#!/usr/bin/perl -w package runsnppredictor; use strict; use IO::File; use Bio::EnsEMBL::Registry; ################################################################################################################## #General function ################################################################################################################## sub run($$$$$){ my ($mod, $specie, $file, $path, $date) = @_;#Options #Create data folder (if don't exist) unless (-e "$path/fic/snp_predictor_effect"){ system("mkdir fic/snp_predictor_effect"); } #Input file my $fic_snp = new IO::File $file, "r" || die "Can not open $file : $!"; if($mod == 0){#Mod 0 = rs id search0($specie,$fic_snp,$date); snp_predictor($specie,$date); } elsif($mod ==1){#Mod 1 = without rsid (the input file is just like the input of snp predictor) search1($fic_snp,$date); snp_predictor($specie,$date); } else{ die "Invalid option mod ($mod). Choose 0 if you use the rs synthax and 1 if you use other synthax.\n"; } $fic_snp->close(); } ################################################################################################################## #The next function extract and find the informations necessary for run snp_predictor using the rs ID. ################################################################################################################## sub search0($$$){ my ($specie,$fic_snp,$date) = @_; my $error = 0;#Numbers of errors founds #Connect to the Ensembl data my $registry = 'Bio::EnsEMBL::Registry'; $registry->load_registry_from_db( -host => 'ensembldb.ensembl.org', -user => 'anonymous' ); my $stderr = new IO::File ">results/error_$date" || die "Can not create results/error_$date : $!";#Error file (where the untreated sequences goes) print $stderr "Untreated sequences:\n"; my $fic_out = new IO::File ">fic/snp_predictor_effect/snp_predictor_input_$date" || die "Can not create fic/snp_predictor_input_$date : $!";#Output file : input for snp_predictor_effect print "Searching for the rs id...\n"; while(<$fic_snp>){ #Check if the rs id exist in db if($_ =~ />(rs\d*)/){ my $id = $1; my $vfa = $registry->get_adaptor($specie,"variation","variationfeature"); my $vf = $registry->get_adaptor($specie,"variation","variation"); my $v = $vf->fetch_by_name($id); if(!defined($v)){ #print "Seq not found in the Ensembl data: $id . Skipping line.\n"; print $stderr "$id (not found in the Ensembl database)\n"; $error++; next; } #Take all variations for this rs (start, end, strand, allele string, chromosom) foreach $vf (@{$vfa->fetch_all_by_Variation($v)}) { if(!defined($vf)){ #print "Seq variation annotation not found in the Ensembl data: $id . Skipping line.\n"; print $stderr "$id (no features found in Ensembl database)\n"; $error++; next; } #Extract the informations: start of snp, end of snp (must be same as the start), strand, alleles, chromosom, rsid (optionnal - for debug) my $start = $vf->start(); if(!defined($start)){ #print "Seq have not start position in the Ensembl data: $id . Skipping line.\n"; print $stderr "$id (no start found in Ensembl database)\n"; $error++; next; } my $end = $vf->end(); if(!defined($end)){ #print "Seq have not end position in the Ensembl data: $id . Skipping line.\n"; print $stderr "$id (no end found in Ensembl database)\n"; $error++; next; } my $strand = $vf->strand(); if(!defined($start)){ #print "Seq have not strand in the Ensembl data: $id . Skipping line.\n"; print $stderr "$id (no strand found in Ensembl database)\n"; $error++; next; } my $alleles = $vf->allele_string(); if(!defined($start)){ #print "Seq have not allele in the Ensembl data: $id . Skipping line.\n"; print $stderr "$id (no allele string found in Ensembl database)\n"; $error++; next; } my $chromosom = $vf->seq_region_name(); if(!defined($chromosom)){ #print "Seq have not chromosom in the Ensembl data: $id . Skipping line.\n"; print $stderr "$id (no chromosom string found in Ensembl database)\n"; $error++; next; } my $name_snp = $vf->variation_name(); print $fic_out "chr$chromosom\t$start\t$end\t$alleles\t$strand\n"; } } else{ print "Error: untolerated rs sequence ($_). Skipping line.\n"; print $stderr "$_\n"; } } if($error!=0){ print "$error errors found. Consult results/error.out.\n"; } $fic_out->close(); $stderr->close(); } ####################################################################################################################### #Copy file in rsdata.txt #Save the name given by the input file in temp_name_save and cut it. ####################################################################################################################### sub search1($$){ my ($fic_snp, $date) = @_; my $rsdata = new IO::File ">fic/snp_predictor_effect/snp_predictor_input_$date" || die "Can not create fic/snp_predictor_input_$date : $!";#Output file : input for snp_predictor_effect my $temp_name_save = new IO::File ">fic/temp_name_save_$date" || die "Can not create file fic/temp_name_save_$date: $! \n";#The file containing names and output of snp_predictor_effect my $stderr = new IO::File ">results/error_$date" || die "Can not create results/error_$date : $!";#Error file (where the untreated sequences goes) while(<$fic_snp>){ if($_ =~ m&^(\S+)\s(chr)?(\S+)\s(\d+)\s\d+\s(\S+).*&){ print $temp_name_save "$3_$4_$5\t$1\n"; $_ =~ m/^\S*\s(.*)/; print $rsdata "$1\n";#Print the line without his name. } else{ print $stderr "File format problem: $_ \n Format required: Name chromosome start end strand\n"; } } $temp_name_save->close(); $rsdata->close(); $stderr->close(); } ################################################################################################################## #Run SNP predictor. #Batch : 50 SNPs at time. ################################################################################################################## sub snp_predictor($$){ my ($specie, $date) = @_; print "Running snp effect predictor.\n"; #Parse the file (for prevent the crash) my $rsdata = new IO::File "fic/snp_predictor_effect/snp_predictor_input_$date" || die "Can not open file fic/snp_predictor_effect/snp_predictor_input_$date : $! \n"; my $rsdata2 = new IO::File ">fic/snp_predictor_effect/temp-snp_predictor_input_$date" || die "Can not create temp file fic/snp_predictor_effect/temp-snp_predictor_input_$date : $! \n"; my $line = 0; while(<$rsdata>){ if($line <= 50){ print $rsdata2 $_; $line++; } else{ print $rsdata2 $_; $rsdata2->close(); `perl snp_effect_predictor.pl -s $specie -i fic/snp_predictor_effect/temp-snp_predictor_input_$date -o fic/snp_predictor_effect/temp-snp_predictor_output_$date -b 400`; #Concatenation in dbsnp.out my $dbsnp2 = new IO::File "fic/snp_predictor_effect/temp-snp_predictor_output_$date" || die "Can not open temp file fic/snp_predictor_effect/temp-snp_predictor_output_$date : $! \n"; my $dbsnp = new IO::File ">>fic/snp_predictor_effect/snp_predictor_output_$date" || die "Can not create file fic/snp_predictor_effect/snp_predictor_output_$date : $! \n"; while(<$dbsnp2>){ print $dbsnp $_; } $dbsnp->close(); $dbsnp2->close(); $rsdata2 = new IO::File ">fic/snp_predictor_effect/temp-snp_predictor_input_$date" || die "Can not create temp file fic/snp_predictor_effect/temp-snp_predictor_input_$date : $! \n";#Re-open file $line = 0; } } #For launch snp_effect_predictor on lasts SNPs `perl snp_effect_predictor.pl -s $specie -i fic/snp_predictor_effect/temp-snp_predictor_input_$date -o fic/snp_predictor_effect/temp-snp_predictor_output_$date -b 400`; my $dbsnp2 = new IO::File "fic/snp_predictor_effect/temp-snp_predictor_output_$date" || die "Can not open temp file fic/snp_predictor_effect/temp-snp_predictor_output_$date : $! \n"; my $dbsnp = new IO::File ">>fic/snp_predictor_effect/snp_predictor_output_$date" || die "Can not create file fic/snp_predictor_effect/snp_predictor_output_$date : $! \n"; while(<$dbsnp2>){ print $dbsnp $_; } $dbsnp->close(); $dbsnp2->close(); $rsdata->close(); $rsdata2->close(); system("rm -f fic/snp_predictor_effect/temp-snp_predictor_output_$date");#Clean the temp files system("rm -f fic/snp_predictor_effect/temp-snp_predictor_input_$date");#Clean the temp files } 1;