#!/usr/bin/perl -w package formatAlleleSeq; use strict; #~ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); #~ require Exporter; use base 'Exporter'; #~ require Exporter; #~ @ISA = qw(Exporter); # Items to export into callers namespace by default. Note: do not export # names by default without a very good reason. Use EXPORT_OK instead. # Do not simply export all your public functions/methods/constants. # export the names of the fonction for that module (to be seen externally) our @EXPORT = qw/getProteinseqfile_fromSNPeffect encodeFastaProtName format_multifasta_to_singlefastas splitfastaN getEncoding getVEPfile_fromSNPeffect getEnsemblformat_from_rsID analyseIndelChange/; #~ $VERSION = '0.01'; use IO::File; use Bio::SeqIO; use Bio::EnsEMBL::Registry; # Only for SNPs from dbSNP (no indels) sub getEnsemblformat_from_rsID($$$$){ my ($rsID_file, $outdir ,$registry_file_path, $specie) = @_; my ($id,$vfa,$vf,$v,$error); my $registry = 'Bio::EnsEMBL::Registry'; $registry->load_all($registry_file_path); my $errorfile = $outdir."/errors.txt"; my $fh_in = new IO::File $rsID_file || die "Can not create $rsID_file : $!"; my $fh_stderr = new IO::File ">$errorfile" || die "Can not create $errorfile : $!"; #Error file (where the untreated sequences goes) print $fh_stderr "Untreated sequences:\n"; my $fh_out = new IO::File ">$outdir/ensembl.txt" || die "Can not create $outdir/ensembl.txt : $!";#Output file : input for snp_predictor_effect print "Searching for the rs id...\n"; $error = 0;#Numbers of errors founds while(<$fh_in>){ #Check if the rs id exist in db if($_ =~ /(rs\d*)/){ $id = $1; $vfa = $registry->get_adaptor($specie,"variation","variationfeature"); $vf = $registry->get_adaptor($specie,"variation","variation"); $v = $vf->fetch_by_name($id); if(!defined($v)){ #print "Seq not found in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (not found in the Ensembl database)\n"; $error++; next; } #Take all variations for this rs (start, end, strand, allele string, chromosom) foreach $vf (@{$vfa->fetch_all_by_Variation($v)}) { if(!defined($vf)){ #print "Seq variation annotation not found in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no features found in Ensembl database)\n"; $error++; next; } #Extract the informations: start of snp, end of snp (must be same as the start), strand, alleles, chromosom, rsid (optionnal - for debug) my $start = $vf->start(); if(!defined($start)){ #print "Seq have not start position in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no start found in Ensembl database)\n"; $error++; next; } my $end = $vf->end(); if(!defined($end)){ #print "Seq have not end position in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no end found in Ensembl database)\n"; $error++; next; } my $strand = $vf->strand(); if(!defined($start)){ #print "Seq have not strand in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no strand found in Ensembl database)\n"; $error++; next; } my $alleles = $vf->allele_string(); if(!defined($start)){ #print "Seq have not allele in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no allele string found in Ensembl database)\n"; $error++; next; } my $chromosom = $vf->seq_region_name(); if(!defined($chromosom)){ #print "Seq have not chromosom in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no chromosom string found in Ensembl database)\n"; $error++; next; } my $name_snp = $vf->variation_name(); print $fh_out "chr$chromosom\t$start\t$end\t$alleles\t$strand\n"; } } else{ print "Error: untolerated rs sequence ($_). Skipping line.\n"; print $fh_stderr "$_\n"; } } if($error!=0){ print "$error errors found. Consult results/error.out.\n"; } $fh_out->close(); $fh_stderr->close(); $fh_in->close(); } # this function is used to get the non_synonymous coding SNPs # to get the corresponding protein sequence from Ensembl # to generate a fasta file with these protein sequences, 1 per allele # so 2 protein sequences per SNP sub getProteinseqfile_fromSNPeffect($$$$$$){ my ($specie, $infile, $outfile, $registry_file_path, $outvep_res, $outvep_title) = @_; print "formatAlleleSeq::create: Searching amino acid sequence for each allele...\n"; #Connect to the Ensembl data my $registry = 'Bio::EnsEMBL::Registry'; $registry->load_all($registry_file_path); my $errorfile = $outfile; $errorfile =~s/\/[^\/]*$/\/error.txt/; my $db_in = new IO::File "$infile", "r" || die "Can not open file : $infile $! \n";#Result of snppredictor my $db_out = new IO::File ">$outfile" || die "Can not create file $outfile $! \n";#The final db my $stderr = new IO::File ">>$errorfile" || die "Can not open $errorfile: $! \n";#Error file my $iterator = 1; my $iterator2 = 1; my $oname; my @tmp = (); #~ my ($ref,$alt); my ($position2,$allele1_len,$start,$end,$seq_alt); my $allele1; my $allele2; my ($alt,$name,$rsID,$gene,$transc,$position,$change,$name_ini,$chr,$pos,$check_name,$geno,$consequence); my ($transcript_adaptor,$transcript,$protein,$seq); my $fh_vep_out_title = new IO::File ">$outvep_title" || die "Can not create file $outvep_title! \n";#The final db print $fh_vep_out_title "SNP Name\tChromosome\tPosition\tGene ID\tTranscript ID\tConsequence\tGenotype\tAA change\tAA position\n"; $fh_vep_out_title->close(); my $fh_vep_out = new IO::File ">$outvep_res" || die "Can not create file $outvep_res $! \n";#The final db while(<$db_in>){ next if ($_=~/^\#/); #~ $_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\t([^\t]*)\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/; @tmp = (); @tmp = split(/\t/,$_); #~ $consequence = $6; #~ $alt = $3; #~ $name_ini = $1; #~ $rsID = $10; #~ $gene = $4; #~ $transc = $5; #~ $position = $8; #~ $change = $9; #~ $consequence = $tmp[6]; $alt = $tmp[2]; $name_ini = $tmp[0]; $rsID = $tmp[12]; $gene = $tmp[3]; $transc = $tmp[4]; $position = $tmp[9]; $change = $tmp[10]; ($check_name) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/; $check_name = $check_name."/".$alt; #~ if((defined($oname)) && ($oname eq $check_name)){#If previous name already exist (severals transcripts exists) #~ $iterator2++;#Increment $iterator #~ } #~ else{ #~ $iterator2 = 1; #~ } $oname = $check_name; #~ $name = $oname."_$rsID\_$transc\_transcript$iterator2\[$change-$position\]"; $name = $oname."_$rsID\_$transc\[$change-$position\]"; ($chr,$pos,$geno) = $oname =~ /([^_]+)_([^_]+)_(.+)/; print $fh_vep_out "$name\t$chr\t$pos\t$gene\t$transc\t$consequence\t$geno\t$change\t$position\n"; ############################################## VERIF OUTPUT VARIANT EFFECT PREDICTOR FORMAT ############################################### #~ my $ligne = $_; #~ my @line = split( "\t", $ligne); #~ if ( $line[4] !~ /ENS\D{3}T\d+/ ){ #~ print "Variant format unknow\n"; #~ exit; #~ } ################################################################ NON SYNONYMOUS CODING ################################################### if(($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tmissense_variant\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/) || ($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tinitiator_codon_variant\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/)){ #~ $alt = $3; #~ $name_ini = $1; #~ $rsID = $9; #~ $gene = $4; #~ $transc = $5; #~ $position = $7; #~ $change = $8; @tmp = (); @tmp = split(/\t/,$_); $consequence = $tmp[6]; $alt = $tmp[2]; $name_ini = $tmp[0]; $rsID = $tmp[12]; $gene = $tmp[3]; $transc = $tmp[4]; $position = $tmp[9]; $change = $tmp[10]; ($check_name) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/; $check_name = $check_name."/".$alt; #~ if((defined($oname)) && ($oname eq $check_name)){#If previous name already exist (severals transcripts exists) #~ $iterator++;#Increment $iterator #~ } #~ else{ #~ $iterator = 1; #~ } #~ $oname = $1; # in case where theere are more than 1 alternatvie allele compared to the reference # the name will be based on: Ref / ALt considered for that line in the vep file # for these cases, the annotation pipeline will consider only the comparison (lost / gain) # between the reference and the alternative base considered (not betwwen different ALT) # note: if there are more than 1 alt: # VEP will retreive 1 line per ALT #~ ($oname) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/; #~ $oname = $oname."/".$alt; $oname = $check_name; #~ print ">>$oname\n"; @tmp = (); @tmp = split(/_/,$oname); my $type_var = length($tmp[2]); # length($3); # SNP len = 1 elsif INDEL len > 1 ## If the variant is a SNP if(($type_var == 3) and (not $name_ini =~ /\/-$/) and (not $name_ini =~ /-\/.*$/)){### #~ my $name = $1."_$rsID\_$transc\_transcript$iterator\[$change-$position\]"; $name = $oname."_$rsID\_$transc\[$change-$position\]"; #~ $allele1; #~ $allele2; if($change =~ m/(\D)\/(\D)/){#Set allele 1 and 2 $allele1 = $1; $allele2 = $2; } else{ print "Error found: $name\n"; print $stderr "$transc (Alleles not found)"; next; } #Get all sequences from the gene ID. $transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' ); $transcript = $transcript_adaptor->fetch_by_stable_id("$transc"); #~ print "*$transc:$transcript*\n"; $protein = $transcript->translate(); if(defined $protein){ #Print the 2 alleles in the outputfile $seq = $protein->seq(); for (my $i=0; $i<2; $i++){ if($i==0){ print $db_out ">$name"."_allele1\n"; print $db_out substr($seq,0,$position-1);#First part of seq print $db_out $allele1;#The amino acid wich change print $db_out substr($seq,$position);#Second part of seq print $db_out "\n"; } elsif($i==1){ print $db_out ">$name"."_allele2\n"; print $db_out substr($seq,0,$position-1);#First part of seq print $db_out $allele2;#The amino acid wich change print $db_out substr($seq,$position);#Second part of seq print $db_out "\n"; } } } else{ print "Protein not found in Ensembl data: $transc\n"; print $stderr "$transc (Protein not found)\n"; next; } } # IF INDEL ! else{ # get the left and right parts of the protein sequence # concatenate: left + del / insert + right parts # get the indel sequence : $alt ($allele1,$allele2) = $change =~ /([^\/]+)\/(.*)/; # get the left and right parts $name = $oname."_$rsID\_$transc\[$change-$position\]"; #Get all sequences from the gene ID. $transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' ); $transcript = $transcript_adaptor->fetch_by_stable_id("$transc"); $protein = $transcript->translate(); if(defined $protein){ $seq = $protein->seq(); #Print the 2 alleles in the outputfile print $db_out ">$name"."_allele1\n"; print $db_out "$seq\n"; if(($name_ini =~ /\/-$/) or ($name_ini =~ /-\/.*$/)){### # deletion in the reference if($name_ini =~ /\/-$/){### print $db_out ">$name"."_allele2\n";### $seq_alt = substr($seq,0,$position-1);#First part of seq ### $seq_alt = $seq_alt.substr($seq,$position);#Second part of seq### print $db_out "$seq_alt\n"; ### }### # insertion in the reference if($name_ini =~ /-\/.*$/){### print $db_out ">$name"."_allele2\n";### $seq_alt = substr($seq,0,$position-1);#First part of seq ### $seq_alt = $seq_alt."$allele2"; # insertion ### $seq_alt = $seq_alt.substr($seq,$position);#Second part of seq### print $db_out "$seq_alt\n"; ### }### }### else{### ($start,$end) = $position =~ /(\d+)-(\d+)/; print $db_out ">$name"."_allele2\n"; $seq_alt = substr($seq,0,$start-1);#First part of seq $seq_alt = $seq_alt."$allele2"; $seq_alt = $seq_alt.substr($seq,$end);#Second part of seq ###!!!!! print $db_out "$seq_alt\n"; }### } else{ print "Protein not found in Ensembl data: $transc\n"; print $stderr "$transc (Protein not found)\n"; next; } } } =add1 ################################################################ INTRONIC, SPLICE_SITE ################################################### #Add here a class of SNPs if you want (3PRIME_UTR, 5PRIME_UTR, DOWNSTREAM, ESSENTIAL_SPLICE_SITE, INTERGENIC, INTRONIC, SPLICE_SITE, STOP_GAINED, SYNONYMOUS_CODING ,UPSTREAM ,WITHIN_NON_CODING_GENE) elsif($_ = m/(.*)\t(.*)\t(.*)\t(.*)\tINTRONIC\t(.*)\t(.*)\t(.*)\t(.*)//){ } =cut } $db_in->close(); $db_out->close(); $stderr->close(); $fh_vep_out->close(); } # This function attributes 1 ID to each seq (some programs don't like huge names). sub encodeFastaProtName($$$$){ my ($infile,$outfile,$outencodingtable,$BASE_SEQ_NAME) = @_; print "formatAlleleSeq::formatfile: formatting protein sequence names...\n"; my ($change,$type_var); my $fic_in = new IO::File "$infile" || die "Can not open $infile: $! \n"; my $sub = new IO::File ">$outfile" || die "Can not create sub $outfile: $! \n";#The file who will be submited #~ my $save_names_file = $outfile; #~ $save_names_file =~ s/\/[^\/]*$/\/saved_names.txt/; my $sav = new IO::File ">$outencodingtable" || die "Can not create sav $outencodingtable: $! \n";#The save data my $i = 1; while(<$fic_in>){ if($_ =~ m/^>/){ chomp $_; ## EXAMPLES OF INDELS #~ >X_3359812_TTCTTCTTCTTCTT/TTCTTCTTCTT_-_ENSBTAT00000014275[KKKKK/KKKK-38-42]_allele1 #~ >X_138471854_AAT/-_-_ENSBTAT00000004339[I/--159]_allele2 #~ >X_14308536_-/ATC_-_ENSBTAT00000013812[P/RS-6023]_allele1 #~ >X_733293_G/A_-_ENSBTAT00000000209[P/S-600]_allele2 ($change) = $_ =~ /\[([^\/]+\/.+)\-{1}\d*\-*\d+\]_allele\d*/; if((length($change) > 3) or ($change =~ /\-/)){ $type_var = "indel"; } else{ $type_var = "snp"; } print $sav "$_\t".$BASE_SEQ_NAME.$i."\t".$type_var."\n"; print $sub ">".$BASE_SEQ_NAME.$i."\n";#new ID $_ = <$fic_in>; print $sub $_;#Sequence $i++; } } $fic_in->close(); $sav->close(); $sub->close(); return $i; } # this function splits the initial fasta file into 1 sequence per file sub format_multifasta_to_singlefastas($$){ my ($inputfile,$outdir) = @_; print "formatAlleleSeq::format_multifasta_to_singlefastas: formats a fasta file containing multiple sequences into 1 sequence per file...\n"; my @args = ("/usr/local/bioinfo/src/emboss/EMBOSS-6.0.1/bin/seqretsplit", "-sequence", $inputfile, "-osdirectory2", $outdir, "-auto"); system(@args) == 0 or die "system @args failed: $?" } # this function splits a fasta file into several fasta files with -n number of sequences per file # faire le mkdir dans ergatis sub splitfastaN($$$$){ my ($inputfile,$outdir,$numseqsperfile,$sizeseqmin) = @_; my ($fh_in,$fh_out,$line,$count,$outfile,$countfile,$name,$seq,$ln); print "formatAlleleSeq::splitfastaN: formats a fasta file containing multiple sequences into 1 sequence per file...\n"; ($name) = $inputfile =~ /\/([^\/]*)$/; $name =~ s/\..*//; $name = $outdir."/".$name; $count = $numseqsperfile + 1; $countfile = 0; $fh_in= new IO::File $inputfile || die "cannot open file $inputfile: $! \n"; $/=">"; <$fh_in>; while($line=<$fh_in>){ $line=~s/>//; ($seq) = $line =~ /[^\n]+\n(.*)/; $ln = length($seq); if($ln < $sizeseqmin){ if($count < $numseqsperfile){ print $fh_out (">$line"); $count++; } else{ unless($countfile==0){$fh_out->close()}; $countfile ++; $outfile = $name.$numseqsperfile."_".$countfile.".fasta"; #~ print "$outfile\n"; #~ $outfile =~ s/\/$name[^\/]*$/\/$name\_seq$numseqsperfile\_$countfile\.fasta/; #~ print "$outfile\n"; $count = 1; $fh_out = new IO::File ">$outfile" || die "cannot open file $outfile: $! \n"; print $fh_out (">$line"); } } } $fh_in->close(); } # GET THE ENCODING NAMES # get the encoded names and the position of the amino acid changein the protein sequence sub getEncoding($){ my ($name_encoding_file) = @_; my %encodingName = (); # initial name => short name my %encodingID = (); # short name => initial name my %name_allele = (); # short name => allele 1 or allele 2 my %name_position = (); # short name => AA position of SNP my %name_var_type = (); # short name => type of variation (snp, indel) my ($line,$fh_in,$fh_out,$pos,$name,$allele,$var_type); $fh_in = new IO::File $name_encoding_file || die "Can not open $name_encoding_file : $! \n"; $/="\n"; while($line=<$fh_in>){ if($line =~ m/^>\S*\s*.*/){ $line =~ /^>([^\t]+)\t([^\t]*)\t([^\n]*)\n/; $encodingID{$2} = $1; $encodingName{$1} = $2; #~ ($pos,$allele,$name) = $line =~ /(\d+)\]_(allele\d)\t([^\n]*)$/; ($pos,$allele,$name,$var_type) = $line =~ /\-(\d+\-*\d*)\]_(allele\d)\t([^\t]*)\t(.*)$/;### #~ print "$pos,$allele,*$name*,$var_type\n"; $name_position{$name} = $pos; $name_allele{$name} = $allele; $name_var_type{$name} = $var_type;### } else{ print "the protein sequences do not have the right format\n"; exit(11); } } $fh_in->close(); return \%encodingName,\%encodingID,\%name_allele,\%name_position,\%name_var_type;### } sub getVEPfile_fromSNPeffect($$$){ my ($infile,$out_vep_dir,$type) = @_; my (@type,$fh_in,$fh_out,@line, @args); my $nb_snp = 0; #~ @type = split(",", $type); #~ foreach my $var (@type){ my ($vep_var_result) = $out_vep_dir."/".$type."_file.vep"; ## ADD HERE NEW CATEGORY if ($type eq "NSC"){ #~ $type = "NON\_SYNONYMOUS\_CODING"; $type = "(missense\_variant|initiator_codon_variant)"; } elsif($type eq "STOP"){ #~ $type = "STOP\_(GAINED|LOST)"; $type = "stop\_(gained)"; } $fh_in = new IO::File $infile, "r" || die "Can not open $infile : $! \n"; $/="\n"; $fh_out = new IO::File $vep_var_result, "w"|| die "Can not create $vep_var_result: $! \n"; while(<$fh_in>){ @line = (); @line = split("\t", $_); next if ($line[0] =~/^#/); if ($line[6] =~ /$type/ && $line[1] !~ /.*\:\d*\-\d*/){ print $fh_out (join ("\t",@line)); $nb_snp++; } } $fh_in->close(); $fh_out->close(); #~ } } sub getVEPfile_fromSNPeffect_old($$$){ my ($infile,$out_vep_dir,$type) = @_; my (@type,$fh_in,$fh_out,@line, @args); my $nb_snp = 0; @type = split(",", $type); foreach my $var (@type){ #~ my ($vep_var_dir) = $out_vep_dir."/".$var."_vep_dir"; #~ system("mkdir $vep_var_dir"); my ($vep_var_result) = $out_vep_dir."/".$var."_file.vep"; #~ print $vep_var_dir."\n".$vep_var_result."\n\n"; ## ADD HERE NEW CATEGORY if ($var eq "NSC"){ $type = "NON\_SYNONYMOUS\_CODING" } elsif($var eq "STOP"){ $type = "STOP\_(GAINED|LOST)"; } print "$var:$type\n$infile\n"; $fh_in = new IO::File $infile, "r" || die "Can not open $infile : $! \n"; $/="\n"; $fh_out = new IO::File $vep_var_result, "w"|| die "Can not create $vep_var_result: $! \n"; while(<$fh_in>){ #print "*$_*\n"; @line = (); @line = split("\t", $_); #print "*$line[0]*: $line[6]\n"; next if ($line[0] =~/^#/); #print "heho $line[6]\t$line[2]"; if ($line[6] =~ /$type/ && $line[1] !~ /.*\:\d*\-\d*/){ print $fh_out (join ("\t",@line)); #print "ici"; $nb_snp++; } } $fh_in->close(); $fh_out->close(); #~ # if number of variants input is greather than snp_number_per_file, we split files #~ if ($nb_snp > $snp_number_per_file){ #~ chdir "$vep_var_dir" or die "cannot change dir getVEPfile_fromSNPeffect chdir $vep_var_dir : $?"; #~ #~ @args = ("split", "-l", $snp_number_per_file, "-d", $vep_var_result, "vep_".$var); #~ system(@args) == 0 or die "system @args failed: $?"; #~ #~ @args = ("rm", $vep_var_result); #~ system(@args) == 0 or die "system @args failed: $?"; #~ } } } # for INDELS only # This subroutine checks according to the VEP formats output types for indels ( I/--pos, -/I-pos, VN/VKN-pos, KKK/KKKK,pos-pos) # if the REF or ALT has to be checked for new signals sub analyseIndelChange($){ my ($info) = @_; my ($ref,$alt,$pos,$change,$start,$end,$indel,$end_alt); $ref=""; $alt=""; $start = 0; $end = 0; $end_alt = 0; ## EXAMPLES OF INDELS: #~ KKKKK/KKKK-38-42 #~ I/--159 #~ P/RS-6023 #~ P/S-600 #~ KKKKK/KKKK-38-42 if($info =~ /^[^\/]+\/[^\-]+\-\d+\-\d+$/){ # positions for the reference start and end ($ref,$alt,$start,$end) = $info =~ /^([^\/]+)\/([^\-]+)\-(\d+)\-(\d+)$/; # Positions for the alternative $end_alt = $start + length($alt) - 1; } #~ I/--159 elsif($info =~ /^[^\/]+\/\-\-\d+$/){ ($ref,$alt,$pos) = $info =~ /^([^\/]+)\/(\-)\-(\d+)$/; $start = $pos; $end = $pos; $end_alt = 0; # deletion => No bases to look at on the alternative ! } #~ -/I-159 elsif($info =~ /^\-\/[^\/]+\-\d+$/){ ($ref,$alt,$pos) = $info =~ /^(\-)\/([^\/]+)\-(\d+)$/; $start = $pos; $end = 0; # deletion => No bases to look at on the reference ! $end_alt = $pos; } #~ P/RS-6023 #~ P/S-600 elsif($info =~ /^[^\/]+\/[^\-]+\-\d+$/){ ($ref,$alt,$pos) = $info =~ /^([^\/]+)\/([^\-]+)\-(\d+)$/; $start = $pos; $end = $pos; $end_alt = $start + length($alt) - 1; } else{ print ">>UNKNWON CASE: $info\n"; } #~ print "************************\n"; #~ print "$info\n"; #~ print "$ref,$alt,$start,$end,$end_alt\n";#>PFFS,PFS-203,206,PFFSPFS-203 #~ print "************************\n"; return ($start,$end,$end_alt); } 1;