#!/usr/bin/perl -w package formatAlleleSeq; use strict; #~ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); #~ require Exporter; use base 'Exporter'; #~ require Exporter; #~ @ISA = qw(Exporter); # Items to export into callers namespace by default. Note: do not export # names by default without a very good reason. Use EXPORT_OK instead. # Do not simply export all your public functions/methods/constants. # export the names of the fonction for that module (to be seen externally) our @EXPORT = qw/getProteinseqfile_fromSNPeffect encodeFastaProtName format_multifasta_to_singlefastas splitfastaN getEncoding/; #~ $VERSION = '0.01'; use IO::File; use Bio::SeqIO; use Bio::EnsEMBL::Registry; #~ sub getEnsembl_variant-effect-format_from_rsID($$$$){ #~ #~ ($rsID_file,$outfile,$errorfile,$registry_file_path,$specie) = @_; #~ #~ my ($id,$vfa,$vf,$v,$error); #~ #~ my $registry = 'Bio::EnsEMBL::Registry'; #~ $registry->load_all($registry_file_path); #~ #~ my $fh_stderr = new IO::File ">$errorfile" || die "Can not create $errorfile : $!";#Error file (where the untreated sequences goes) #~ print $fh_stderr "Untreated sequences:\n"; #~ #~ my $fh_out = new IO::File ">fic/snp_predictor_effect/snp_predictor_input_$date" || die "Can not create fic/snp_predictor_input_$date : $!";#Output file : input for snp_predictor_effect #~ #~ print "Searching for the rs id...\n"; #~ #~ $error = 0;#Numbers of errors founds #~ #~ #~ while(<$fic_snp>){ #~ #~ #Check if the rs id exist in db #~ if($_ =~ /(rs\d*)/){ #~ #~ $id = $1; #~ $vfa = $registry->get_adaptor($specie,"variation","variationfeature"); #~ $vf = $registry->get_adaptor($specie,"variation","variation"); #~ $v = $vf->fetch_by_name($id); #~ #~ if(!defined($v)){ #~ #print "Seq not found in the Ensembl data: $id . Skipping line.\n"; #~ print $fh_stderr "$id (not found in the Ensembl database)\n"; #~ $error++; #~ next; #~ } #~ #Take all variations for this rs (start, end, strand, allele string, chromosom) #~ foreach $vf (@{$vfa->fetch_all_by_Variation($v)}) { #~ if(!defined($vf)){ #~ #print "Seq variation annotation not found in the Ensembl data: $id . Skipping line.\n"; #~ print $fh_stderr "$id (no features found in Ensembl database)\n"; #~ $error++; #~ next; #~ } #~ #Extract the informations: start of snp, end of snp (must be same as the start), strand, alleles, chromosom, rsid (optionnal - for debug) #~ my $start = $vf->start(); #~ if(!defined($start)){ #~ #print "Seq have not start position in the Ensembl data: $id . Skipping line.\n"; #~ print $fh_stderr "$id (no start found in Ensembl database)\n"; #~ $error++; #~ next; #~ } #~ my $end = $vf->end(); #~ if(!defined($end)){ #~ #print "Seq have not end position in the Ensembl data: $id . Skipping line.\n"; #~ print $fh_stderr "$id (no end found in Ensembl database)\n"; #~ $error++; #~ next; #~ } #~ my $strand = $vf->strand(); #~ if(!defined($start)){ #~ #print "Seq have not strand in the Ensembl data: $id . Skipping line.\n"; #~ print $fh_stderr "$id (no strand found in Ensembl database)\n"; #~ $error++; #~ next; #~ } #~ my $alleles = $vf->allele_string(); #~ if(!defined($start)){ #~ #print "Seq have not allele in the Ensembl data: $id . Skipping line.\n"; #~ print $fh_stderr "$id (no allele string found in Ensembl database)\n"; #~ $error++; #~ next; #~ } #~ my $chromosom = $vf->seq_region_name(); #~ if(!defined($chromosom)){ #~ #print "Seq have not chromosom in the Ensembl data: $id . Skipping line.\n"; #~ print $fh_stderr "$id (no chromosom string found in Ensembl database)\n"; #~ $error++; #~ next; #~ } #~ #~ my $name_snp = $vf->variation_name(); #~ print $fh_out "chr$chromosom\t$start\t$end\t$alleles\t$strand\n"; #~ } #~ } #~ else{ #~ print "Error: untolerated rs sequence ($_). Skipping line.\n"; #~ print $fh_stderr "$_\n"; #~ } #~ } #~ #~ if($error!=0){ #~ print "$error errors found. Consult results/error.out.\n"; #~ } #~ #~ $fh_out->close(); #~ $fh_stderr->close(); #~ } # this function is used to get the non_synonymous coding SNPs # to get the corresponding protein sequence from Ensembl # to generate a fasta file with these protein sequences, 1 per allele # so 2 protein sequences per SNP sub getProteinseqfile_fromSNPeffect($$$$){ my ($specie, $infile, $outfile, $registry_file_path) = @_; print "formatAlleleSeq::create: Searching amino acid sequence for each allele...\n"; #Connect to the Ensembl data my $registry = 'Bio::EnsEMBL::Registry'; $registry->load_all($registry_file_path); my $errorfile = $outfile; $errorfile =~s/\/[^\/]*$/\/error.txt/; my $db_in = new IO::File "$infile", "r" || die "Can not open file : $infile $! \n";#Result of snppredictor my $db_out = new IO::File ">$outfile" || die "Can not create file $outfile $! \n";#The final db my $stderr = new IO::File ">>$errorfile" || die "Can not open $errorfile: $! \n";#Error file my $iterator = 1; my $oname; my @tmp = (); while(<$db_in>){ ##############################################VERIF OUTPUT VARIANT EFFECT PREDICTOR FORMAT ############################################### #~ my $ligne = $_; #~ my @line = split( "\t", $ligne); #~ if ( $line[4] !~ /ENS\D{3}T\d+/ ){ #~ print "Variant format unknow\n"; #~ exit; #~ } ################################################################ NON SYNONYMOUS CODING ################################################### if($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tNON_SYNONYMOUS_CODING\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/){ if((defined($oname)) && ($oname eq $1)){#If previous name already exist (severals transcripts exists) $iterator++;#Increment $iterator } else{ $iterator = 1; } $oname = $1; @tmp = (); @tmp = split(/_/,$oname); my $type_var = length($tmp[2]); # length($3); # SNP len = 1 elsif INDEL len > 1 my $rsID = $9; #~ my $location = $2; my $gene = $4; my $transc = $5; #~ print $transc; my $position = $7; my $change = $8; #~ my $change = $1; ## If the variant is a SNP if($type_var == 3 ){ my $name = $1."_$rsID\_$transc\_transcript$iterator\[$change-$position\]"; my $allele1; my $allele2; if($change =~ m/(\D)\/(\D)/){#Set allele 1 and 2 $allele1 = $1; $allele2 = $2; } else{ print "Error found: $name\n"; print $stderr "$transc (Alleles not found)"; next; } #Get all sequences from the gene ID. my $transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' ); my $transcript = $transcript_adaptor->fetch_by_stable_id("$transc"); my $protein = $transcript->translate(); if(defined $protein){ #Print the 2 alleles in the outputfile my $seq = $protein->seq(); for (my $i=0; $i<2; $i++){ if($i==0){ print $db_out ">$name"."_allele1\n"; print $db_out substr($seq,0,$position-1);#First part of seq print $db_out $allele1;#The amino acid wich change print $db_out substr($seq,$position);#Second part of seq print $db_out "\n"; } elsif($i==1){ print $db_out ">$name"."_allele2\n"; print $db_out substr($seq,0,$position-1);#First part of seq print $db_out $allele2;#The amino acid wich change print $db_out substr($seq,$position);#Second part of seq print $db_out "\n"; } } } else{ print "Protein not found in Ensembl data: $transc\n"; print $stderr "$transc (Protein not found)\n"; next; } } # IF INDEL ! } =add1 ################################################################ INTRONIC, SPLICE_SITE ################################################### #Add here a class of SNPs if you want (3PRIME_UTR, 5PRIME_UTR, DOWNSTREAM, ESSENTIAL_SPLICE_SITE, INTERGENIC, INTRONIC, SPLICE_SITE, STOP_GAINED, SYNONYMOUS_CODING ,UPSTREAM ,WITHIN_NON_CODING_GENE) elsif($_ = m/(.*)\t(.*)\t(.*)\t(.*)\tINTRONIC\t(.*)\t(.*)\t(.*)\t(.*)//){ } =cut } $db_in->close(); $db_out->close(); $stderr->close(); } # This function attributes 1 ID to each seq (some programs don't like huge names). sub encodeFastaProtName($$$$){ my ($infile,$outfile,$outencodingtable,$BASE_SEQ_NAME) = @_; print "formatAlleleSeq::formatfile: formatting protein sequence names...\n"; my $fic_in = new IO::File "$infile" || die "Can not open $infile: $! \n"; my $sub = new IO::File ">$outfile" || die "Can not create sub $outfile: $! \n";#The file who will be submited #~ my $save_names_file = $outfile; #~ $save_names_file =~ s/\/[^\/]*$/\/saved_names.txt/; my $sav = new IO::File ">$outencodingtable" || die "Can not create sav $outencodingtable: $! \n";#The save data my $i = 1; while(<$fic_in>){ if($_ =~ m/^>/){ chomp $_; print $sav "$_\t".$BASE_SEQ_NAME.$i."\n"; print $sub ">".$BASE_SEQ_NAME.$i."\n";#new ID $_ = <$fic_in>; print $sub $_;#Sequence $i++; } } $fic_in->close(); $sav->close(); $sub->close(); return $i; } # this function splits the initial fasta file into 1 sequence per file sub format_multifasta_to_singlefastas($$){ my ($inputfile,$outdir) = @_; print "formatAlleleSeq::format_multifasta_to_singlefastas: formats a fasta file containing multiple sequences into 1 sequence per file...\n"; my @args = ("/usr/local/bioinfo/src/emboss/EMBOSS-6.0.1/bin/seqretsplit", "-sequence", $inputfile, "-osdirectory2", $outdir, "-auto"); system(@args) == 0 or die "system @args failed: $?" } # this function splits a fasta file into several fasta files with -n number of sequences per file # faire le mkdir dans ergatis sub splitfastaN($$$$){ my ($inputfile,$outdir,$numseqsperfile,$sizeseqmin) = @_; my ($fh_in,$fh_out,$line,$count,$outfile,$countfile,$name,$seq,$ln); print "formatAlleleSeq::splitfastaN: formats a fasta file containing multiple sequences into 1 sequence per file...\n"; ($name) = $inputfile =~ /\/([^\/]*)$/; $name =~ s/\..*//; $name = $outdir."/".$name; $count = $numseqsperfile + 1; $countfile = 0; $fh_in= new IO::File $inputfile || die "cannot open file $inputfile: $! \n"; $/=">"; <$fh_in>; while($line=<$fh_in>){ $line=~s/>//; ($seq) = $line =~ /[^\n]+\n(.*)/; $ln = length($seq); if($ln < $sizeseqmin){ if($count < $numseqsperfile){ print $fh_out (">$line"); $count++; } else{ unless($countfile==0){$fh_out->close()}; $countfile ++; $outfile = $name.$numseqsperfile."_".$countfile.".fasta"; #~ $outfile =~ s/\/$name[^\/]*$/\/$name\_seq$numseqsperfile\_$countfile\.fasta/; #~ print "$outfile\n"; $count = 1; $fh_out = new IO::File ">$outfile" || die "cannot open file $outfile: $! \n"; print $fh_out (">$line"); } } } $fh_in->close(); } # GET THE ENCODING NAMES # get the encoded names and the position of the amino acid changein the protein sequence sub getEncoding($){ my ($name_encoding_file) = @_; my %encodingName = (); # initial name => short name my %encodingID = (); # short name => initial name my %name_allele = (); # short name => allele 1 or allele 2 my %name_position = (); # short name => AA position of SNP my ($line,$fh_in,$fh_out,$pos,$name,$allele); $fh_in = new IO::File $name_encoding_file || die "Can not open $name_encoding_file : $! \n"; $/="\n"; while($line=<$fh_in>){ if($line =~ m/^>\S*\s*.*/){ $line =~ /^>([^\t]+)\t([^\n]*)\n/; $encodingID{$2} = $1; $encodingName{$1} = $2; ($pos,$allele,$name) = $line =~ /(\d+)\]_(allele\d)\t([^\n]*)$/; $name_position{$name} = $pos; $name_allele{$name} = $allele; } else{ print "the protein sequences do not have the right format\n"; exit(11); } } $fh_in->close(); return \%encodingName,\%encodingID,\%name_allele,\%name_position; } 1;