#!/usr/bin/perl -w package formatAlleleSeq; use strict; #~ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); #~ require Exporter; use base 'Exporter'; use File::Basename; #~ require Exporter; #~ @ISA = qw(Exporter); # Items to export into callers namespace by default. Note: do not export # names by default without a very good reason. Use EXPORT_OK instead. # Do not simply export all your public functions/methods/constants. # export the names of the fonction for that module (to be seen externally) our @EXPORT = qw/getProteinseqfile_fromSNPeffect encodeFastaProtName format_multifasta_to_singlefastas splitfastaN getEncoding getVEPfile_fromSNPeffect getEnsemblformat_from_rsID analyseIndelChange/; #~ $VERSION = '0.01'; use IO::File; use Bio::SeqIO; #~ printf STDERR "%s\n",join("\n",@INC); use Bio::EnsEMBL::Registry; use Bio::Seq; use Config::IniFiles; # Only for SNPs from dbSNP (no indels) sub getEnsemblformat_from_rsID($$$$){ my ($rsID_file, $outdir ,$registry_file_path, $specie) = @_; my ($id,$vfa,$vf,$v,$error); my $registry = 'Bio::EnsEMBL::Registry'; $registry->load_all($registry_file_path); my $errorfile = $outdir."/errors.txt"; my $fh_in = new IO::File $rsID_file || die "Can not create $rsID_file : $!"; my $fh_stderr = new IO::File ">$errorfile" || die "Can not create $errorfile : $!"; #Error file (where the untreated sequences goes) print $fh_stderr "Untreated sequences:\n"; my $fh_out = new IO::File ">$outdir/ensembl.txt" || die "Can not create $outdir/ensembl.txt : $!";#Output file : input for snp_predictor_effect print "Searching for the rs id...\n"; $error = 0;#Numbers of errors founds while(<$fh_in>){ #Check if the rs id exist in db if($_ =~ /(rs\d*)/){ $id = $1; $vfa = $registry->get_adaptor($specie,"variation","variationfeature"); $vf = $registry->get_adaptor($specie,"variation","variation"); $v = $vf->fetch_by_name($id); if(!defined($v)){ #print "Seq not found in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (not found in the Ensembl database)\n"; $error++; next; } #Take all variations for this rs (start, end, strand, allele string, chromosom) foreach $vf (@{$vfa->fetch_all_by_Variation($v)}) { if(!defined($vf)){ #print "Seq variation annotation not found in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no features found in Ensembl database)\n"; $error++; next; } #Extract the informations: start of snp, end of snp (must be same as the start), strand, alleles, chromosom, rsid (optionnal - for debug) my $start = $vf->start(); if(!defined($start)){ #print "Seq have not start position in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no start found in Ensembl database)\n"; $error++; next; } my $end = $vf->end(); if(!defined($end)){ #print "Seq have not end position in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no end found in Ensembl database)\n"; $error++; next; } my $strand = $vf->strand(); if(!defined($start)){ #print "Seq have not strand in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no strand found in Ensembl database)\n"; $error++; next; } my $alleles = $vf->allele_string(); if(!defined($start)){ #print "Seq have not allele in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no allele string found in Ensembl database)\n"; $error++; next; } my $chromosom = $vf->seq_region_name(); if(!defined($chromosom)){ #print "Seq have not chromosom in the Ensembl data: $id . Skipping line.\n"; print $fh_stderr "$id (no chromosom string found in Ensembl database)\n"; $error++; next; } my $name_snp = $vf->variation_name(); print $fh_out "chr$chromosom\t$start\t$end\t$alleles\t$strand\n"; } } else{ print "Error: untolerated rs sequence ($_). Skipping line.\n"; print $fh_stderr "$_\n"; } } if($error!=0){ print "$error errors found. Consult results/error.out.\n"; } $fh_out->close(); $fh_stderr->close(); $fh_in->close(); } # this function is used to get the non_synonymous coding SNPs # to get the corresponding protein sequence from Ensembl # to generate a fasta file with these protein sequences, 1 per allele # so 2 protein sequences per SNP sub getProteinseqfile_fromSNPeffect($$$$$$$$){ print "formatAlleleSeq::getProteinseqfile_fromSNPeffect: obtaining protein sequences ...\n"; my ($specie, $infile, $outfile, $registry_file_path, $outvep_res, $outvep_title,$gff_file, $genome_fasta_file) = @_; my $cfg = Config::IniFiles->new( -file => "/usr/local/bioinfo/src/galaxy-test/galaxy-dist/PATH.ini" ); my $file_path = $cfg->val( 'workPath', 'FILEPATH_DEV' ); print STDOUT "Warning : use of FILE PATH from PATH.ini !!!\n\n"; # get the path to the files my $dirresults = dirname($infile); `cd $dirresults; `; print STDOUT "getProteinseqfile_fromSNPeffect WORKING DIR:$dirresults\n$infile\n\n"; #Connect to the Ensembl data my $registry = 'Bio::EnsEMBL::Registry'; $registry->load_all($registry_file_path); my $errorfile = $outfile; $errorfile =~s/\/[^\/]*$/\/error.txt/; my $db_in = new IO::File "$infile", "r" || die "Can not open file : $infile $! \n";#Result of snppredictor my $db_out = new IO::File ">$outfile" || die "Can not create file $outfile $! \n";#The final db my $stderr = new IO::File ">>$errorfile" || die "Can not open $errorfile: $! \n";#Error file my $iterator = 1; my $iterator2 = 1; my $oname; my @tmp = (); #~ my ($ref,$alt); my ($position2,$allele1_len,$start,$end,$seq_alt); my $allele1; my $allele2; my ($alt,$name,$rsID,$gene,$transc,$position,$change,$name_ini,$chr,$pos,$check_name,$geno,$consequence); my ($transcript_adaptor,$transcript,$protein,$seq,$strand,$sift,$polyphen); my $fh_vep_out_title = new IO::File ">$outvep_title"; defined($fh_vep_out_title)|| die "Can not create file $outvep_title! \n";#The final db print $fh_vep_out_title "SNP Name\tChromosome\tPosition\tGene ID\tTranscript ID\tConsequence\tGenotype\tAA change\tAA position\tStrand\tSift prediction\tPolyphen prediction (human only)\n"; $fh_vep_out_title->close(); my $fh_vep_out = new IO::File ">$outvep_res" || die "Can not create file $outvep_res $! \n";#The final db my $transcript_file = "transcripts.fa"; # case of reference not included in Ensembl DB my ($transcript_id,$transcript_seq); my %transcript_list=(); ################################# Function Code ############################################# # case reference genome not found in Ensembl print ">>genome_fasta: $genome_fasta_file - $gff_file\n"; #~ if(not defined $genome_fasta_file){ if($genome_fasta_file ne "no" and defined $gff_file ne "no"){ # get the transcript sequences # STEP 1 à lancer => génére un fichier d'index # index the fasta file #~ `cp $genome_fasta_file "$dirresults/genome.fasta"; cp $gff_file "$dirresults/annot.gff";`; `samtools faidx "$genome_fasta_file" `; # STEP 2 à lancer => génère un fichier de transcripts # get the transcripts `gffread -x $transcript_file -g "$genome_fasta_file" "$gff_file"`; #~ print "gffread -x $transcript_file -g $fasta $gff\n\n"; # STEP 3 de galaxy : protseq.... # prend en entrée: # - fichier d'indexation du .fasta génomic # - fichier des trasnscripts .fa # - fichier .vep my $fh_fasta_in=new IO::File "$transcript_file" || die "Can not create file $transcript_file $! \n"; $/=">"; while(my $line=<$fh_fasta_in>){ my ($name)=$line=~/([^\s]+)/; my $seq=$line; $seq=~s/[^\n]+\n//; $seq=~s/>//; $seq=~s/\n//g; $transcript_list{$name}=$seq; } $fh_fasta_in->close(); } $/="\n"; while(<$db_in>){ next if ($_=~/^\#/); #~ print "$_"; @tmp = (); @tmp = split(/\t/,$_); $consequence = $tmp[6]; $alt = $tmp[2]; $name_ini = $tmp[0]; $rsID = $tmp[12]; $gene = $tmp[3]; $transc = $tmp[4]; $position = $tmp[9]; $change = $tmp[10]; if($tmp[13] =~/STRAND/){($strand) = $tmp[13] =~ /STRAND=(-*1)/;} else{$strand="";} if($tmp[13] =~/SIFT/){($sift) = $tmp[13] =~ /SIFT=([^\)]*\))/;} else{$sift="";} if($tmp[13] =~/PolyPhen/){($polyphen) = $tmp[13] =~ /PolyPhen=([^\)]*\))/;} else{$polyphen="";} ($check_name) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/; $check_name = $check_name."/".$alt; $oname = $check_name; $name = $oname."_$rsID\_$transc\[$change-$position\]"; ($chr,$pos,$geno) = $oname =~ /([^_]+)_([^_]+)_(.+)/; print $fh_vep_out "$name\t$chr\t$pos\t$gene\t$transc\t$consequence\t$geno\t$change\t$position\t$strand\t$sift\t$polyphen\n"; if(($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tmissense_variant\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/) || ($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tinitiator_codon_variant\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/)){ @tmp = (); @tmp = split(/\t/,$_); $consequence = $tmp[6]; $alt = $tmp[2]; $name_ini = $tmp[0]; $rsID = $tmp[12]; $gene = $tmp[3]; $transc = $tmp[4]; $position = $tmp[9]; $change = $tmp[10]; ($check_name) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/; $check_name = $check_name."/".$alt; # in case where theere are more than 1 alternatvie allele compared to the reference # the name will be based on: Ref / ALt considered for that line in the vep file # for these cases, the annotation pipeline will consider only the comparison (lost / gain) # between the reference and the alternative base considered (not betwwen different ALT) # note: if there are more than 1 alt: # VEP will retreive 1 line per ALT $oname = $check_name; # modification 28/08/15 # case where the seqname contains "_" !!! my ($var) = $oname =~ /.+_\d+_([^_]+)$/; my $type_var = length($var); # length($3); # SNP len = 1 elsif INDEL len > 1 ## If the variant is a SNP if(($type_var == 3) and (not $name_ini =~ /\/-$/) and (not $name_ini =~ /-\/.*$/)){### $name = $oname."_$rsID\_$transc\[$change-$position\]"; if($change =~ m/(\D)\/(\D)/){#Set allele 1 and 2 $allele1 = $1; $allele2 = $2; } else{ print "Error found: $name\n"; print $stderr "$transc (Alleles not found)"; next; } # get the transcript sequence from the transcript name: $transc ##### Without the reference in Ensembl #~ if(not defined $genome_fasta_file){ if($genome_fasta_file ne "no" and $gff_file ne "no"){ if(defined $transcript_list{$transc}){ $transcript_seq=$transcript_list{$transc}; $transcript = Bio::Seq->new(-seq => "$transcript_seq", -alphabet => 'dna' ); } else{ print "the transcript from the .vep ($transc) has a different nomenclature than the .gff one\n"; exit; } } ##### with reference in ENsembl else{ #**************************** #Get all sequences from the gene ID. $transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' ) || die "registry->get_adaptor $specie failed"; $transcript = $transcript_adaptor->fetch_by_stable_id("$transc") || die "transcript_adaptor->fetch_by_stable_id $transc failed"; #**************************** } #~ print "*$transc:$transcript*\n"; $protein = $transcript->translate(); if(defined $protein){ #Print the 2 alleles in the outputfile $seq = $protein->seq(); $seq =~s/\*$//; for (my $i=0; $i<2; $i++){ if($i==0){ print $db_out ">$name"."_allele1\n"; print $db_out substr($seq,0,$position-1);#First part of seq print $db_out $allele1;#The amino acid wich change print $db_out substr($seq,$position);#Second part of seq print $db_out "\n"; } elsif($i==1){ print $db_out ">$name"."_allele2\n"; print $db_out substr($seq,0,$position-1);#First part of seq print $db_out $allele2;#The amino acid wich change print $db_out substr($seq,$position);#Second part of seq print $db_out "\n"; } } } else{ print "Protein not found in Ensembl data: $transc\n"; print $stderr "$transc (Protein not found)\n"; next; } } # IF INDEL ! else{ # get the left and right parts of the protein sequence # concatenate: left + del / insert + right parts # get the indel sequence : $alt ($allele1,$allele2) = $change =~ /([^\/]+)\/(.*)/; # get the left and right parts $name = $oname."_$rsID\_$transc\[$change-$position\]"; # get the transcript sequence from the transcript name: $transc ##### Without the reference in Ensembl #~ if(not defined $genome_fasta_file){ if($genome_fasta_file ne "no" and $gff_file ne "no"){ if(defined $transcript_list{$transc}){ $transcript_seq=$transcript_list{$transc}; $transcript = Bio::Seq->new(-seq => "$transcript_seq", -alphabet => 'dna' ); } else{ print "the transcript from the .vep ($transc) has a different nomenclature than the .gff one\n"; exit; } } ##### with reference in ENsembl else{ #**************************** #Get all sequences from the gene ID. $transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' ); $transcript = $transcript_adaptor->fetch_by_stable_id("$transc"); #**************************** } $protein = $transcript->translate(); if(defined $protein){ $seq = $protein->seq(); $seq =~s/\*$//; #Print the 2 alleles in the outputfile print $db_out ">$name"."_allele1\n"; print $db_out "$seq\n"; if(($name_ini =~ /\/-$/) or ($name_ini =~ /-\/.*$/)){### # deletion in the reference if($name_ini =~ /\/-$/){### print $db_out ">$name"."_allele2\n";### $seq_alt = substr($seq,0,$position-1);#First part of seq ### $seq_alt = $seq_alt.substr($seq,$position);#Second part of seq### print $db_out "$seq_alt\n"; ### }### # insertion in the reference if($name_ini =~ /-\/.*$/){### print $db_out ">$name"."_allele2\n";### $seq_alt = substr($seq,0,$position-1);#First part of seq ### $seq_alt = $seq_alt."$allele2"; # insertion ### $seq_alt = $seq_alt.substr($seq,$position);#Second part of seq### print $db_out "$seq_alt\n"; ### }### }### else{### ($start,$end) = $position =~ /(\d+)-(\d+)/; print $db_out ">$name"."_allele2\n"; $seq_alt = substr($seq,0,$start-1);#First part of seq $seq_alt = $seq_alt."$allele2"; $seq_alt = $seq_alt.substr($seq,$end);#Second part of seq ###!!!!! print $db_out "$seq_alt\n"; }### } else{ print "Protein not found in Ensembl data: $transc\n"; print $stderr "$transc (Protein not found)\n"; next; } } } =add1 ################################################################ INTRONIC, SPLICE_SITE ################################################### #Add here a class of SNPs if you want (3PRIME_UTR, 5PRIME_UTR, DOWNSTREAM, ESSENTIAL_SPLICE_SITE, INTERGENIC, INTRONIC, SPLICE_SITE, STOP_GAINED, SYNONYMOUS_CODING ,UPSTREAM ,WITHIN_NON_CODING_GENE) elsif($_ = m/(.*)\t(.*)\t(.*)\t(.*)\tINTRONIC\t(.*)\t(.*)\t(.*)\t(.*)//){ } =cut } #~ `cp $outvep_res "$dirresults/results_vep.txt"; cp $outfile "$dirresults/results.txt"; cp $errorfile "$dirresults/error.txt"`; $db_in->close(); $db_out->close(); $stderr->close(); $fh_vep_out->close(); } # This function attributes 1 ID to each seq (some programs don't like huge names). sub encodeFastaProtName($$$$){ my ($infile,$outfile,$outencodingtable,$BASE_SEQ_NAME) = @_; print "formatAlleleSeq::encodeFastaProtName: formatting protein sequence names...\n"; my ($change,$type_var); my $fic_in = new IO::File "$infile" || die "Can not open $infile: $! \n"; my $sub = new IO::File ">$outfile" || die "Can not create sub $outfile: $! \n";#The file who will be submited #~ my $save_names_file = $outfile; #~ $save_names_file =~ s/\/[^\/]*$/\/saved_names.txt/; my $sav = new IO::File ">$outencodingtable" || die "Can not create sav $outencodingtable: $! \n";#The save data my $i = 1; while(<$fic_in>){ if($_ =~ m/^>/){ chomp $_; #~ print STDOUT ">>>>$_"; ## EXAMPLES OF INDELS #~ >X_3359812_TTCTTCTTCTTCTT/TTCTTCTTCTT_-_ENSBTAT00000014275[KKKKK/KKKK-38-42]_allele1 #~ >X_138471854_AAT/-_-_ENSBTAT00000004339[I/--159]_allele2 #~ >X_14308536_-/ATC_-_ENSBTAT00000013812[P/RS-6023]_allele1 #~ >X_733293_G/A_-_ENSBTAT00000000209[P/S-600]_allele2 ($change) = $_ =~ /\[([^\/]+\/.+)\-{1}\d*\-*\d+\]_allele\d*/; if((length($change) > 3) or ($change =~ /\-/)){ $type_var = "indel"; } else{ $type_var = "snp"; } print $sav "$_\t".$BASE_SEQ_NAME.$i."\t".$type_var."\n"; #~ print STDOUT "$_\t".$BASE_SEQ_NAME.$i."\t".$type_var."\n"; print $sub ">".$BASE_SEQ_NAME.$i."\n";#new ID $_ = <$fic_in>; print $sub $_;#Sequence $i++; } } $fic_in->close(); $sav->close(); $sub->close(); return $i; } # this function splits the initial fasta file into 1 sequence per file sub format_multifasta_to_singlefastas($$){ my ($inputfile,$outdir) = @_; print "formatAlleleSeq::format_multifasta_to_singlefastas: formats a fasta file containing multiple sequences into 1 sequence per file...\n"; my @args = ("/usr/local/bioinfo/src/emboss/EMBOSS-6.0.1/bin/seqretsplit", "-sequence", $inputfile, "-osdirectory2", $outdir, "-auto"); system(@args) == 0 or die "system @args failed: $?" } # this function splits a fasta file into several fasta files with -n number of sequences per file # faire le mkdir dans ergatis sub splitfastaN($$$$){ my ($inputfile,$outdir,$numseqsperfile,$sizeseqmin) = @_; my ($fh_in,$fh_out,$line,$count,$outfile,$countfile,$name,$seq,$ln); print "formatAlleleSeq::splitfastaN: formats a fasta file containing multiple sequences into 1 sequence per file...\n"; ($name) = $inputfile =~ /\/([^\/]*)$/; $name =~ s/\..*//; $name = $outdir."/".$name; $count = $numseqsperfile + 1; $countfile = 0; $fh_in= new IO::File $inputfile || die "cannot open file $inputfile: $! \n"; $/=">"; <$fh_in>; while($line=<$fh_in>){ $line=~s/>//; ($seq) = $line =~ /[^\n]+\n(.*)/; $ln = length($seq); if($ln < $sizeseqmin){ if($count < $numseqsperfile){ print $fh_out (">$line"); $count++; } else{ unless($countfile==0){$fh_out->close()}; $countfile ++; $outfile = $name.$numseqsperfile."_".$countfile.".fasta"; #~ print "$outfile\n"; #~ $outfile =~ s/\/$name[^\/]*$/\/$name\_seq$numseqsperfile\_$countfile\.fasta/; #~ print "$outfile\n"; $count = 1; $fh_out = new IO::File ">$outfile" || die "cannot open file $outfile: $! \n"; print $fh_out (">$line"); } } } $fh_in->close(); } # GET THE ENCODING NAMES # get the encoded names and the position of the amino acid changein the protein sequence sub getEncoding($){ my ($name_encoding_file) = @_; my %encodingName = (); # initial name => short name my %encodingID = (); # short name => initial name my %name_allele = (); # short name => allele 1 or allele 2 my %name_position = (); # short name => AA position of SNP my %name_var_type = (); # short name => type of variation (snp, indel) my ($line,$fh_in,$fh_out,$pos,$name,$allele,$var_type); $fh_in = new IO::File $name_encoding_file; defined($fh_in)|| die "Can not open $name_encoding_file : $! \n"; $/="\n"; while($line=<$fh_in>){ if($line =~ m/^>\S*\s*.*/){ $line =~ /^>([^\t]+)\t([^\t]*)\t([^\n]*)\n/; $encodingID{$2} = $1; $encodingName{$1} = $2; #~ ($pos,$allele,$name) = $line =~ /(\d+)\]_(allele\d)\t([^\n]*)$/; ($pos,$allele,$name,$var_type) = $line =~ /\-(\d+\-*\d*)\]_(allele\d)\t([^\t]*)\t(.*)$/;### #~ print "$pos,$allele,*$name*,$var_type\n"; $name_position{$name} = $pos; $name_allele{$name} = $allele; $name_var_type{$name} = $var_type;### } else{ print "$name_encoding_file the protein sequences do not have the right format\n$line \n"; exit(11); } } $fh_in->close(); return \%encodingName,\%encodingID,\%name_allele,\%name_position,\%name_var_type;### } sub getVEPfile_fromSNPeffect($$$){ my ($infile,$out_vep_dir,$type) = @_; my (@type,$fh_in,$fh_out,@line, @args); my $nb_snp = 0; #~ @type = split(",", $type); #~ foreach my $var (@type){ my ($vep_var_result) = $out_vep_dir."/".$type."_file.vep"; ## ADD HERE NEW CATEGORY if ($type eq "NSC"){ #~ $type = "NON\_SYNONYMOUS\_CODING"; $type = "(missense\_variant|initiator_codon_variant)"; } elsif($type eq "STOP"){ #~ $type = "STOP\_(GAINED|LOST)"; $type = "stop\_(gained)"; } $fh_in = new IO::File $infile, "r" ; defined $fh_in || die "Can not open $infile : $! \n"; $/="\n"; $fh_out = new IO::File $vep_var_result, "w"; defined $fh_out || die "Can not create $vep_var_result: $! \n"; while(<$fh_in>){ @line = (); @line = split("\t", $_); next if ($line[0] =~/^#/); if ($line[6] =~ /$type/ && $line[1] !~ /.*\:\d*\-\d*/){ print $fh_out (join ("\t",@line)); $nb_snp++; } } $fh_in->close(); $fh_out->close(); #~ } } sub getVEPfile_fromSNPeffect_old($$$){ my ($infile,$out_vep_dir,$type) = @_; my (@type,$fh_in,$fh_out,@line, @args); my $nb_snp = 0; @type = split(",", $type); foreach my $var (@type){ #~ my ($vep_var_dir) = $out_vep_dir."/".$var."_vep_dir"; #~ system("mkdir $vep_var_dir"); my ($vep_var_result) = $out_vep_dir."/".$var."_file.vep"; #~ print $vep_var_dir."\n".$vep_var_result."\n\n"; ## ADD HERE NEW CATEGORY if ($var eq "NSC"){ $type = "NON\_SYNONYMOUS\_CODING" } elsif($var eq "STOP"){ $type = "STOP\_(GAINED|LOST)"; } print "$var:$type\n$infile\n"; $fh_in = new IO::File $infile, "r" || die "Can not open $infile : $! \n"; $/="\n"; $fh_out = new IO::File $vep_var_result, "w"|| die "Can not create $vep_var_result: $! \n"; while(<$fh_in>){ #print "*$_*\n"; @line = (); @line = split("\t", $_); #print "*$line[0]*: $line[6]\n"; next if ($line[0] =~/^#/); #print "heho $line[6]\t$line[2]"; if ($line[6] =~ /$type/ && $line[1] !~ /.*\:\d*\-\d*/){ print $fh_out (join ("\t",@line)); #print "ici"; $nb_snp++; } } $fh_in->close(); $fh_out->close(); #~ # if number of variants input is greather than snp_number_per_file, we split files #~ if ($nb_snp > $snp_number_per_file){ #~ chdir "$vep_var_dir" or die "cannot change dir getVEPfile_fromSNPeffect chdir $vep_var_dir : $?"; #~ #~ @args = ("split", "-l", $snp_number_per_file, "-d", $vep_var_result, "vep_".$var); #~ system(@args) == 0 or die "system @args failed: $?"; #~ #~ @args = ("rm", $vep_var_result); #~ system(@args) == 0 or die "system @args failed: $?"; #~ } } } # for INDELS only # This subroutine checks according to the VEP formats output types for indels ( I/--pos, -/I-pos, VN/VKN-pos, KKK/KKKK,pos-pos) # if the REF or ALT has to be checked for new signals sub analyseIndelChange($){ my ($info) = @_; my ($ref,$alt,$pos,$change,$start,$end,$indel,$end_alt); $ref=""; $alt=""; $start = 0; $end = 0; $end_alt = 0; ## EXAMPLES OF INDELS: #~ KKKKK/KKKK-38-42 #~ I/--159 #~ P/RS-6023 #~ P/S-600 #~ KKKKK/KKKK-38-42 if($info =~ /^[^\/]+\/[^\-]+\-\d+\-\d+$/){ # positions for the reference start and end ($ref,$alt,$start,$end) = $info =~ /^([^\/]+)\/([^\-]+)\-(\d+)\-(\d+)$/; # Positions for the alternative $end_alt = $start + length($alt) - 1; } #~ I/--159 elsif($info =~ /^[^\/]+\/\-\-\d+$/){ ($ref,$alt,$pos) = $info =~ /^([^\/]+)\/(\-)\-(\d+)$/; $start = $pos; $end = $pos; $end_alt = 0; # deletion => No bases to look at on the alternative ! } #~ -/I-159 elsif($info =~ /^\-\/[^\/]+\-\d+$/){ ($ref,$alt,$pos) = $info =~ /^(\-)\/([^\/]+)\-(\d+)$/; $start = $pos; $end = 0; # deletion => No bases to look at on the reference ! $end_alt = $pos; } #~ P/RS-6023 #~ P/S-600 elsif($info =~ /^[^\/]+\/[^\-]+\-\d+$/){ ($ref,$alt,$pos) = $info =~ /^([^\/]+)\/([^\-]+)\-(\d+)$/; $start = $pos; $end = $pos; $end_alt = $start + length($alt) - 1; } else{ print ">>UNKNWON CASE: $info\n"; } #~ print "************************\n"; #~ print "$info\n"; #~ print "$ref,$alt,$start,$end,$end_alt\n";#>PFFS,PFS-203,206,PFFSPFS-203 #~ print "************************\n"; return ($start,$end,$end_alt); } 1;