#!/usr/bin/perl -w =pod =head1 NAME runnetnglyc Module =head1 SYNOPSIS use runnetnglyc; =head1 OPTIONS =head1 DESCRIPTION =head1 VERSION Version 1 =head1 DATE 20 janvier 2012 =head1 AUTHORS Sabrina Rodriguez (sabrina.rodriguez@jouy.inra.fr) Johann Beghain =cut package runnetnglyc; use strict; use IO::File; use base 'Exporter'; our @EXPORT = qw/netnglyc_extract_results/; use formatAlleleSeq; # extract results after running netnglyc applied to 1 result file sub netnglyc_extract_results($$$$$){ my ($delta, $netnglyc_result_file, $name_encoding_file, $output_file,$BASE_SEQ_NAME) = @_; print "runnetnglyc::netnglyc_extract_results: Searching glycosylation site loss or gain between 2 alleles:\n"; print "Name\tgain/loss/diff\tscore_allele1\tscore_allele2...\n"; my ($seq_name,$pos,$score,$signal,$line,$fh_in,$fh_out,$name,$nameA1,$nameA2,$score_diff,$basis_name,$start,$end,$ref,$alt,$info,$end_alt,$i); # get the alleles results from netnglyc my %score_allele1 = (); # key : initial complete name => score value (for the alléle 1 position) my %score_allele2 = (); # key : initial complete name => signal value (for the alléle 1 position : ++) my %signal_allele1 = (); # key : initial complete name => score value (for the alléle 2 position) my %signal_allele2 = (); # key : initial complete name => signal value (for the alléle 2 position : ++) my %ref_pos = (); my %alt_pos = (); my ($h_encodingName,$h_encodingID,$h_name_allele,$h_name_position,$h_name_var_type) = getEncoding($name_encoding_file); my $prog = "netnglyc"; my $outpath = $output_file; $outpath =~ s/\/([^\/]+)$//; my $fh_title = new IO::File ">$outpath/$prog\_title.txt" || die "Can not open $outpath/$prog\_title.txt : $! \n"; print $fh_title ("Sequence Name\tCase\tscore allele1\tscore allele2\tsignal allele1\tsignal allele2\n"); $fh_title->close(); # get the encoded names and the position of the amino acid changein the protein sequence $fh_in = new IO::File $netnglyc_result_file || die "Can not open file $netnglyc_result_file : $! \n"; $fh_out = new IO::File ">$output_file" || die "Can not open $output_file : $! \n"; $/="\n"; while($line=<$fh_in>){ if($line =~ /^$BASE_SEQ_NAME\d+\s*\d+\s*[^\s]*\s*\d\.\d*\s*[^\s]*\s*[+]{1,}/){ ($seq_name,$pos,$score,$signal) = $line =~ /^($BASE_SEQ_NAME\d+)\s*(\d+)\s*[^\s]*\s*(\d\.\d*)\s*[^\s]*\s*([+]{1,})/; #~ print "\n> SEQ: seq_name:*$seq_name*, pred_pos:$pos, pred_score:$score, $signal\n"; ($basis_name) = $h_encodingID->{$seq_name} =~ /(.*)_allele\d$/; if($h_name_var_type->{$seq_name} eq "snp"){ ### if($pos eq $h_name_position->{$seq_name}){ if($h_name_allele->{$seq_name} eq "allele1"){ $signal_allele1{$basis_name} = $signal; $score_allele1{$basis_name} = $score; } elsif($h_name_allele->{$seq_name} eq "allele2"){ $signal_allele2{$basis_name} = $signal; $score_allele2{$basis_name} = $score; } else{ print "PB: $seq_name :$h_name_allele->{$basis_name}\n"; exit(12); } } } else{ #~ print "HERE\n"; # get the [REF/ALT-POS] information #~ print "$h_encodingID->{$name}\n"; ($info) = $h_encodingID->{$seq_name} =~ /\[([^\/]+\/.+\-\d*\-*\d+)\]_allele\d*/; # retrives the amino acids to check on ($start,$end,$end_alt) = analyseIndelChange($info); #~ print "coordinates: $start,$end,$end_alt\nPOSITION: $pos\n"; # get the signals from the reference %ref_pos = (); for($i=$start;$i<=$end;$i++){ $ref_pos{$i} = ""; #~ print "ref:$i\n"; } # get the signals from the alternative %alt_pos = (); for($i=$start;$i<=$end_alt;$i++){ $alt_pos{$i} = ""; #~ print "alt:$i\n"; } #~ print "--> POS FROM RES: $pos\n"; if((defined $ref_pos{$pos}) and ($h_name_allele->{$seq_name} eq "allele1")){ $score_allele1{$basis_name} = "indel"; if(not defined $signal_allele1{$basis_name}){ $signal_allele1{$basis_name} = 1; } else{ $signal_allele1{$basis_name} = $signal_allele1{$basis_name} + 1; # number of signals on the reference for the indel } #~ print "REF:$basis_name:$signal_allele1{$basis_name}\n"; } if((defined $alt_pos{$pos}) and ($h_name_allele->{$seq_name} eq "allele2")){ $score_allele2{$basis_name} = "indel"; if(not defined $signal_allele2{$basis_name}){ $signal_allele2{$basis_name} = 1; } else{ $signal_allele2{$basis_name} = $signal_allele2{$basis_name} + 1; # number of signals on the alternative for the indel } #~ print "ALT:$basis_name:$signal_allele2{$basis_name}\n"; } } } } foreach $basis_name (keys %signal_allele1){ #~ $nameA1 = $seq_name; #~ $name = $h_encodingID->{$seq_name}; #~ $name =~ s/allele1/allele2/; #~ $nameA2 = $h_encodingName->{$name}; #~ print "mmmmm $seq_name - *$seq_name*\n"; # indel if($score_allele1{$basis_name} eq "indel" ){ #~ print "$basis_name : $signal_allele1{$basis_name} != $signal_allele2{$basis_name}\n"; # the number of signals is different between reference and alternative at the indel part if ($signal_allele1{$basis_name} != $signal_allele2{$basis_name}){ print $fh_out ("$basis_name\tdiff\t\t\t$signal_allele1{$basis_name}\t$signal_allele2{$basis_name}\n"); ##### } delete $signal_allele2{$basis_name}; } # snp else{ #~ ($basis_name) = $h_encodingID->{$seq_name} =~ /(.*)_allele\d$/; if((not defined $signal_allele2{$basis_name}) && ($signal_allele1{$basis_name} =~ /\+{1,}/)) { print $fh_out ("$basis_name\tloss\t$score_allele1{$basis_name}\t\t$signal_allele1{$basis_name}\t\n"); } elsif(defined $score_allele2{$basis_name}){ $score_diff = $score_allele1{$basis_name} - $score_allele2{$basis_name}; $score_diff =~ s/-//; if(($score_diff >= $delta) && ($score_allele1{$basis_name} > $score_allele2{$basis_name})){ print $fh_out ("$basis_name\tloss?\t$score_allele1{$basis_name}\t$score_allele2{$basis_name}\t$signal_allele1{$basis_name}\t$signal_allele2{$basis_name}\n"); } elsif(($score_diff >= $delta) && ($score_allele1{$basis_name} < $score_allele2{$basis_name})){ print $fh_out ("$basis_name\tgain?\t$score_allele1{$basis_name}\t$score_allele2{$basis_name}\t$signal_allele1{$basis_name}\t$signal_allele2{$basis_name}\n"); } } delete $signal_allele2{$basis_name}; } } foreach $basis_name (keys %signal_allele2){ #~ $nameA2 = $seq_name; #~ $name = $h_encodingID->{$seq_name}; #~ $name =~ s/allele2/allele1/; #~ $nameA1 = $h_encodingName->{$name}; # indel if($score_allele2{$basis_name} eq "indel" ){ #~ print ">>>$basis_name : $signal_allele1{$basis_name} != $signal_allele2{$basis_name}\n"; # the number of signals is different between reference and alternative at the indel part if ($signal_allele1{$basis_name} != $signal_allele2{$basis_name}){ print $fh_out ("$basis_name\tdiff\t\t\t$signal_allele1{$basis_name}\t$signal_allele2{$basis_name}\n"); ##### } } # snp else{ #~ ($basis_name) = $h_encodingID->{$seq_name} =~ /(.*)_allele\d$/; if((not defined $signal_allele1{$basis_name})&& ($signal_allele2{$basis_name} =~ /\+{1,}/)) { print $fh_out ("$basis_name\tgain\t\t$score_allele2{$basis_name}\t\t$signal_allele2{$basis_name}\n"); } elsif(defined $score_allele1{$basis_name}){ $score_diff = $score_allele1{$basis_name} - $score_allele2{$basis_name}; $score_diff =~ s/-//; if(($score_diff >= $delta) && ($score_allele1{$basis_name} > $score_allele2{$basis_name})){ print $fh_out ("$basis_name\tloss?\t$score_allele1{$basis_name}\t$score_allele2{$basis_name}\t$signal_allele1{$basis_name}\t$signal_allele2{$basis_name}\n"); } elsif(($score_diff >= $delta) && ($score_allele1{$basis_name} < $score_allele2{$basis_name})){ print $fh_out ("$basis_name\tgain?\t$score_allele1{$basis_name}\t$score_allele2{$basis_name}\t$signal_allele1{$basis_name}\t$signal_allele2{$basis_name}\n"); } } } } $fh_out->close; $fh_in->close; } #~ sub test($$$$){ #~ #~ my ($delta, $netnglyc_result_file, $name_encoding_file, $output_file) = @_; #~ #~ print "runnetnglyc::netnglyc_extract_results: Searching C-mannosilation loss or gain between 2 alleles:\n"; #~ print "Name\tgain/loss/diff\tscore_allele1\tscore_allele2...\n"; #~ #~ my %netnglyc_res = (); #~ my %change_position = (); #~ my %encoded_names= ();#Hash with all name / id #~ #~ my ($i,$fh_in,$name,$line,$pos,$score,$tag,$allele,$basis_name,$diff_score,$fh_out); #~ #~ my %allele1_score = (); #~ my %allele2_score = (); #~ my %allele1_tag = (); #~ my %allele2_tag = (); #~ #~ # get the encoded names and the position of the amino acid changein the protein sequence #~ $fh_in = new IO::File $name_encoding_file || die "Can not open $name_encoding_file : $! \n"; #~ $fh_out = new IO::File ">$output_file" || die "Can not open $output_file : $! \n"; #~ $/="\n"; #~ while($line=<$fh_in>){ #~ if($line =~ m/^>(\S*)\s*(.*)/){ #~ #~ ($pos,$name) = $line =~ /(\d+)\]_allele\d\t([^\n]*)$/; #~ $change_position{$name} = $pos; #~ #~ $line =~ /^>([^\t]+)\t[^\n]*/; #~ $encoded_names{$name} = $1; #~ #~ print "NAME $name:\n$change_position{$name} = $pos\n$encoded_names{$2} = *$1*\n"; #~ } #~ else{ #~ #~ print "the protein sequences do not have the right format\n"; #~ exit(11); #~ } #~ } #~ $fh_in->close(); #~ #~ # get the C mannolisation predictions #~ #extract name, position and score of C manosylation. #~ print "FILE::*$netnglyc_result_file*\n"; #~ $fh_in = new IO::File $netnglyc_result_file || die "Can not open file $netnglyc_result_file : $! \n"; #~ $/="\n"; #~ while($line=<$fh_in>){ #~ #~ if(not $line =~ /^\#/){ #~ #~ if(defined $encoded_names{$name}){ #~ #~ ($name,$pos,$score,$tag) = $line=~/([^\s]+)\s*[^\s]+\s*[^\s]+\s*([^\s]+)\s*[^\s]+\s*([^\s]+)\s*[^\s]+\s*([^\n]+)\n/; #~ ($basis_name,$allele) = $encoded_names{$name} =~ /(.*)_(allele\d)$/; #~ #~ print "$name,$change_position{$name},$pos,$score,$tag,*$basis_name*,$allele,$encoded_names{$name}\n"; #~ #~ #~ if(defined $encoded_names{$name} && ($change_position{$name} eq $pos) && ($allele eq "allele1")){ #~ #~ $allele1_score{$basis_name} = $score; #~ $allele1_tag{$basis_name} = $tag; #~ } #~ elsif(defined $encoded_names{$name} && ($change_position{$name} eq $pos) && ($allele eq "allele2")){ #~ #~ $allele2_score{$basis_name} = $score; #~ $allele2_tag{$basis_name} = $tag; #~ } #~ } #~ else{ #~ #~ print "unkown name: $name\n"; #~ } #~ } #~ } #~ $fh_in->close(); #~ #~ # to analyse the difference between the 2 alleles #~ foreach $name (keys %allele1_score){ #~ #~ #~ if(not defined $allele2_score{$name} && $allele1_tag{$name} eq "W"){ #~ #~ print $fh_out ("$name\tloss\t$allele1_score{$name}\t\n"); #~ print ("$name\tloss\t$allele1_score{$name}\t\n"); #~ #~ } #~ elsif(defined $allele2_score{$name}){ #~ #~ $diff_score = $allele1_score{$basis_name} - $allele2_score{$basis_name}; #~ $diff_score =~ s/-//; #~ #~ if($diff_score >= $delta){ #~ #~ print $fh_out ("$name\t>=(loss?)\t$allele1_score{$name}\t$allele2_score{$name}\n"); #~ } #~ #~ delete $allele2_score{$name}; #~ } #~ } #~ #~ foreach $name (keys %allele2_score){ #~ #~ #~ if(not defined $allele1_score{$name} && $allele2_tag{$name} eq "W"){ #~ #~ print $fh_out ("$name\tgain\t\t$allele2_score{$name}\n"); #~ #~ } #~ elsif(defined $allele1_score{$name}){ #~ #~ $diff_score = $allele1_score{$basis_name} - $allele2_score{$basis_name}; #~ $diff_score =~ s/-//; #~ #~ if($diff_score >= $delta){ #~ #~ print $fh_out ("$name\t>=(gain?)\t$allele1_score{$name}\t$allele2_score{$name}\n"); #~ } #~ } #~ #~ } #~ $fh_out->close; #~ } #~ sub init($$$){ #~ my ($delta, $path, $date) = @_; #~ print "Running NetNGlyc.\n"; #~ unless (-e "$path/results/NetNGlyc"){ #~ system("mkdir results/NetNGlyc"); #~ } #~ #Parse file #~ my $sub = new IO::File "fic/sub_$date" || die "Can not open file sub_$date : $!\n"; #~ my $temp = new IO::File ">fic/temp-netnglyc_$date" || die "$!"; #~ my $j = 0; #~ while(<$sub>){ #~ print $temp $_; #~ $j++; #~ if($j >= 100){ #~ $temp->close(); #~ $j = 0; #~ system("cp fic/temp-netnglyc_$date $path/Programmes/NetNGlyc/netNglyc-1.0a"); #~ chdir "$path/Programmes/NetNGlyc/netNglyc-1.0a"; #~ `./netNglyc temp-netnglyc_$date >>$path/results/NetNGlyc/netnglyc_$date`; #~ chdir "$path"; #~ $temp = new IO::File ">fic/temp-netnglyc_$date" || die "$!"; #~ } #~ } #~ #Last file #~ system("cp fic/temp-netnglyc_$date $path/Programmes/NetNGlyc/netNglyc-1.0a"); #~ chdir "$path/Programmes/NetNGlyc/netNglyc-1.0a"; #~ `./netNglyc temp-netnglyc_$date >>$path/results/NetNGlyc/netnglyc_$date`; #~ `rm -f temp-netnglyc_$date`;#Clean directory #~ `rm -f $path/fic/temp-netnglyc_$date`;#Clean directory #~ chdir "$path"; #~ $temp->close(); #~ $sub->close(); #~ extract_results($delta,$path,$date); #~ `rm -f fic/temp-netnglyc_$date`; #~ } #~ #~ sub extract_results($$$){ #~ #~ my ($delta, $path, $date) = @_; #~ #~ #Open and extract name, position and score of N glycosylation. #~ my $res = new IO::File "$path/results/NetNGlyc/netnglyc_$date" || die "Can not open the result file (netnglyc) : $! \n"; #~ my $tmp = new IO::File ">$path/results/programs/temp-netnglyc_$date" || die "Can not create the temp result file (netnglyc) : $! \n"; #~ #~ while(<$res>){ #~ if($_ =~ m/(\@\d*)\s*(\S*)\s\w*\s*(\S*)\s*\S*\s*(\S*)/){ #~ print $tmp "$1\t$2\t$3\t$4\n"; #~ } #~ } #~ $tmp->close(); #~ $res->close(); #~ #~ #Construct the hash table #~ my $sav = new IO::File "fic/sav_$date" || die "Can not open sav_$date : $! \n"; #~ my %id;#Hash with all name / id #~ while(<$sav>){ #~ if($_ =~ m/^>(\S*)\s*(.*)/){ #~ $id{$2} .= $1; #~ } #~ } #~ $sav->close(); #~ #~ #Find variations #~ $tmp = new IO::File "$path/results/programs/temp-netnglyc_$date" || die "Can not open the temp result file (netnglyc) : $! \n"; #~ my $netnglyc = new IO::File ">$path/results/programs/netnglyc_$date" || die "Can not create the final result file (NetNGlyc) $! \n"; #~ my %a1; #~ my %a2; #~ my %check; #~ my $allele = ""; #~ my $name = ""; #~ my $c = 2; #~ while(<$tmp>){ #~ my $line = $_; #~ if($line =~ s/^@(\d*)/$id{$1}/){#Substitute the ID by his coresponding name #~ if($line =~ m/^(\S*)_allele(\d)\s*(\d*)\s(\S*)\s(\S*)/){ #~ if($2 eq $allele){ #~ if($c == 0){#Same allele1 #~ $allele = $2; #~ my $position = $3; #~ my $score = $4; #~ if($5 =~ m/\++/){ #~ $a1{$position}.=$score; #~ $check{$position}.="1"; #~ } #~ } #~ elsif($c == 1){#Same allele2 #~ $allele = $2; #~ my $position = $3; #~ my $score = $4; #~ if($5 =~ m/\++/){ #~ $a2{$position}.=$score; #~ $check{$position}.="2"; #~ } #~ } #~ } #~ else{ #~ if($c == 0){#Other allele #~ $c++; #~ $allele = $2; #~ my $position = $3; #~ my $score = $4; #~ if($5 =~ m/\++/){ #~ $a2{$position}.=$score; #~ $check{$position}.="2"; #~ } #~ } #~ elsif($c == 1){#Other transcript #~ compare($netnglyc,$delta,\%a1,\%a2,\%check,$name); #~ %a1=(); #~ %a2=(); #~ %check=(); #~ $name = $1; #~ $allele = $2; #~ my $position = $3; #~ my $score = $4; #~ if($5 =~ m/\++/){ #~ $a1{$position}.=$score; #~ $check{$position}.="1"; #~ } #~ $c = 0; #~ } #~ elsif($c == 2){#Initialisation #~ $name = $1; #~ $allele = $2; #~ my $position = $3; #~ my $score = $4; #~ if($5 =~ m/\++/){ #~ $a1{$position}.=$score; #~ $check{$position}.="1"; #~ } #~ $c = 0; #~ } #~ } #~ } #~ } #~ } #~ compare($netnglyc,$delta,\%a1,\%a2,\%check,$name);#For the last transcript #~ $tmp->close(); #~ $netnglyc->close(); #~ } #~ #~ sub compare($$$$){ #~ #~ my ($netnglyc,$delta,$ref1,$ref2,$ref3,$name) = @_; #~ my %a1=%$ref1; #~ my %a2=%$ref2; #~ my %check=%$ref3; #~ foreach my $k (keys(%check)){ #~ my $v = $check{$k}; #~ if($v eq "12"){ #~ my $diffscore = $a1{$k} - $a2{$k}; #~ if($diffscore >= $delta){ #~ print $netnglyc "$name : Signal lost at position $k. Score allele 1 = $a1{$k} - Score allele 2 = $a2{$k}\n"; #~ } #~ elsif($diffscore <= -$delta){ #~ print $netnglyc "$name : Signal gain at position $k. Score allele 1 = $a1{$k} - Score allele 2 = $a2{$k}\n"; #~ } #~ } #~ elsif($v eq "1"){ #~ print $netnglyc "$name : Signal lost at position $k. Score : $a1{$k}\n"; #~ } #~ elsif($v eq "2"){ #~ print $netnglyc "$name : Signal gain at position $k. Score : $a2{$k}\n"; #~ } #~ } #~ } 1;