#!/usr/bin/perl -w

package formatAlleleSeq;

use strict;
#~ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

#~ require Exporter;
use base 'Exporter';

#~ require Exporter;
#~ @ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# export the names of the fonction for that module (to be seen externally)
our @EXPORT = qw/getProteinseqfile_fromSNPeffect encodeFastaProtName format_multifasta_to_singlefastas splitfastaN getEncoding/;

#~ $VERSION = '0.01';

use IO::File;
use Bio::SeqIO;
use Bio::EnsEMBL::Registry;


#~ sub getEnsembl_variant-effect-format_from_rsID($$$$){
	#~ 
	#~ ($rsID_file,$outfile,$errorfile,$registry_file_path,$specie) = @_;
	#~ 
	#~ my ($id,$vfa,$vf,$v,$error);
	#~ 
	#~ my $registry = 'Bio::EnsEMBL::Registry';
  	#~ $registry->load_all($registry_file_path);
						#~ 
	#~ my $fh_stderr = new IO::File ">$errorfile" || die "Can not create $errorfile : $!";#Error file (where the untreated sequences goes)
	#~ print $fh_stderr "Untreated sequences:\n";
#~ 
	#~ my $fh_out = new IO::File ">fic/snp_predictor_effect/snp_predictor_input_$date" || die "Can not create fic/snp_predictor_input_$date : $!";#Output file : input for snp_predictor_effect
	#~ 
	#~ print "Searching for the rs id...\n";
	#~ 
	#~ $error = 0;#Numbers of errors founds
#~ 
#~ 
	#~ while(<$fic_snp>){
		#~ 
		#~ #Check if the rs id exist in db
		#~ if($_ =~ /(rs\d*)/){
			#~ 
			#~ $id = $1;
			#~ $vfa = $registry->get_adaptor($specie,"variation","variationfeature");
			#~ $vf = $registry->get_adaptor($specie,"variation","variation");
			#~ $v = $vf->fetch_by_name($id);
			#~ 
			#~ if(!defined($v)){
				#~ #print "Seq not found in the Ensembl data: $id . Skipping line.\n";
				#~ print $fh_stderr "$id (not found in the Ensembl database)\n";
				#~ $error++;
				#~ next;
			#~ }
			#~ #Take all variations for this rs (start, end, strand, allele string, chromosom)
			#~ foreach $vf (@{$vfa->fetch_all_by_Variation($v)}) {
				#~ if(!defined($vf)){
					#~ #print "Seq variation annotation not found in the Ensembl data: $id . Skipping line.\n";
					#~ print $fh_stderr "$id (no features found in Ensembl database)\n";
					#~ $error++;
					#~ next;
				#~ }
				#~ #Extract the informations: start of snp, end of snp (must be same as the start), strand, alleles, chromosom, rsid (optionnal - for debug)
				#~ my $start = $vf->start();
				#~ if(!defined($start)){
					#~ #print "Seq have not start position in the Ensembl data: $id . Skipping line.\n";
					#~ print $fh_stderr "$id (no start found in Ensembl database)\n";
					#~ $error++;
					#~ next;
				#~ }
				#~ my $end = $vf->end();
				#~ if(!defined($end)){
					#~ #print "Seq have not end position in the Ensembl data: $id . Skipping line.\n";
					#~ print $fh_stderr "$id (no end found in Ensembl database)\n";
					#~ $error++;
					#~ next;
				#~ }
				#~ my $strand = $vf->strand();
				#~ if(!defined($start)){
					#~ #print "Seq have not strand in the Ensembl data: $id . Skipping line.\n";
					#~ print $fh_stderr "$id (no strand found in Ensembl database)\n";
					#~ $error++;
					#~ next;
				#~ }
				#~ my $alleles = $vf->allele_string();
				#~ if(!defined($start)){
					#~ #print "Seq have not allele in the Ensembl data: $id . Skipping line.\n";
					#~ print $fh_stderr "$id (no allele string found in Ensembl database)\n";
					#~ $error++;
					#~ next;
				#~ }
				#~ my $chromosom = $vf->seq_region_name();
				#~ if(!defined($chromosom)){
					#~ #print "Seq have not chromosom in the Ensembl data: $id . Skipping line.\n";
					#~ print $fh_stderr "$id (no chromosom string found in Ensembl database)\n";
					#~ $error++;
					#~ next;
				#~ }
				#~ 
				#~ my $name_snp = $vf->variation_name();
				#~ print $fh_out "chr$chromosom\t$start\t$end\t$alleles\t$strand\n";
			#~ }
		#~ }
		#~ else{
			#~ print "Error: untolerated rs sequence ($_). Skipping line.\n";
			#~ print $fh_stderr "$_\n";
		#~ }
	#~ }
	#~ 
	#~ if($error!=0){
		#~ print "$error errors found. Consult results/error.out.\n";
	#~ }
	#~ 
	#~ $fh_out->close();
	#~ $fh_stderr->close();
#~ }


# this function is used to get the non_synonymous coding SNPs
# to get the corresponding protein sequence from Ensembl
# to generate a fasta file with these protein sequences, 1 per allele 
# so 2 protein sequences per SNP

sub getProteinseqfile_fromSNPeffect($$$$){
	
	my ($specie, $infile, $outfile, $registry_file_path) = @_;
	print "formatAlleleSeq::create: Searching amino acid sequence for each allele...\n";
	
	#Connect to the Ensembl data		
	my $registry = 'Bio::EnsEMBL::Registry';
  	$registry->load_all($registry_file_path);
  	
  	my $errorfile = $outfile;
  	$errorfile =~s/\/[^\/]*$/\/error.txt/;
	
	my $db_in = new IO::File "$infile", "r" || die "Can not open file  : $infile $! \n";#Result of snppredictor	
	my $db_out = new IO::File ">$outfile" || die "Can not create file $outfile $! \n";#The final db
	my $stderr = new IO::File ">>$errorfile" || die "Can not open $errorfile: $! \n";#Error file

	my $iterator = 1;
	my $oname;
	my @tmp = ();
	
	while(<$db_in>){
		
		##############################################VERIF OUTPUT VARIANT EFFECT PREDICTOR FORMAT ###############################################
		#~ my $ligne = $_;
		#~ my @line = split( "\t", $ligne);
		#~ if ( $line[4] !~ /ENS\D{3}T\d+/ ){
			#~ print "Variant format unknow\n";
			#~ exit;
		#~ }

		################################################################ NON SYNONYMOUS CODING ###################################################
		if($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tNON_SYNONYMOUS_CODING\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/){
						
			if((defined($oname)) && ($oname eq $1)){#If previous name already exist (severals transcripts exists)
				$iterator++;#Increment $iterator
			}
			else{
				$iterator = 1;
			}
			$oname = $1;
			
			@tmp = ();
			@tmp = split(/_/,$oname);
			my $type_var = length($tmp[2]); # length($3); # SNP len = 1 elsif INDEL len > 1
			
			my $rsID = $9;
			#~ my $location = $2;			
			my $gene = $4;
			my $transc = $5;
			#~ print $transc;
			my $position = $7;
			
			my $change = $8;
			#~ my $change = $1;
			
			## If the variant is a SNP
			if($type_var == 3 ){
				
				my $name = $1."_$rsID\_$transc\_transcript$iterator\[$change-$position\]";
				my $allele1;
				my $allele2;
				if($change =~ m/(\D)\/(\D)/){#Set allele 1 and 2
					$allele1 = $1;
					$allele2 = $2;
				}
				else{
					print "Error found: $name\n";
					print $stderr "$transc (Alleles not found)";
					next;
				}
				
				#Get all sequences from the gene ID.
				my $transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' );
				my $transcript = $transcript_adaptor->fetch_by_stable_id("$transc");
				my $protein = $transcript->translate();
				if(defined $protein){
					#Print the 2 alleles in the outputfile
					my $seq = $protein->seq();
					for (my $i=0; $i<2; $i++){
						if($i==0){
							print $db_out ">$name"."_allele1\n";
							
							print $db_out substr($seq,0,$position-1);#First part of seq
							print $db_out $allele1;#The amino acid wich change
							print $db_out substr($seq,$position);#Second part of seq
							print $db_out "\n";
						}
						elsif($i==1){
							print $db_out ">$name"."_allele2\n";
							print $db_out substr($seq,0,$position-1);#First part of seq
							print $db_out $allele2;#The amino acid wich change
							print $db_out substr($seq,$position);#Second part of seq
							print $db_out "\n";
						}
					}
				}
				else{
					print "Protein not found in Ensembl data: $transc\n";
					print $stderr "$transc (Protein not found)\n";
					next;
				}
				
			}
			# IF INDEL !
		}
=add1
		################################################################ INTRONIC, SPLICE_SITE ###################################################
		#Add here a class of SNPs if you want (3PRIME_UTR, 5PRIME_UTR, DOWNSTREAM, ESSENTIAL_SPLICE_SITE, INTERGENIC, INTRONIC, SPLICE_SITE, STOP_GAINED, SYNONYMOUS_CODING ,UPSTREAM ,WITHIN_NON_CODING_GENE)
		elsif($_ = m/(.*)\t(.*)\t(.*)\t(.*)\tINTRONIC\t(.*)\t(.*)\t(.*)\t(.*)//){
		}
=cut
	}
	$db_in->close();
	$db_out->close();
	$stderr->close();
}


# This function attributes 1 ID to each seq (some programs don't like huge names).

sub encodeFastaProtName($$$$){
	
	my ($infile,$outfile,$outencodingtable,$BASE_SEQ_NAME) = @_;
	
	print "formatAlleleSeq::formatfile: formatting protein sequence names...\n";
	
	my $fic_in = new IO::File "$infile" || die "Can not open $infile: $! \n";
	my $sub = new IO::File ">$outfile" || die "Can not create sub $outfile: $! \n";#The file who will be submited
	
	#~ my $save_names_file = $outfile;
	#~ $save_names_file =~ s/\/[^\/]*$/\/saved_names.txt/;
	
	my $sav = new IO::File ">$outencodingtable" || die "Can not create sav $outencodingtable: $! \n";#The save data
	my $i = 1;
	while(<$fic_in>){
		if($_ =~ m/^>/){
			chomp $_;
			print $sav "$_\t".$BASE_SEQ_NAME.$i."\n";
			print $sub ">".$BASE_SEQ_NAME.$i."\n";#new ID
			$_ = <$fic_in>;
			print $sub $_;#Sequence
			$i++;
		}
	}
	$fic_in->close();
	$sav->close();
	$sub->close();
	return $i;
}


# this function splits the initial fasta file into 1 sequence per file

sub format_multifasta_to_singlefastas($$){
	
	my ($inputfile,$outdir) = @_;
	
	print "formatAlleleSeq::format_multifasta_to_singlefastas: formats a fasta file containing multiple sequences into 1 sequence per file...\n";
	
	my @args = ("/usr/local/bioinfo/src/emboss/EMBOSS-6.0.1/bin/seqretsplit", "-sequence", $inputfile, "-osdirectory2", $outdir, "-auto");
   	system(@args) == 0 or die "system @args failed: $?"
}


# this function splits a fasta file into several fasta files with -n number of sequences per file

# faire le mkdir dans ergatis
sub splitfastaN($$$$){
	
	my ($inputfile,$outdir,$numseqsperfile,$sizeseqmin) = @_;
	
	my ($fh_in,$fh_out,$line,$count,$outfile,$countfile,$name,$seq,$ln);
	
	print "formatAlleleSeq::splitfastaN: formats a fasta file containing multiple sequences into 1 sequence per file...\n";


	($name) = $inputfile =~ /\/([^\/]*)$/;
	$name =~ s/\..*//;
	$name = $outdir."/".$name;
	
	$count = $numseqsperfile + 1;
	$countfile = 0;
	
	$fh_in= new IO::File $inputfile || die "cannot open file $inputfile: $! \n";
	$/=">";
	<$fh_in>;
	while($line=<$fh_in>){
				
		$line=~s/>//;		
		
		($seq) = $line =~ /[^\n]+\n(.*)/;
		
		$ln = length($seq);
		
		if($ln < $sizeseqmin){
			
			if($count < $numseqsperfile){
				
				print $fh_out (">$line");
				$count++;
			}
			else{
				
				unless($countfile==0){$fh_out->close()};
				
				$countfile ++;
				$outfile = $name.$numseqsperfile."_".$countfile.".fasta";
				#~ $outfile =~ s/\/$name[^\/]*$/\/$name\_seq$numseqsperfile\_$countfile\.fasta/;
				
				#~ print "$outfile\n";
				
				$count = 1;
				$fh_out = new IO::File ">$outfile" || die "cannot open file $outfile: $! \n";
				print $fh_out (">$line");
				
			}
		}
	}
	$fh_in->close();
}


# GET THE ENCODING NAMES
# get the encoded names and the position of the amino acid changein the protein sequence

sub getEncoding($){
	
	my ($name_encoding_file) = @_;
	
	my %encodingName = (); # initial name => short name
	my %encodingID = (); # short name => initial name
	my %name_allele = (); # short name => allele 1 or allele 2
	my %name_position = (); # short name => AA position of SNP
	
	my ($line,$fh_in,$fh_out,$pos,$name,$allele);
	
	$fh_in = new IO::File $name_encoding_file || die "Can not open $name_encoding_file : $! \n";	
	$/="\n";
	while($line=<$fh_in>){				
		
		if($line =~ m/^>\S*\s*.*/){
			
			$line =~ /^>([^\t]+)\t([^\n]*)\n/;						
				
			$encodingID{$2} = $1;	
			$encodingName{$1} = $2;			

			($pos,$allele,$name) = $line =~ /(\d+)\]_(allele\d)\t([^\n]*)$/;			
			$name_position{$name} = $pos;						
			$name_allele{$name} = $allele;						
		}
		else{
			
			print "the protein sequences do not have the right format\n";
			exit(11);
		}
	} 
	$fh_in->close();
	
	return \%encodingName,\%encodingID,\%name_allele,\%name_position;
}

1;