#!/usr/bin/perl -w

package formatAlleleSeq;

use strict;
#~ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

#~ require Exporter;
use base 'Exporter';
use File::Basename;

#~ require Exporter;
#~ @ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# export the names of the fonction for that module (to be seen externally)
our @EXPORT = qw/getProteinseqfile_fromSNPeffect encodeFastaProtName format_multifasta_to_singlefastas splitfastaN getEncoding getVEPfile_fromSNPeffect getEnsemblformat_from_rsID analyseIndelChange/;

#~ $VERSION = '0.01';

use IO::File;
use Bio::SeqIO;
#~ printf STDERR "%s\n",join("\n",@INC);

use Bio::EnsEMBL::Registry;
use Bio::Seq;

use Config::IniFiles;


# Only for SNPs from dbSNP (no indels)
sub getEnsemblformat_from_rsID($$$$){
	 	 
	 my ($rsID_file, $outdir ,$registry_file_path, $specie) = @_;
	 
	 my ($id,$vfa,$vf,$v,$error);
	 
	 my $registry = 'Bio::EnsEMBL::Registry';
  	 $registry->load_all($registry_file_path);
						
	 my $errorfile = $outdir."/errors.txt";				
	 
	 my $fh_in = new IO::File $rsID_file || die "Can not create $rsID_file : $!";
	 
	 my $fh_stderr = new IO::File ">$errorfile" || die "Can not create $errorfile : $!"; #Error file (where the untreated sequences goes)
	 print $fh_stderr "Untreated sequences:\n";

	 my $fh_out = new IO::File ">$outdir/ensembl.txt" || die "Can not create $outdir/ensembl.txt : $!";#Output file : input for snp_predictor_effect
	 
	 print "Searching for the rs id...\n";
	 
	 $error = 0;#Numbers of errors founds
 
 
	 while(<$fh_in>){
		 
		 #Check if the rs id exist in db
		 if($_ =~ /(rs\d*)/){
			 
			 $id = $1;
			 $vfa = $registry->get_adaptor($specie,"variation","variationfeature");
			 $vf = $registry->get_adaptor($specie,"variation","variation");
			 $v = $vf->fetch_by_name($id);
		
			 if(!defined($v)){
				 #print "Seq not found in the Ensembl data: $id . Skipping line.\n";
				 print $fh_stderr "$id (not found in the Ensembl database)\n";
				 $error++;
				 next;
			 }
			 #Take all variations for this rs (start, end, strand, allele string, chromosom)
			 foreach $vf (@{$vfa->fetch_all_by_Variation($v)}) {
				 if(!defined($vf)){
					 #print "Seq variation annotation not found in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no features found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 #Extract the informations: start of snp, end of snp (must be same as the start), strand, alleles, chromosom, rsid (optionnal - for debug)
				 my $start = $vf->start();
				 if(!defined($start)){
					 #print "Seq have not start position in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no start found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 my $end = $vf->end();
				 if(!defined($end)){
					 #print "Seq have not end position in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no end found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 my $strand = $vf->strand();
				 if(!defined($start)){
					 #print "Seq have not strand in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no strand found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 my $alleles = $vf->allele_string();
				 if(!defined($start)){
					 #print "Seq have not allele in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no allele string found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 my $chromosom = $vf->seq_region_name();
				 if(!defined($chromosom)){
					 #print "Seq have not chromosom in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no chromosom string found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 
				 my $name_snp = $vf->variation_name();
				 print $fh_out "chr$chromosom\t$start\t$end\t$alleles\t$strand\n";
			 }
		 }
		 else{
			 print "Error: untolerated rs sequence ($_). Skipping line.\n";
			 print $fh_stderr "$_\n";
		 }
	 }
	 
	 if($error!=0){
		 print "$error errors found. Consult results/error.out.\n";
	 }
	 
	 $fh_out->close();
	 $fh_stderr->close();
	 $fh_in->close();
}


# this function is used to get the non_synonymous coding SNPs
# to get the corresponding protein sequence from Ensembl
# to generate a fasta file with these protein sequences, 1 per allele 
# so 2 protein sequences per SNP

sub getProteinseqfile_fromSNPeffect($$$$$$$$){
		
	print "formatAlleleSeq::getProteinseqfile_fromSNPeffect: obtaining protein sequences ...\n";
	
	my ($specie, $infile, $outfile, $registry_file_path, $outvep_res, $outvep_title,$gff_file, $genome_fasta_file) = @_;
	
	my $cfg = Config::IniFiles->new( -file => "/usr/local/bioinfo/src/galaxy-test/galaxy-dist/PATH.ini" );
	my $file_path = $cfg->val( 'workPath', 'FILEPATH_DEV' );
	print STDOUT "Warning : use of FILE PATH from PATH.ini !!!\n\n";
	
	# get the path to the files
	my $dirresults  = dirname($infile);
	`cd $dirresults; `; 
	
	print STDOUT "getProteinseqfile_fromSNPeffect WORKING DIR:$dirresults\n$infile\n\n";
		
		
	#Connect to the Ensembl data		
	my $registry = 'Bio::EnsEMBL::Registry';
  	$registry->load_all($registry_file_path);
  	
  	my $errorfile = $outfile;
  	$errorfile =~s/\/[^\/]*$/\/error.txt/;
	
	my $db_in = new IO::File "$infile", "r" || die "Can not open file  : $infile $! \n";#Result of snppredictor	
	my $db_out = new IO::File ">$outfile" || die "Can not create file $outfile $! \n";#The final db
	my $stderr = new IO::File ">>$errorfile" || die "Can not open $errorfile: $! \n";#Error file

	my $iterator = 1;
	my $iterator2 = 1;
	my $oname;
	my @tmp = ();
	
	#~ my ($ref,$alt);
	my ($position2,$allele1_len,$start,$end,$seq_alt);
	my $allele1;
	my $allele2;
	my ($alt,$name,$rsID,$gene,$transc,$position,$change,$name_ini,$chr,$pos,$check_name,$geno,$consequence);
	my ($transcript_adaptor,$transcript,$protein,$seq,$strand,$sift,$polyphen);
	
	my $fh_vep_out_title = new IO::File ">$outvep_title";
	defined($fh_vep_out_title)|| die "Can not create file  $outvep_title! \n";#The final db
	print $fh_vep_out_title "SNP Name\tChromosome\tPosition\tGene ID\tTranscript ID\tConsequence\tGenotype\tAA change\tAA position\tStrand\tSift prediction\tPolyphen prediction (human only)\n";
	$fh_vep_out_title->close();
	
	my $fh_vep_out = new IO::File ">$outvep_res" || die "Can not create file $outvep_res $! \n";#The final db
	
	my $transcript_file = "transcripts.fa"; # case of reference not included in Ensembl DB
	my ($transcript_id,$transcript_seq);
	my %transcript_list=();
	
	
	################################# Function Code #############################################
	# case reference genome not found in Ensembl
	print ">>genome_fasta: $genome_fasta_file - $gff_file\n";
	#~ if(not defined $genome_fasta_file){
	if($genome_fasta_file ne "no" and defined $gff_file ne "no"){
		
		# get the transcript sequences
		
		# STEP 1 à lancer => génére un fichier d'index
		# index the fasta file
		#~ `cp $genome_fasta_file "$dirresults/genome.fasta"; cp $gff_file "$dirresults/annot.gff";`;
		`samtools faidx "$genome_fasta_file" `;

		# STEP 2 à lancer => génère un fichier de transcripts 
		# get the transcripts
		`gffread -x $transcript_file -g "$genome_fasta_file" "$gff_file"`;
		
		#~ print "gffread -x $transcript_file -g $fasta $gff\n\n";
		
		# STEP 3 de galaxy : protseq....
		# prend en entrée:
		# - fichier d'indexation du .fasta génomic
		# - fichier des trasnscripts .fa
		# - fichier .vep
	
		my $fh_fasta_in=new IO::File "$transcript_file" || die "Can not create file $transcript_file $! \n";
		$/=">";
		while(my $line=<$fh_fasta_in>){
			
			my ($name)=$line=~/([^\s]+)/;
			my $seq=$line;
			$seq=~s/[^\n]+\n//;
			$seq=~s/>//;
			$seq=~s/\n//g;
			
			$transcript_list{$name}=$seq;
		}
		$fh_fasta_in->close();
	}
	
	$/="\n";
	while(<$db_in>){
		
		next if ($_=~/^\#/);

		#~ print "$_";
		
		@tmp = ();
		@tmp = split(/\t/,$_);
		

        $consequence = $tmp[6];
		$alt = $tmp[2];
		$name_ini = $tmp[0];
		$rsID = $tmp[12];	
		$gene = $tmp[3];
		$transc = $tmp[4];
		$position = $tmp[9];		
		$change = $tmp[10];
		
		if($tmp[13] =~/STRAND/){($strand) = $tmp[13] =~ /STRAND=(-*1)/;} else{$strand="";}
		if($tmp[13] =~/SIFT/){($sift) = $tmp[13] =~ /SIFT=([^\)]*\))/;} else{$sift="";}
		if($tmp[13] =~/PolyPhen/){($polyphen) = $tmp[13] =~ /PolyPhen=([^\)]*\))/;} else{$polyphen="";}
													
		($check_name) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/;
		$check_name = $check_name."/".$alt;
						
						
		$oname = $check_name;					
		$name = $oname."_$rsID\_$transc\[$change-$position\]";	
				
		($chr,$pos,$geno) = $oname =~ /([^_]+)_([^_]+)_(.+)/;
		
		print $fh_vep_out "$name\t$chr\t$pos\t$gene\t$transc\t$consequence\t$geno\t$change\t$position\t$strand\t$sift\t$polyphen\n";
	
		if(($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tmissense_variant\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/) ||
		   ($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tinitiator_codon_variant\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/)){		
			   						
			@tmp = ();
			@tmp = split(/\t/,$_);
						
			$consequence = $tmp[6];
			$alt = $tmp[2];
			$name_ini = $tmp[0];
			$rsID = $tmp[12];	
			$gene = $tmp[3];
			$transc = $tmp[4];
			$position = $tmp[9];					
			$change = $tmp[10];						
						
			($check_name) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/;
			$check_name = $check_name."/".$alt;
						
			# in case where theere are more than 1 alternatvie allele compared to the reference
			# the name will be based on: Ref / ALt considered for that line in the vep file
			# for these cases, the annotation pipeline will consider only the comparison (lost / gain)
			# between the reference and the alternative base considered (not betwwen different ALT)
			# note: if there are more than 1 alt:
			# VEP will retreive 1 line per ALT
			
			$oname = $check_name;
			
			# modification 28/08/15 
			# case where the seqname contains "_" !!!
			my ($var) = $oname =~ /.+_\d+_([^_]+)$/;
			my $type_var = length($var); # length($3); # SNP len = 1 elsif INDEL len > 1	
								
			
			## If the variant is a SNP
			if(($type_var == 3) and (not $name_ini =~ /\/-$/) and (not $name_ini =~ /-\/.*$/)){###
				
				$name = $oname."_$rsID\_$transc\[$change-$position\]";
			
				if($change =~ m/(\D)\/(\D)/){#Set allele 1 and 2
					$allele1 = $1;
					$allele2 = $2;
				}
				else{
					print "Error found: $name\n";
					print $stderr "$transc (Alleles not found)";
					next;
				}
				
				# get the transcript sequence from the transcript name: $transc
				##### Without the reference in Ensembl
				#~ if(not defined $genome_fasta_file){
				if($genome_fasta_file ne "no" and $gff_file ne "no"){
					
					if(defined $transcript_list{$transc}){
					
						$transcript_seq=$transcript_list{$transc};								
		                $transcript = Bio::Seq->new(-seq => "$transcript_seq", -alphabet => 'dna' );		 										
					}			
					else{
						print "the transcript from the .vep ($transc) has a different nomenclature than the .gff one\n";
						exit;
					}
				}
				##### with reference in ENsembl
				else{						
					#****************************
					#Get all sequences from the gene ID.
					$transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' ) || die "registry->get_adaptor $specie failed";
					$transcript = $transcript_adaptor->fetch_by_stable_id("$transc") || die "transcript_adaptor->fetch_by_stable_id $transc failed";
					#****************************
				}
				
				#~ print "*$transc:$transcript*\n";
				$protein = $transcript->translate();
				if(defined $protein){
					#Print the 2 alleles in the outputfile
					$seq = $protein->seq();
					$seq =~s/\*$//;

					for (my $i=0; $i<2; $i++){
						if($i==0){
							print $db_out ">$name"."_allele1\n";						
							print $db_out substr($seq,0,$position-1);#First part of seq
							print $db_out $allele1;#The amino acid wich change
							print $db_out substr($seq,$position);#Second part of seq
							print $db_out "\n";
						}
						elsif($i==1){
							print $db_out ">$name"."_allele2\n";
							print $db_out substr($seq,0,$position-1);#First part of seq
							print $db_out $allele2;#The amino acid wich change
							print $db_out substr($seq,$position);#Second part of seq
							print $db_out "\n";
						}
					}
				}
				else{
					print "Protein not found in Ensembl data: $transc\n";
					print $stderr "$transc (Protein not found)\n";
					next;
				}				
			}
			# IF INDEL !
			else{
				
				# get the left and right parts of the protein sequence
				# concatenate: left + del / insert + right parts
				
				# get the indel sequence : $alt				
				($allele1,$allele2) = $change =~ /([^\/]+)\/(.*)/;
				
				# get the left and right parts
				$name = $oname."_$rsID\_$transc\[$change-$position\]";
				
				# get the transcript sequence from the transcript name: $transc
				##### Without the reference in Ensembl
				#~ if(not defined $genome_fasta_file){
				if($genome_fasta_file ne "no" and $gff_file ne "no"){
					
					if(defined $transcript_list{$transc}){
					
						$transcript_seq=$transcript_list{$transc};
								
		                $transcript = Bio::Seq->new(-seq => "$transcript_seq", -alphabet => 'dna' );
		 										
					}			
					else{
						print "the transcript from the .vep ($transc) has a different nomenclature than the .gff one\n";
						exit;
					}
				}
				##### with reference in ENsembl
				else{						
					#****************************				
					#Get all sequences from the gene ID.
					$transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' );
					$transcript = $transcript_adaptor->fetch_by_stable_id("$transc");
					#****************************
				}
				
				$protein = $transcript->translate();
				
				if(defined $protein){
					
					$seq = $protein->seq();
					$seq =~s/\*$//;

					
					#Print the 2 alleles in the outputfile
					print $db_out ">$name"."_allele1\n";
					print $db_out "$seq\n";					
								
					if(($name_ini =~ /\/-$/) or ($name_ini =~ /-\/.*$/)){###
						
						# deletion in the reference
						if($name_ini =~ /\/-$/){###
						
							print $db_out ">$name"."_allele2\n";###
							$seq_alt = substr($seq,0,$position-1);#First part of seq ###
							$seq_alt = $seq_alt.substr($seq,$position);#Second part of seq###
							print $db_out "$seq_alt\n";	###
								
						}###
						
						# insertion in the reference
						if($name_ini =~ /-\/.*$/){###
						
							print $db_out ">$name"."_allele2\n";###
							$seq_alt = substr($seq,0,$position-1);#First part of seq ###
							$seq_alt = $seq_alt."$allele2"; # insertion ###
							$seq_alt = $seq_alt.substr($seq,$position);#Second part of seq###
							print $db_out "$seq_alt\n";	###
								
						}###
						
						
					}###
					else{###
						($start,$end) = $position =~ /(\d+)-(\d+)/;										
							
						print $db_out ">$name"."_allele2\n";
						$seq_alt = substr($seq,0,$start-1);#First part of seq 
						$seq_alt = $seq_alt."$allele2";
						$seq_alt = $seq_alt.substr($seq,$end);#Second part of seq  ###!!!!!
						print $db_out "$seq_alt\n";				
						
					}###											
				}
				else{
					print "Protein not found in Ensembl data: $transc\n";
					print $stderr "$transc (Protein not found)\n";
					next;
				}								
			}
		}
=add1
		################################################################ INTRONIC, SPLICE_SITE ###################################################
		#Add here a class of SNPs if you want (3PRIME_UTR, 5PRIME_UTR, DOWNSTREAM, ESSENTIAL_SPLICE_SITE, INTERGENIC, INTRONIC, SPLICE_SITE, STOP_GAINED, SYNONYMOUS_CODING ,UPSTREAM ,WITHIN_NON_CODING_GENE)
		elsif($_ = m/(.*)\t(.*)\t(.*)\t(.*)\tINTRONIC\t(.*)\t(.*)\t(.*)\t(.*)//){
		}
=cut
	}
	
	#~ `cp $outvep_res "$dirresults/results_vep.txt"; cp $outfile "$dirresults/results.txt"; cp $errorfile "$dirresults/error.txt"`;
	
	$db_in->close();
	$db_out->close();
	$stderr->close();
	$fh_vep_out->close();
}


# This function attributes 1 ID to each seq (some programs don't like huge names).

sub encodeFastaProtName($$$$){
	
	my ($infile,$outfile,$outencodingtable,$BASE_SEQ_NAME) = @_;
	
	print "formatAlleleSeq::encodeFastaProtName: formatting protein sequence names...\n";
	
	my ($change,$type_var);
	
	my $fic_in = new IO::File "$infile" || die "Can not open $infile: $! \n";
	my $sub = new IO::File ">$outfile" || die "Can not create sub $outfile: $! \n";#The file who will be submited
	
	
	#~ my $save_names_file = $outfile;
	#~ $save_names_file =~ s/\/[^\/]*$/\/saved_names.txt/;
	
	my $sav = new IO::File ">$outencodingtable" || die "Can not create sav $outencodingtable: $! \n";#The save data
	my $i = 1;
	while(<$fic_in>){
		if($_ =~ m/^>/){
			chomp $_;
			
			#~ print STDOUT ">>>>$_";
			
			## EXAMPLES OF INDELS
			#~ >X_3359812_TTCTTCTTCTTCTT/TTCTTCTTCTT_-_ENSBTAT00000014275[KKKKK/KKKK-38-42]_allele1 
			#~ >X_138471854_AAT/-_-_ENSBTAT00000004339[I/--159]_allele2 
			#~ >X_14308536_-/ATC_-_ENSBTAT00000013812[P/RS-6023]_allele1 
			#~ >X_733293_G/A_-_ENSBTAT00000000209[P/S-600]_allele2 
			
			
			($change) = $_ =~ /\[([^\/]+\/.+)\-{1}\d*\-*\d+\]_allele\d*/;
			
			
			if((length($change) > 3) or ($change =~ /\-/)){
				
				$type_var = "indel";
			}
			else{
				
				$type_var = "snp";
			}
			
			print $sav "$_\t".$BASE_SEQ_NAME.$i."\t".$type_var."\n";
			#~ print STDOUT "$_\t".$BASE_SEQ_NAME.$i."\t".$type_var."\n";
			
			print $sub ">".$BASE_SEQ_NAME.$i."\n";#new ID
			$_ = <$fic_in>;
			print $sub $_;#Sequence
			$i++;
		}
	}
	$fic_in->close();
	$sav->close();
	$sub->close();
	return $i;
}


# this function splits the initial fasta file into 1 sequence per file

sub format_multifasta_to_singlefastas($$){
	
	my ($inputfile,$outdir) = @_;
	
	print "formatAlleleSeq::format_multifasta_to_singlefastas: formats a fasta file containing multiple sequences into 1 sequence per file...\n";
	
	my @args = ("/usr/local/bioinfo/src/emboss/EMBOSS-6.0.1/bin/seqretsplit", "-sequence", $inputfile, "-osdirectory2", $outdir, "-auto");
   	system(@args) == 0 or die "system @args failed: $?"
}


# this function splits a fasta file into several fasta files with -n number of sequences per file

# faire le mkdir dans ergatis
sub splitfastaN($$$$){
	
	my ($inputfile,$outdir,$numseqsperfile,$sizeseqmin) = @_;
	
	my ($fh_in,$fh_out,$line,$count,$outfile,$countfile,$name,$seq,$ln);
	
	print "formatAlleleSeq::splitfastaN: formats a fasta file containing multiple sequences into 1 sequence per file...\n";


	($name) = $inputfile =~ /\/([^\/]*)$/;
	$name =~ s/\..*//;
	$name = $outdir."/".$name;
	
	$count = $numseqsperfile + 1;
	$countfile = 0;
	
	$fh_in= new IO::File $inputfile || die "cannot open file $inputfile: $! \n";
	$/=">";
	<$fh_in>;
	while($line=<$fh_in>){
				
		$line=~s/>//;		
		
		($seq) = $line =~ /[^\n]+\n(.*)/;
		
		$ln = length($seq);
		
		if($ln < $sizeseqmin){
			
			if($count < $numseqsperfile){
				
				print $fh_out (">$line");
				$count++;
			}
			else{
				
				unless($countfile==0){$fh_out->close()};
				
				$countfile ++;
				$outfile = $name.$numseqsperfile."_".$countfile.".fasta";
				#~ print "$outfile\n";
				#~ $outfile =~ s/\/$name[^\/]*$/\/$name\_seq$numseqsperfile\_$countfile\.fasta/;
				
				#~ print "$outfile\n";
				
				$count = 1;
				$fh_out = new IO::File ">$outfile" || die "cannot open file $outfile: $! \n";
				print $fh_out (">$line");
				
			}
		}
	}
	$fh_in->close();
}


# GET THE ENCODING NAMES
# get the encoded names and the position of the amino acid changein the protein sequence

sub getEncoding($){
	
	my ($name_encoding_file) = @_;
	
	my %encodingName = (); # initial name => short name
	my %encodingID = (); # short name => initial name
	my %name_allele = (); # short name => allele 1 or allele 2
	my %name_position = (); # short name => AA position of SNP
	my %name_var_type = (); # short name => type of variation (snp, indel)
	
	my ($line,$fh_in,$fh_out,$pos,$name,$allele,$var_type);
	
	$fh_in = new IO::File $name_encoding_file;
	defined($fh_in)|| die "Can not open $name_encoding_file : $! \n";	
	$/="\n";
	while($line=<$fh_in>){				
		
		if($line =~ m/^>\S*\s*.*/){
			
			$line =~ /^>([^\t]+)\t([^\t]*)\t([^\n]*)\n/;						
				
			$encodingID{$2} = $1;	
			$encodingName{$1} = $2;			

			#~ ($pos,$allele,$name) = $line =~ /(\d+)\]_(allele\d)\t([^\n]*)$/;			
			($pos,$allele,$name,$var_type) = $line =~ /\-(\d+\-*\d*)\]_(allele\d)\t([^\t]*)\t(.*)$/;###		
						
			#~ print "$pos,$allele,*$name*,$var_type\n";	
			$name_position{$name} = $pos;						
			$name_allele{$name} = $allele;	
			$name_var_type{$name} = $var_type;###
		}
		else{
			
			print "$name_encoding_file the protein sequences do not have the right format\n$line \n";
			exit(11);
		}
	} 
	$fh_in->close();
	
	return \%encodingName,\%encodingID,\%name_allele,\%name_position,\%name_var_type;###
}


sub getVEPfile_fromSNPeffect($$$){
	
	my ($infile,$out_vep_dir,$type) = @_;
	
	my (@type,$fh_in,$fh_out,@line, @args);
	my $nb_snp = 0;
	#~ @type = split(",", $type);

	#~ foreach my $var (@type){
				
		my ($vep_var_result) = $out_vep_dir."/".$type."_file.vep";
		
		## ADD HERE NEW CATEGORY
		if ($type eq "NSC"){
			#~ $type = "NON\_SYNONYMOUS\_CODING";
			$type = "(missense\_variant|initiator_codon_variant)";
		}
		elsif($type eq "STOP"){
			#~ $type = "STOP\_(GAINED|LOST)";
			$type = "stop\_(gained)";
		}
	
		$fh_in = new IO::File $infile, "r" ;
		defined $fh_in || die "Can not open $infile : $! \n";
		$/="\n";	
		$fh_out = new IO::File $vep_var_result, "w";
		defined $fh_out || die "Can not create $vep_var_result: $! \n";

		while(<$fh_in>){
						
			@line = ();	
			@line = split("\t", $_);
						
			next if ($line[0] =~/^#/);
						
			if ($line[6] =~ /$type/ && $line[1] !~ /.*\:\d*\-\d*/){
				print $fh_out (join ("\t",@line));
				$nb_snp++;
			}
		} 
		$fh_in->close();
		$fh_out->close();
						
	#~ }
}	


sub getVEPfile_fromSNPeffect_old($$$){
	
	my ($infile,$out_vep_dir,$type) = @_;
	
	my (@type,$fh_in,$fh_out,@line, @args);
	my $nb_snp = 0;
	@type = split(",", $type);

	foreach my $var (@type){
		
		#~ my ($vep_var_dir) = $out_vep_dir."/".$var."_vep_dir";
		#~ system("mkdir $vep_var_dir");
		
		my ($vep_var_result) = $out_vep_dir."/".$var."_file.vep";
		#~ print $vep_var_dir."\n".$vep_var_result."\n\n";
		
		## ADD HERE NEW CATEGORY
		if ($var eq "NSC"){
			$type = "NON\_SYNONYMOUS\_CODING"
		}
		elsif($var eq "STOP"){
			$type = "STOP\_(GAINED|LOST)";
		}
		
		print "$var:$type\n$infile\n";
		
		$fh_in = new IO::File $infile, "r"  || die "Can not open $infile : $! \n";
		$/="\n";	
		$fh_out = new IO::File $vep_var_result, "w"|| die "Can not create $vep_var_result: $! \n";

		while(<$fh_in>){
		
			#print "*$_*\n";
				
			@line = ();	
			@line = split("\t", $_);
			
			#print "*$line[0]*: $line[6]\n";
			
			next if ($line[0] =~/^#/);
			
			#print "heho $line[6]\t$line[2]";
			
			if ($line[6] =~ /$type/ && $line[1] !~ /.*\:\d*\-\d*/){
				print $fh_out (join ("\t",@line));
				#print "ici";
				$nb_snp++;
			}
		} 
		$fh_in->close();
		$fh_out->close();
		
		#~ # if number of variants input is greather than snp_number_per_file, we split files
		#~ if ($nb_snp > $snp_number_per_file){
			#~ chdir "$vep_var_dir" or die "cannot change dir getVEPfile_fromSNPeffect chdir $vep_var_dir : $?";
			#~ 
			#~ @args = ("split", "-l", $snp_number_per_file, "-d", $vep_var_result, "vep_".$var);
			#~ system(@args) == 0 or die "system @args failed: $?";
#~ 
			#~ @args = ("rm", $vep_var_result);
			#~ system(@args) == 0 or die "system @args failed: $?";
		#~ }				
	}
}


# for INDELS only
# This subroutine checks according to the VEP formats output types for indels ( I/--pos, -/I-pos, VN/VKN-pos, KKK/KKKK,pos-pos) 
# if the REF or ALT has to be checked for new signals
sub analyseIndelChange($){
	
	my ($info) = @_;
	
	my ($ref,$alt,$pos,$change,$start,$end,$indel,$end_alt);
	
	$ref="";
	$alt="";
	$start = 0;
	$end = 0;
	$end_alt = 0;
	
	## EXAMPLES OF INDELS:
	#~ KKKKK/KKKK-38-42
	#~ I/--159
	#~ P/RS-6023
	#~ P/S-600
	
	#~ KKKKK/KKKK-38-42
	if($info =~ /^[^\/]+\/[^\-]+\-\d+\-\d+$/){
		
		# positions for the reference start and end
		($ref,$alt,$start,$end) = $info =~ /^([^\/]+)\/([^\-]+)\-(\d+)\-(\d+)$/;
					
		# Positions for the alternative
		$end_alt = $start + length($alt) - 1;		
	}
	#~ I/--159	
	elsif($info =~ /^[^\/]+\/\-\-\d+$/){
		
		($ref,$alt,$pos) = $info =~ /^([^\/]+)\/(\-)\-(\d+)$/;
		
		$start = $pos;
		$end = $pos;
		$end_alt = 0; # deletion => No bases to look at on the alternative !	
	}	
	#~ -/I-159
	elsif($info =~ /^\-\/[^\/]+\-\d+$/){
		
		($ref,$alt,$pos) = $info =~ /^(\-)\/([^\/]+)\-(\d+)$/;
		
		$start = $pos;
		$end = 0; # deletion => No bases to look at on the reference !
		$end_alt = $pos;	
	}	
	#~ P/RS-6023
	#~ P/S-600
	elsif($info =~ /^[^\/]+\/[^\-]+\-\d+$/){
		
		($ref,$alt,$pos) = $info =~ /^([^\/]+)\/([^\-]+)\-(\d+)$/;
		
		$start = $pos;
		$end = $pos;
		$end_alt = $start + length($alt) - 1;
	}
	else{
		
		print ">>UNKNWON CASE: $info\n";
	}
	

	#~ print "************************\n";
	#~ print "$info\n";
	#~ print "$ref,$alt,$start,$end,$end_alt\n";#>PFFS,PFS-203,206,PFFSPFS-203
	#~ print "************************\n";

	
	return ($start,$end,$end_alt);
}


1;