#!/usr/bin/perl -w

package formatAlleleSeq;

use strict;
#~ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

#~ require Exporter;
use base 'Exporter';

#~ require Exporter;
#~ @ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

# export the names of the fonction for that module (to be seen externally)
our @EXPORT = qw/getProteinseqfile_fromSNPeffect encodeFastaProtName format_multifasta_to_singlefastas splitfastaN getEncoding getVEPfile_fromSNPeffect getEnsemblformat_from_rsID analyseIndelChange/;

#~ $VERSION = '0.01';

use IO::File;
use Bio::SeqIO;
use Bio::EnsEMBL::Registry;


# Only for SNPs from dbSNP (no indels)
sub getEnsemblformat_from_rsID($$$$){
	 	 
	 my ($rsID_file, $outdir ,$registry_file_path, $specie) = @_;
	 
	 my ($id,$vfa,$vf,$v,$error);
	 
	 my $registry = 'Bio::EnsEMBL::Registry';
  	 $registry->load_all($registry_file_path);
						
	 my $errorfile = $outdir."/errors.txt";				
	 
	 my $fh_in = new IO::File $rsID_file || die "Can not create $rsID_file : $!";
	 
	 my $fh_stderr = new IO::File ">$errorfile" || die "Can not create $errorfile : $!"; #Error file (where the untreated sequences goes)
	 print $fh_stderr "Untreated sequences:\n";

	 my $fh_out = new IO::File ">$outdir/ensembl.txt" || die "Can not create $outdir/ensembl.txt : $!";#Output file : input for snp_predictor_effect
	 
	 print "Searching for the rs id...\n";
	 
	 $error = 0;#Numbers of errors founds
 
 
	 while(<$fh_in>){
		 
		 #Check if the rs id exist in db
		 if($_ =~ /(rs\d*)/){
			 
			 $id = $1;
			 $vfa = $registry->get_adaptor($specie,"variation","variationfeature");
			 $vf = $registry->get_adaptor($specie,"variation","variation");
			 $v = $vf->fetch_by_name($id);
		
			 if(!defined($v)){
				 #print "Seq not found in the Ensembl data: $id . Skipping line.\n";
				 print $fh_stderr "$id (not found in the Ensembl database)\n";
				 $error++;
				 next;
			 }
			 #Take all variations for this rs (start, end, strand, allele string, chromosom)
			 foreach $vf (@{$vfa->fetch_all_by_Variation($v)}) {
				 if(!defined($vf)){
					 #print "Seq variation annotation not found in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no features found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 #Extract the informations: start of snp, end of snp (must be same as the start), strand, alleles, chromosom, rsid (optionnal - for debug)
				 my $start = $vf->start();
				 if(!defined($start)){
					 #print "Seq have not start position in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no start found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 my $end = $vf->end();
				 if(!defined($end)){
					 #print "Seq have not end position in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no end found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 my $strand = $vf->strand();
				 if(!defined($start)){
					 #print "Seq have not strand in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no strand found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 my $alleles = $vf->allele_string();
				 if(!defined($start)){
					 #print "Seq have not allele in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no allele string found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 my $chromosom = $vf->seq_region_name();
				 if(!defined($chromosom)){
					 #print "Seq have not chromosom in the Ensembl data: $id . Skipping line.\n";
					 print $fh_stderr "$id (no chromosom string found in Ensembl database)\n";
					 $error++;
					 next;
				 }
				 
				 my $name_snp = $vf->variation_name();
				 print $fh_out "chr$chromosom\t$start\t$end\t$alleles\t$strand\n";
			 }
		 }
		 else{
			 print "Error: untolerated rs sequence ($_). Skipping line.\n";
			 print $fh_stderr "$_\n";
		 }
	 }
	 
	 if($error!=0){
		 print "$error errors found. Consult results/error.out.\n";
	 }
	 
	 $fh_out->close();
	 $fh_stderr->close();
	 $fh_in->close();
}


# this function is used to get the non_synonymous coding SNPs
# to get the corresponding protein sequence from Ensembl
# to generate a fasta file with these protein sequences, 1 per allele 
# so 2 protein sequences per SNP

sub getProteinseqfile_fromSNPeffect($$$$$$){
	
	my ($specie, $infile, $outfile, $registry_file_path, $outvep_res, $outvep_title) = @_;
	
	print "formatAlleleSeq::create: Searching amino acid sequence for each allele...\n";
	
	#Connect to the Ensembl data		
	my $registry = 'Bio::EnsEMBL::Registry';
  	$registry->load_all($registry_file_path);
  	
  	my $errorfile = $outfile;
  	$errorfile =~s/\/[^\/]*$/\/error.txt/;
	
	my $db_in = new IO::File "$infile", "r" || die "Can not open file  : $infile $! \n";#Result of snppredictor	
	my $db_out = new IO::File ">$outfile" || die "Can not create file $outfile $! \n";#The final db
	my $stderr = new IO::File ">>$errorfile" || die "Can not open $errorfile: $! \n";#Error file

	my $iterator = 1;
	my $iterator2 = 1;
	my $oname;
	my @tmp = ();
	
	#~ my ($ref,$alt);
	my ($position2,$allele1_len,$start,$end,$seq_alt);
	my $allele1;
	my $allele2;
	my ($alt,$name,$rsID,$gene,$transc,$position,$change,$name_ini,$chr,$pos,$check_name,$geno,$consequence);
	my ($transcript_adaptor,$transcript,$protein,$seq);
	
	my $fh_vep_out_title = new IO::File ">$outvep_title" || die "Can not create file  $outvep_title! \n";#The final db
	print $fh_vep_out_title "SNP Name\tChromosome\tPosition\tGene ID\tTranscript ID\tConsequence\tGenotype\tAA change\tAA position\n";
	$fh_vep_out_title->close();
	
	my $fh_vep_out = new IO::File ">$outvep_res" || die "Can not create file $outvep_res $! \n";#The final db
	
	while(<$db_in>){
		
		next if ($_=~/^\#/);

		#~ $_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\t([^\t]*)\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/;
		
		@tmp = ();
		@tmp = split(/\t/,$_);
		
		#~ $consequence = $6;
		#~ $alt = $3;
		#~ $name_ini = $1;
		#~ $rsID = $10;			
		#~ $gene = $4;
		#~ $transc = $5;
		#~ $position = $8;			
		#~ $change = $9;		
					#~ 
        $consequence = $tmp[6];
		$alt = $tmp[2];
		$name_ini = $tmp[0];
		$rsID = $tmp[12];	
		$gene = $tmp[3];
		$transc = $tmp[4];
		$position = $tmp[9];		
		$change = $tmp[10];
													
		($check_name) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/;
		$check_name = $check_name."/".$alt;
						
							
		#~ if((defined($oname)) && ($oname eq $check_name)){#If previous name already exist (severals transcripts exists)
			#~ $iterator2++;#Increment $iterator
		#~ }
		#~ else{
			#~ $iterator2 = 1;
		#~ }
						
		$oname = $check_name;					
		#~ $name = $oname."_$rsID\_$transc\_transcript$iterator2\[$change-$position\]";	
		$name = $oname."_$rsID\_$transc\[$change-$position\]";	
				
		($chr,$pos,$geno) = $oname =~ /([^_]+)_([^_]+)_(.+)/;
		
		print $fh_vep_out "$name\t$chr\t$pos\t$gene\t$transc\t$consequence\t$geno\t$change\t$position\n";
	
				
		############################################## VERIF OUTPUT VARIANT EFFECT PREDICTOR FORMAT ###############################################
		#~ my $ligne = $_;
		#~ my @line = split( "\t", $ligne);
		#~ if ( $line[4] !~ /ENS\D{3}T\d+/ ){
			#~ print "Variant format unknow\n";
			#~ exit;
		#~ }

		################################################################ NON SYNONYMOUS CODING ###################################################
		if(($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tmissense_variant\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/) ||
		   ($_ =~ m/(.*)\t(.*)\t(\S*)\t(.*)\t(.*)\t.*\tinitiator_codon_variant\t(\S*)\t\S*\t(\S*)\t(\S*)\t\S*\t(\S*)/)){						
						
			#~ $alt = $3;
			#~ $name_ini = $1;
			#~ $rsID = $9;			
			#~ $gene = $4;
			#~ $transc = $5;
			#~ $position = $7;			
			#~ $change = $8;	
			
			@tmp = ();
			@tmp = split(/\t/,$_);
						
			$consequence = $tmp[6];
			$alt = $tmp[2];
			$name_ini = $tmp[0];
			$rsID = $tmp[12];	
			$gene = $tmp[3];
			$transc = $tmp[4];
			$position = $tmp[9];		
			$change = $tmp[10];					
						
			($check_name) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/;
			$check_name = $check_name."/".$alt;
						
			#~ if((defined($oname)) && ($oname eq $check_name)){#If previous name already exist (severals transcripts exists)
				#~ $iterator++;#Increment $iterator
			#~ }
			#~ else{
				#~ $iterator = 1;
			#~ }						
			
			#~ $oname = $1;
			# in case where theere are more than 1 alternatvie allele compared to the reference
			# the name will be based on: Ref / ALt considered for that line in the vep file
			# for these cases, the annotation pipeline will consider only the comparison (lost / gain)
			# between the reference and the alternative base considered (not betwwen different ALT)
			# note: if there are more than 1 alt:
			# VEP will retreive 1 line per ALT
			
			#~ ($oname) = $name_ini =~ /([^_]+_[^_]+_[^\/]+)/;
			#~ $oname = $oname."/".$alt;
			$oname = $check_name;
			#~ print ">>$oname\n";
			
			@tmp = ();
			@tmp = split(/_/,$oname);
			my $type_var = length($tmp[2]); # length($3); # SNP len = 1 elsif INDEL len > 1									
			
			## If the variant is a SNP
			if(($type_var == 3) and (not $name_ini =~ /\/-$/) and (not $name_ini =~ /-\/.*$/)){###
				
				#~ my $name = $1."_$rsID\_$transc\_transcript$iterator\[$change-$position\]";
				$name = $oname."_$rsID\_$transc\[$change-$position\]";
				#~ $allele1;
				#~ $allele2;
				if($change =~ m/(\D)\/(\D)/){#Set allele 1 and 2
					$allele1 = $1;
					$allele2 = $2;
				}
				else{
					print "Error found: $name\n";
					print $stderr "$transc (Alleles not found)";
					next;
				}
				
				#Get all sequences from the gene ID.
				$transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' );
				$transcript = $transcript_adaptor->fetch_by_stable_id("$transc");
				
				#~ print "*$transc:$transcript*\n";
				$protein = $transcript->translate();
				if(defined $protein){
					#Print the 2 alleles in the outputfile
					$seq = $protein->seq();
					for (my $i=0; $i<2; $i++){
						if($i==0){
							print $db_out ">$name"."_allele1\n";						
							print $db_out substr($seq,0,$position-1);#First part of seq
							print $db_out $allele1;#The amino acid wich change
							print $db_out substr($seq,$position);#Second part of seq
							print $db_out "\n";
						}
						elsif($i==1){
							print $db_out ">$name"."_allele2\n";
							print $db_out substr($seq,0,$position-1);#First part of seq
							print $db_out $allele2;#The amino acid wich change
							print $db_out substr($seq,$position);#Second part of seq
							print $db_out "\n";
						}
					}
				}
				else{
					print "Protein not found in Ensembl data: $transc\n";
					print $stderr "$transc (Protein not found)\n";
					next;
				}				
			}
			# IF INDEL !
			else{
				
				# get the left and right parts of the protein sequence
				# concatenate: left + del / insert + right parts
				
				# get the indel sequence : $alt				
				($allele1,$allele2) = $change =~ /([^\/]+)\/(.*)/;
				
				# get the left and right parts
				$name = $oname."_$rsID\_$transc\[$change-$position\]";
				
				#Get all sequences from the gene ID.
				$transcript_adaptor = $registry->get_adaptor( $specie, 'Core', 'Transcript' );
				$transcript = $transcript_adaptor->fetch_by_stable_id("$transc");
				$protein = $transcript->translate();
				
				if(defined $protein){
					
					$seq = $protein->seq();
					
					#Print the 2 alleles in the outputfile
					print $db_out ">$name"."_allele1\n";
					print $db_out "$seq\n";					
								
					if(($name_ini =~ /\/-$/) or ($name_ini =~ /-\/.*$/)){###
						
						# deletion in the reference
						if($name_ini =~ /\/-$/){###
						
							print $db_out ">$name"."_allele2\n";###
							$seq_alt = substr($seq,0,$position-1);#First part of seq ###
							$seq_alt = $seq_alt.substr($seq,$position);#Second part of seq###
							print $db_out "$seq_alt\n";	###
								
						}###
						
						# insertion in the reference
						if($name_ini =~ /-\/.*$/){###
						
							print $db_out ">$name"."_allele2\n";###
							$seq_alt = substr($seq,0,$position-1);#First part of seq ###
							$seq_alt = $seq_alt."$allele2"; # insertion ###
							$seq_alt = $seq_alt.substr($seq,$position);#Second part of seq###
							print $db_out "$seq_alt\n";	###
								
						}###
						
						
					}###
					else{###
						($start,$end) = $position =~ /(\d+)-(\d+)/;										
							
						print $db_out ">$name"."_allele2\n";
						$seq_alt = substr($seq,0,$start-1);#First part of seq 
						$seq_alt = $seq_alt."$allele2";
						$seq_alt = $seq_alt.substr($seq,$end);#Second part of seq  ###!!!!!
						print $db_out "$seq_alt\n";				
						
					}###											
				}
				else{
					print "Protein not found in Ensembl data: $transc\n";
					print $stderr "$transc (Protein not found)\n";
					next;
				}								
			}
		}
=add1
		################################################################ INTRONIC, SPLICE_SITE ###################################################
		#Add here a class of SNPs if you want (3PRIME_UTR, 5PRIME_UTR, DOWNSTREAM, ESSENTIAL_SPLICE_SITE, INTERGENIC, INTRONIC, SPLICE_SITE, STOP_GAINED, SYNONYMOUS_CODING ,UPSTREAM ,WITHIN_NON_CODING_GENE)
		elsif($_ = m/(.*)\t(.*)\t(.*)\t(.*)\tINTRONIC\t(.*)\t(.*)\t(.*)\t(.*)//){
		}
=cut
	}
	$db_in->close();
	$db_out->close();
	$stderr->close();
	$fh_vep_out->close();
}


# This function attributes 1 ID to each seq (some programs don't like huge names).

sub encodeFastaProtName($$$$){
	
	my ($infile,$outfile,$outencodingtable,$BASE_SEQ_NAME) = @_;
	
	print "formatAlleleSeq::formatfile: formatting protein sequence names...\n";
	
	my ($change,$type_var);
	
	my $fic_in = new IO::File "$infile" || die "Can not open $infile: $! \n";
	my $sub = new IO::File ">$outfile" || die "Can not create sub $outfile: $! \n";#The file who will be submited
	
	#~ my $save_names_file = $outfile;
	#~ $save_names_file =~ s/\/[^\/]*$/\/saved_names.txt/;
	
	my $sav = new IO::File ">$outencodingtable" || die "Can not create sav $outencodingtable: $! \n";#The save data
	my $i = 1;
	while(<$fic_in>){
		if($_ =~ m/^>/){
			chomp $_;
			
			## EXAMPLES OF INDELS
			#~ >X_3359812_TTCTTCTTCTTCTT/TTCTTCTTCTT_-_ENSBTAT00000014275[KKKKK/KKKK-38-42]_allele1 
			#~ >X_138471854_AAT/-_-_ENSBTAT00000004339[I/--159]_allele2 
			#~ >X_14308536_-/ATC_-_ENSBTAT00000013812[P/RS-6023]_allele1 
			#~ >X_733293_G/A_-_ENSBTAT00000000209[P/S-600]_allele2 
			
			($change) = $_ =~ /\[([^\/]+\/.+)\-{1}\d*\-*\d+\]_allele\d*/;
			
			if((length($change) > 3) or ($change =~ /\-/)){
				
				$type_var = "indel";
			}
			else{
				
				$type_var = "snp";
			}
			
			print $sav "$_\t".$BASE_SEQ_NAME.$i."\t".$type_var."\n";
			
			print $sub ">".$BASE_SEQ_NAME.$i."\n";#new ID
			$_ = <$fic_in>;
			print $sub $_;#Sequence
			$i++;
		}
	}
	$fic_in->close();
	$sav->close();
	$sub->close();
	return $i;
}


# this function splits the initial fasta file into 1 sequence per file

sub format_multifasta_to_singlefastas($$){
	
	my ($inputfile,$outdir) = @_;
	
	print "formatAlleleSeq::format_multifasta_to_singlefastas: formats a fasta file containing multiple sequences into 1 sequence per file...\n";
	
	my @args = ("/usr/local/bioinfo/src/emboss/EMBOSS-6.0.1/bin/seqretsplit", "-sequence", $inputfile, "-osdirectory2", $outdir, "-auto");
   	system(@args) == 0 or die "system @args failed: $?"
}


# this function splits a fasta file into several fasta files with -n number of sequences per file

# faire le mkdir dans ergatis
sub splitfastaN($$$$){
	
	my ($inputfile,$outdir,$numseqsperfile,$sizeseqmin) = @_;
	
	my ($fh_in,$fh_out,$line,$count,$outfile,$countfile,$name,$seq,$ln);
	
	print "formatAlleleSeq::splitfastaN: formats a fasta file containing multiple sequences into 1 sequence per file...\n";


	($name) = $inputfile =~ /\/([^\/]*)$/;
	$name =~ s/\..*//;
	$name = $outdir."/".$name;
	
	$count = $numseqsperfile + 1;
	$countfile = 0;
	
	$fh_in= new IO::File $inputfile || die "cannot open file $inputfile: $! \n";
	$/=">";
	<$fh_in>;
	while($line=<$fh_in>){
				
		$line=~s/>//;		
		
		($seq) = $line =~ /[^\n]+\n(.*)/;
		
		$ln = length($seq);
		
		if($ln < $sizeseqmin){
			
			if($count < $numseqsperfile){
				
				print $fh_out (">$line");
				$count++;
			}
			else{
				
				unless($countfile==0){$fh_out->close()};
				
				$countfile ++;
				$outfile = $name.$numseqsperfile."_".$countfile.".fasta";
				#~ print "$outfile\n";
				#~ $outfile =~ s/\/$name[^\/]*$/\/$name\_seq$numseqsperfile\_$countfile\.fasta/;
				
				#~ print "$outfile\n";
				
				$count = 1;
				$fh_out = new IO::File ">$outfile" || die "cannot open file $outfile: $! \n";
				print $fh_out (">$line");
				
			}
		}
	}
	$fh_in->close();
}


# GET THE ENCODING NAMES
# get the encoded names and the position of the amino acid changein the protein sequence

sub getEncoding($){
	
	my ($name_encoding_file) = @_;
	
	my %encodingName = (); # initial name => short name
	my %encodingID = (); # short name => initial name
	my %name_allele = (); # short name => allele 1 or allele 2
	my %name_position = (); # short name => AA position of SNP
	my %name_var_type = (); # short name => type of variation (snp, indel)
	
	my ($line,$fh_in,$fh_out,$pos,$name,$allele,$var_type);
	
	$fh_in = new IO::File $name_encoding_file || die "Can not open $name_encoding_file : $! \n";	
	$/="\n";
	while($line=<$fh_in>){				
		
		if($line =~ m/^>\S*\s*.*/){
			
			$line =~ /^>([^\t]+)\t([^\t]*)\t([^\n]*)\n/;						
				
			$encodingID{$2} = $1;	
			$encodingName{$1} = $2;			

			#~ ($pos,$allele,$name) = $line =~ /(\d+)\]_(allele\d)\t([^\n]*)$/;			
			($pos,$allele,$name,$var_type) = $line =~ /\-(\d+\-*\d*)\]_(allele\d)\t([^\t]*)\t(.*)$/;###		
						
			#~ print "$pos,$allele,*$name*,$var_type\n";	
			$name_position{$name} = $pos;						
			$name_allele{$name} = $allele;	
			$name_var_type{$name} = $var_type;###
		}
		else{
			
			print "the protein sequences do not have the right format\n";
			exit(11);
		}
	} 
	$fh_in->close();
	
	return \%encodingName,\%encodingID,\%name_allele,\%name_position,\%name_var_type;###
}


sub getVEPfile_fromSNPeffect($$$){
	
	my ($infile,$out_vep_dir,$type) = @_;
	
	my (@type,$fh_in,$fh_out,@line, @args);
	my $nb_snp = 0;
	#~ @type = split(",", $type);

	#~ foreach my $var (@type){
				
		my ($vep_var_result) = $out_vep_dir."/".$type."_file.vep";
		
		## ADD HERE NEW CATEGORY
		if ($type eq "NSC"){
			#~ $type = "NON\_SYNONYMOUS\_CODING";
			$type = "(missense\_variant|initiator_codon_variant)";
		}
		elsif($type eq "STOP"){
			#~ $type = "STOP\_(GAINED|LOST)";
			$type = "stop\_(gained)";
		}
				
		$fh_in = new IO::File $infile, "r"  || die "Can not open $infile : $! \n";
		$/="\n";	
		$fh_out = new IO::File $vep_var_result, "w"|| die "Can not create $vep_var_result: $! \n";

		while(<$fh_in>){
						
			@line = ();	
			@line = split("\t", $_);
						
			next if ($line[0] =~/^#/);
						
			if ($line[6] =~ /$type/ && $line[1] !~ /.*\:\d*\-\d*/){
				print $fh_out (join ("\t",@line));
				$nb_snp++;
			}
		} 
		$fh_in->close();
		$fh_out->close();
						
	#~ }
}	


sub getVEPfile_fromSNPeffect_old($$$){
	
	my ($infile,$out_vep_dir,$type) = @_;
	
	my (@type,$fh_in,$fh_out,@line, @args);
	my $nb_snp = 0;
	@type = split(",", $type);

	foreach my $var (@type){
		
		#~ my ($vep_var_dir) = $out_vep_dir."/".$var."_vep_dir";
		#~ system("mkdir $vep_var_dir");
		
		my ($vep_var_result) = $out_vep_dir."/".$var."_file.vep";
		#~ print $vep_var_dir."\n".$vep_var_result."\n\n";
		
		## ADD HERE NEW CATEGORY
		if ($var eq "NSC"){
			$type = "NON\_SYNONYMOUS\_CODING"
		}
		elsif($var eq "STOP"){
			$type = "STOP\_(GAINED|LOST)";
		}
		
		print "$var:$type\n$infile\n";
		
		$fh_in = new IO::File $infile, "r"  || die "Can not open $infile : $! \n";
		$/="\n";	
		$fh_out = new IO::File $vep_var_result, "w"|| die "Can not create $vep_var_result: $! \n";

		while(<$fh_in>){
		
			#print "*$_*\n";
				
			@line = ();	
			@line = split("\t", $_);
			
			#print "*$line[0]*: $line[6]\n";
			
			next if ($line[0] =~/^#/);
			
			#print "heho $line[6]\t$line[2]";
			
			if ($line[6] =~ /$type/ && $line[1] !~ /.*\:\d*\-\d*/){
				print $fh_out (join ("\t",@line));
				#print "ici";
				$nb_snp++;
			}
		} 
		$fh_in->close();
		$fh_out->close();
		
		#~ # if number of variants input is greather than snp_number_per_file, we split files
		#~ if ($nb_snp > $snp_number_per_file){
			#~ chdir "$vep_var_dir" or die "cannot change dir getVEPfile_fromSNPeffect chdir $vep_var_dir : $?";
			#~ 
			#~ @args = ("split", "-l", $snp_number_per_file, "-d", $vep_var_result, "vep_".$var);
			#~ system(@args) == 0 or die "system @args failed: $?";
#~ 
			#~ @args = ("rm", $vep_var_result);
			#~ system(@args) == 0 or die "system @args failed: $?";
		#~ }				
	}
}


# for INDELS only
# This subroutine checks according to the VEP formats output types for indels ( I/--pos, -/I-pos, VN/VKN-pos, KKK/KKKK,pos-pos) 
# if the REF or ALT has to be checked for new signals
sub analyseIndelChange($){
	
	my ($info) = @_;
	
	my ($ref,$alt,$pos,$change,$start,$end,$indel,$end_alt);
	
	$ref="";
	$alt="";
	$start = 0;
	$end = 0;
	$end_alt = 0;
	
	## EXAMPLES OF INDELS:
	#~ KKKKK/KKKK-38-42
	#~ I/--159
	#~ P/RS-6023
	#~ P/S-600
	
	#~ KKKKK/KKKK-38-42
	if($info =~ /^[^\/]+\/[^\-]+\-\d+\-\d+$/){
		
		# positions for the reference start and end
		($ref,$alt,$start,$end) = $info =~ /^([^\/]+)\/([^\-]+)\-(\d+)\-(\d+)$/;
					
		# Positions for the alternative
		$end_alt = $start + length($alt) - 1;		
	}
	#~ I/--159	
	elsif($info =~ /^[^\/]+\/\-\-\d+$/){
		
		($ref,$alt,$pos) = $info =~ /^([^\/]+)\/(\-)\-(\d+)$/;
		
		$start = $pos;
		$end = $pos;
		$end_alt = 0; # deletion => No bases to look at on the alternative !	
	}	
	#~ -/I-159
	elsif($info =~ /^\-\/[^\/]+\-\d+$/){
		
		($ref,$alt,$pos) = $info =~ /^(\-)\/([^\/]+)\-(\d+)$/;
		
		$start = $pos;
		$end = 0; # deletion => No bases to look at on the reference !
		$end_alt = $pos;	
	}	
	#~ P/RS-6023
	#~ P/S-600
	elsif($info =~ /^[^\/]+\/[^\-]+\-\d+$/){
		
		($ref,$alt,$pos) = $info =~ /^([^\/]+)\/([^\-]+)\-(\d+)$/;
		
		$start = $pos;
		$end = $pos;
		$end_alt = $start + length($alt) - 1;
	}
	else{
		
		print ">>UNKNWON CASE: $info\n";
	}
	

	#~ print "************************\n";
	#~ print "$info\n";
	#~ print "$ref,$alt,$start,$end,$end_alt\n";#>PFFS,PFS-203,206,PFFSPFS-203
	#~ print "************************\n";

	
	return ($start,$end,$end_alt);
}


1;