#!/usr/bin/perl -w

=pod

=head1  NAME
formatSNPeffect-protseq.pl
 
=head1  SYNOPSIS

formatSNPeffect-protseq.pl -arg1 <> -arg2 <> -arg3 <> -arg4 <>

=head1  OPTIONS

  --infile string, the path of the input file SNP_PREDICTOR_EFFECT FORMAT FILE
  --outfileprot string, the path of the output file which will contain the protein sequences FASTA FORMAT, 1 per allele from the input file
  --outfileprot_codedName string, the path of the output file  with protein sequences, 1 per allele, with short encoded/shorter name FASTA FORMAT FILE
  --outencodingtable string, the path of the output file  with name encoding (initial name ending by "_allele1" or "_allele2" => short name "seqx") TAB FORMAT FILE
  --out_fastas_single_dir string, directory containing the temporary fasta files (1 fasta file per sequence) DIR
  --out_fastas_Nseq_dir string, directory containing the temporary fasta files (1 fasta file per N sequence) DIR
  --out_fastas_Nseq_dir_protlenmin string, directory containing the temporary fasta files (1 fasta file per N sequence with protein sequence length min) DIR
  --Nseq string, number of sequences per fasta file for out_fastas_Nseq_dir directory NUMBER
  --Nseq_protlenmin string, number of sequences per fasta file for out_fastas_Nseq_dir_protlenmin directory NUMBER 
  --sizeseqmin string, minimum size for the proteins concerning the 2nd directory
  --species string, specie name from ENSEMBL names (ex cow, sus crofa...) STRING
  --registry_file registry file to local ensembl database FILE WITH PATH
  --delta string, value between 0 and 1. Discriminant for the score comparison. VALUE
  --base_name string, for encoded protein names specify a short string to be used to generate the short sequence name (ex: _seq)   
  --out_vep_NSC_dir, string dir for vep categories file (needed for conservation module) for non synonymous coding
  --out_vep_STOP_dir, string dir for vep categories file (needed for conservation module) for stop gained
  --outvep_res, string file for vep result information for the join step
  --outvep_title, string file for vep title information for the join step
  --annot_file, string gff or gtf file for reference genome not found in Ensembl
  --genome_fasta_file, string fasta file for reference genome not found in Ensembl (names and version corresponding to annot_file)
  
=head1 DESCRIPTION

formatSNPeffect-protseq.pl - This program is part of a pipeline of programs for SNP annotation.
							This program starts from a variant_effect_predictor file, uses ensembl local database to get protein sequences,
							2 proteins per SNP, 1 per allele (2 alleles per SNP are considered here). It retreives a directory with protein fasta
							sequences.
							The protein fasta file(s) generated by this program contain(s) a coded short name based on the parameter 
							base_name specified by	the user and a number (ex: _seq1, _seq2...). The program also generates the enconding 
							file with "long initial name ending by _allele1 or _allele2" <-> "short name, ex:_seqX".

WARNING:
THE FASTA FILES SHOULD HAVE FORMAT: name.fasta
ALL FILES MUST HAVE ONLY 1 EXTENSION: .txt or .tab....


=head1 DATE

20/02/2012

=head1 AUTHORS

Sabrina Rodriguez
Johann Beghain

=cut

use strict;

# find the absolute path to the local library
use FindBin;
# return the absolute path to the local library
use lib "$FindBin::RealBin/../lib";
#~ use lib '/usr/local/bioinfo/src/ergatisdev/current/bin/AnnotationPipelines/lib';

use Getopt::Long;
use Pod::Usage;
use formatAlleleSeq;


#~ PERL5LIB=${PERL5LIB}:/usr/local/bioinfo/src/ensembl-api/branch-66/ensembl/modules
#~ PERL5LIB=${PERL5LIB}:/usr/local/bioinfo/src/ensembl-api/branch-66/ensembl-variation/modules
#~ PERL5LIB=${PERL5LIB}:/usr/local/bioinfo/src/ensembl-api/branch-66/ensembl-compara/modules
#~ export PERL5LIB


#~ perl -I /usr/local/bioinfo/src/ensembl-api/branch-66/ensembl/modules -I /usr/local/bioinfo/src/ensembl-api/branch-66/ensembl-variation/modules -I /usr/local/bioinfo/src/ensembl-api/branch-66/ensembl-compara/modules /usr/local/bioinfo/src/ergatisdev/current/bin/AnnotationPipelines/bin/formatSNPeffect-protseq.pl --infile /home/sigenae/work/Sabrina/VEP_cow_output/variant_effect_predictor.txt --outfileprot /home/sigenae/work/Sabrina/cow_outputs/cow.form --outfileprot_codedName /home/sigenae/work/Sabrina/cow_outputs/cow.fasta --species cow --registry_file /usr/local/bioinfo/src/ensembl-api/variant_effect_predictor.registry --delta 0.5 --out_fastas_single_dir /home/sigenae/work/Sabrina/cow_outputs/protNseq --outencodingtable /home/sigenae/work/Sabrina/cow_outputs/names_encoding.txt --out_fastas_Nseq_dir /home/sigenae/work/Sabrina/cow_outputs/protNseq --Nseq 3000 --base_name "_seq" --sizeseqmin 4000 --out_fastas_Nseq_dir_protlenmin /home/sigenae/work/Sabrina/cow_outputs/protNseqSizemin --Nseq_protlenmin 50 --out_vep_dir /home/sigenae/work/Sabrina/cow_outputs/VEP_cow/

#~ perl /usr/local/bioinfo/src/ergatisdev/current/bin/AnnotationPipelines/bin/formatSNPeffect-protseq.pl --infile /home/sigenae/work/Sabrina/VEP_horse_output/variant_effect_predictor.txt --outfileprot /home/sigenae/work/Sabrina/horse_outputs/horse.form --outfileprot_codedName /home/sigenae/work/Sabrina/horse_outputs/horse.fasta --species horse --registry_file /usr/local/bioinfo/src/ensembl-api/variant_effect_predictor.registry --delta 0.5 --out_fastas_single_dir /home/sigenae/work/Sabrina/horse_outputs/protNseq --outencodingtable /home/sigenae/work/Sabrina/horse_outputs/names_encoding.txt --out_fastas_Nseq_dir /home/sigenae/work/Sabrina/horse_outputs/protNseq --Nseq 3000 --base_name "_seq" --sizeseqmin 4000 --out_fastas_Nseq_dir_protlenmin /home/sigenae/work/Sabrina/horse_outputs/protNseqSizemin --Nseq_protlenmin 50 --out_vep_NSC_dir /home/sigenae/work/Sabrina/horse_outputs/VEP_horse_NSC/ --out_vep_STOP_dir /home/sigenae/work/Sabrina/horse_outputs/VEP_horse_STOP/ --outvep_res /home/sigenae/work/Sabrina/horse_outputs/vep_results.txt -outvep_title /home/sigenae/work/Sabrina/horse_outputs/vep_title.txt 

#~ perl /usr/local/bioinfo/src/ergatisdev/current/bin/AnnotationPipelines/bin/formatSNPeffect-protseq.pl --infile /home/sigenae/work/Sabrina/VEP_horse_output/variant_effect_predictor_test.txt --outfileprot /home/sigenae/work/Sabrina/horse_outputs/horse.form --outfileprot_codedName /home/sigenae/work/Sabrina/horse_outputs/horse.fasta --species horse --registry_file /usr/local/bioinfo/src/ensembl-api/variant_effect_predictor.registry --delta 0.5 --out_fastas_single_dir /home/sigenae/work/Sabrina/horse_outputs/protNseq --outencodingtable /home/sigenae/work/Sabrina/horse_outputs/names_encoding.txt --out_fastas_Nseq_dir /home/sigenae/work/Sabrina/horse_outputs/protNseq --Nseq 3000 --base_name "_seq" --sizeseqmin 4000 --out_fastas_Nseq_dir_protlenmin /home/sigenae/work/Sabrina/horse_outputs/protNseqSizemin --Nseq_protlenmin 50 --out_vep_NSC_dir /home/sigenae/work/Sabrina/horse_outputs/VEP_horse_NSC/ --out_vep_STOP_dir /home/sigenae/work/Sabrina/horse_outputs/VEP_horse_STOP/ --outvep_res /home/sigenae/work/Sabrina/horse_outputs/vep_results.txt -outvep_title /home/sigenae/work/Sabrina/horse_outputs/vep_title.txt 

#~ perl /usr/local/bioinfo/src/ergatisdev/current/bin/AnnotationPipelines/bin/formatSNPeffect-protseq.pl --infile /work/sigenae/vmergatisdev/output_repository/SR_variant_effect_predictor/214_default/variant_effect_predictor_dir/variant_effect_predictor.txt --outfileprot /home/sigenae/work/Sabrina/horse_outputs/horse.form --outfileprot_codedName /home/sigenae/work/Sabrina/horse_outputs/horse.fasta --species cow --registry_file /usr/local/bioinfo/src/ensembl-api/variant_effect_predictor.registry --delta 0.5 --out_fastas_single_dir /home/sigenae/work/Sabrina/horse_outputs/protNseq --outencodingtable /home/sigenae/work/Sabrina/horse_outputs/names_encoding.txt --out_fastas_Nseq_dir /home/sigenae/work/Sabrina/horse_outputs/protNseq --Nseq 3000 --base_name "_seq" --sizeseqmin 4000 --out_fastas_Nseq_dir_protlenmin /home/sigenae/work/Sabrina/horse_outputs/protNseqSizemin --Nseq_protlenmin 50 --out_vep_NSC_dir /home/sigenae/work/Sabrina/horse_outputs/VEP_horse_NSC/ --out_vep_STOP_dir /home/sigenae/work/Sabrina/horse_outputs/VEP_horse_STOP/ --outvep_res /home/sigenae/work/Sabrina/horse_outputs/vep_results.txt -outvep_title /home/sigenae/work/Sabrina/horse_outputs/vep_title.txt 

#~ perl /usr/local/bioinfo/src/ergatisdev/current/bin/AnnotationPipelines/bin/formatSNPeffect-protseq.pl --infile  /work/sigenae/vmergatisdev/output_repository/SR_variant_effect_predictor/161_default/variant_effect_predictor_dir/variant_effect_predictor.txt --outfileprot /home/sigenae/work/Sabrina/horse_outputs/horse.form --outfileprot_codedName /home/sigenae/work/Sabrina/horse_outputs/horse.fasta --species cow --registry_file /usr/local/bioinfo/src/ensembl-api/variant_effect_predictor.registry --delta 0.5 --out_fastas_single_dir /home/sigenae/work/Sabrina/horse_outputs/protNseq --outencodingtable /home/sigenae/work/Sabrina/horse_outputs/names_encoding.txt --out_fastas_Nseq_dir /home/sigenae/work/Sabrina/horse_outputs/protNseq --Nseq 3000 --base_name "_seq" --sizeseqmin 4000 --out_fastas_Nseq_dir_protlenmin /home/sigenae/work/Sabrina/horse_outputs/protNseqSizemin --Nseq_protlenmin 50 --out_vep_NSC_dir /home/sigenae/work/Sabrina/horse_outputs/VEP_horse_NSC/ --out_vep_STOP_dir /home/sigenae/work/Sabrina/horse_outputs/VEP_horse_STOP/ --outvep_res /home/sigenae/work/Sabrina/horse_outputs/vep_results.txt -outvep_title /home/sigenae/work/Sabrina/horse_outputs/vep_title.txt 

############################ OPTIONS / PARAMETERS ############################

my @getopt_args = (
                    '-infile=s'  ,
                    '-outfileprot=s'  ,
                    '-outfileprot_codedName=s'  ,
                    '-outencodingtable=s'  ,
                    '-out_fastas_single_dir=s',
                    '-out_fastas_Nseq_dir=s',
                    '-Nseq=s',
                    '-Nseq_protlenmin=s',
                    '-registry_file=s'  ,
                    '-species=s'  ,    
                    '-delta=s',
                    '-base_name=s',
                    '-sizeseqmin=s',
                    '-out_fastas_Nseq_dir_protlenmin=s',
                    '-out_vep_NSC_dir=s',                                        
                    '-out_vep_STOP_dir=s',
                    '-outvep_res=s',                                        
                    '-outvep_title=s',
                    '-annot_file=s',
                    '-genome_fasta_file=s'
                    
                  );

#~ print STDOUT "TEST la\n";

my %options = ();

unless ( GetOptions( \%options, @getopt_args ) ) {
  usage();
}

#~ print STDOUT "TEST ici-bas\n";

sub usage {
  exec "pod2text $0";
  exit( 1 );
}

usage() if ( !exists $options{'infile'} );
usage() if ( !exists $options{'outfileprot'} );
usage() if ( !exists $options{'outfileprot_codedName'} );
usage() if ( !exists $options{'outencodingtable'} );
usage() if ( !exists $options{'out_fastas_single_dir'} );
usage() if ( !exists $options{'out_fastas_Nseq_dir'} );
usage() if ( !exists $options{'Nseq'} );
usage() if ( !exists $options{'Nseq_protlenmin'} );
usage() if ( !exists $options{'registry_file'} );
#~ usage() if ( !exists $options{'species'} );
usage() if ( !exists $options{'delta'} );
usage() if ( !exists $options{'base_name'} );
usage() if ( !exists $options{'sizeseqmin'} );
usage() if ( !exists $options{'out_fastas_Nseq_dir_protlenmin'} );
usage() if ( !exists $options{'out_vep_NSC_dir'} );
usage() if ( !exists $options{'out_vep_STOP_dir'} );
usage() if ( !exists $options{'outvep_res'} );
usage() if ( !exists $options{'outvep_title'} );
#~ usage() if ( !exists $options{'annot_file'} );
#~ usage() if ( !exists $options{'genome_fasta_file'} );

############################ PROGRAM ############################

my $species = $options{'species'};
my $infile = $options{'infile'};
my $outfileprot = $options{'outfileprot'};
my $outfileprot_codedName = $options{'outfileprot_codedName'};
my $outencodingtable = $options{'outencodingtable'};
my $out_fastas_single_dir = $options{'out_fastas_single_dir'};
my $out_fastas_Nseq_dir = $options{'out_fastas_Nseq_dir'};
my $Nseq_protlenmin = $options{'Nseq_protlenmin'};
my $Nseq = $options{'Nseq'};
my $registry_file = $options{'registry_file'};
my $delta = $options{'delta'};
my $base_name = $options{'base_name'};
my $sizeseqmin = $options{'sizeseqmin'};
my $out_fastas_Nseq_dir_protlenmin = $options{'out_fastas_Nseq_dir_protlenmin'};
my $out_vep_NSC_dir = $options{'out_vep_NSC_dir'};
my $out_vep_STOP_dir = $options{'out_vep_STOP_dir'};
my $outvep_res = $options{'outvep_res'};
my $outvep_title = $options{'outvep_title'};
my $annot_file = $options{'annot_file'};
my $genome_fasta_file = $options{'genome_fasta_file'};

# if you have a particular type of SNP to extract in vep format, please add here
#~ my $type = "NSC,STOP";

# mkdir in ergatis (to be set in web interface) to specifye the directory to work in 

#~ # library formatAlleleSeq (formatAlleleSeq::getProteinseqfile_fromSNPeffect)
#~ # get the protein sequences from snp-perdictor-effect and Ensembl (1 protein per allele, 2 prot per SNP)

#### ATTENTION A INCLURE DANS LES OPTIONS!!!!
#~ my $gff_file="";
#~ my $genome_fasta_file ="";
getProteinseqfile_fromSNPeffect($species,$infile,$outfileprot,$registry_file,$outvep_res,$outvep_title,$annot_file,$genome_fasta_file);
# encode the name of each protein
encodeFastaProtName($outfileprot,$outfileprot_codedName,$outencodingtable,$base_name);
# split fasta files, 1 file per protein sequence
#~## format_multifasta_to_singlefastas($outfileprot_codedName,$out_fastas_single_dir);
# split fasta files, 1 file per N protein sequences
splitfastaN($outfileprot_codedName,$out_fastas_Nseq_dir,$Nseq,100000);
# minimum protein sequence length
splitfastaN($outfileprot_codedName,$out_fastas_Nseq_dir_protlenmin,$Nseq_protlenmin,$sizeseqmin);
#~ #get the VEP file with non synonymous SNP only
getVEPfile_fromSNPeffect($infile,$out_vep_NSC_dir,"NSC");
getVEPfile_fromSNPeffect($infile,$out_vep_STOP_dir,"STOP");



