#!/usr/bin/perl -w

# usage : perl bwa_samtools.pl <FASTA ref file> <FASTQ1 file> <FASTQ1 file> <output BAM file> 
# Sarah Maman - 11 juillet 2013 - Mis en place dans le cadre du projet PhyloFish
#perl bwa_samtools.pl tests/results_from_merged_rename.fasta toto tests/TEST_R1.fastq tests/TEST_R2.fastq galaxy_dataset_128.dat
# Copyright (C) 2013 INRA
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#


use strict;
use File::Basename;

my $input_fasta        = $ARGV[0];
my $tailleFichier      = (stat($input_fasta))[7]; #Selon la taille du FASTA de reference, l indexation est differente
my $output_stats       = $ARGV[1];
my $input_1_fastq      = $ARGV[2];
my $input_2_fastq      = $ARGV[3];
my $output_BAM         = $ARGV[4];
my $maxmis             = $ARGV[5];
my $qualalign          = $ARGV[6];
my $NOM                = $ARGV[7];

my $path = dirname($input_fasta);
my ($nb1) = ($output_BAM=~/galaxy_dataset_(\d+)\.\S+$/);
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); 
my $ALL =  $mday."-".($mon+1)."_".$hour."h".$min."mn".$sec."secPHYLOFISH"; 
my $nb = $nb1."_".$ALL;



#pour eviter que la commande ne fonctionne plus si le $PATH est perdu suite a un restart de galaxy
my $BWA = '/usr/local/bioinfo/bin/bwa';
my $SAMTOOLS = '/usr/local/bioinfo/bin/samtools';
my $cmd1 = ''; my $cmd2 = ''; my $cmd3 = ''; my $cmd6 = ''; my $cmd7 = '';

#fonction exit
sub control ()
{
if ($? != 0){print STDERR "Incident. Fin du job. Veuillez relancer votre outil\n"; exit (1);}
}


####################################################################################
#									           #			
#		INDEXATION FASTA REF						   #
#                                                                                  #
####################################################################################
#Indexation du genome de reference (ne pas placer cette commande en background)
#si le genome de référence est gros , plus de 10 MB, alors indexer avec l'option -a bwtsw
#si le genome est petit, indexer avec -a is 

#if (! -e "${input_fasta}.ann"){#si le fasta est deja indexe
#	if ($tailleFichier > 10485760) {
#        	$cmd1 = "$BWA index -a bwtsw $input_fasta  >> ./bwaindex.log 2>&1";
#     	}
#	if ($tailleFichier < 2147483648){
#       	$cmd1 = "$BWA index -a is $input_fasta  >> ./bwaindex.log 2>&1";
#     	}
#	else {
#        	$cmd1 ="$BWA index $input_fasta  >> ./bwaindex.log 2>&1";
#     	}
#	system $cmd1;
#
#	#Information en STDOUT pour les biologistes
#	print STDOUT "Etape 1 \nIndexation : $cmd1 \n\n";
#	control();
#}
#else {print STDOUT "Etape 1 \nIndexation  deja realisee en amont - fasta : $input_fasta\n\n";}


####################################################################################
#									           #			
#		BWA - OUTPUT : BAM NON TRIE | RECUPERATION SEQ ALIGNEE		   #
#                                                                                  #
####################################################################################
#Alignement
#-1 	When -b is specified, only use the first read in a read pair in mapping (skip single-end reads and the second reads).
#-2 	When -b is specified, only use the second read in a read pair in mapping. 

#PREPARATION des inputs avec la bonne extension
#`cp $input_fasta ${input_fasta}.fasta; cp $input_1_fastq ${input_1_fastq}.fastq;`;
`cp $input_1_fastq ${input_1_fastq}.fastq;`;

my $FASTA = "/work/galaxy-dev/$NOM/*.fasta";

#$cmd2 ="($BWA aln -n $maxmis ${input_fasta}.fasta ${input_1_fastq}.fastq > ${input_1_fastq}_1.sai) >& ./bwaaln1.log 2>&1";
$cmd2 ="($BWA aln -n $maxmis $FASTA ${input_1_fastq}.fastq 2>&1 1> ${input_1_fastq}_1.sai ) >& ./bwaaln1.log";
system $cmd2;if ($?!=0){print STDOUT "cmd2 failed:  $?"};
if (! -e "${input_1_fastq}_1.sai"){print STDERR "SAI FILE1 NOT FOUND\n";}
control();


#PREPARATION des inputs avec la bonne extension
`cp $input_2_fastq ${input_2_fastq}.fastq;`;

#$cmd3 ="($BWA aln -n $maxmis ${input_fasta}.fasta ${input_2_fastq}.fastq > ${input_2_fastq}_2.sai) >& ./bwaaln2.log 2>&1";
$cmd3 ="($BWA aln -n $maxmis $FASTA ${input_2_fastq}.fastq 2>&1 1> ${input_2_fastq}_2.sai) >& ./bwaaln2.log";
system $cmd3;if ($?!=0){print STDOUT "cmd3 failed:  $?"};
if (! -e "${input_2_fastq}_2.sai"){print STDERR "SAI FILE2 NOT FOUND\n";}
control();

#Information en STDOUT pour les biologistes
print STDOUT "Etape 2 \nAlignement du premier fastq : $cmd2 \nAlignement du second fastq: $cmd3 \n\n";

#bwa sampe / sequences alignees uniquement
#$cmd6 = "($BWA sampe $input_fasta ${input_1_fastq}-1.sai ${input_2_fastq}-2.sai $input_1_fastq $input_2_fastq | $SAMTOOLS view -bS -F 4 - | samtools sort - ${input_fasta}-sampe ) >& ./bwa.log 2>&1";
#$cmd6 = "($BWA sampe ${input_fasta}.fasta ${input_1_fastq}_1.sai ${input_2_fastq}_2.sai ${input_1_fastq}.fastq ${input_2_fastq}.fastq | $SAMTOOLS view -bS -F 4 -q $qualalign - | $SAMTOOLS sort - $path/${nb}_sampe) >& ./bwa.log 2>&1";
$cmd6 = "($BWA sampe $FASTA ${input_1_fastq}_1.sai ${input_2_fastq}_2.sai ${input_1_fastq}.fastq ${input_2_fastq}.fastq | $SAMTOOLS view -bS -F 4 -q $qualalign - | $SAMTOOLS sort - $path/${nb}_sampe) >& ./bwa.log 2>&1";
system $cmd6;if ($?!=0){print STDOUT "cmd6 failed:  $?"};
control();

#Information en STDOUT pour les biologistes
print STDOUT "Etape 3 \nBWA, Conversion des SAM en BAM puis BWA sur les sequences alignees uniquement : $cmd6 \n\n";


####################################################################################
#									           #			
#		TRI INDEX STATS      						   #
#                                                                                  #
####################################################################################
#Creation du BAM index et generation des statistiques 
#$cmd7 = "$SAMTOOLS index ${input_fasta}-sampe.bam ; $SAMTOOLS idxstats ${input_fasta}-sampe.bam > ${input_fasta}.txt;";
$cmd7 = "$SAMTOOLS index $path/${nb}_sampe.bam ; $SAMTOOLS idxstats $path/${nb}_sampe.bam > $path/$nb.txt;";
system $cmd7;if ($?!=0){print STDOUT "cmd7 failed:  $?"};
control();
#Information en STDOUT pour les biologistes
print STDOUT "Etape 4 \nCreation du BAM index et generation des statistiques : $cmd7\n\n";



####################################################################################
#									           #			
#		ELIMINATION DES FICHIERS INTERMEDIAIRES          		   #
#                                                                                  #
####################################################################################

#`rm ${input_2_fastq}_2.sai ${input_1_fastq}_1.sai $path/${nb}_sampe.bam.bai;`;


####################################################################################
#									           #			
#		GALAXY OUTPUT         						   #
#                                                                                  #
####################################################################################
# Recuperation du fichier sortant de statistiques
#if (! -e "${input_fasta}.txt"){print STDERR "Echec lors de la recuperation du fichier de statitiques. \n";}
#else {`cp -a "${input_fasta}.txt" $output_stats `;}
if (! -e "$path/$nb.txt"){print STDERR "Echec lors de la recuperation du fichier de statistiques. \n";}
else {`cp -a "$path/$nb.txt" $output_stats `;}

# Recuperation du fichier sortant BAM
#if (! -e "${input_fasta}-sampe.bam"){print STDERR "Echec lors de la recuperation du BAM. \n";}
#else {`cp -a "${input_fasta}-sampe.bam" $output_BAM `;}

if (! -e "$path/${nb}_sampe.bam"){print STDERR "Echec lors de la recuperation du BAM. \n";}
else {`cp -a "$path/${nb}_sampe.bam" $output_BAM `;}
