#!/usr/bin/perl -w
use strict;

my @a_pSam;            # Previous sam line
my @aa_tmplocus;       # Current locus (tab of tab sam line)
my @a_locus;           # Tab of locus (ref\tstart\tend\tlength\tnbSamLine)
my %h_locusAln;
my ($deb, $end, $cmp);
my $locusId = 0;

# First mapped read (sam line)
while (my $line=<STDIN>) {
  @a_pSam = split('\s+', $line);
  ($cmp, $deb, $end) = (1, $a_pSam[3], $a_pSam[3]+length($a_pSam[9])-1);
  last if($a_pSam[1] != 4);
}

while (my $line=<STDIN>) {
  chomp $line;
  my @a_sam = split('\s+',$line);

  next if($a_sam[1] == 4);

  my @a_tmp = @a_pSam;
  push @aa_tmplocus, [ @a_tmp ];
  if($a_sam[2] eq $a_pSam[2]  &&  $a_sam[3] <= $end) {
    $cmp++;
    $end = max($a_sam[3]+length($a_sam[9])-1, $end);
  }
  else {
    $locusId++;
    my $strand = $aa_tmplocus[0][1];
    for (my $i=0; $i<=$#aa_tmplocus; $i++) {
      $strand = -1 if($aa_tmplocus[$i][1] ne $strand);
      my $lenshift = $aa_tmplocus[$i][3] - $deb;
      my $shift = "";
      for (my $j=$deb; $j<$aa_tmplocus[$i][3]; $j++) { $shift .= "-"; }
      $aa_tmplocus[$i][9] = $shift.$aa_tmplocus[$i][9];
      push @{$aa_tmplocus[$i]}, "XX:Z:locus_$locusId";
      print STDERR join("\t",@{$aa_tmplocus[$i]})."\n";
    }
    if($end-$deb+1<=38  &&  $cmp>0) {  ##25 cmp>1
      push(@a_locus, "locus_$locusId\t$strand\t$a_pSam[2]\t$deb\t$end\t".($end-$deb+1)."\tisoform:$cmp");
      for (my $i=0; $i<=$#aa_tmplocus; $i++) {
	$h_locusAln{"locus_$locusId"} .= "$aa_tmplocus[$i][0]\t$aa_tmplocus[$i][9]\n";
      }
    }
    print STDERR "##locus_$locusId\t$strand\t$a_pSam[2]\t$deb\t$end\t".($end-$deb+1)."\tisoform:$cmp\n";
    $cmp = 1;
    $deb = $a_sam[3];
    $end = $a_sam[3]+length($a_sam[9])-1;
    @aa_tmplocus = ();
  }
  @a_pSam = @a_sam;
}


my $link = 0;
my $refseq = "";
my @a_aln;
my @a_locusBeg;
my @a_locusEnd;
my @a_locusDescr;
my @a_miRNADescr;
for (my $i=0; $i<=$#a_locus; $i++) {
  print "##\nlocus\t" if($link == 0);

  my @a_curr = split('\s+', $a_locus[$i]);
  my @a_next = split('\s+', $a_locus[$i+1]);

  if($a_curr[2] eq $a_next[2] && $a_curr[4]+30 > $a_next[3] && $a_curr[4]+4 < $a_next[3]) {
    if($link == 0) {
      push @a_locusDescr, "$a_locus[$i]";
      push @a_aln, [ split('\n', $h_locusAln{$a_curr[0]}) ];
      push @a_locusBeg, $a_curr[3];
      push @a_locusEnd, $a_curr[4];
      $link++;
    }

    push @a_locusDescr, "$a_locus[$i+1]";
    push @a_aln, [ split('\n', $h_locusAln{$a_next[0]}) ];
    push @a_locusBeg, $a_next[3];
    push @a_locusEnd, $a_next[4];
    $link++;
    $refseq = $a_next[2];
  }
  else {
    if($link == 0) {
      push @a_locusDescr, "$a_locus[$i]";
      push @a_aln, [ split('\n', $h_locusAln{$a_curr[0]}) ];
      push @a_locusBeg, max(1, $a_curr[3]-50);
      push @a_locusEnd, $a_curr[4]+50;
      $link++;
      $refseq = $a_curr[2];

      # Consensus (extractseq) and folding (RNAfold)
      my $exs = extractseq($refseq, max(1,$a_curr[3]-50), $a_curr[4]);
      my $fol = fastaOneLiner($exs);
      my $rfo = rnafold($fol);
      my ($rfoScore) = ( $rfo =~ /([+-]?\d+\.\d+)/ );
      push @a_aln, [ split('\n', $rfo) ];
      push @a_locusBeg, max(1, $a_curr[3]-50);
      push @a_locusEnd, $a_curr[4];
      push @a_miRNADescr, "miRNA-miRNA*\t$refseq\t".max(1,$a_curr[3]-50)."\t$a_curr[4]".
	"\tfoldScore:$rfoScore\tfoldFlag:".computeRnafoldFlag($rfo);

      $exs = extractseq($refseq, $a_curr[3], $a_curr[4]+50);
      $fol = fastaOneLiner($exs);
      $rfo = rnafold($fol);
      ($rfoScore) = ( $rfo =~ /([+-]?\d+\.\d+)/ );
      push @a_aln, [ split('\n', $rfo) ];
      push @a_locusBeg, $a_curr[3];
      push @a_locusEnd, $a_curr[4]+50;
      push @a_miRNADescr, "miRNA-miRNA*\t$refseq\t$a_curr[3]\t".($a_curr[4]+50).
	"\tfoldScore:$rfoScore\tfoldFlag:".computeRnafoldFlag($rfo);
    }
    else {
      for(my $x=0; $x<=($link-2); $x++) {
	# Consensus (extractseq) and folding (RNAfold)
	my $exs = extractseq($refseq, $a_locusBeg[0+$x], $a_locusEnd[1+$x]);
	my $fol = fastaOneLiner($exs);
	my $rfo = rnafold($fol);
	my ($rfoScore1) = ( $rfo =~ /([+-]?\d+\.\d+)/ );
	my $rfoFlag1    = computeRnafoldFlag($rfo);
	push @a_aln, [ split('\n', $rfo) ];
	push @a_locusBeg, $a_locusBeg[0+$x];
	push @a_locusEnd, $a_locusEnd[1+$x];
	
	# Folding with N in the loop
	my $l1   = $a_locusEnd[0+$x] - $a_locusBeg[0+$x] + 1;
	my $loop = $a_locusBeg[1+$x] - $a_locusEnd[0+$x] - 1;
	$fol =~ s/(.{$l1})([ATCGNatcgn]{$loop})(.*)$/$1.("N" x length($2)).$3/e;
	$rfo = rnafold($fol);
	my ($rfoScore2) = ( $rfo =~ /([+-]?\d+\.\d+)/ );
	my $rfoFlag2    = computeRnafoldFlag($rfo);
	push @a_aln, [ split('\n', $rfo) ];
	push @a_locusBeg, $a_locusBeg[0+$x];
	push @a_locusEnd, $a_locusEnd[1+$x];
	
	# Align miRNA vs miRNA*
	my $needleScore  = 0;
	my @a_fol   = split('\n', $fol);
	my @a_mirna = split('NNNN+', $a_fol[1]);   ## Min 4 N for the loop
	if($#a_mirna == 1) {                       ## If only mirna + mirna* (one loop)
	  my $revcmp =  reverse($a_mirna[1]);
	  $revcmp    =~ tr/ACGTacgt/TGCAtgca/;
	  open(TMP, ">sam2locus.$$.needle1") || die "Enable to create sam2locus.$$.needle1 file";
	  print TMP ">miRNA_locus1\n".$a_mirna[0];
	  close TMP;
	  open(TMP, ">sam2locus.$$.needle2") || die "Enable to create sam2locus.$$.needle2 file";
	  print TMP ">miRNA_locus2\n".$revcmp;
	  close TMP;
	  `needle sam2locus.$$.needle1 sam2locus.$$.needle2 -gapopen 10 -gapextend 0.5 -outfile sam2locus.$$.needle 2> /dev/null`;
	  open(NEEDLE, "sam2locus.$$.needle") || die "Enable to open sam2locus.$$.needle output file";
	  my $needleOutput = "";
	  $needleScore = 0;
	  while (my $line=<NEEDLE>) {
	    chomp $line;
	    if($line =~ /^# Score:/) {
	      $needleScore = $line;
	      $needleScore =~ s/# Score:\s+//;
	    }
	    if($line =~ /^miRNA_locus1/) {
	      $line =~ s/miRNA_locus1\s+/>miRNA_locus1\($needleScore\)\t/;
	      $needleOutput = "$line                                            \n";
	    }
	    if($line =~ /^miRNA_locus2/) {
	      $line =~ s/miRNA_locus2\s+/>miRNA_locus2\($needleScore\)\t/;
	      $needleOutput .= "$line                                           \n";
	    }
	  }
	  close NEEDLE;
	  unlink("sam2locus.$$.needle1");
	  unlink("sam2locus.$$.needle2");
	  unlink("sam2locus.$$.needle");
	  push @a_aln, [ split('\n', $needleOutput) ];
	  push @a_locusBeg, $a_locusBeg[0+$x];
	  push @a_locusEnd, $a_locusEnd[1+$x];
	}
	push @a_miRNADescr,
	  "miRNA-miRNA*\t$refseq\t".$a_locusBeg[0+$x]."\t".$a_locusEnd[1+$x].
	  "\tfoldScore:$rfoScore1\tfoldFlag:$rfoFlag1".
	  "\tNfoldScore:$rfoScore2\tNfoldFlag:$rfoFlag2".
	  "\tneedleScore:$needleScore";
      }
    }

    print "$a_curr[2]\t$a_locusBeg[0]\t$a_locusEnd[-1]\tlocusNumber:$link\n";
    foreach my $descr (@a_locusDescr) {
      print "$descr\n";
    }
    foreach my $descr (@a_miRNADescr) {
      print "$descr\n";
    }
    print "#\n";

    my $maxName = 0;
    for (my $j=0; $j<=$#a_aln; $j++) {
      for (my $k=0; $k<=$#{$a_aln[$j]}; $k++) {
	my @a_tmp = split('\t', $a_aln[$j][$k]);
	$maxName = max(length($a_tmp[0]), $maxName);
      }
    }
    for (my $j=0; $j<=$#a_aln; $j++) {
      for (my $k=0; $k<=$#{$a_aln[$j]}; $k++) {
	my @a_tmp = split('\t', $a_aln[$j][$k]);
	for (my $l=length($a_tmp[0]); $l<$maxName; $l++) { $a_tmp[0] .= " "; }
	if($link == 1 && $j<=$#a_aln-2) {
	  for (my $l=0; $l<50; $l++) { $a_tmp[1] =  "-".$a_tmp[1]; }
	}
	for (my $l=$a_locusBeg[0];    $l<$a_locusBeg[$j]; $l++) { $a_tmp[1] =  "-".$a_tmp[1]; }
	for (my $l=length($a_tmp[1]); $l<$a_locusEnd[-1]-$a_locusBeg[0]+1; $l++) { $a_tmp[1] .= "-"; }
	print "$a_tmp[0]  $a_tmp[1]\n";
      }
      print "#\n" if($j<$#a_aln);
    }

    $link   = 0;
    $refseq = "";
    @a_aln        = ();
    @a_locusBeg   = ();
    @a_locusEnd   = ();
    @a_locusDescr = ();
    @a_miRNADescr = ();
  }
}


sub max { if ($_[0] > $_[1]) { $_[0]; } else{ $_[1]; } }

sub min { if ($_[0] < $_[1]) { $_[0]; } else{ $_[1]; } }

sub extractseq {
  my ($refseq, $beg, $end) = @_;
  return `extractseq V4_454Scaffolds.fna:$refseq -regions $beg-$end -outseq stdout 2> /dev/null`;
}

sub fastaOneLiner {
  my ($fasta) = @_;
  my @a_lines = split(/\n/, $fasta);
  my $ac = shift @a_lines;
  $ac =~ /^(\S+)/;
  my $r = "$1\n".join('', @a_lines);
  return $r;
}

sub rnafold {
  my ($seq) = @_;
  ## 3 lines in RNAfold output (1:header fasta - 2:seq - 3:(((...())[...] (nrj))
  my @a_res  = split ('\n', `perl -e '{print "$seq"}' | RNAfold -noPS -noconv`);
  $a_res[2]  =~ s/\( +/\(/;
  my @a_stru = split (' ', $a_res[2]);
  return "$a_res[0]\t$a_res[1]\n$a_res[0]$a_stru[1]\t$a_stru[0]\n";
}

sub extractseq_rnafold {
  my ($refseq, $beg, $end) = @_;
  my $cmd_extractseq = "extractseq V4_454Scaffolds.fna:$refseq -regions $beg-$end -outseq stdout";
  my $cmd_rnafold    = "RNAfold -noPS -noconv";
  ## 3 lines in RNAfold output (1:header fasta - 2:seq - 3:(((...())[...] (nrj))
  my @a_res  = split ('\n', `$cmd_extractseq | ./fastaOneliner.pl | $cmd_rnafold`);
  $a_res[2]  =~ s/\( /\(/;
  my @a_stru = split (' ', $a_res[2]);
  return "$a_res[0]\t$a_res[1]\n$a_res[0]$a_stru[1]\t$a_stru[0]\n";
}

sub computeRnafoldFlag {
  my ($rfo) = @_;
  my ($fold) = ($rfo =~ /\s+([\(\)\.]{10,})\s+/);
  if(!defined $fold) {print "###$rfo\n";exit();}
  return 0 if($fold =~ /^\.+$/);
  my ($fold1,$loop,$fold2) = ($fold =~ /^([\(\.]+\()(.+?)(\)[\)\.]+)$/);
  return 0 if(!defined $fold1 || !defined $loop || !defined $fold2);
  my $loopSize = length($loop);
  $loop =~ s/\.//g;
  if($loopSize <= 30   && length($loop) <= 4 &&
     $fold1 !~ /\.{5,}/ && $fold2 !~ /\.{5,}/) {
    return 1;
  }
  return 0;
}

### TODO :
###  1 Attention les deux mirna mirna* doivent etre sur le meme brin !!! => flag -1
###  2 Faire descendre les comptages d'occurence
###  3 Produire un alignement avec les deux mirna et mirna*              => ok

###  4 extraire la réf de la zone (avec  contexte) !
###    splitfasta avec fastaexplode
###    puis extractseq...

#
#foreach i (*locus)
#grep "###" -c $i
#end

#foreach i ( *locus.stderr )
#grep -c "##locu" $i
#end

#foreach i (*locus)
#grep "###" $i | grep -c "\-1"
#end

#foreach i (*locus)
#grep "###" $i | grep -cPv '###2\n'
#end

#grep "###" s_6_uT25.fastq.cuta.nr.sort.bam.NM0_NoXA.redon.locus | perl -lane '{`extractseq V4_454Scaffolds.fna:$F[2].fa -regions $F[3]-$F[11] -outseq stdout | tee $F[2]_$F[3]-$F[11].fa | RNAfold -noPS -noconv >> $F[2]_$F[3]-$F[11].fa`}'

#perl -lne '{if($_ =~ /^\>/){print "$_"} else { $seq.=$_ if($_ =~ /^[ATCGNatcgn]+$/); if($_=~/^[\.\(\)]+ /) {$_ =~s/ (.+)//; $str.=$_}; print "$seq\n$str" if ( eof ) ;} }' scaffold58014_1652-1717.fa


# Banque Rfam 10.0 : /bank/sanger/rfam/current/blast/
