#!/usr/bin/env python

""" This program is aimed to rebuild the clone sequence given a crossmatch masked cap3 output contig file.
It take the masked cap3 contig file as input.
Checks if the largest contig is at least of 90% of the total sequence length (if not you have to check it manually)
Splits the largest contig if needed.
Rebuilds the resulting sequence.
Removes the masking Xs from the crossmatch masking step.
Outputs the resulting sequence.

The output sequence can the be used for annotation and publication.

Author : Ch. Klopp

Date : Tue Apr 14 15:40:09 CEST 2009
"""

from Bio import SeqIO
from Bio import Seq
from Bio.Alphabet import IUPAC
import sys
import os
import re
import string
#import matplotlib.mlab as mlab
#import matplotlib.pyplot as plt
from Bio.SeqRecord import SeqRecord


# arguments
# 1 : first fasta file name

block = 10 # block size to search the X
max1 = 6 # number of Xs to determine the X blocks

#---------------------------------------------
def checkFile(file):

   print "Checking input %s" % file
   if not os.path.exists(file):
      print "File %s" % file + " is not valid"
      sys.exit(1)
   else:
      print "exists."

#---------------------------------------------
def readCrossMatch(crossmatch_file):
	overlap = []
	xmatch_obj=open(crossmatch_file, 'r')

	for line in xmatch_obj:
		###reverse matches
		rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)")
		rm = rev_regex.match(line)
		###forward matches
		fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+")
		fm = fwd_regex.match(line)


		if rm != None:
			#print "GR: %s" % line
			print "REVERSE: %s %s %s %s %s %s %s" % (fm.group(1), fm.group(2), fm.group(3), fm.group(4), fm.group(5), fm.group(7), fm.group(8))
			(percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(rm.group(2)), rm.group(3), float(rm.group(4)), float(rm.group(5)), rm.group(6), float(rm.group(7)), float(rm.group(8)))
			overlap.append((startFirstMatch, endFirstMatch))
		elif fm != None:
			#print "GF: %s" % line
			print "FORWARD: %s %s %s %s %s %s %s" % (fm.group(1), fm.group(2), fm.group(3), fm.group(4), fm.group(5), fm.group(6), fm.group(7))
			(percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), float(fm.group(4)), float(fm.group(5)), fm.group(6), float(fm.group(7)), float(fm.group(8)))
			overlap.append((startFirstMatch, endFirstMatch))
		#else:
			#print "NO RE:%s" % line

	xmatch_obj.close()

	return overlap

#---------------------------------------------
def runCrossMatch(seq1,seq2):
	# creating the record set
	rsec1 = SeqRecord(seq1,id="left",description="left")
	rsec2 = SeqRecord(seq2,id="right",description="right")
	# creating the record sets
	rseq1s = [rsec1]
	rseq2s = [rsec2]
	# creating the temporary sequence files
	tfile1 = os.tmpnam()
	print "temp file name 1 = "+tfile1+"\t",
	tfile2 = os.tmpnam()
	print "temp file name 2 = "+tfile2+"\t",
	tcfile = os.tmpnam()
	print "cross_match temp file name = "+tcfile
	# getting the file handles
	handle1 = open(tfile1, "w")
	handle2 = open(tfile2, "w")
	# writing the sequences 
	SeqIO.write(rseq1s, handle1, "fasta")
	SeqIO.write(rseq2s, handle2, "fasta")
	# closing the files 
	handle1.close()
	handle2.close()
	# processing the sequences 
	cmd = 'cross_match '+tfile1+" "+tfile2+' -minmatch 10 -minscore 20 > '+tcfile
	result = os.system(cmd)
	print "Crossmatch return code = "+str(result)
	result = readCrossMatch(tcfile)
	# processing crossmatch result
	print "seq1 length ="+str(len(seq1.tostring()))+"\tseq2 length ="+str(len(seq2.tostring()))+"\ttotal ="+str(len(seq1.tostring())+len(seq2.tostring()))
	end_coord = 0
	if result:
		print "Overlap = "+str(result)
		# checking if the overlap is at the end of the left sequence and at the start of the right one
		for ov in result :
			if (len(seq1.tostring())-ov[1]) < block :
				print "Found an ending overlap : new end sequence coordinate = "+str(ov[0])
				end_coord = int(ov[0])
	else :
		print "No overlap"		

	# seq = Seq(seq1+seq2, IUPAC.unambiguous_dna)
	if end_coord > 0:
		seq = seq1[0:end_coord]+seq2
	else:
		seq = seq1+seq2
	# returning the resulting sequence 
	return seq

# processing positiv sequences
handle = open(sys.argv[1])

print "################### "+sys.argv[1]+" #######################"
l = 0
seq1 = ''
char = 'X'
ltot = 0;
countChar = 0 

for seq_record in SeqIO.parse(handle, "fasta") :
	ltot = ltot + len(seq_record.seq.tostring())
	countChar = countChar + seq_record.seq.count(char)
	print seq_record.id+"\t"+str(len(seq_record.seq.tostring()))+"\t"+str(seq_record.seq.count(char))
	if len(seq_record.seq.tostring()) > l :
		l = len(seq_record.seq.tostring())
		seq1 = seq_record

# stopping the program if no vector cleaned sequence was found
if countChar == 0:
	print "This sequence file has not been cleaned or has no vector detected"
	sys.exit()

# verification if the selected contig is really big
if ((l*1.0)/(ltot*1.0)) < 0.9 :
	print "ATTENTION There might be anoter contig to add to this one !!!!!!!!!!!!!!!!!"+str(l)+"\t"+str(ltot)+"\t"+str(l/ltot)

#print sys.argv[1]+"\t"+seq1.id+" sequence length = "+str(len(seq1.seq.tostring()))
xx = []
start = 0
stop = 0
cont = 0 #to determine if we are in a X block or not

for index in range(0,len(seq1.seq.tostring())-block):
	# print str(len(seq1.seq.tostring())-block)
	if (seq1.seq[index:(index+block)].tostring().upper().count(char) > max1) :
		# print seq1.seq[index]+"\t"+str(index)+"\t"+str(seq1.seq[index:(index+block)].tostring().upper().count('X'))+"\t"+str(max1)+"\t"+str(cont)
		if (cont == 0):
			start = index
			cont = 1
	else :
		if (cont == 1):
			stop = index
			xx.append((start,stop))
			cont = 0

if (cont == 1):
	stop = index
	xx.append((start,stop))

# processing the different vector sequence locations
# if the vector sequence is close to the beginning or the end of the contig then it is removed
# if the vector sequence is in the middle of the contig the the contig is modified (split and join)

seq2 = ''
case = 0
start1 = 0
stop1 = 0

for block1 in xx:
	if block1[0] < block :
		start = block1[1]+(block/2)
		case = case + 1 
	elif (len(seq1.seq.tostring()) - block1[1]) < (block*4) :
		stop = block1[0]
		case = case + 2
	else:
		stop1 = block1[0]
		start1 = block1[1]
		case = case + 4

l = len(seq1.seq.tostring())
print "Selected contig = "+seq1.id+"\t"+str(len(seq1.seq.tostring()))+"\t",
print xx
print str(case)+"\t"+"Vector location / Clone sequence start\t"+str(start1)+"\tstop\t"+str(stop1)	 

if case == 1:
	seq2 = seq1.seq[start:l]
elif case == 2:
	seq2 = seq1.seq[0:stop]
elif case == 3:
	seq2 = seq1.seq[start:stop]
elif case == 4:
	seq2 = runCrossMatch(seq1.seq[start1:l],seq1.seq[0:stop1])
elif case == 5:
	seq2 = runCrossMatch(seq1.seq[start1:l],seq1.seq[start:stop1])
elif case == 6:
	seq2 = runCrossMatch(seq1.seq[start1:stop],seq1.seq[0:stop1])
elif case == 7:
	seq2 = runCrossMatch(seq1.seq[start1:stop],seq1.seq[start:stop1])
else:
	print "Problem!!!!!"

#print seq2
seq3 = seq2.tomutable()

for j in range(0,block):
	if seq3[0] == char :
		seq3 = seq3[1:-1]
	if seq3[-1] == char :
		seq3 = seq3[0:-2]

#print "Modified sequence ",
#print seq3
print str(l)+"\t"+str(len(seq1.seq.tostring()))+"\t"+str(len(seq3.tostring()))


rsec3 = SeqRecord(seq3,id="selected_contig",description="")
seqs3 = [rsec3]

handle3 = open(sys.argv[1]+".modified", "w")
SeqIO.write(seqs3, handle3, "fasta")
handle3.close()