#!/usr/bin/env python """ This program is aimed to rebuild the clone sequence given a crossmatch masked cap3 output contig file. It take the masked cap3 contig file as input. Checks if the largest contig is at least of 90% of the total sequence length (if not you have to check it manually) Splits the largest contig if needed. Rebuilds the resulting sequence. Removes the masking Xs from the crossmatch masking step. Outputs the resulting sequence. The output sequence can the be used for annotation and publication. Author : Ch. Klopp Date : Tue Apr 14 15:40:09 CEST 2009 """ from Bio import SeqIO from Bio import Seq from Bio.Alphabet import IUPAC import sys import os import re import string #import matplotlib.mlab as mlab #import matplotlib.pyplot as plt from Bio.SeqRecord import SeqRecord # arguments # 1 : first fasta file name block = 10 # block size to search the X max1 = 6 # number of Xs to determine the X blocks #--------------------------------------------- def checkFile(file): print "Checking input %s" % file if not os.path.exists(file): print "File %s" % file + " is not valid" sys.exit(1) else: print "exists." #--------------------------------------------- def readCrossMatch(crossmatch_file): overlap = [] xmatch_obj=open(crossmatch_file, 'r') for line in xmatch_obj: ###reverse matches rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)") rm = rev_regex.match(line) ###forward matches fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+") fm = fwd_regex.match(line) if rm != None: #print "GR: %s" % line print "REVERSE: %s %s %s %s %s %s %s" % (fm.group(1), fm.group(2), fm.group(3), fm.group(4), fm.group(5), fm.group(7), fm.group(8)) (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(rm.group(2)), rm.group(3), float(rm.group(4)), float(rm.group(5)), rm.group(6), float(rm.group(7)), float(rm.group(8))) overlap.append((startFirstMatch, endFirstMatch)) elif fm != None: #print "GF: %s" % line print "FORWARD: %s %s %s %s %s %s %s" % (fm.group(1), fm.group(2), fm.group(3), fm.group(4), fm.group(5), fm.group(6), fm.group(7)) (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), float(fm.group(4)), float(fm.group(5)), fm.group(6), float(fm.group(7)), float(fm.group(8))) overlap.append((startFirstMatch, endFirstMatch)) #else: #print "NO RE:%s" % line xmatch_obj.close() return overlap #--------------------------------------------- def runCrossMatch(seq1,seq2): # creating the record set rsec1 = SeqRecord(seq1,id="left",description="left") rsec2 = SeqRecord(seq2,id="right",description="right") # creating the record sets rseq1s = [rsec1] rseq2s = [rsec2] # creating the temporary sequence files tfile1 = os.tmpnam() print "temp file name 1 = "+tfile1+"\t", tfile2 = os.tmpnam() print "temp file name 2 = "+tfile2+"\t", tcfile = os.tmpnam() print "cross_match temp file name = "+tcfile # getting the file handles handle1 = open(tfile1, "w") handle2 = open(tfile2, "w") # writing the sequences SeqIO.write(rseq1s, handle1, "fasta") SeqIO.write(rseq2s, handle2, "fasta") # closing the files handle1.close() handle2.close() # processing the sequences cmd = 'cross_match '+tfile1+" "+tfile2+' -minmatch 10 -minscore 20 > '+tcfile result = os.system(cmd) print "Crossmatch return code = "+str(result) result = readCrossMatch(tcfile) # processing crossmatch result print "seq1 length ="+str(len(seq1.tostring()))+"\tseq2 length ="+str(len(seq2.tostring()))+"\ttotal ="+str(len(seq1.tostring())+len(seq2.tostring())) end_coord = 0 if result: print "Overlap = "+str(result) # checking if the overlap is at the end of the left sequence and at the start of the right one for ov in result : if (len(seq1.tostring())-ov[1]) < block : print "Found an ending overlap : new end sequence coordinate = "+str(ov[0]) end_coord = int(ov[0]) else : print "No overlap" # seq = Seq(seq1+seq2, IUPAC.unambiguous_dna) if end_coord > 0: seq = seq1[0:end_coord]+seq2 else: seq = seq1+seq2 # returning the resulting sequence return seq # processing positiv sequences handle = open(sys.argv[1]) print "################### "+sys.argv[1]+" #######################" l = 0 seq1 = '' char = 'X' ltot = 0; countChar = 0 for seq_record in SeqIO.parse(handle, "fasta") : ltot = ltot + len(seq_record.seq.tostring()) countChar = countChar + seq_record.seq.count(char) print seq_record.id+"\t"+str(len(seq_record.seq.tostring()))+"\t"+str(seq_record.seq.count(char)) if len(seq_record.seq.tostring()) > l : l = len(seq_record.seq.tostring()) seq1 = seq_record # stopping the program if no vector cleaned sequence was found if countChar == 0: print "This sequence file has not been cleaned or has no vector detected" sys.exit() # verification if the selected contig is really big if ((l*1.0)/(ltot*1.0)) < 0.9 : print "ATTENTION There might be anoter contig to add to this one !!!!!!!!!!!!!!!!!"+str(l)+"\t"+str(ltot)+"\t"+str(l/ltot) #print sys.argv[1]+"\t"+seq1.id+" sequence length = "+str(len(seq1.seq.tostring())) xx = [] start = 0 stop = 0 cont = 0 #to determine if we are in a X block or not for index in range(0,len(seq1.seq.tostring())-block): # print str(len(seq1.seq.tostring())-block) if (seq1.seq[index:(index+block)].tostring().upper().count(char) > max1) : # print seq1.seq[index]+"\t"+str(index)+"\t"+str(seq1.seq[index:(index+block)].tostring().upper().count('X'))+"\t"+str(max1)+"\t"+str(cont) if (cont == 0): start = index cont = 1 else : if (cont == 1): stop = index xx.append((start,stop)) cont = 0 if (cont == 1): stop = index xx.append((start,stop)) # processing the different vector sequence locations # if the vector sequence is close to the beginning or the end of the contig then it is removed # if the vector sequence is in the middle of the contig the the contig is modified (split and join) seq2 = '' case = 0 start1 = 0 stop1 = 0 for block1 in xx: if block1[0] < block : start = block1[1]+(block/2) case = case + 1 elif (len(seq1.seq.tostring()) - block1[1]) < (block*4) : stop = block1[0] case = case + 2 else: stop1 = block1[0] start1 = block1[1] case = case + 4 l = len(seq1.seq.tostring()) print "Selected contig = "+seq1.id+"\t"+str(len(seq1.seq.tostring()))+"\t", print xx print str(case)+"\t"+"Vector location / Clone sequence start\t"+str(start1)+"\tstop\t"+str(stop1) if case == 1: seq2 = seq1.seq[start:l] elif case == 2: seq2 = seq1.seq[0:stop] elif case == 3: seq2 = seq1.seq[start:stop] elif case == 4: seq2 = runCrossMatch(seq1.seq[start1:l],seq1.seq[0:stop1]) elif case == 5: seq2 = runCrossMatch(seq1.seq[start1:l],seq1.seq[start:stop1]) elif case == 6: seq2 = runCrossMatch(seq1.seq[start1:stop],seq1.seq[0:stop1]) elif case == 7: seq2 = runCrossMatch(seq1.seq[start1:stop],seq1.seq[start:stop1]) else: print "Problem!!!!!" #print seq2 seq3 = seq2.tomutable() for j in range(0,block): if seq3[0] == char : seq3 = seq3[1:-1] if seq3[-1] == char : seq3 = seq3[0:-2] #print "Modified sequence ", #print seq3 print str(l)+"\t"+str(len(seq1.seq.tostring()))+"\t"+str(len(seq3.tostring())) rsec3 = SeqRecord(seq3,id="selected_contig",description="") seqs3 = [rsec3] handle3 = open(sys.argv[1]+".modified", "w") SeqIO.write(seqs3, handle3, "fasta") handle3.close()