#!/usr/bin/python # usage : python mb_join_annot_results.py # Maria Bernard 20/08/2015 # Copyright (C) 2012 INRA # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # import argparse parser = argparse.ArgumentParser(description="") parser.add_argument('-o', default=None, required=True, help='outupt merged file') parser.add_argument('-v', default=None, required=True, help='input vep file to merge') parser.add_argument('-i', default=None, required=True, nargs='+', help='input file to merge') args = parser.parse_args() # to keep vep order seq_list=[] seq_hash={} column_header=[] dic_full_name={} header=() fh_vep = open(args.v) for line in fh_vep: if line.startswith("##"): continue if not header : #parse header header = line.strip().replace("#","").split('\t')[1:] column_header += header else: base_id = "_".join(line.strip().split("\t")[0].split("_")[:-1]) #~ print "vep base_id", base_id allel = line.strip().split("\t")[0].split("_")[-1].split("/") transcript_id = line.strip().split("\t")[4] for a in allel[1:]: seq_id=base_id+"_"+allel[0]+"/"+a+"_"+transcript_id h = dict(zip(header,line.strip().split('\t')[1:] )) #12_113425679_T/C_-_ENST00000547342[M/V-89] if not seq_id in seq_hash: seq_hash[seq_id]=h seq_list.append(seq_id) #~ print "vep seq_id",seq_id break fh_vep.close() for f in args.i: fh_in = open(f) header=() for line in fh_in : if line.startswith("##"): continue if not header : #parse header header = line.strip().replace("#","").split('\t')[1:] column_header += header else: #parse annot # 12_113425679_T/C_-_ENST00000547342[M/V-89] full_id = line.strip().split("\t")[0] #~ print "annot full_id", full_id # 12_11342567_T/C_ENST00000547342 l=full_id.replace("_"," ").replace("[", " ").split(" ") seq_id="_".join(l[:-3])+"_"+l[-2] #~ print "annot seq_id",seq_id h = dict(zip(header,line.strip().split('\t')[1:] )) if seq_id in seq_hash : seq_hash[seq_id].update(h) else : seq_hash[seq_id]=h seq_list.append(seq_id) fh_in.close() fh_out=open(args.o,"w") fh_out.write("Sequence Name\t"+"\t".join(column_header)+"\n") for seq_id in seq_list: string = seq_id for col in column_header : string += "\t"+seq_hash[seq_id][col] if col in seq_hash[seq_id] and seq_hash[seq_id][col]!="" else "\t-" fh_out.write(string+"\n")