#!/usr/bin/python

# usage : python mb_join_annot_results.py
# Maria Bernard 20/08/2015
# Copyright (C) 2012 INRA
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import argparse

parser = argparse.ArgumentParser(description="")
parser.add_argument('-o', default=None, required=True, help='outupt merged file')
parser.add_argument('-v', default=None, required=True, help='input vep file to merge')
parser.add_argument('-i', default=None, required=True, nargs='+', help='input file to merge')
args = parser.parse_args()

# to keep vep order
seq_list=[]
seq_hash={}
column_header=[]
dic_full_name={}
header=()

fh_vep = open(args.v)
for line in fh_vep:
	if line.startswith("##"):
			continue
	if not header :
		#parse header
		header = line.strip().replace("#","").split('\t')[1:]
		column_header += header
	else:
		base_id = "_".join(line.strip().split("\t")[0].split("_")[:-1])
		#~ print "vep base_id", base_id
		allel = line.strip().split("\t")[0].split("_")[-1].split("/")
		transcript_id = line.strip().split("\t")[4]
		for a in allel[1:]:
			seq_id=base_id+"_"+allel[0]+"/"+a+"_"+transcript_id
			h = dict(zip(header,line.strip().split('\t')[1:] ))	
		#12_113425679_T/C_-_ENST00000547342[M/V-89]
			if not seq_id in seq_hash:
				seq_hash[seq_id]=h
				seq_list.append(seq_id)
				#~ print "vep seq_id",seq_id
				break
			
fh_vep.close()

for f in args.i:
	fh_in = open(f)
	header=()
	
	for line in fh_in :
		if line.startswith("##"):
			continue
		if not header :
			#parse header
			header = line.strip().replace("#","").split('\t')[1:]
			column_header += header
		else:
			#parse annot
			# 12_113425679_T/C_-_ENST00000547342[M/V-89]
			full_id = line.strip().split("\t")[0]
			#~ print "annot full_id", full_id
			# 12_11342567_T/C_ENST00000547342
			l=full_id.replace("_"," ").replace("[", " ").split(" ")
			seq_id="_".join(l[:-3])+"_"+l[-2]
			#~ print "annot seq_id",seq_id
			h = dict(zip(header,line.strip().split('\t')[1:] ))	
			if seq_id in  seq_hash :
				seq_hash[seq_id].update(h)
			else :
				seq_hash[seq_id]=h
				seq_list.append(seq_id)
				
	
	fh_in.close()

fh_out=open(args.o,"w")
fh_out.write("Sequence Name\t"+"\t".join(column_header)+"\n")

for seq_id in seq_list:
	string = seq_id
	for col in column_header :
		string += "\t"+seq_hash[seq_id][col] if col in seq_hash[seq_id] and seq_hash[seq_id][col]!="" else "\t-"
	fh_out.write(string+"\n")