Source code for iCallSV.mergeFinalFiles

"""
mergeFinalFiles
~~~~~~~~~~~~~~~

:Description: Merge VCF, iAnnotateSV tab and targetSeqView tab file into a single tab-delimited file
"""
'''
Created on May 17, 2015
Description: Merge VCF, iAnnotateSV tab and targetSeqView tab file into a single tab-delimited file
@author: Ronak H Shah
::Input::
aId: Sample ID for case that has the structural abberations
bId: Sample ID for control
vcfFile: Delly filtered and merged VCF file
annoTab: iAnnotateSV tab-delimited file with annotations
confTab: targetSeqView tab-delimited file with probability score
outputDir: Directory to write the output file
outputPrefix: Output File Prefix
::Output::
outputFile: File with following header
"TumorId\tNormalId\tChr1\tPos1\tChr2\tPos2\tSV_Type\tGene1\tGene2\tTranscript1\tTranscript2\tSite1Description\tSite2Description\tFusion\tProbabilityScore\tConfidence\tComments\tConnection_Type\tSV_LENGTH\tMAPQ\tPairEndReadSupport\tSplitReadSupport\tBrkptType\tConsensusSequence\tTumorVariantCount\tTumorSplitVariantCount\tTumorReadCount\tTumorGenotypeQScore\tNormalVariantCount\tNormalSplitVariantCount\tNormalReadCount\tNormalGenotypeQScorerepName-repClass-repFamily:-site1\trepName-repClass-repFamily:-site2\tCC_Chr_Band\tCC_Tumour_Types(Somatic)\tCC_Cancer_Syndrome\tCC_Mutation_Type\tCC_Translocation_Partner\tDGv_Name-DGv_VarType-site1\tDGv_Name-DGv_VarType-site2\n";
'''
import sys
import os
import logging
import vcf
import checkparameters as cp
import pandas as pd
import re
import coloredlogs
import numpy as np

logger = logging.getLogger('iCallSV.mergeFinalFiles')
coloredlogs.install(level='DEBUG')

[docs]def run(aId, bId, vcfFile, annoTab, confTab, outDir, outputPrefix, verbose): """ This will Merge VCF, iAnnotateSV tab and targetSeqView tab file into a single tab-delimited file :param str aId: Sample ID for case that has the structural abberations :param str bId: Sample ID for control :param str vcfFile: Delly filtered and merged VCF file :param str annoTab: iAnnotateSV tab-delimited file with annotations :param str confTab: targetSeqView tab-delimited file with probability score :param str outputDir: Directory to write the output file :param str outputPrefix: Output File Prefix :return: str of the tab-delimited file :rtype: str """ if(verbose): logger.info( "iCallSV::MergeFinalFile: Merging Delly Filtered VCF, iAnnotateSV tab and targetSeqView tab file into a single tab-delimited file") cp.checkFile(vcfFile) cp.checkFile(annoTab) # cp.checkFile(confTab) cp.checkDir(outDir) outDF = pd.DataFrame( columns=[ "TumorId", "NormalId", "Chr1", "Pos1", "Chr2", "Pos2", "SV_Type", "Gene1", "Gene2", "Transcript1", "Transcript2", "Site1Description", "Site2Description", "Fusion", "ProbabilityScore", "Confidence", "Comments", "Connection_Type", "SV_LENGTH", "MAPQ", "PairEndReadSupport", "SplitReadSupport", "BrkptType", "ConsensusSequence", "TumorReferenceCount", "TumorSplitReferenceCount", "TumorVariantCount", "TumorSplitVariantCount", "TumorReadCount", "TumorGenotypeQScore", "NormalReferenceCount", "NormalSplitReferenceCount", "NormalVariantCount", "NormalSplitVariantCount", "NormalReadCount", "NormalGenotypeQScore", "Cosmic_Fusion_Counts", "repName-repClass-repFamily:-site1", "repName-repClass-repFamily:-site2", "CC_Chr_Band", "CC_Tumour_Types(Somatic)", "CC_Cancer_Syndrome", "CC_Mutation_Type", "CC_Translocation_Partner", "DGv_Name-DGv_VarType-site1", "DGv_Name-DGv_VarType-site2"]) annoDF = pd.read_csv(annoTab, sep="\t", header=0, keep_default_na='True') if(os.path.isfile(confTab)): confDF = pd.read_csv(confTab, sep="\t", header=0, keep_default_na='True') else: confDF = None # Read VCF and Traverse through it vcf_reader = vcf.Reader(open(vcfFile, 'r')) samples = vcf_reader.samples pattern = re.compile(aId) # Get the case and control id caseIDinVcf = None controlIDinVcf = None for sample in samples: match = re.search(pattern, sample) if(match): caseIDinVcf = sample else: controlIDinVcf = sample # traverse through the vcf count = 0 for record in vcf_reader: # Define all variables: (chrom1, start1, start2, chrom2, filter, svtype, brktype, contype, conseq) = (None for i in range(9)) (startCT, endCT, str1, str2, svlengthFromDelly, mapqFromDelly, peSupportFromDelly, srSupportFromDelly, ciEndNeg, ciEndPos, ciPosNeg, ciPosPos, caseRC, caseGQ, caseDR, caseDV, caseRR, caseRV, controlGQ, controlRC, controlDR, controlDV, controlRR, controlRV) = (0 for i in range(24)) chrom1 = str(record.CHROM) start1 = record.POS filter = record.FILTER if(len(filter) < 1): filter = None else: filter = filter[0] preciseFlag = record.is_sv_precise if("END" in record.INFO): start2 = record.INFO['END'] if("CHR2" in record.INFO): chrom2 = str(record.INFO['CHR2']) if("SVTYPE" in record.INFO): svtype = record.INFO['SVTYPE'] if("SVLEN" in record.INFO): svlengthFromDelly = np.int(record.INFO['SVLEN']) else: if(svtype == "TRA"): svlengthFromDelly = 0 else: svlengthFromDelly = np.int(abs(start2 - start1)) if("MAPQ" in record.INFO): mapqFromDelly = np.int(record.INFO['MAPQ']) if("PE" in record.INFO): peSupportFromDelly = np.int(record.INFO['PE']) if("SR" in record.INFO): srSupportFromDelly = np.int(record.INFO['SR']) if("CT" in record.INFO): contype = record.INFO['CT'] (startCT, endCT) = contype.split("to") if((int(startCT) == 3) and (int(endCT) == 3)): str1 = 0 str2 = 0 elif((int(startCT) == 3) and (int(endCT) == 5)): str1 = 0 str2 = 1 elif((int(startCT) == 5) and (int(endCT) == 3)): str1 = 1 str2 = 0 elif((int(startCT) == 5) and (int(endCT) == 5)): str1 = 1 str2 = 1 else: if(verbose): logger.warning( "mergeFinalFiles: The connection type (CT) given in the vcf file is incorrect.CT: %s", contype) if("CONSENSUS" in record.INFO): conseq = record.INFO['CONSENSUS'] if(record.is_sv_precise): brktype = "PRECISE" else: brktype = "IMPPRECISE" if("CIEND" in record.INFO): ciEndNeg, ciEndPos = record.INFO['CIEND'] if(abs(ciEndNeg) < 50): ciEndNeg = 50 if(abs(ciEndPos) < 50): ciEndNeg = 50 if("CIPOS" in record.INFO): ciPosNeg, ciPosPos = record.INFO['CIPOS'] if(abs(ciPosNeg) < 50): ciPosNeg = 50 if(abs(ciPosPos) < 50): ciPosNeg = 50 caseCalls = record.genotype(caseIDinVcf) controlCalls = record.genotype(controlIDinVcf) if(hasattr(caseCalls.data, "GQ")): caseGQ = np.int(caseCalls.data.GQ) if(hasattr(caseCalls.data, "RC")): caseRC = np.int(caseCalls.data.RC) if(hasattr(caseCalls.data, "DR")): caseDR = np.int(caseCalls.data.DR) if(hasattr(caseCalls.data, "DV")): caseDV = np.int(caseCalls.data.DV) if(hasattr(caseCalls.data, "RR")): caseRR = np.int(caseCalls.data.RR) if(hasattr(caseCalls.data, "RV")): caseRV = np.int(caseCalls.data.RV) if(hasattr(controlCalls.data, "GQ")): controlGQ = np.int(controlCalls.data.GQ) if(hasattr(controlCalls.data, "RC")): controlRC = np.int(controlCalls.data.RC) if(hasattr(controlCalls.data, "DR")): controlDR = np.int(controlCalls.data.DR) if(hasattr(controlCalls.data, "DV")): controlDV = np.int(controlCalls.data.DV) if(hasattr(controlCalls.data, "RR")): controlRR = np.int(controlCalls.data.RR) if(hasattr(controlCalls.data, "RV")): controlRV = np.int(controlCalls.data.RV) # Get data from annotation file (indexList, annoIndex, gene1, gene2, transcript1, transcript2, site1, site2, fusion, rr_site1, rr_site2, cc_chr_band, cc_t_t, cc_c_s, cc_m_t, cc_t_p, dgv_site1, dgv_site2 ) = (None for i in range(18)) cosmic_fusion_counts = 0 annoDF[['chr1', 'chr2']] = annoDF[['chr1', 'chr2']].astype(str) annoDF['Cosmic_Fusion_Counts'].fillna(0, inplace=True) annoDF[['Cosmic_Fusion_Counts']] = annoDF[['Cosmic_Fusion_Counts']].astype(int) indexList = annoDF.loc[annoDF['chr1'].isin([chrom1]) & annoDF['pos1'].isin([int(start1)]) & annoDF['chr2'].isin([chrom2]) & annoDF['pos2'].isin([int(start2)]) & annoDF['str1'].isin([str1]) & annoDF['str2'].isin([str2])].index.tolist() if(len(indexList) > 1): if(verbose): logger.fatal( "iCallSV::MergeFinalFile: More then one sv have same coordinate in same sample for annotated file. Please check and rerun") sys.exit(1) else: annoIndex = indexList[0] gene1 = annoDF.iloc[annoIndex]['gene1'] gene2 = annoDF.iloc[annoIndex]['gene2'] transcript1 = annoDF.iloc[annoIndex]['transcript1'] transcript2 = annoDF.iloc[annoIndex]['transcript2'] site1 = annoDF.iloc[annoIndex]['site1'] site2 = annoDF.iloc[annoIndex]['site2'] fusion = annoDF.iloc[annoIndex]['fusion'] rr_site1 = annoDF.iloc[annoIndex]['repName-repClass-repFamily:-site1'] rr_site2 = annoDF.iloc[annoIndex]['repName-repClass-repFamily:-site2'] cosmic_fusion_counts = int(annoDF.iloc[annoIndex]['Cosmic_Fusion_Counts']) cc_chr_band = annoDF.iloc[annoIndex]['CC_Chr_Band'] cc_t_t = annoDF.iloc[annoIndex]['CC_Tumour_Types(Somatic)'] cc_c_s = annoDF.iloc[annoIndex]['CC_Cancer_Syndrome'] cc_m_t = annoDF.iloc[annoIndex]['CC_Mutation_Type'] cc_t_p = annoDF.iloc[annoIndex]['CC_Translocation_Partner'] dgv_site1 = annoDF.iloc[annoIndex]['DGv_Name-DGv_VarType-site1'] dgv_site2 = annoDF.iloc[annoIndex]['DGv_Name-DGv_VarType-site2'] if(confDF is None): confidenceScore = None else: # Get information for confidence score confIndex = None confidenceScore = None confDF[['Chr1', 'Chr2']] = confDF[['Chr1', 'Chr2']].astype(str) indexList = confDF.loc[ confDF['Chr1'].isin([chrom1]) & confDF['Start1'].isin( [int(start1 - abs(ciPosNeg))]) & confDF['Chr2'].isin([chrom2]) & confDF['Start2'].isin([int(start2 - abs(ciEndNeg))])].index.tolist() if(len(indexList) > 1): if(verbose): logger.fatal( "iCallSV::MergeFinalFile: More then one sv have same coordinate in same sample for confidence score. Please check and rerun") sys.exit(1) else: confIndex = indexList[0] confidenceScore = np.float(confDF.iloc[confIndex]['ProbabilityScore']) # populate final dataframe outDF.loc[count, ["TumorId", "NormalId", "Chr1", "Pos1", "Chr2", "Pos2", "SV_Type", "Gene1", "Gene2", "Transcript1", "Transcript2", "Site1Description", "Site2Description", "Fusion", "ProbabilityScore", "Confidence", "Comments", "Connection_Type", "SV_LENGTH", "MAPQ", "PairEndReadSupport", "SplitReadSupport", "BrkptType", "ConsensusSequence", "TumorReferenceCount", "TumorSplitReferenceCount", "TumorVariantCount", "TumorSplitVariantCount", "TumorReadCount", "TumorGenotypeQScore", "NormalReferenceCount", "NormalSplitReferenceCount", "NormalVariantCount", "NormalSplitVariantCount", "NormalReadCount", "NormalGenotypeQScore", "Cosmic_Fusion_Counts", "repName-repClass-repFamily:-site1", "repName-repClass-repFamily:-site2", "CC_Chr_Band", "CC_Tumour_Types(Somatic)", "CC_Cancer_Syndrome", "CC_Mutation_Type", "CC_Translocation_Partner", "DGv_Name-DGv_VarType-site1", "DGv_Name-DGv_VarType-site2"]] = [aId, bId, chrom1, start1, chrom2, start2, svtype, gene1, gene2, transcript1, transcript2, site1, site2, fusion, confidenceScore, None, None, contype, svlengthFromDelly, mapqFromDelly, peSupportFromDelly, srSupportFromDelly, brktype, conseq, caseDR, caseRR, caseDV, caseRV, caseRC, caseGQ, controlDR, controlRR, controlDV, controlRV, controlRC, controlGQ, cosmic_fusion_counts, rr_site1, rr_site2, cc_chr_band, cc_t_t, cc_c_s, cc_m_t, cc_t_p, dgv_site1, dgv_site2] count = count + 1 # Write Output outFile = outDir + "/" + outputPrefix + "_merged.txt" outDF.to_csv(outFile, sep='\t', index=False) if(verbose): logger.info("iCallSV::MergeFinalFile: Finished merging, Final data written in %s", outFile) return(outFile)