"""
filterAnnotatedSV
~~~~~~~~~~~~~~~~~
:Description: This module will filter calls from the merged file
"""
'''
Created on Mar 17, 2015
Description: This module will filter calls from the merged file
@author: Ronak H Shah
::Inputs::
inputTxt: Filter Text File
outputDir: Output directory
outPrefix: Prefix of the output file
blacklistGenesFile: List of genes that should be eliminated
verbose: Mode
genesToKeepFile: List of genes that should be kept
::Output::
Filtered Output files
'''
import os
import pandas as pd
import logging
import checkparameters as cp
import re
import coloredlogs
# Initiate logger
logger = logging.getLogger('iCallSV.FilterDellyCalls')
coloredlogs.install(level='DEBUG')
[docs]def run(inputTxt, outputDir, outPrefix, blacklistGenesFile, verbose, genesToKeepFile="somefile.txt"):
"""
This will ``filter sv calls`` from the final merged file.
:param str inputTxt: str for the txt file to be filtered
:param str outputDir: str for the output directory
:param str outputPrefix: str prefix for the output File
:param str blacklistGenesFile: str for the txt file containing blacklisted genes
:param bool verbose: a boolean
:param str genesToKeepFile: str for the txt file containing genes to keep
:return: A str name of final sv file
:rtype: str
"""
cp.checkFile(inputTxt)
cp.checkFile(blacklistGenesFile)
cp.checkDir(outputDir)
cp.checkEmpty(outPrefix, "Prefix for the output file")
if(os.path.isfile(genesToKeepFile)):
logger.info(
"iCallSV::FilterFinalFile: Genes to Keep File Given %s and will be used.",
genesToKeepFile)
keepGenes = [line.strip() for line in open(genesToKeepFile, 'r')]
else:
keepGenes = None
inputDF = pd.read_table(inputTxt, keep_default_na='True')
outputDF = inputDF.copy()
#outputDF = pd.DataFrame(columns=inputDF.columns)
outputFile = os.path.join(outputDir, outPrefix + "_final.txt")
for index, row in inputDF.iterrows():
gene1 = row.loc['Gene1']
gene2 = row.loc['Gene2']
site1 = row.loc['Site1Description']
site2 = row.loc['Site2Description']
# skip IGR records
if("IGR" in site1 and "IGR" in site2):
igrFlag = True
else:
igrFlag = False
# check records from these gene
if(keepGenes):
keepGeneFlag = checkGeneListToKeep(gene1, gene2, keepGenes)
else:
keepGeneFlag = True
# check records from these gene
blacklistGenes = [line.strip() for line in open(blacklistGenesFile, 'r')]
blacklistGeneFlag = checkBlackListGene(gene1, gene2, blacklistGenes)
# skip record occurring within intron
eventInIntronFlag = False
if((gene1 == gene2) and ((igrFlag is False) or (blacklistGeneFlag is False)) and ("Intron" in site1 and "Intron" in site2)):
eventInIntronFlag = checkEventInIntronFlag(gene1, gene2, site1, site2)
else:
pass
if((keepGeneFlag is False) or (igrFlag) or (blacklistGeneFlag) or (eventInIntronFlag)):
if(verbose):
logger.warn(
"iCallSV::FilterFinalFile: Record: gene1:%s; gene2:%s; site1:%s; site2:%s; will be Filtered as keepGeneFlag:%s; IGR:%s; blackListGene:%s; Intronic Event:%s",
gene1,
gene2,
site1,
site2,
str(keepGeneFlag),
str(igrFlag),
str(blacklistGeneFlag),
str(eventInIntronFlag))
outputDF = outputDF.drop(index)
else:
pass
outputDF[['SV_LENGTH', 'Cosmic_Fusion_Counts']] = outputDF[['SV_LENGTH', 'Cosmic_Fusion_Counts']].astype(int)
# Write The Final Output File
outputDF.to_csv(outputFile, sep='\t', index=False)
if(verbose):
logger.info(
"iCallSV::FilterFinalFile: Finished Filtering, Final data written in %s",
outputFile)
return(outputFile)
# Check if the gene is a Keep gene
[docs]def checkGeneListToKeep(gene1, gene2, keepGenes):
if((gene1 in keepGenes) or (gene2 in keepGenes)):
kgFlag = True
else:
kgFlag = False
return(kgFlag)
# Check if the gene is a blacklist gene
[docs]def checkBlackListGene(gene1, gene2, blacklistGenes):
"""
This will ``check for blacklisted genes``
:param str gene1: str for the name of gene at breakpoint 1
:param str gene2: str for the name of gene at breakpoint 2
:param list blacklistGenes: list containing blacklisted genes
:param str genesToKeepFile: str for the txt file containing genes to keep
:return: A boolean tag indicating True or False
:rtype: bool
"""
if((gene1 in blacklistGenes) or (gene2 in blacklistGenes)):
bgFlag = True
else:
bgFlag = False
return(bgFlag)
# Check if the event is in the intron only and not affecting splicing
[docs]def checkEventInIntronFlag(gene1, gene2, site1, site2):
"""
This will ``Check if the event is in the intron only and not affecting
splicing``
:param str gene1: str for the name of gene at breakpoint 1
:param str gene2: str for the name of gene at breakpoint 2
:param str site1: str for the description of site in breakpoint 1
:param str site2: str for the description of site in breakpoint 2
:return: A boolean tag indicating True or False
:rtype: bool
"""
if(gene1 == gene2):
(s1A, s1B) = site1.split(":")
(s2A, s2B) = site2.split(":")
(s1a, s1b, s1c, s1d) = s1B.split(" ")
(s2a, s2b, s2c, s2d) = s2B.split(" ")
if(("before" in site1 and "before" in site2) or ("after" in site1 and "after" in site2)):
if(int(s1d) == int(s2d)):
if("bp" in s1a):
s1location = re.findall(r'\d+', s1a)[0]
s2location = re.findall(r'\d+', s2a)[0]
if(int(s1location) < 5 or int(s2location) < 5):
eviFlag = False
else:
eviFlag = True
else:
eviFlag = True
else:
eviFlag = True
else:
eviFlag = True
else:
eviFlag = True
return(eviFlag)