import pysam
import cycler
import numpy as np
import pybedtools as pb
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import namedtuple, Counter, defaultdict
[docs]class Region(namedtuple('Region', ['reference_name', 'reference_start', 'reference_end'])):
def __new__(cls, reference_name, reference_start, reference_end):
assert reference_start <= reference_end, 'Invalid genomic region %s:%d-%d.' % (reference_name, reference_start, reference_end)
return super(Region, cls).__new__(cls, reference_name, reference_start, reference_end)
def __str__(self):
return '%s:%d-%d' % (self.reference_name, self.reference_start, self.reference_end)
[docs]def get_bam_handle(bam_path):
"""Returns `pysam.AlignmentFile` object for BAM file."""
return pysam.AlignmentFile(bam_path)
[docs]def get_binary_string_representation(read):
"""Returns a binary representation string of methylation state of a read.
Unmethylated state = 0, methylated state = 1.
"""
return ''.join([['0', '1'][c == 'Z'] for c in read.get_tag('XM') if c in ['z', 'Z']])
[docs]def is_fully_methylated_or_unmethylated(pattern):
return len(set(str(pattern))) == 1
[docs]def pattern_counters_from_met(fp):
"""Parse MET file and generate header, and counter of patterns."""
header, binary_patterns = None, []
with open(fp) as inFile:
for line in inFile.readlines():
if line.startswith('>'):
if header is not None:
pattern_counter = Counter(binary_patterns)
yield header[1:], pattern_counter
header, binary_patterns = line.strip(), []
else:
header = line.strip()
else:
binary_patterns.append(line.strip())
pattern_counter = Counter(binary_patterns)
yield header[1:], pattern_counter
[docs]def random_pattern(l, n):
"""Generate `n` random methylation pattern with length `l`."""
s = set()
ps = []
p = np.random.randint(2, size=l)
s.add(''.join(map(str, p)))
ps.append(p)
while True:
if len(ps) == n:
break
p = list(np.random.randint(2, size=l))
if ''.join(map(str, p)) not in s:
s.add(''.join(map(str, p)))
ps.append(p)
return np.array(list(ps)).flatten()
[docs]def jaccard_similarity(s1, s2):
"""Compute Jaccard similarity of two sets."""
s1 = set(s1)
s2 = set(s2)
return len(s1 & s2) / len(s1 | s2)
[docs]def preset_rc(scale=1, font_family=None):
"""Set visually plausible matplotlib rc file."""
plt.rc('axes', linewidth=1.33, labelsize=14)
plt.rc('xtick', labelsize=8 * scale)
plt.rc('ytick', labelsize=8 * scale)
plt.rc('xtick', bottom=True)
plt.rc('xtick.major', size=5 * scale, width=1.33)
plt.rc('xtick.minor', size=5 * scale, width=1.33)
plt.rc('ytick', left=True)
plt.rc('ytick.major', size=5 * scale, width=1.33)
plt.rc('ytick.minor', size=5 * scale, width=1.33)
plt.rc('legend', fontsize=7 * scale)
plt.rc('grid', color='grey', linewidth=0.5, alpha=0.33)
if font_family:
plt.rc('font', family=font_family)
color_palette = [
'#005AC8',
'#AA0A3C',
'#0AB45A',
'#FA7850',
'#8214A0',
'#FA78FA',
'#A0FA82',
'#006E82',
'#00A0FA',
'#14D2DC',
'#F0F032',
'#FAE6BE',
]
mpl.rcParams['axes.prop_cycle'] = cycler.cycler(color=color_palette)
[docs]def parse_result_line(line):
"""Parse each line in the PRISM result file."""
fields = line.strip().split()
header = fields[0]
cluster = int(fields[1])
subclone = int(fields[2])
depths = np.array(list(map(int, fields[3].split(','))))
counts = np.array(list(map(int, fields[4].split(','))))
fingerprint_fractions = list(map(float, fields[5].split(',')))
return header, cluster, subclone, depths, counts, fingerprint_fractions