Source code for prism.deconvolute

from prism.mixture import BetaBinomialMixture
from collections import Counter

import numpy as np
import prism.util as util
import cleanlog

logger = cleanlog.ColoredLogger('deconvolute')

[docs]def postfiltered(pattern_counter_generator, full_pattern_proportion): """Generator filter for postfiltered patterns. :param generator pattern_counter_generator: Generator emitting epiloci headers and pattern counters. :param float full_pattern_proportion: Proportion of fully methylated and unmethylated patterns to be retained. :returns: Yields retained header and pattern counter. """ for header, pattern_counter in pattern_counter_generator: if len(pattern_counter) == 1: continue counts = pattern_counter.values() pattern1, pattern2 = pattern_counter.most_common(2)[0][0], pattern_counter.most_common(2)[1][0] total = sum(counts) if ( util.is_fully_methylated_or_unmethylated(pattern1) and util.is_fully_methylated_or_unmethylated(pattern2) and pattern_counter[pattern1] + pattern_counter[pattern2] > total * full_pattern_proportion ): yield header, pattern_counter
[docs]def methylated_pattern(p1, p2): """Given two methylation patterns, returns the pattern with more methylated CpGs. :param string p1: Binarized methylation pattern (0: unmethylated, 1: methylated). :param string p2: Binarized methylation pattern (0: unmethylated, 1: methylated). :returns: The pattern with more methylated CpGs. """ return [p1, p2][sum(c == '1' for c in str(p1)) < sum(c == '1' for c in str(p2))]
[docs]def parse_met_file(fp, full_pattern_proportion=0.8): """Parse entries in MET file and yield depths, counts, and epiloci headers for postfiltered fingerprint epiloci. :param string fp: File path to (corrected) met file. :param float full_pattern_proportion: Proportion of fully methylated and unmethylated patterns to be retained. :returns: Yields arrays of depths, counts and headers of postfiltered fingerprint epiloci. """ depths, counts, headers = [], [], [] for header, pattern_counter in postfiltered(util.pattern_counters_from_met(fp), full_pattern_proportion): p1, p2 = pattern_counter.most_common(2)[0][0], pattern_counter.most_common(2)[1][0] depth = pattern_counter[p1] + pattern_counter[p2] count = pattern_counter[methylated_pattern(p1, p2)] depths.append(depth) counts.append(count) headers.append(header) return np.array(depths), np.array(counts), np.array(headers)
[docs]def common_intersection(headers_list): """Given two or more lists of epiloci headers, returns the list of common headers. :param list headers_list: A list of epiloci headers from two or more samples. :returns: List of common headers that appear in all of the samples. """ common_headers = set(headers_list[0]) for i in range(1, len(headers_list)): common_headers = common_headers & set(headers_list[i]) common_headers = list(common_headers) return common_headers
[docs]def merge_met_files(met_files, full_pattern_proportion, intersection_method, jaccard_cutoff=0.5): """Given met files, return depths, fingerprint pattern counts and epiloci headers for each common fingerprint epilocus. :param list met_files: List of file paths to (corrected) met files. :param float full_pattern_proportion: Proportion of fully methylated and unmethylated patterns to be retained. :param string intersection_method: Possible values are one of ['common', 'jaccard']. 'common': only epiloci that exactly appears in all of the samples will be retained. 'jaccard': This only applies to two-sample analysis. A pair of epiloci that have jaccard similarity greater than 0.5 will be retained. :returns: Depths, fingerprint pattern counts and epiloci headers for each common fingerprint epilocus. """ n_samples = len(met_files) depths_list, counts_list, headers_list = [], [], [] for met_file in met_files: logger.debug('Parsing %s.' % met_file) depths, counts, headers = parse_met_file(met_file, full_pattern_proportion=full_pattern_proportion) depths_list.append(depths) counts_list.append(counts) headers_list.append(headers) header_depth_dicts, header_count_dicts = [], [] for depths, counts, headers in zip(depths_list, counts_list, headers_list): header_depth_dict, header_count_dict = dict(), dict() for depth, count, header in zip(depths, counts, headers): header_depth_dict[header] = depth header_count_dict[header] = count header_depth_dicts.append(header_depth_dict) header_count_dicts.append(header_count_dict) if intersection_method == 'common': common_headers = common_intersection(headers_list) merged_depths = np.array([ [header_depth_dicts[i][h] for i in range(n_samples)] for h in common_headers ]) merged_counts = np.array([ [header_count_dicts[i][h] for i in range(n_samples)] for h in common_headers ]) elif intersection_method == 'jaccard': assert len(headers_list) == 2, 'Extracting common headers by jaccard similarity is applicable only for two samples.' common_h1, common_h2, common_headers = util.get_common_headers_by_jaccard_similarity(headers_list[0], headers_list[1], cutoff=jaccard_cutoff) merged_depths = np.array([ [header_depth_dicts[0][h1], header_depth_dicts[1][h2]] for h1, h2 in zip(common_h1, common_h2) ]) merged_counts = np.array([ [header_count_dicts[0][h1], header_count_dicts[1][h2]] for h1, h2 in zip(common_h1, common_h2) ]) return merged_depths, merged_counts, common_headers
[docs]def mark_outlier_clusters(model, outlier_dispersion_cutoff=0.2): """Given model fit, mark overdispered clusters as outlier clusters. :param BetaBinomialMixture model: Beta-binomial mixture model fit. :param float outlier_dispersion_cutoff: Cutoff for dispersion to mark a cluster as an outlier. :returns: Boolean mask that denotes if each of the cluster is an outlier. """ return np.array([any(d > outlier_dispersion_cutoff) for d in model.get_dispersions()])
[docs]def merge_subclones(subclones, cluster_a, cluster_b): """Merge two clusters containing cluster a and cluster b. If a and b are already in the same subclone, just return the subclones unchanged. :param list subclones: List of sets of clusters (subclones). :param int cluster_a: Cluster index to merge. :param int cluster_b: Cluster index to merge. :return: Merged subclones as a list. """ merged_subclones = [] for subclone in subclones: # Cluster a and b is already in the same subclone. if cluster_a in subclone and cluster_b in subclone: return subclones if cluster_a in subclone: subclone_containing_a = subclone elif cluster_b in subclone: subclone_containing_b = subclone else: merged_subclones.append(subclone) # Append merged subclone to the result. merged_subclones.append(subclone_containing_a | subclone_containing_b) return merged_subclones
[docs]def identify_subclone(model, merge_cutoff, outlier_cluster_mask): """Given beta-binomial model fit, identify mergeable clusters, merge them, and mark outliers. :param BetaBinomialMixture model: Beta-binomial model fit. :param float merge_cutoff: Cutoff for the distance from midpoint of the two clusters to (0.5, ..., 0.5) to be merged. :param list outlier_cluster_mask: A boolean mask denoting if the cluster is outlier. :returns: A list of identified subclones, and boolean mask marking outlier subclones. """ midpoint_distance = lambda a, b: np.sqrt(np.square((a + b) / 2 - 0.5).sum()) cluster_means = model.get_means() num_clusters = model.get_n_components() subclones = [{i} for i in range(num_clusters)] # For 1-dimensional analysis, skip merging clusters. if model.get_n_dimensions() == 1: return subclones, outlier_cluster_mask[:] # Examine all pairs of clusters, and merge if their midpoint is close to (0.5, ..., 0.5). for cluster_a in range(num_clusters): for cluster_b in range(cluster_a + 1, num_clusters): if midpoint_distance(cluster_means[cluster_a], cluster_means[cluster_b]) < merge_cutoff: subclones = merge_subclones(subclones, cluster_a, cluster_b) # Mark a subclone as outlier subclone if it contains outlier cluster. outlier_subclone_mask = [] for subclone in subclones: is_outlier_subclone = any([outlier_cluster_mask[cluster] for cluster in subclone]) outlier_subclone_mask.append(is_outlier_subclone) # Reorder subclone so that non-outlier subclones have indices starting from 0. final_subclones = [] final_outlier_subclone_mask = [] for subclone_index, subclone in enumerate(subclones): if not outlier_subclone_mask[subclone_index]: final_subclones.append(subclone) final_outlier_subclone_mask.append(False) for subclone_index, subclone in enumerate(subclones): if outlier_subclone_mask[subclone_index]: final_subclones.append(subclone) final_outlier_subclone_mask.append(True) return final_subclones, final_outlier_subclone_mask
[docs]def posthoc_process(model, merge_cutoff, outlier_dispersion_cutoff): """Post-hoc processing step. In this step, clusters are merged if they seemed to be 'reflected' clusters. Also, overdispered clusters are marked so that they can be excluded in further analyses. :param BetaBinomialMixture model: Beta-binomial model fit. :param float merge_cutoff: Cutoff for the distance from midpoint of the two clusters to (0.5, ..., 0.5) to be merged. :param float outlier_dispersion_cutoff: Cutoff for dispersion to mark a cluster as an outlier. :returns: List of subclones, and boolean mask representing if each of them is an outlier. """ outlier_cluster_mask = mark_outlier_clusters(model, outlier_dispersion_cutoff) subclones, outlier_subclone_mask = identify_subclone(model, merge_cutoff, outlier_cluster_mask) return subclones, outlier_subclone_mask
[docs]def get_subclone_assignment(subclones, assignment, outlier_subclone_mask): """Given cluster assignment, returns the subclone assignment. Note that if the subclone is found to be an outlier, -1 will be returned. :param list subclones: List of subclones. :param int assignment: Index of assigned clsuter. :param list outlier_subclone_mask: Boolean mask denoting outlier subclones. :returns: Index of assigned subclone. """ for subclone_index, subclone in enumerate(subclones): if assignment in subclone: # If the assigned subclone is outlier, return -1. if outlier_subclone_mask[subclone_index]: return -1 # Else, return the index of subclone. else: return subclone_index
[docs]def run(input_fps, full_pattern_proportion=0.8, merge_cutoff=0.05, outlier_dispersion_cutoff=0.2, num_max_cluster=15, seed=12345, intersection_method='common', verbose=False, output_fp=None): if verbose: logger.setLevel(cleanlog.DEBUG) merged_depths, merged_counts, common_headers = merge_met_files(input_fps, full_pattern_proportion=full_pattern_proportion, intersection_method=intersection_method) logger.debug('Total %d fingerprint epiloci will be used for deconvolution.' % len(common_headers)) models = [] for n_components in range(1, num_max_cluster + 1): logger.debug('Fitting beta-binomial mixture model with %d clusters.' % n_components) bbmm = BetaBinomialMixture(n_components=n_components, seed=seed) bbmm.fit(merged_depths, merged_counts, common_headers) models.append(bbmm) selected_model = min(models, key=lambda m: m.bic()) logger.debug('The best model had %d clusters.' % selected_model.get_n_components()) subclones, outlier_subclone_mask = posthoc_process(selected_model, merge_cutoff, outlier_dispersion_cutoff) cluster_assignments = selected_model.predict_proba(merged_depths, merged_counts).argmax(axis=0) subclone_assignments = [get_subclone_assignment(subclones, assignment, outlier_subclone_mask) for assignment in cluster_assignments] logger.debug('The clusters were processed, and resulted in %d subclones.' % len(subclones)) if output_fp is None: return selected_model headers = [ 'epilocus', 'cluster', 'subclone', 'depths', 'fingerprint_counts' 'fingerprint_fractions', ] with open(output_fp, 'w') as outFile: print('\t'.join(headers), file=outFile) for header, cluster, subclone, depths, counts in zip(common_headers, cluster_assignments, subclone_assignments, merged_depths, merged_counts): ffs = ",".join(map(lambda t: str(t[1] / t[0]), zip(depths, counts))) print(f'{header}\t{cluster}\t{subclone}\t{",".join(map(str, depths))}\t{",".join(map(str, counts))}\t{ffs}', file=outFile) return selected_model