import numpy as np
import cleanlog
from scipy.special import gammaln as logG
from scipy.special import digamma
from scipy.special import polygamma
from sklearn.mixture import GaussianMixture
logger = cleanlog.ColoredLogger(name='PRISM')
[docs]def trigamma(x):
return polygamma(1, x)
[docs]class BetaBinomialMixture():
"""Beta-binomial mixture model for PRISM core algorithm.
"""
def __init__(self, n_components=1, max_iter=10000, tol=1e-3, seed=None, verbose=False):
self.n_components = n_components
self.max_iter = max_iter
self.tol = tol
self.seed = seed
self.verbose = verbose
def _bebin_mle(self, ns, ks, ws, n_iter=4):
"""Given depths(ns), and counts(ks) find the most likely alpha and beta,
which are the two parameters of beta-binomial distribution."""
a, b = 1, 1
N = ws.sum()
p1_bar = np.exp(1 / N * np.sum(ws * (digamma(a + ks) - digamma(a + b + ns))))
p2_bar = np.exp(1 / N * np.sum(ws * (digamma(b + (ns - ks)) - digamma(a + b + ns))))
# Find good initial esimates.
a = 1 / 2 * (1 - p2_bar) / (1 - p1_bar - p2_bar)
b = 1 / 2 * (1 - p1_bar) / (1 - p1_bar - p2_bar)
for _ in range(n_iter):
p1_bar = np.exp(1 / N * np.sum(ws * (digamma(a + ks) - digamma(a + b + ns))))
p2_bar = np.exp(1 / N * np.sum(ws * (digamma(b + (ns - ks)) - digamma(a + b + ns))))
n_log_p1 = np.sum(ws * (digamma(a + ks) - digamma(a + b + ns)))
n_log_p2 = np.sum(ws * (digamma(b + (ns - ks)) - digamma(a + b + ns)))
q_1 = -N * trigamma(a)
q_2 = -N * trigamma(b)
g_1 = N * digamma(a + b) - N * digamma(a) + n_log_p1
g_2 = N * digamma(a + b) - N * digamma(b) + n_log_p2
z = N * trigamma(a + b)
b_ = (g_1 / q_1 + g_2 / q_2) / (1 / z + 1 / q_1 + 1 / q_2)
a = a - (g_1 - b_) / q_1
b = b - (g_2 - b_) / q_2
return a, b
def _bebin_likelihood(self, n, k, a, b):
return np.exp(self._bebin_loglikelihood(n, k, a, b))
def _bebin_loglikelihood(self, n, k, a, b):
tmp = logG(n + 1) + logG(k + a) + logG(n - k + b) + logG(a + b) - \
(logG(k + 1) + logG(n - k + 1) + logG(a) + logG(b) + logG(n + a + b))
return tmp.sum(axis=1)
def _gmm_initialize(self, n, k):
"""Initialize alphas and betas by fitting gaussian mixture model roughly.
Alphas and betas are correspondingly computed from means and variances of each component.
"""
# Ratio of methylated reads.
r = k / n
r = r.reshape(self.n_data, self.n_dim)
# Fit gaussian mixture model. It assumes that each dimension has its own variance.
model = GaussianMixture(n_components=self.n_components, covariance_type='diag')
model.fit(r)
alphas, betas = [], []
for i in range(self.n_components):
# Compute alpha and beta from mean and variance of GMM fit.
mu = model.means_[i]
var = model.covariances_[i]
alpha = ((1 - mu) / var - 1 / mu) * mu**2
beta = alpha * (1 / mu - 1)
alphas.append(alpha)
betas.append(beta)
return np.array(alphas), np.array(betas)
[docs] def fit(self, n, k, headers):
if self.seed:
np.random.seed(self.seed)
self.n_data, self.n_dim = len(n), 1 if n.ndim == 1 else len(n[0])
# If 1d data are given, reshape them into 2d matrix.
if self.n_dim == 1:
n = n.reshape(self.n_data, self.n_dim)
k = k.reshape(self.n_data, self.n_dim)
self.depths, self.counts, self.headers = n, k, headers
self.alphas_, self.betas_ = self._gmm_initialize(n, k)
self.pi_ = np.ones(self.n_components) / self.n_components
prev_weighted_loglikelihood, self.converged_ = np.inf, False
for iteration in range(self.max_iter):
# Compute likelihood of alpha and beta w.r.t. each data point.
l = np.array([self._bebin_likelihood(n, k, a, b) for a, b in zip(self.alphas_, self.betas_)])
# Compute posterior.
weighted_likelihood = l * self.pi_.reshape([self.n_components, 1])
curr_weighted_loglikelihood = np.log(weighted_likelihood.sum(axis=0)).sum()
# Check for convergence.
if np.abs(curr_weighted_loglikelihood - prev_weighted_loglikelihood) < self.tol:
if self.verbose:
logger.debug(f'Met convergence criterion at iteration {iteration}. Terminating.')
self.converged_ = True
break
prev_weighted_loglikelihood = curr_weighted_loglikelihood
w = weighted_likelihood / weighted_likelihood.sum(axis=0)
# Compute relative size of clusters.
self.pi_ = w.sum(axis=1) / self.n_data
# Compute maximum likelihood estimates of alpha and beta.
alphas, betas = [], []
for i in range(self.n_components):
alpha_list, beta_list = [], []
for j in range(self.n_dim):
tmp_alpha, tmp_beta = self._bebin_mle(n[:, j], k[:, j], w[i])
alpha_list.append(tmp_alpha)
beta_list.append(tmp_beta)
alphas.append(alpha_list)
betas.append(beta_list)
self.alphas_ = np.array(alphas)
self.betas_ = np.array(betas)
# END for iteration
if not self.converged_:
if self.verbose:
logger.warning('The EM algorithm did not converge. Increase n_iter enough to ensure convergence.')
self.log_likelihood_ = curr_weighted_loglikelihood
[docs] def predict_proba(self, n, k):
"""Returns the posterior probabilities of each fingerprint epiloci for each cluster.
:param list n: Depths of fingerprint epiloci.
:param list k: Fingerprint pattern counts of fingerprint epiloci.
:returns: Posterior probabilities of each fingerprint epiloci.
"""
l = np.array([self._bebin_likelihood(n, k, a, b) for a, b in zip(self.alphas_, self.betas_)])
weighted_likelihood = l * self.pi_.reshape([self.n_components, 1])
return weighted_likelihood / weighted_likelihood.sum(axis=0)
def _n_parameters(self):
"""Returns the number of parameters estimated while fitting the model.
:returns: Number of parameters in the model.
"""
return int(2 * self.n_dim * self.n_components + (self.n_components - 1))
[docs] def get_weights(self):
"""Returns the list of cluster weights.
Note that a cluster weight is computed as a sum of posterior probabilities that
each of the data point will be assiged to that cluster.
:returns: Cluster weights.
"""
return self.pi_
@property
def means_(self):
return np.array([a / (a + b) for a, b in zip(self.alphas_, self.betas_)])
[docs] def get_means(self):
"""
:returns: Cluster means.
"""
return self.means_
@property
def dispersions_(self):
return np.array([1 / (a + b + 1) for a, b in zip(self.alphas_, self.betas_)])
[docs] def get_dispersions(self):
"""
:returns: Cluster dispersions.
"""
return self.dispersions_
[docs] def bic(self):
"""
:returns: Bayesian Information Criterion (BIC) value of the model.
"""
return -2 * self.log_likelihood_ + np.log(self.n_data) * self._n_parameters()
[docs] def get_n_dimensions(self):
"""
:returns: Number of dimensions.
"""
return self.n_dim
[docs] def get_n_components(self):
"""
:returns: Number of clusters.
"""
return self.n_components
[docs] def get_depths(self):
"""
:returns: Depths of fingerprint epiloci used for fitting the model.
"""
return self.depths
[docs] def get_counts(self):
"""
:returns: Fingerprint pattern counts of fingerprint epiloci used for fitting the model.
"""
return self.counts