Module bearclaw.feature_extraction
Expand source code
from pathlib import Path
from typing import Literal
from pandas import DataFrame, read_csv
from sklearn.decomposition import NMF
RESOURCE_DIR = Path(__file__).parent.resolve() / "resources"
_COSMIC_MUTATIONAL_SIGNATURES = {
"single_base_substitutions": RESOURCE_DIR / "COSMIC_v3.3_SBS_GRCh37.txt",
"doublet_base_substitutions": RESOURCE_DIR / "COSMIC_v3.3_DBS_GRCh37.txt",
"indel": RESOURCE_DIR / "COSMIC_v3.3_ID_GRCh37.txt",
"cnv": RESOURCE_DIR / "COSMIC_v3.3_CN_GRCh37.txt",
}
_POSSIBLE_SBS_SEQUENCING_ARTEFACTS = [
# Extracted from https://cancer.sanger.ac.uk/signatures/sbs/.
"SBS27",
"SBS43",
"SBS45",
"SBS46",
"SBS47",
"SBS48",
"SBS49",
"SBS50",
"SBS51",
"SBS52",
"SBS53",
"SBS54",
"SBS55",
"SBS56",
"SBS57",
"SBS58",
"SBS59",
"SBS60",
]
_POSSIBLE_CN_SEQUENCING_ARTEFACTS = ["CN22", "CN23", "CN24"]
def _get_cosmic_feature_names(
representation: Literal[
"single_base_substitutions",
"doublet_base_substitutions",
"indel",
"cnv",
],
signatures: bool = True,
) -> list:
"""Fetch feature/signature names of a given spectrum.
Args:
signatures: If True, return COSMIC signature names, otherwise the
transition names.
"""
# Extract from mutational signature file.
dataframe = read_csv(
_COSMIC_MUTATIONAL_SIGNATURES[representation], sep="\t", index_col=0
)
if signatures:
return dataframe.columns.to_list()
return dataframe.index.to_list()
class CosmicNMF(NMF):
"""Load NMF model fit with COSMIC GRCh37 signatures.
Given the mutational signature H, the `transform` method computes `W` so that
X = WH
where X is the mutation spectrum.
"""
def __init__(
self,
cosmic_signature: Literal[
"single_base_substitutions", "doublet_base_substitutions", "indel", "cnv"
],
tol: float = 1e-6,
max_iter: int = 10000,
):
"""
Load NMF COSMIC GRCh37 mutational signature checkpoint.
Args:
cosmic_signature: Input features correspond to 96 single base
substution (SBS), 78 double base substitutions (DBS), 83
indels mutational spectrum, or 48 copy number variant classes.
"""
self.cosmic_signature = cosmic_signature
H = read_csv(
_COSMIC_MUTATIONAL_SIGNATURES[cosmic_signature], sep="\t", index_col=0
).T
self.n_components_, self.n_features_in_ = H.shape
super().__init__(
n_components=self.n_components_,
max_iter=max_iter,
solver="cd",
init="nndsvda",
tol=tol,
)
self.components_ = H.to_numpy()
self.feature_names_in_ = H.columns
self.feature_names_out_ = H.index
def transform(self, X):
"""Transform back to pandas dataframe."""
X_numpy = super().transform(X)
return DataFrame(X_numpy, index=X.index, columns=self.feature_names_out_)
Classes
class CosmicNMF (cosmic_signature: Literal['single_base_substitutions', 'doublet_base_substitutions', 'indel', 'cnv'], tol: float = 1e-06, max_iter: int = 10000)
-
Load NMF model fit with COSMIC GRCh37 signatures.
Given the mutational signature H, the
transform
method computesW
so that X = WH where X is the mutation spectrum.Load NMF COSMIC GRCh37 mutational signature checkpoint.
Args
cosmic_signature
- Input features correspond to 96 single base substution (SBS), 78 double base substitutions (DBS), 83 indels mutational spectrum, or 48 copy number variant classes.
Expand source code
class CosmicNMF(NMF): """Load NMF model fit with COSMIC GRCh37 signatures. Given the mutational signature H, the `transform` method computes `W` so that X = WH where X is the mutation spectrum. """ def __init__( self, cosmic_signature: Literal[ "single_base_substitutions", "doublet_base_substitutions", "indel", "cnv" ], tol: float = 1e-6, max_iter: int = 10000, ): """ Load NMF COSMIC GRCh37 mutational signature checkpoint. Args: cosmic_signature: Input features correspond to 96 single base substution (SBS), 78 double base substitutions (DBS), 83 indels mutational spectrum, or 48 copy number variant classes. """ self.cosmic_signature = cosmic_signature H = read_csv( _COSMIC_MUTATIONAL_SIGNATURES[cosmic_signature], sep="\t", index_col=0 ).T self.n_components_, self.n_features_in_ = H.shape super().__init__( n_components=self.n_components_, max_iter=max_iter, solver="cd", init="nndsvda", tol=tol, ) self.components_ = H.to_numpy() self.feature_names_in_ = H.columns self.feature_names_out_ = H.index def transform(self, X): """Transform back to pandas dataframe.""" X_numpy = super().transform(X) return DataFrame(X_numpy, index=X.index, columns=self.feature_names_out_)
Ancestors
- sklearn.decomposition._nmf.NMF
- sklearn.base._ClassNamePrefixFeaturesOutMixin
- sklearn.base.TransformerMixin
- sklearn.base.BaseEstimator
Methods
def transform(self, X)
-
Transform back to pandas dataframe.
Expand source code
def transform(self, X): """Transform back to pandas dataframe.""" X_numpy = super().transform(X) return DataFrame(X_numpy, index=X.index, columns=self.feature_names_out_)