Module bearclaw.transforms
Expand source code
from pathlib import Path
from pandas import DataFrame, concat
from bearclaw.feature_extraction import CosmicNMF, _POSSIBLE_SBS_SEQUENCING_ARTEFACTS
from bearclaw.io import extract_mutation_spectra, from_tso500_trace, vcf
def spectrum(vcf_file: Path, exome: bool = False) -> DataFrame:
"""Extract single + doublet base substitutions and indel spectra from VCF file."""
mutation_matrices = extract_mutation_spectra(vcf_file, exome=exome)
spectra = list(mutation_matrices.values())
return concat(spectra, axis="columns")
@from_tso500_trace
def spectrum_from_trace(trace_file: Path) -> DataFrame:
"""Extract single + doublet base substitutions & indel spectra from TSO500 trace.
N.B. This spectrum incorporates all mutations that are eligible for a TSO500 TMB
calculation (i.e., both synonymous and non-synonymous).
Args:
trace_file: Path to the *_TMB_Trace.tsv file generated by TSO500 app.
"""
return spectrum(trace_file, exome=False)
@vcf.non_synonymous
def non_synonymous_spectrum(vcf_file: Path, exome: bool = False) -> DataFrame:
return spectrum(vcf_file, exome)
def mutational_signature(
vcf_file: Path, filter_seq_artefact_signatures: bool = True
) -> DataFrame:
"""
Extract mutation spectra and deconvolute into COSMIC signatures.
!! N.B.: Assumes that variants are from the GRCh37 reference genome. !!
Args:
vcf_file: File to analyse.
filter_seq_artefact_signatures: When True, remove mutational
signatures that are potentially related to sequencing artefacts.
"""
mutation_matrices = extract_mutation_spectra(vcf_file, exome=False)
signatures = []
for spectrum in [
"single_base_substitutions",
"doublet_base_substitutions",
"indel",
]:
decomposer = CosmicNMF(cosmic_signature=spectrum) # type: ignore
signatures.append(decomposer.transform(mutation_matrices[spectrum]))
result = concat(signatures, axis="columns")
if filter_seq_artefact_signatures:
return result.drop(columns=_POSSIBLE_SBS_SEQUENCING_ARTEFACTS)
return result
@from_tso500_trace
def mutational_signature_from_trace(
trace_file: Path, filter_seq_artefact_signatures: bool = True
) -> DataFrame:
"""Extract TSO500 mutation spectra and deconvolute into COSMIC signatures.
N.B. This signature deconvolution uses all mutations that are eligible for a TMB
calculation (i.e., both synonymous and non-synonymous). This is different from the
true TMB definition (which includes only non-synonymous variants).
Args:
trace_file: Path to the *_TMB_Trace.tsv file generated by TSO500 app.
filter_seq_artefact_signatures: When True, remove mutational
signatures that are potentially related to sequencing artefacts.
"""
return mutational_signature(trace_file, filter_seq_artefact_signatures)
Functions
def mutational_signature(vcf_file: pathlib.Path, filter_seq_artefact_signatures: bool = True) ‑> pandas.core.frame.DataFrame
-
Extract mutation spectra and deconvolute into COSMIC signatures.
!! N.B.: Assumes that variants are from the GRCh37 reference genome. !!
Args
vcf_file
- File to analyse.
filter_seq_artefact_signatures
- When True, remove mutational signatures that are potentially related to sequencing artefacts.
Expand source code
def mutational_signature( vcf_file: Path, filter_seq_artefact_signatures: bool = True ) -> DataFrame: """ Extract mutation spectra and deconvolute into COSMIC signatures. !! N.B.: Assumes that variants are from the GRCh37 reference genome. !! Args: vcf_file: File to analyse. filter_seq_artefact_signatures: When True, remove mutational signatures that are potentially related to sequencing artefacts. """ mutation_matrices = extract_mutation_spectra(vcf_file, exome=False) signatures = [] for spectrum in [ "single_base_substitutions", "doublet_base_substitutions", "indel", ]: decomposer = CosmicNMF(cosmic_signature=spectrum) # type: ignore signatures.append(decomposer.transform(mutation_matrices[spectrum])) result = concat(signatures, axis="columns") if filter_seq_artefact_signatures: return result.drop(columns=_POSSIBLE_SBS_SEQUENCING_ARTEFACTS) return result
def mutational_signature_from_trace(trace_file: pathlib.Path, filter_seq_artefact_signatures: bool = True) ‑> pandas.core.frame.DataFrame
-
Extract TSO500 mutation spectra and deconvolute into COSMIC signatures.
N.B. This signature deconvolution uses all mutations that are eligible for a TMB calculation (i.e., both synonymous and non-synonymous). This is different from the true TMB definition (which includes only non-synonymous variants).
Args
trace_file
- Path to the *_TMB_Trace.tsv file generated by TSO500 app.
filter_seq_artefact_signatures
- When True, remove mutational signatures that are potentially related to sequencing artefacts.
Expand source code
@from_tso500_trace def mutational_signature_from_trace( trace_file: Path, filter_seq_artefact_signatures: bool = True ) -> DataFrame: """Extract TSO500 mutation spectra and deconvolute into COSMIC signatures. N.B. This signature deconvolution uses all mutations that are eligible for a TMB calculation (i.e., both synonymous and non-synonymous). This is different from the true TMB definition (which includes only non-synonymous variants). Args: trace_file: Path to the *_TMB_Trace.tsv file generated by TSO500 app. filter_seq_artefact_signatures: When True, remove mutational signatures that are potentially related to sequencing artefacts. """ return mutational_signature(trace_file, filter_seq_artefact_signatures)
def non_synonymous_spectrum(vcf_file: pathlib.Path, exome: bool = False) ‑> pandas.core.frame.DataFrame
-
Expand source code
@vcf.non_synonymous def non_synonymous_spectrum(vcf_file: Path, exome: bool = False) -> DataFrame: return spectrum(vcf_file, exome)
def spectrum(vcf_file: pathlib.Path, exome: bool = False) ‑> pandas.core.frame.DataFrame
-
Extract single + doublet base substitutions and indel spectra from VCF file.
Expand source code
def spectrum(vcf_file: Path, exome: bool = False) -> DataFrame: """Extract single + doublet base substitutions and indel spectra from VCF file.""" mutation_matrices = extract_mutation_spectra(vcf_file, exome=exome) spectra = list(mutation_matrices.values()) return concat(spectra, axis="columns")
def spectrum_from_trace(trace_file: pathlib.Path) ‑> pandas.core.frame.DataFrame
-
Extract single + doublet base substitutions & indel spectra from TSO500 trace.
N.B. This spectrum incorporates all mutations that are eligible for a TSO500 TMB calculation (i.e., both synonymous and non-synonymous).
Args
trace_file
- Path to the *_TMB_Trace.tsv file generated by TSO500 app.
Expand source code
@from_tso500_trace def spectrum_from_trace(trace_file: Path) -> DataFrame: """Extract single + doublet base substitutions & indel spectra from TSO500 trace. N.B. This spectrum incorporates all mutations that are eligible for a TSO500 TMB calculation (i.e., both synonymous and non-synonymous). Args: trace_file: Path to the *_TMB_Trace.tsv file generated by TSO500 app. """ return spectrum(trace_file, exome=False)