Module bearclaw.transforms

Expand source code
from pathlib import Path

from pandas import DataFrame, concat

from bearclaw.feature_extraction import CosmicNMF, _POSSIBLE_SBS_SEQUENCING_ARTEFACTS
from bearclaw.io import extract_mutation_spectra, from_tso500_trace, vcf


def spectrum(vcf_file: Path, exome: bool = False) -> DataFrame:
    """Extract single + doublet base substitutions and indel spectra from VCF file."""
    mutation_matrices = extract_mutation_spectra(vcf_file, exome=exome)
    spectra = list(mutation_matrices.values())
    return concat(spectra, axis="columns")


@from_tso500_trace
def spectrum_from_trace(trace_file: Path) -> DataFrame:
    """Extract single + doublet base substitutions & indel spectra from TSO500 trace.

    N.B. This spectrum incorporates all mutations that are eligible for a TSO500 TMB
    calculation (i.e., both synonymous and non-synonymous).

    Args:
        trace_file: Path to the *_TMB_Trace.tsv file generated by TSO500 app.
    """
    return spectrum(trace_file, exome=False)


@vcf.non_synonymous
def non_synonymous_spectrum(vcf_file: Path, exome: bool = False) -> DataFrame:
    return spectrum(vcf_file, exome)


def mutational_signature(
    vcf_file: Path, filter_seq_artefact_signatures: bool = True
) -> DataFrame:
    """
    Extract mutation spectra and deconvolute into COSMIC signatures.

    !! N.B.: Assumes that variants are from the GRCh37 reference genome. !!

    Args:
        vcf_file: File to analyse.
        filter_seq_artefact_signatures: When True, remove mutational
            signatures that are potentially related to sequencing artefacts.
    """
    mutation_matrices = extract_mutation_spectra(vcf_file, exome=False)
    signatures = []
    for spectrum in [
        "single_base_substitutions",
        "doublet_base_substitutions",
        "indel",
    ]:
        decomposer = CosmicNMF(cosmic_signature=spectrum)  # type: ignore
        signatures.append(decomposer.transform(mutation_matrices[spectrum]))

    result = concat(signatures, axis="columns")
    if filter_seq_artefact_signatures:
        return result.drop(columns=_POSSIBLE_SBS_SEQUENCING_ARTEFACTS)
    return result


@from_tso500_trace
def mutational_signature_from_trace(
    trace_file: Path, filter_seq_artefact_signatures: bool = True
) -> DataFrame:
    """Extract TSO500 mutation spectra and deconvolute into COSMIC signatures.

    N.B. This signature deconvolution uses all mutations that are eligible for a TMB
    calculation (i.e., both synonymous and non-synonymous). This is different from the
    true TMB definition (which includes only non-synonymous variants).

    Args:
        trace_file: Path to the *_TMB_Trace.tsv file generated by TSO500 app.
        filter_seq_artefact_signatures: When True, remove mutational
            signatures that are potentially related to sequencing artefacts.
    """
    return mutational_signature(trace_file, filter_seq_artefact_signatures)

Functions

def mutational_signature(vcf_file: pathlib.Path, filter_seq_artefact_signatures: bool = True) ‑> pandas.core.frame.DataFrame

Extract mutation spectra and deconvolute into COSMIC signatures.

!! N.B.: Assumes that variants are from the GRCh37 reference genome. !!

Args

vcf_file
File to analyse.
filter_seq_artefact_signatures
When True, remove mutational signatures that are potentially related to sequencing artefacts.
Expand source code
def mutational_signature(
    vcf_file: Path, filter_seq_artefact_signatures: bool = True
) -> DataFrame:
    """
    Extract mutation spectra and deconvolute into COSMIC signatures.

    !! N.B.: Assumes that variants are from the GRCh37 reference genome. !!

    Args:
        vcf_file: File to analyse.
        filter_seq_artefact_signatures: When True, remove mutational
            signatures that are potentially related to sequencing artefacts.
    """
    mutation_matrices = extract_mutation_spectra(vcf_file, exome=False)
    signatures = []
    for spectrum in [
        "single_base_substitutions",
        "doublet_base_substitutions",
        "indel",
    ]:
        decomposer = CosmicNMF(cosmic_signature=spectrum)  # type: ignore
        signatures.append(decomposer.transform(mutation_matrices[spectrum]))

    result = concat(signatures, axis="columns")
    if filter_seq_artefact_signatures:
        return result.drop(columns=_POSSIBLE_SBS_SEQUENCING_ARTEFACTS)
    return result
def mutational_signature_from_trace(trace_file: pathlib.Path, filter_seq_artefact_signatures: bool = True) ‑> pandas.core.frame.DataFrame

Extract TSO500 mutation spectra and deconvolute into COSMIC signatures.

N.B. This signature deconvolution uses all mutations that are eligible for a TMB calculation (i.e., both synonymous and non-synonymous). This is different from the true TMB definition (which includes only non-synonymous variants).

Args

trace_file
Path to the *_TMB_Trace.tsv file generated by TSO500 app.
filter_seq_artefact_signatures
When True, remove mutational signatures that are potentially related to sequencing artefacts.
Expand source code
@from_tso500_trace
def mutational_signature_from_trace(
    trace_file: Path, filter_seq_artefact_signatures: bool = True
) -> DataFrame:
    """Extract TSO500 mutation spectra and deconvolute into COSMIC signatures.

    N.B. This signature deconvolution uses all mutations that are eligible for a TMB
    calculation (i.e., both synonymous and non-synonymous). This is different from the
    true TMB definition (which includes only non-synonymous variants).

    Args:
        trace_file: Path to the *_TMB_Trace.tsv file generated by TSO500 app.
        filter_seq_artefact_signatures: When True, remove mutational
            signatures that are potentially related to sequencing artefacts.
    """
    return mutational_signature(trace_file, filter_seq_artefact_signatures)
def non_synonymous_spectrum(vcf_file: pathlib.Path, exome: bool = False) ‑> pandas.core.frame.DataFrame
Expand source code
@vcf.non_synonymous
def non_synonymous_spectrum(vcf_file: Path, exome: bool = False) -> DataFrame:
    return spectrum(vcf_file, exome)
def spectrum(vcf_file: pathlib.Path, exome: bool = False) ‑> pandas.core.frame.DataFrame

Extract single + doublet base substitutions and indel spectra from VCF file.

Expand source code
def spectrum(vcf_file: Path, exome: bool = False) -> DataFrame:
    """Extract single + doublet base substitutions and indel spectra from VCF file."""
    mutation_matrices = extract_mutation_spectra(vcf_file, exome=exome)
    spectra = list(mutation_matrices.values())
    return concat(spectra, axis="columns")
def spectrum_from_trace(trace_file: pathlib.Path) ‑> pandas.core.frame.DataFrame

Extract single + doublet base substitutions & indel spectra from TSO500 trace.

N.B. This spectrum incorporates all mutations that are eligible for a TSO500 TMB calculation (i.e., both synonymous and non-synonymous).

Args

trace_file
Path to the *_TMB_Trace.tsv file generated by TSO500 app.
Expand source code
@from_tso500_trace
def spectrum_from_trace(trace_file: Path) -> DataFrame:
    """Extract single + doublet base substitutions & indel spectra from TSO500 trace.

    N.B. This spectrum incorporates all mutations that are eligible for a TSO500 TMB
    calculation (i.e., both synonymous and non-synonymous).

    Args:
        trace_file: Path to the *_TMB_Trace.tsv file generated by TSO500 app.
    """
    return spectrum(trace_file, exome=False)