Module bearclaw.io.tso
Expand source code
from datetime import datetime
from pathlib import Path
from pandas import notna, read_csv
import vcfpy
from vcfpy import DEL, INS, SNV, MNV, Header, HeaderLine, SamplesInfos, Substitution
import bearclaw
def _build_header() -> Header:
"""Construct VCF header with meta data."""
header = Header(samples=SamplesInfos(sample_names=[]))
header.add_line(
HeaderLine(key="fileformat", value="VCFv4.3"),
)
header.add_line(
HeaderLine(key="fileDate", value=datetime.now().strftime(r"%Y%m%d")),
)
header.add_line(
HeaderLine(key="source", value=f"Bearclaw {bearclaw.__version__}"),
)
header.add_line(HeaderLine(key="reference", value="GRCh37"))
header.add_line(HeaderLine(key="platform", value="Derived from TSO500"))
header.add_filter_line({"ID": "PASS", "Description": "All filters passed"})
# Write info lines.
header.add_info_line(
{
"ID": "AF",
"Number": "A", # One per alternate allele.
"Type": "Float",
"Description": "Allele Frequency",
}
)
header.add_info_line(
{
"ID": "DP",
"Number": 1,
"Type": "Integer",
"Description": "Total Depth",
}
)
return header
def tmb_trace_to_vcf(input_tsv: Path, output_vcf: Path):
"""Convert *_TMB_Trace.tsv from TSO500 run to VCF format.
Keeps only the variants that are marked as `IncludedInTMBNumerator`.
"""
variant_dataframe = read_csv(input_tsv, sep=r"\t")
tmb_variants = variant_dataframe[variant_dataframe.IncludedInTMBNumerator]
header = _build_header()
with vcfpy.Writer.from_path(output_vcf, header) as writer:
for _, row in tmb_variants.iterrows():
info = {"DP": row["Depth"], "AF": [row["VAF"]]}
variant_type = (
row[["VariantType"]]
.map({"SNV": SNV, "MNV": MNV, "insertion": INS, "deletion": DEL})
.squeeze()
)
variant_id = "."
if notna(row["CosmicIDs"]):
variant_id = row["CosmicIDs"]
alternate = Substitution(type_=variant_type, value=row["AltCall"])
record = vcfpy.Record(
CHROM=row["Chromosome"],
POS=row["Position"],
ID=[variant_id],
REF=row["RefCall"],
ALT=[alternate],
INFO=info,
FILTER=["PASS"],
QUAL=".",
)
writer.write_record(record)
Functions
def tmb_trace_to_vcf(input_tsv: pathlib.Path, output_vcf: pathlib.Path)
-
Convert *_TMB_Trace.tsv from TSO500 run to VCF format.
Keeps only the variants that are marked as
IncludedInTMBNumerator
.Expand source code
def tmb_trace_to_vcf(input_tsv: Path, output_vcf: Path): """Convert *_TMB_Trace.tsv from TSO500 run to VCF format. Keeps only the variants that are marked as `IncludedInTMBNumerator`. """ variant_dataframe = read_csv(input_tsv, sep=r"\t") tmb_variants = variant_dataframe[variant_dataframe.IncludedInTMBNumerator] header = _build_header() with vcfpy.Writer.from_path(output_vcf, header) as writer: for _, row in tmb_variants.iterrows(): info = {"DP": row["Depth"], "AF": [row["VAF"]]} variant_type = ( row[["VariantType"]] .map({"SNV": SNV, "MNV": MNV, "insertion": INS, "deletion": DEL}) .squeeze() ) variant_id = "." if notna(row["CosmicIDs"]): variant_id = row["CosmicIDs"] alternate = Substitution(type_=variant_type, value=row["AltCall"]) record = vcfpy.Record( CHROM=row["Chromosome"], POS=row["Position"], ID=[variant_id], REF=row["RefCall"], ALT=[alternate], INFO=info, FILTER=["PASS"], QUAL=".", ) writer.write_record(record)