Module bearclaw.io.vcf
Expand source code
from functools import wraps
from pathlib import Path
import re
from tempfile import TemporaryDirectory
import warnings
import vcfpy
from vcfpy.exceptions import FieldInfoNotFound, VCFPyException
def non_synonymous(method):
"""Decorator that pre-filters synonymous variants from the VCF file."""
@wraps(method)
def decorated_transform(input_vcf: Path, *args, **kwargs):
with TemporaryDirectory() as tmp_dir:
target_vcf = Path(tmp_dir) / input_vcf.name.replace(".gz", "")
filter_synonymous_variants(input_vcf, target_vcf, soft=False)
x_transformed = method(target_vcf, *args, **kwargs)
return x_transformed
return decorated_transform
def _is_non_synonymous(annotation: dict) -> bool:
"""Test if an annotated variant is non-synonymous.
Args:
annotation: dict where keys and values are according to ANN standard.
"""
field_names = tuple(annotation.keys())
# The standard for the annotation field can be found here:
# https://pcingola.github.io/SnpEff/adds/VCFannotationformat_v1.0.pdf
annotation_field = field_names[1]
hgsvp_field = field_names[10]
# Only variants affecting protein.
if annotation[hgsvp_field] != "":
# Only variants that change amino-acid sequence.
var_annot_values = annotation[annotation_field].split("&")
synonymous_type = (
"synonymous_variant",
"stop_retained_variant",
"start_retained_variant",
)
is_non_synonymous = all(
v.strip() not in synonymous_type for v in var_annot_values
)
if is_non_synonymous:
return True
return False
def _check_vcf_ann_header(reader: vcfpy.Reader):
"""Validate header to see if INFO column has variant annotation `ANN` fields.
Raises:
VCFPyException: When header specifies no ANN field or incorrect ANN keys.
"""
with warnings.catch_warnings():
warnings.filterwarnings("error")
try:
annotation = reader.header.get_info_field_info("ANN")
except FieldInfoNotFound:
raise VCFPyException("Missing ANN field in VCF.")
return annotation
def _get_annotations_keys(reader: vcfpy.Reader) -> list:
"""Extract the annotation's data fields (=keys) from the VCF header.
This corresponds to the data fields in the value corresponding to the
`ANN` key in the `INFO`field.
Returns:
Keys corresponding to record ANN array.
"""
annotation = _check_vcf_ann_header(reader)
ann_format = annotation.description
mo = re.search("'(.+)'", ann_format)
ann_keys = [key.strip() for key in mo.group(1).split("|")]
# According to standard, there are 16 annotations data fields.
assert len(ann_keys) == 16, "Variant annotations not compatible with standard."
return ann_keys
def _get_annotations(record: vcfpy.Record, reader: vcfpy.Reader) -> list[dict]:
"""Turn annotations data field into dictionary.
Args:
record: Single record in the VCF file.
reader: VCFpy `Reader` istance with header information.
Returns:
Annotation dictionary per allele.
"""
ann_keys = _get_annotations_keys(reader)
return [
dict(zip(ann_keys, ann_values.split("|"))) for ann_values in record.INFO["ANN"]
]
def filter_synonymous_variants(input_vcf: Path, filtered_vcf: Path, soft: bool = False):
"""Filter synonymous (non-amino acid sequencing changing) variants.
Args:
input_vcf: Variant file (possbily gzipped) to filter.
filtered_vcf: Store filtered VCF to this (possbily gzipped) file.
soft: When `True`, apply a soft filter that only changes the FILTER
column (but keeps the record). When `False`, remove filtered
records.
Raises:
VCFPyException: When header specifies no `ANN` field or incorrect `ANN`
keys.
"""
with vcfpy.Reader.from_path(str(input_vcf)) as reader:
# Soft filter adds the following variant FILTER.
if soft:
reader.header.add_filter_line(
vcfpy.OrderedDict(
[
("ID", "IS_SYNON"),
(
"Description",
"The variant is not amino-acid sequence changing.",
),
]
)
)
with vcfpy.Writer.from_path(filtered_vcf, reader.header) as writer:
for record in reader:
try:
annotations = _get_annotations(record, reader)
except KeyError:
# Filter variants without annotation.
if not soft:
continue
record.add_filter("IS_SYNON")
else:
# Filter variants that are not non-synonymous.
if not any(map(_is_non_synonymous, annotations)):
if not soft:
continue
record.add_filter("IS_SYNON")
writer.write_record(record)
Functions
def filter_synonymous_variants(input_vcf: pathlib.Path, filtered_vcf: pathlib.Path, soft: bool = False)
-
Filter synonymous (non-amino acid sequencing changing) variants.
Args
input_vcf
- Variant file (possbily gzipped) to filter.
filtered_vcf
- Store filtered VCF to this (possbily gzipped) file.
soft
- When
True
, apply a soft filter that only changes the FILTER column (but keeps the record). WhenFalse
, remove filtered records.
Raises
VCFPyException
- When header specifies no
ANN
field or incorrectANN
keys.
Expand source code
def filter_synonymous_variants(input_vcf: Path, filtered_vcf: Path, soft: bool = False): """Filter synonymous (non-amino acid sequencing changing) variants. Args: input_vcf: Variant file (possbily gzipped) to filter. filtered_vcf: Store filtered VCF to this (possbily gzipped) file. soft: When `True`, apply a soft filter that only changes the FILTER column (but keeps the record). When `False`, remove filtered records. Raises: VCFPyException: When header specifies no `ANN` field or incorrect `ANN` keys. """ with vcfpy.Reader.from_path(str(input_vcf)) as reader: # Soft filter adds the following variant FILTER. if soft: reader.header.add_filter_line( vcfpy.OrderedDict( [ ("ID", "IS_SYNON"), ( "Description", "The variant is not amino-acid sequence changing.", ), ] ) ) with vcfpy.Writer.from_path(filtered_vcf, reader.header) as writer: for record in reader: try: annotations = _get_annotations(record, reader) except KeyError: # Filter variants without annotation. if not soft: continue record.add_filter("IS_SYNON") else: # Filter variants that are not non-synonymous. if not any(map(_is_non_synonymous, annotations)): if not soft: continue record.add_filter("IS_SYNON") writer.write_record(record)
def non_synonymous(method)
-
Decorator that pre-filters synonymous variants from the VCF file.
Expand source code
def non_synonymous(method): """Decorator that pre-filters synonymous variants from the VCF file.""" @wraps(method) def decorated_transform(input_vcf: Path, *args, **kwargs): with TemporaryDirectory() as tmp_dir: target_vcf = Path(tmp_dir) / input_vcf.name.replace(".gz", "") filter_synonymous_variants(input_vcf, target_vcf, soft=False) x_transformed = method(target_vcf, *args, **kwargs) return x_transformed return decorated_transform