#######################################################################
#
# Copyright (C) 2021, 2022 David Palao
#
# This file is part of PacBio data processing.
#
# PacBioDataProcessing is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PacBio data processing is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PacBioDataProcessing. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################
import subprocess
from functools import cached_property, cache
from collections import namedtuple
import logging
from hashlib import md5
from pathlib import Path
import warnings
from typing import Protocol
import pysam
from pacbio_data_processing.constants import (
SAMTOOLS_GET_HEADER, SAMTOOLS_GET_BODY, SAMTOOLS_WRITE_BAM,
)
MOLECULE_MARKER = b"zm:i:"
BAM_POS_COLUMN = 3
BAM_FLAG_COLUMN = 1
MAX_LINES_TO_CHECK_IN_BAM = 1000
_BAMFILEPYSAM_STRATEGY_CANONICAL = "_BamFilePysam"
_BAMFILEPYSAM_STRATEGY_ALIASES = (
_BAMFILEPYSAM_STRATEGY_CANONICAL, "pysam"
)
_BAMFILESAMTOOLS_STRATEGY_CANONICAL = "_BamFileSamtools"
_BAMFILESAMTOOLS_STRATEGY_ALIASES = (
_BAMFILESAMTOOLS_STRATEGY_CANONICAL, "samtools"
)
_DEFAULT_BAMFILE_STRATEGY = _BAMFILEPYSAM_STRATEGY_CANONICAL
[docs]class BamFileStrategy(Protocol):
def __init__(self, bamfilename):
...
def _read_header(self):
...
def _read_body(self):
...
def _write(self, *, header, body):
...
[docs]def pack_lines(lines):
for line in lines:
yield b"\t".join(line)+b"\n"
[docs]def set_pysam_verbosity():
"""Ad-hoc function to remove unpleasant errors messages by pysam."""
api_error_common_msg = (
"You might see some non-critical error messages from pysam."
)
try:
pysam.set_verbosity(0)
except AttributeError as e:
logging.error(f"{e}. {api_error_common_msg}")
except TypeError as e:
logging.error(
"'pysam.set_verbosity' failed. It looks like pysam changed its"
" API. Continuing without setting the verbosity. "
f"{api_error_common_msg}"
)
except Exception as e:
logging.error(
f"Unexpected error calling 'pysam.set_verbosity':\n{e}\n"
f"Continuing without setting the verbosity. {api_error_common_msg}"
)
[docs]def _strategy_factory(
name: str = _DEFAULT_BAMFILE_STRATEGY) -> BamFileStrategy:
"""Internal function that returns the *strategy* class in a concrete
``BamFile`` instance.
:meta public:
"""
strategy = _BamFilePysam
if name in _BAMFILEPYSAM_STRATEGY_ALIASES:
pass
elif name in _BAMFILESAMTOOLS_STRATEGY_ALIASES:
strategy = _BamFileSamtools
else:
msg = (
f"Unknown strategy name ({name}); using default strategy "
f"({_DEFAULT_BAMFILE_STRATEGY})"
)
warnings.warn(msg)
return strategy
@cache
def _BamLine_factory(*, num_columns: int, molecule_column: int) -> namedtuple:
"""Internal factory function that creates a custom BamLine class.
It is used by ``_ReadBamFile`` to return a meaningful object for
each line read.
It is cached for obvious reasons: we want to avoid creation of
the same class multiple times (every line!).
"""
line_attrs = [f"attr{_}" for _ in range(num_columns)]
line_attrs[molecule_column] = "zmw"
class BamLine(namedtuple("BamLine", line_attrs)):
@property
def molecule_id(instance):
return instance.zmw[len(MOLECULE_MARKER):]
@property
def flag(instance) -> int:
return int(instance[BAM_FLAG_COLUMN])
return BamLine
[docs]class BamFile:
"""Proxy class for ``_BamFileSamtools`` and ``_BamFilePysam``. This is
a high level class whose only role is to choose among different
possible *states*: ``_ReadableBamFile`` and ``_WritableBamFile``
and to select the underlying implementation (*strategy*) to
interact with the BAM file:
- ``_BamFileSamtools``: implementation that simply wraps the
'samtools' command line, and
- ``_BamFilePysam``: implementation that uses 'pysam'
The code is ready to permit the choice of strategy. With the current
implementation it is, intentionally, a bit convoluted. For instance,
instead of the default implementation (``pysam``), another one can
be chosen as follows::
from pacbio_data_processing.bam import BamFile
BamFile.bamfile_strategy_name = "samtools"
bam = BamFile("my.bam")
and ``samtools`` will be used under the hood to get access to the data
in a BAM file.
"""
bamfile_strategy_name = _DEFAULT_BAMFILE_STRATEGY
[docs] def __init__(self, bam_file_name, mode="r"):
if mode == "r":
self.__class__ = _ReadableBamFile
elif mode == "w":
self.__class__ = _WritableBamFile
else:
raise ValueError(f"invalid mode: '{mode}'")
self._bamfile_strategy = _strategy_factory(self.bamfile_strategy_name)
self._real_subject = self._bamfile_strategy(bam_file_name)
def __getattr__(self, attr):
return getattr(self._real_subject, attr)
class _ReadableBamFile(BamFile):
"""This class provides the attributes necessary for BamFile to be
readable. Most attributes, e.g.
- header
- _BamLine
- molecule_column
- num_columns
are cached, to avoid unnecessary IO ops (in the case of being a
_BamFileSamtools object, an IO op would imply calling "samtools" each
time the given attribute is read, which makes no sense since that would
mean that the file has been modified after being opened, i.e. it has been
most probably corrupted).
The 'body' property is not cached as it is a generator of lines and it
might make sense to read several times the same file.
:meta public:
"""
@cached_property
def header(self):
return self._read_header()
@property
def _BamLine(self):
return _BamLine_factory(
num_columns=self.num_columns,
molecule_column=self.molecule_column
)
@property
def body(self):
for line in self._read_body():
try:
bamline = self._BamLine(*line)
except TypeError:
self.num_columns = len(line)
bamline = self._BamLine(*line)
yield bamline
def __iter__(self):
return self.body
@cached_property
def num_items(self) -> int:
mols = set()
for i, line in enumerate(self):
mols.add(line.molecule_id)
return {"molecules": len(mols), "subreads": i+1}
@cached_property
def molecule_column(self) -> int:
line = next(self._read_body())
for i, item in enumerate(line):
if item.startswith(MOLECULE_MARKER):
return i
@cached_property
def num_columns(self) -> int:
"""This property is tricky: the lines in a BAM file can have
different number of columns within the same BAM file. The
``body`` generator will adjust the ``num_columns`` attribute
as necessary.
"""
line = next(self._read_body())
return len(line)
@property
def num_molecules(self) -> int:
return self.num_items["molecules"]
@property
def num_subreads(self) -> int:
return self.num_items["subreads"]
def __len__(self) -> int:
return self.num_items["subreads"]
@property
def all_molecules(self):
last = None
for i, line in enumerate(self):
mol_id = line.molecule_id
if mol_id != last:
yield mol_id
last = mol_id
@cached_property
def is_aligned(self) -> bool:
at_most = MAX_LINES_TO_CHECK_IN_BAM
for iline, line in enumerate(self):
if line[BAM_POS_COLUMN] == b"0":
return False
if iline > at_most:
# exit early here assuming all the rest will be ok:
break
return True
def is_plausible_aligned_version_of(self, other: BamFile) -> bool:
"""The main purpose of this *ad-hoc method* is to help
SingleMoleculeAnalysis by providing a plausible answer to
the question: *Does this BamFile look like an aligned version*
*of another BamFile?*
The implementation checks that the subject is aligned, the other
is not and that the subject's set of molecules is a (proper)
subset of the molecules in the other BamFile.
Exceptions are propagated.
"""
if self.is_aligned and (not other.is_aligned):
mols = set(self.all_molecules)
other_mols = set(other.all_molecules)
if mols <= other_mols:
return True
return False
@property
def last_subreads_map(self) -> dict[bytes, int]:
"""It returns a mapping that answers the question: what is the
index of the last subread corresponding to a certain molecule id?
.. admonition:: Implementation detail
This is *manually cached* because I was experiencing weird
problems with ``cached_property`` that might have to do with
https://github.com/python/cpython/issues/86293
Anyway, I chose a poor man's solution that simply works.
"""
try:
subreads_map = self._subreads_map
except AttributeError:
subreads_map = {}
for idx, subread in enumerate(self):
subreads_map[subread.molecule_id] = idx
self._subreads_map = subreads_map
return subreads_map
@cached_property
def md5sum_body(self) -> str:
"""MD5 checksum of only the body of the BAM file, excluding the
header"""
checksum = md5()
for line in pack_lines(self._read_body()):
checksum.update(line)
return checksum.hexdigest()
@cached_property
def full_md5sum(self) -> str:
"""MD5 checksum of the full file."""
return md5(open(self.bam_file_name, "rb").read()).hexdigest()
# It would be good to have something like:
# def __getitem__(self, mol_id):
# for line in self:
# if line.molecule_id == mol_id:
# yield line
@property
def size_in_bytes(self) -> int:
return Path(self.bam_file_name).stat().st_size
class _WritableBamFile(BamFile):
def write(self, *, header, body):
self._write(header=header, body=body)
class _BamFileSamtools:
"""Strategy implementation that wraps the 'samtools' executable.
:meta public:
"""
def __init__(self, bam_file_name):
# A refactor of __init__ would probably need an ABC...
self.bam_file_name = bam_file_name
def _read_header(self):
# check for errors and report them if any. Need FT!
return subprocess.run(
SAMTOOLS_GET_HEADER+(self.bam_file_name,), capture_output=True
).stdout
def _read_body(self):
with subprocess.Popen(
SAMTOOLS_GET_BODY+(self.bam_file_name,), stdout=subprocess.PIPE
) as body:
for line in body.stdout:
yield line.split()
def _write(self, *, header, body):
with open(self.bam_file_name, "wb") as bam_file:
with subprocess.Popen(
SAMTOOLS_WRITE_BAM, stdin=subprocess.PIPE, stdout=bam_file
) as proc:
proc.stdin.write(header)
for line in pack_lines(body):
proc.stdin.write(line)
class _BamFilePysam:
"""Strategy implementation that uses 'pysam'.
:meta public:
"""
_PYSAM_VERBOSITY_SET = False
def __init__(self, bam_file_name):
self.bam_file_name = bam_file_name
self.set_pysam_verbosity_once()
@classmethod
def set_pysam_verbosity_once(cls):
"""This class method helps to ensure that set_pysam_verbosity
is called only once, the first time an instance is created."""
if cls._PYSAM_VERBOSITY_SET is False:
set_pysam_verbosity()
cls._PYSAM_VERBOSITY_SET = True
@property
def _ralignment_file(self):
"""An AlignmentFile instance is created for READING."""
return pysam.AlignmentFile(self.bam_file_name, "rb", check_sq=False)
def _read_header(self):
return str(self._ralignment_file.header).encode()
def _read_body(self):
for line in self._ralignment_file:
yield line.to_string().encode().split()
def _write(self, *, header, body):
# Need to be sure that there is a single "\n" at the end:
h = pysam.AlignmentHeader.from_text(header.decode().rstrip()+"\n")
with pysam.AlignmentFile(self.bam_file_name, "wb", header=h) as g:
header = g.header
for line in pack_lines(body):
g.write(pysam.AlignedSegment.fromstring(
# Need to be sure that there is no "\n" between lines:
line.decode().strip(), header)
)