Source code for craw.coverage

###########################################################################
#                                                                         #
# This file is part of Counter RNAseq Window (craw) package.              #
#                                                                         #
#    Authors: Bertrand Néron                                              #
#    Copyright © 2017  Institut Pasteur (Paris).                          #
#    see COPYRIGHT file for details.                                      #
#                                                                         #
#    craw is free software: you can redistribute it and/or modify         #
#    it under the terms of the GNU General Public License as published by #
#    the Free Software Foundation, either version 3 of the License, or    #
#    (at your option) any later version.                                  #
#                                                                         #
#    craw is distributed in the hope that it will be useful,              #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of       #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                 #
#    See the GNU General Public License for more details.                 #
#                                                                         #
#    You should have received a copy of the GNU General Public License    #
#    along with craw (see COPYING file).                                  #
#    If not, see <http://www.gnu.org/licenses/>.                          #
#                                                                         #
###########################################################################

import logging

try:
    # for pysam>=0.9.1.4
    from pysam.calignmentfile import AlignmentFile
except ImportError:
    # for pysam>=0.10
    from pysam import AlignmentFile

from .wig import Genome

_log = logging.getLogger(__name__)


[docs]def get_coverage_function(input): """ :param input: the input either a samfile (see pysam library) or a genome build from a wig file (see wig module) :type input: :class:`wig.Genome` or :class:`pysam.calignmentfile.AlignmentFile` object :return: get_wig_coverage or get_bam_coverage according the type of input :rtype: function :raise RuntimeError: when input is not instance of :class:`pysam.calignmentfile.AlignmentFile` or :class:`wig.Genome` """ if isinstance(input, AlignmentFile): return get_bam_coverage elif isinstance(input, Genome): return get_wig_coverage else: raise RuntimeError("get_coverage support only 'wig.Genome' or " "'pysam.calignmentfile.AlignmentFile' as Input, not {}".format(input.__class__.__name__))
[docs]def get_wig_coverage(genome, annot_entry, start=None, stop=None, max_left=0, max_right=0, qual_thr=None): """ :param annot_entry: an entry of the annotation file :type annot_entry: :class:`annotation.Entry` object :param start: The position to start to compute the coverage(coordinates are 0-based, start position is included). :type start: int :param stop: The position to start to compute the coverage (coordinates are 0-based, stop position is excluded). :type stop: int :param max_left: The highest number of base before the reference position to take in account. :type max_left: int :param max_right: The highest number of base after the reference position to take in account. :type max_right: int :param qual_thr: this parameter is not used, It's here to have the same api as get_bam_coverage. :type qual_thr: None :return: the coverage (all bases) :rtype: tuple of 2 list containing int """ real_start = start pad_neg_start = [] if start < 0: # if start is negative # when start is compute from large window and reads map at the beginning of the reference # pysam crash see issue #10 # so we ask coverage from 0 and pad with None value for negative positions start = 0 pad_neg_start = [None] * abs(real_start) chromosome = genome[annot_entry.chromosome] forward_cov, reverse_cov = chromosome[start:stop] if annot_entry.strand == '+': pad_left = [None] * (max_left - (annot_entry.ref - 1 - start)) pad_right = [None] * (max_right - (stop - annot_entry.ref)) pad_left += pad_neg_start else: pad_left = [None] * (max_left - (stop - annot_entry.ref)) pad_right = [None] * (max_right - (annot_entry.ref - 1 - start)) pad_right += pad_neg_start forward_cov.reverse() reverse_cov.reverse() forward_cov = pad_left + forward_cov + pad_right reverse_cov = pad_left + reverse_cov + pad_right return forward_cov, reverse_cov
[docs]def get_bam_coverage(sam_file, annot_entry, start=None, stop=None, qual_thr=15, max_left=0, max_right=0): """ Compute the coverage for a region position by position on each strand :param sam_file: the samfile openend with pysam :type sam_file: :class:`pysam.AlignmentFile` object. :param annot_entry: an entry of the annotation file :type annot_entry: :class:`annotation.Entry` object :param start: The position to start to compute the coverage(coordinates are 0-based, start position is included). :type start: int :param stop: The position to start to compute the coverage (coordinates are 0-based, stop position is excluded). :type stop: int :param qual_thr: The quality threshold :type qual_thr: int :param max_left: The highest number of base before the reference position to take in account. :type max_left: int :param max_right: The highest number of base after the reference position to take in account. :type max_right: int :return: the coverage (all bases) :rtype: tuple of 2 list containing int """ def on_forward(al_seg): """ :param al_seg: a pysam aligned segment (the object used by pysam to represent an aligned read) :type al_seg: :class:`pysam.AlignedSegment` :return: True if read is mapped to forward strand :rtype: boolean """ return not al_seg.is_reverse def on_reverse(al_seg): """ :param al_seg: a pysam aligned segment (the object used by pysam to represent an aligned read) :type al_seg: :class:`pysam.AlignedSegment` :return: True if read is mapped to reverse strand. :rtype: boolean """ return al_seg.is_reverse def coverage_one_strand(sam_file, chromosome, start, stop, qual, strand): """ Compute the coverage for each position between start and stop on the chromosome on the strand. :param sam_file: the sam alignment to use :type sam_file: a :class:`pysam.AlignmentFile` object :param chromosome: the name of the chromosome :type chromosome: basestring :param start: The position to start to compute the coverage(coordinates are 0-based, start position is included). :type start: int :param stop:The position to start to compute the coverage (coordinates are 0-based, stop position is excluded). :type stop: int :param qual: The quality threshold. :type qual: int :param strand: the strand on which the read match :type strand: string :return: the coverage on forward then on reverse strand. The coverage is the sum of all kind bases mapped for each position :rtype: tuple of 2 list containing int """ call_back = on_forward if strand == '+' else on_reverse real_start = None if start < 0: # if start is negative # when start is compute from large window and reads map at the beginning of the reference # pysam crash see issue #10 # so we ask coverage from 0 and pad with None value for negative positions real_start = start start = 0 try: coverage = sam_file.count_coverage(reference=chromosome, start=start, end=stop, quality_threshold=qual, read_callback=call_back) except SystemError as err: import sys print("ERROR when call count_coverage with following arguments\n", "reference=", chromosome, "\n", "start=", start, "\n", "end=", stop, "\n", "quality_threshold=", qual, "\n", "read_callback=", call_back, file=sys.stderr) raise err coverage = [array.tolist() for array in coverage] window_cov = [] for cov_A, cov_T, cov_C, cov_G in zip(*coverage): window_cov.append(cov_A + cov_T + cov_C + cov_G) if real_start: window_cov = [None] * abs(real_start) + window_cov return window_cov forward_cov = coverage_one_strand(sam_file, annot_entry.chromosome, start, stop, qual_thr, '+' ) reverse_cov = coverage_one_strand(sam_file, annot_entry.chromosome, start, stop, qual_thr, '-' ) if annot_entry.strand == '+': # -1 because the ref must not be take in account in pad # start and stop are 0 based (see docstring) pad_left = [None] * (max_left - (annot_entry.ref - 1 - start)) # but stop is excluded in get_bam and included in annot_entry # so it (stop -1) - ( ref -1) => stop -1 pad_right = [None] * (max_right - (stop - annot_entry.ref)) else: pad_left = [None] * (max_left - (stop - annot_entry.ref)) pad_right = [None] * (max_right - (annot_entry.ref - 1 - start)) forward_cov.reverse() reverse_cov.reverse() forward_cov = pad_left + forward_cov + pad_right reverse_cov = pad_left + reverse_cov + pad_right return forward_cov, reverse_cov