Source code for mgkit.align

"""
Module dealing with BAM/SAM files
"""
from future.utils import viewitems
import logging
try:
    # In Python2
    from itertools import izip as zip
except ImportError:
    pass
from builtins import object
import pandas
import numpy
import pysam
from tqdm import tqdm
from mgkit.io.utils import open_file

LOG = logging.getLogger(__name__)


[docs]def get_region_coverage(bam_file, seq_id, feat_from, feat_to):
    """
    Return coverage for an annotation.

    .. note::

        feat_from and feat_to are 1-based indexes

    :param Samfile bam_file: instance of :class:`pysam.Samfile`
    :param str seq_id: sequence id
    :param int feat_from: start position of feature
    :param int feat_to: end position of feature

    :return int: coverage array for the annotation
    """
    norm_start, norm_end = feat_from - 1, feat_to - 1
    iterator = bam_file.pileup(
        reference=seq_id,
        start=norm_start,
        end=norm_end,
        truncate=True
    )

    coverage = pandas.Series(
        dict(
            (pileup_proxy.pos, pileup_proxy.n)
            for pileup_proxy in iterator
        ),
        dtype=numpy.int,
    )

    return coverage


[docs]def covered_annotation_bp(files, annotations, min_cov=1, progress=False):
    """
    .. versionadded:: 0.1.14

    Returns the number of base pairs covered of annotations over multiple
    samples.

    Arguments:
        files (iterable): an iterable that returns the alignment file names
        annotations (iterable): an iterable that returns annotations
        min_cov (int): minumum coverage for a base to counted
        progress (bool): if *True*, a progress bar is used

    Returns:
        dict: a dictionary whose keys are the uid and the values the number of
        bases that are covered by reads among all samples

    """
    annotations = [
        (annotation.uid, annotation.seq_id, annotation.start, annotation.end)
        for annotation in annotations
    ]

    covered = {}

    for file_name in files:
        # pysam 0.8.1 changed Samfile to AlignmentFile
        alg_file = pysam.AlignmentFile(file_name, 'rb')

        if progress:
            annotations = tqdm(annotations)

        for uid, seq_id, start, end in annotations:
            cov = get_region_coverage(alg_file, seq_id, start, end)

            try:
                covered[uid] = cov.add(covered[uid], fill_value=0)
            except KeyError:
                covered[uid] = cov

    return dict(
        (uid, len(cov[cov >= min_cov])) for uid, cov in viewitems(covered)
    )


[docs]def add_coverage_info(annotations, bam_files, samples, attr_suff='_cov'):
    """
    .. versionchanged:: 0.3.4
        the coverage now is returned as floats instead of int

    Adds coverage information to annotations, using BAM files.

    The coverage information is added for each sample as a 'sample_cov' and the
    total coverage as as 'cov' attribute in the annotations.

    .. note::

        The bam_files and sample variables must have the same order

    :param iterable annotations: iterable of annotations
    :param iterable bam_files: iterable of :class:`pysam.Samfile` instances
    :param iterable sample: names of the samples for the BAM files
    """
    LOG.info("Adding coverage info for %d annotations", len(annotations))
    for bam_file, sample in zip(bam_files, samples):
        LOG.info("Sample %s, file %s", sample, bam_file.filename)

    tot_coverage = {}

    for bam_file, sample in zip(bam_files, samples):
        LOG.info("Adding coverage for sample %s", sample)
        for index, annotation in enumerate(annotations):
            sample_coverage = get_region_coverage(
                bam_file,
                annotation.seq_id,
                annotation.start,
                annotation.end
            )
            # adds the results of the coverage to the total coverage
            # uses the add method of a Series to make sure that possible
            # nan values are filled with 0 before the sum
            try:
                tot_coverage[annotation.uid] = sample_coverage.add(
                    tot_coverage[annotation.uid],
                    fill_value=0
                )
            except KeyError:
                tot_coverage[annotation.uid] = sample_coverage

            cov_mean = sample_coverage.mean()

            annotation.set_attr(
                sample + attr_suff,
                0 if numpy.isnan(cov_mean) else int(cov_mean)
            )
            if (index + 1) % 1000 == 0:
                LOG.debug(
                    'Added coverage for (%d/%d) annotations',
                    index + 1,
                    len(annotations)
                )
    for annotation in annotations:
        cov_mean = tot_coverage[annotation.uid].mean()
        annotation.set_attr(
            'cov',
            0 if numpy.isnan(cov_mean) else cov_mean
        )


[docs]def read_samtools_depth(file_handle, num_seqs=10000, seq_ids=None):
    """
    .. versionchanged:: 0.4.0
        now returns 3 array, instead of 2. Also added *seq_ids* to skip lines

    .. versionchanged:: 0.3.4
        *num_seqs* can be None to avoid a log message

    .. versionadded:: 0.3.0

    Reads a samtools *depth* file, returning a generator that yields the
    array of each base coverage on a per-sequence base.

    .. note::

        The information on position is not used, to use numpy and save memory.
        samtools *depth* should be called with the `-aa` option::

             `samtools depth -aa bamfile`

        This options will output both base position with 0 coverage and
        sequneces with no aligned reads

    Arguments:
        file_handle (file): file handle of the coverage file
        num_seqs (int or None): number of sequence that fires a log message. If
            None, no message is triggered
        seq_ids (dict, set): a hashed container like a dictionary or set with
            the sequences to return

    Yields:
        tuple: the first element is the sequence identifier, the second one
        is the *numpy* array with the positions, the third element is the
        *numpy* array with the coverages
    """
    curr_key = ''
    curr_pos = []
    curr_cov = []

    file_handle = open_file(file_handle, 'rb')

    LOG.info(
        'Reading coverage from file (%s)',
        getattr(file_handle, 'name', repr(file_handle))
    )
    line_no = 0
    for line in file_handle:
        line = line.decode('ascii')
        name, pos, cov = line.strip().split('\t')
        pos = int(pos)
        cov = int(cov)
        if (seq_ids is not None) and (name not in seq_ids):
            continue
        if curr_key == name:
                curr_pos.append(pos)
                curr_cov.append(cov)
        else:
            if curr_key == '':
                curr_cov.append(cov)
                curr_pos.append(pos)
                curr_key = name
            else:
                line_no += 1
                if (num_seqs is not None) and (line_no % num_seqs == 0):
                    LOG.info('Read %d sequence coverage', line_no)
                yield curr_key, numpy.array(curr_pos), numpy.array(curr_cov)
                curr_key = name
                curr_cov = [cov]
                curr_cov = [pos]
    else:
        yield curr_key, numpy.array(curr_pos), numpy.array(curr_cov)

    LOG.info('Read a total of %d sequence coverage', line_no + 1)


[docs]class SamtoolsDepth(object):
    """
    .. versionchanged:: 0.4.0
        uses pandas.SparseArray now. It should use less memory, but needs
        pandas version > 0.24

    .. versionadded:: 0.3.0

    A class used to cache the results of :func:`read_samtools_depth`, while
    reading only the necessary data from a`samtools depth -aa` file.
    """
    file_handle = None
    data = None
    max_size = None
    max_size_dict = None

    def __init__(self, file_handle, num_seqs=10**4, max_size=10**6,
                 max_size_dict=None):
        """
        .. versionchanged:: 0.4.0
            added *max_size* and max_size_dict

        Arguments:
            file_handle (file): the file handle to pass to
                :func:`read_samtools_depth`
            num_seqs (int): number of sequence that fires a log message
            max_size (int): max size to use in the SparseArray
            max_size_dict (dict): dictionary with max size for each seq_id
                requested. If None, *max_size* is used
        """
        self.max_size = max_size
        if max_size_dict is None:
            self.max_size_dict = {}
        else:
            self.max_size_dict = max_size_dict

        self.file_handle = read_samtools_depth(
            file_handle,
            num_seqs=num_seqs,
            seq_ids=max_size_dict
        )
        self.data = {}


[docs]    def region_coverage(self, seq_id, start, end):
        """

        Returns the mean coverage of a region. The *start* and *end* parameters
        are expected to be 1-based coordinates, like the correspondent
        attributes in :class:`mgkit.io.gff.Annotation` or
        :class:`mgkit.io.gff.GenomicRange`.

        If the sequence for which the coverage is requested is not found, the
        *depth* file is read (and cached) until it is found.

        Arguments:
            seq_id (str): sequence for which to return mean coverage
            start (int): start of the region
            end (int): end of the region

        Returns:
            float: mean coverage of the requested region
        """
        try:
            cov = self.data[seq_id][start-1:end]
        except KeyError:
            for key, pos_array, cov_array in self.file_handle:
                # If the max_size_dict was passed, skips arrays that are not in
                # of interests
                if self.max_size_dict and (key not in self.max_size_dict):
                    continue
                # Init a SparseArray (by passing the correct dtype)
                value = pandas.Series(
                    # brings start position to 0
                    dict(zip(pos_array - 1, cov_array)),
                    dtype="Sparse[int]"
                # reindex to the correct size to allow
                ).reindex(
                    range(self.max_size_dict.get(seq_id, self.max_size)),
                    fill_value=0
                )
                self.data[key] = value
                # if the key is the one requested, the loop is stopped and and
                # the value is kept
                if key == seq_id:
                    cov = value[start-1:end]
                    break
            else:
                raise ValueError(
                    "No coverage information found for {}".format(seq_id)
                )

        return cov.mean()