Source code for mgkit.align

"""
Module dealing with BAM/SAM files
"""
from future.utils import viewitems
import logging
try:
    # In Python2
    from itertools import izip as zip
except ImportError:
    pass
from builtins import object
import pandas
import numpy
import pysam
from tqdm import tqdm
from mgkit.io.utils import open_file

LOG = logging.getLogger(__name__)


[docs]def get_region_coverage(bam_file, seq_id, feat_from, feat_to): """ Return coverage for an annotation. .. note:: feat_from and feat_to are 1-based indexes :param Samfile bam_file: instance of :class:`pysam.Samfile` :param str seq_id: sequence id :param int feat_from: start position of feature :param int feat_to: end position of feature :return int: coverage array for the annotation """ norm_start, norm_end = feat_from - 1, feat_to - 1 iterator = bam_file.pileup( reference=seq_id, start=norm_start, end=norm_end, truncate=True ) coverage = pandas.Series( dict( (pileup_proxy.pos, pileup_proxy.n) for pileup_proxy in iterator ), dtype=numpy.int, ) return coverage
[docs]def covered_annotation_bp(files, annotations, min_cov=1, progress=False): """ .. versionadded:: 0.1.14 Returns the number of base pairs covered of annotations over multiple samples. Arguments: files (iterable): an iterable that returns the alignment file names annotations (iterable): an iterable that returns annotations min_cov (int): minumum coverage for a base to counted progress (bool): if *True*, a progress bar is used Returns: dict: a dictionary whose keys are the uid and the values the number of bases that are covered by reads among all samples """ annotations = [ (annotation.uid, annotation.seq_id, annotation.start, annotation.end) for annotation in annotations ] covered = {} for file_name in files: # pysam 0.8.1 changed Samfile to AlignmentFile alg_file = pysam.AlignmentFile(file_name, 'rb') if progress: annotations = tqdm(annotations) for uid, seq_id, start, end in annotations: cov = get_region_coverage(alg_file, seq_id, start, end) try: covered[uid] = cov.add(covered[uid], fill_value=0) except KeyError: covered[uid] = cov return dict( (uid, len(cov[cov >= min_cov])) for uid, cov in viewitems(covered) )
[docs]def add_coverage_info(annotations, bam_files, samples, attr_suff='_cov'): """ .. versionchanged:: 0.3.4 the coverage now is returned as floats instead of int Adds coverage information to annotations, using BAM files. The coverage information is added for each sample as a 'sample_cov' and the total coverage as as 'cov' attribute in the annotations. .. note:: The bam_files and sample variables must have the same order :param iterable annotations: iterable of annotations :param iterable bam_files: iterable of :class:`pysam.Samfile` instances :param iterable sample: names of the samples for the BAM files """ LOG.info("Adding coverage info for %d annotations", len(annotations)) for bam_file, sample in zip(bam_files, samples): LOG.info("Sample %s, file %s", sample, bam_file.filename) tot_coverage = {} for bam_file, sample in zip(bam_files, samples): LOG.info("Adding coverage for sample %s", sample) for index, annotation in enumerate(annotations): sample_coverage = get_region_coverage( bam_file, annotation.seq_id, annotation.start, annotation.end ) # adds the results of the coverage to the total coverage # uses the add method of a Series to make sure that possible # nan values are filled with 0 before the sum try: tot_coverage[annotation.uid] = sample_coverage.add( tot_coverage[annotation.uid], fill_value=0 ) except KeyError: tot_coverage[annotation.uid] = sample_coverage cov_mean = sample_coverage.mean() annotation.set_attr( sample + attr_suff, 0 if numpy.isnan(cov_mean) else int(cov_mean) ) if (index + 1) % 1000 == 0: LOG.debug( 'Added coverage for (%d/%d) annotations', index + 1, len(annotations) ) for annotation in annotations: cov_mean = tot_coverage[annotation.uid].mean() annotation.set_attr( 'cov', 0 if numpy.isnan(cov_mean) else cov_mean )
[docs]def read_samtools_depth(file_handle, num_seqs=10000, seq_ids=None): """ .. versionchanged:: 0.4.0 now returns 3 array, instead of 2. Also added *seq_ids* to skip lines .. versionchanged:: 0.3.4 *num_seqs* can be None to avoid a log message .. versionadded:: 0.3.0 Reads a samtools *depth* file, returning a generator that yields the array of each base coverage on a per-sequence base. .. note:: The information on position is not used, to use numpy and save memory. samtools *depth* should be called with the `-aa` option:: `samtools depth -aa bamfile` This options will output both base position with 0 coverage and sequneces with no aligned reads Arguments: file_handle (file): file handle of the coverage file num_seqs (int or None): number of sequence that fires a log message. If None, no message is triggered seq_ids (dict, set): a hashed container like a dictionary or set with the sequences to return Yields: tuple: the first element is the sequence identifier, the second one is the *numpy* array with the positions, the third element is the *numpy* array with the coverages """ curr_key = '' curr_pos = [] curr_cov = [] file_handle = open_file(file_handle, 'rb') LOG.info( 'Reading coverage from file (%s)', getattr(file_handle, 'name', repr(file_handle)) ) line_no = 0 for line in file_handle: line = line.decode('ascii') name, pos, cov = line.strip().split('\t') pos = int(pos) cov = int(cov) if (seq_ids is not None) and (name not in seq_ids): continue if curr_key == name: curr_pos.append(pos) curr_cov.append(cov) else: if curr_key == '': curr_cov.append(cov) curr_pos.append(pos) curr_key = name else: line_no += 1 if (num_seqs is not None) and (line_no % num_seqs == 0): LOG.info('Read %d sequence coverage', line_no) yield curr_key, numpy.array(curr_pos), numpy.array(curr_cov) curr_key = name curr_cov = [cov] curr_cov = [pos] else: yield curr_key, numpy.array(curr_pos), numpy.array(curr_cov) LOG.info('Read a total of %d sequence coverage', line_no + 1)
[docs]class SamtoolsDepth(object): """ .. versionchanged:: 0.4.0 uses pandas.SparseArray now. It should use less memory, but needs pandas version > 0.24 .. versionadded:: 0.3.0 A class used to cache the results of :func:`read_samtools_depth`, while reading only the necessary data from a`samtools depth -aa` file. """ file_handle = None data = None max_size = None max_size_dict = None def __init__(self, file_handle, num_seqs=10**4, max_size=10**6, max_size_dict=None): """ .. versionchanged:: 0.4.0 added *max_size* and max_size_dict Arguments: file_handle (file): the file handle to pass to :func:`read_samtools_depth` num_seqs (int): number of sequence that fires a log message max_size (int): max size to use in the SparseArray max_size_dict (dict): dictionary with max size for each seq_id requested. If None, *max_size* is used """ self.max_size = max_size if max_size_dict is None: self.max_size_dict = {} else: self.max_size_dict = max_size_dict self.file_handle = read_samtools_depth( file_handle, num_seqs=num_seqs, seq_ids=max_size_dict ) self.data = {}
[docs] def region_coverage(self, seq_id, start, end): """ Returns the mean coverage of a region. The *start* and *end* parameters are expected to be 1-based coordinates, like the correspondent attributes in :class:`mgkit.io.gff.Annotation` or :class:`mgkit.io.gff.GenomicRange`. If the sequence for which the coverage is requested is not found, the *depth* file is read (and cached) until it is found. Arguments: seq_id (str): sequence for which to return mean coverage start (int): start of the region end (int): end of the region Returns: float: mean coverage of the requested region """ try: cov = self.data[seq_id][start-1:end] except KeyError: for key, pos_array, cov_array in self.file_handle: # If the max_size_dict was passed, skips arrays that are not in # of interests if self.max_size_dict and (key not in self.max_size_dict): continue # Init a SparseArray (by passing the correct dtype) value = pandas.Series( # brings start position to 0 dict(zip(pos_array - 1, cov_array)), dtype="Sparse[int]" # reindex to the correct size to allow ).reindex( range(self.max_size_dict.get(seq_id, self.max_size)), fill_value=0 ) self.data[key] = value # if the key is the one requested, the loop is stopped and and # the value is kept if key == seq_id: cov = value[start-1:end] break else: raise ValueError( "No coverage information found for {}".format(seq_id) ) return cov.mean()