Source code for mgkit.align

"""
Module dealing with BAM/SAM files
"""
from future.utils import viewitems
import logging
try:
    # In Python2
    from itertools import izip as zip
except ImportError:
    pass
from builtins import object
import pandas
import numpy
import pysam
from tqdm import tqdm
from mgkit.io.utils import open_file

LOG = logging.getLogger(__name__)


[docs]def get_region_coverage(bam_file, seq_id, feat_from, feat_to): """ Return coverage for an annotation. .. note:: feat_from and feat_to are 1-based indexes :param Samfile bam_file: instance of :class:`pysam.Samfile` :param str seq_id: sequence id :param int feat_from: start position of feature :param int feat_to: end position of feature :return int: coverage array for the annotation """ norm_start, norm_end = feat_from - 1, feat_to - 1 iterator = bam_file.pileup( reference=seq_id, start=norm_start, end=norm_end, truncate=True ) coverage = pandas.Series( dict( (pileup_proxy.pos, pileup_proxy.n) for pileup_proxy in iterator ), dtype=numpy.int, ) return coverage
[docs]def covered_annotation_bp(files, annotations, min_cov=1, progress=False): """ .. versionadded:: 0.1.14 Returns the number of base pairs covered of annotations over multiple samples. Arguments: files (iterable): an iterable that returns the alignment file names annotations (iterable): an iterable that returns annotations min_cov (int): minumum coverage for a base to counted progress (bool): if *True*, a progress bar is used Returns: dict: a dictionary whose keys are the uid and the values the number of bases that are covered by reads among all samples """ annotations = [ (annotation.uid, annotation.seq_id, annotation.start, annotation.end) for annotation in annotations ] covered = {} for file_name in files: # pysam 0.8.1 changed Samfile to AlignmentFile alg_file = pysam.AlignmentFile(file_name, 'rb') if progress: annotations = tqdm(annotations) for uid, seq_id, start, end in annotations: cov = get_region_coverage(alg_file, seq_id, start, end) try: covered[uid] = cov.add(covered[uid], fill_value=0) except KeyError: covered[uid] = cov return dict( (uid, len(cov[cov >= min_cov])) for uid, cov in viewitems(covered) )
[docs]def add_coverage_info(annotations, bam_files, samples, attr_suff='_cov'): """ .. versionchanged:: 0.3.4 the coverage now is returned as floats instead of int Adds coverage information to annotations, using BAM files. The coverage information is added for each sample as a 'sample_cov' and the total coverage as as 'cov' attribute in the annotations. .. note:: The bam_files and sample variables must have the same order :param iterable annotations: iterable of annotations :param iterable bam_files: iterable of :class:`pysam.Samfile` instances :param iterable sample: names of the samples for the BAM files """ LOG.info("Adding coverage info for %d annotations", len(annotations)) for bam_file, sample in zip(bam_files, samples): LOG.info("Sample %s, file %s", sample, bam_file.filename) tot_coverage = {} for bam_file, sample in zip(bam_files, samples): LOG.info("Adding coverage for sample %s", sample) for index, annotation in enumerate(annotations): sample_coverage = get_region_coverage( bam_file, annotation.seq_id, annotation.start, annotation.end ) # adds the results of the coverage to the total coverage # uses the add method of a Series to make sure that possible # nan values are filled with 0 before the sum try: tot_coverage[annotation.uid] = sample_coverage.add( tot_coverage[annotation.uid], fill_value=0 ) except KeyError: tot_coverage[annotation.uid] = sample_coverage cov_mean = sample_coverage.mean() annotation.set_attr( sample + attr_suff, 0 if numpy.isnan(cov_mean) else int(cov_mean) ) if (index + 1) % 1000 == 0: LOG.debug( 'Added coverage for (%d/%d) annotations', index + 1, len(annotations) ) for annotation in annotations: cov_mean = tot_coverage[annotation.uid].mean() annotation.set_attr( 'cov', 0 if numpy.isnan(cov_mean) else cov_mean )
[docs]def read_samtools_depth(file_handle, num_seqs=10000, seq_ids=None): """ .. versionchanged:: 0.4.2 the function returns **lists** instead of numpy arrays for speed (at least in my tests it seems ~4x increase) .. versionchanged:: 0.4.0 now returns 3 array, instead of 2. Also added *seq_ids* to skip lines .. versionchanged:: 0.3.4 *num_seqs* can be None to avoid a log message .. versionadded:: 0.3.0 Reads a samtools *depth* file, returning a generator that yields the array of each base coverage on a per-sequence base. .. note:: There's no need anymore to use `samtools depth -aa`, because the function returns the position array and this can be used to create a Pandas SparseArray which can be reindexed to include missing positions (with values of 0) **Valid for version < 0.4.0**: The information on position is not used, to use numpy and save memory. samtools *depth* should be called with the `-aa` option:: `samtools depth -aa bamfile` This options will output both base position with 0 coverage and sequneces with no aligned reads Arguments: file_handle (file): file handle of the coverage file num_seqs (int or None): number of sequence that fires a log message. If None, no message is triggered seq_ids (dict, set): a hashed container like a dictionary or set with the sequences to return Yields: tuple: the first element is the sequence identifier, the second one is the list with the positions, the third element is the list with the coverages """ curr_key = '' curr_pos = [] curr_cov = [] file_handle = open_file(file_handle, 'rb') LOG.info( 'Reading coverage from file (%s)', getattr(file_handle, 'name', repr(file_handle)) ) line_no = 0 for line in file_handle: line = line.decode('ascii') # From Python3 the default is Universal newlines, and it's not expected # to have more than '\n' at the end of the line - increases speed # slightly name, pos, cov = line[:-1].split('\t') if (seq_ids is not None) and (name not in seq_ids): continue # only converts if sequence is to be used pos = int(pos) cov = int(cov) if curr_key == name: curr_pos.append(pos) curr_cov.append(cov) else: if curr_key == '': curr_cov.append(cov) curr_pos.append(pos) curr_key = name else: line_no += 1 if (num_seqs is not None) and (line_no % num_seqs == 0): LOG.info('Read %d sequence coverage', line_no) yield curr_key, curr_pos, curr_cov curr_key = name curr_cov = [cov] curr_cov = [pos] else: yield curr_key, curr_pos, curr_cov LOG.info('Read a total of %d sequence coverage', line_no + 1)
[docs]class SamtoolsDepth(object): """ .. versionchanged:: 0.4.2 several optimisations and changes to support a scanning approach, instead of lookup table. No exception is raised when a sequence is not found in the file, instead assuming that the coverage is 0 .. versionchanged:: 0.4.0 uses pandas.SparseArray now. It should use less memory, but needs pandas version > 0.24 .. versionadded:: 0.3.0 This a class that helps use the results from :func:`read_samtools_depth`. There are 2 modes of operations: 1) Request a region coverage via :meth:`SamtoolsDepth.region_coverage` 2) Advance the `samtools depth` file until a sequence coverage is read The method 1) was the default in MGKit < 0.4.2, when the user would request a region coverage and this class would advance the reading until the requested region is found. While scanning, all the sequences encountered are kept as pandas Series with SparseArray declared, reindexed from 0 to the *max_size_dict* values if provided, or *max_size*. The advantage is that the it's easier to use, but with bigger datasets will keep high memory usage. This is the case for case 2) which involves calling directly :meth:`SamtoolsDepth.advance_file` returning the sequence ID read. Then the region coverage can be requested the same way as before, but won't involve scanning the file. The use of a SparseArray in a pandas Series allows the use of `samtools depth` files that weren't produced with `samtools depth -aa` and save memory. .. note:: The amount of memory used is fairly high, especially with the increasing number of sequences in a GFF/Depth file. It is recoommended to use max_size_dict to 1) only creates arrays for sequences needed and 2) use arrays of smaller size .. warning:: In MGKit 0.4.2, the internal dictionary to keep the SparseArray(s) is a :class:`weakref.WeakValueDictionary`, to improve the release of memory. That on Linux seems to release the array **too fast**. Upgrade to MGKit version 0.4.3 if this class is needed """ file_handle = None data = None max_size = None max_size_dict = None not_found = None closed = False density = None def __init__(self, file_handle, num_seqs=10**4, max_size=10**6, max_size_dict=None, calc_density=False, dtype='uint32'): """ .. versionchanged:: 0.4.2 added *raise_error* to control sequences not found in depth files, *calc_density* to debug the density of the SparseArray used and also dtype to control the dtype to use in the SparseArray .. versionchanged:: 0.4.0 added *max_size* and max_size_dict Arguments: file_handle (file): the file handle to pass to :func:`read_samtools_depth` num_seqs (int): number of sequence that fires a log message max_size (int): max size to use in the SparseArray max_size_dict (dict): dictionary with max size for each seq_id requested. If None, *max_size* is used calc_density (bool): If True, the density of the SparseArray(s) used are kept in :attr:`SamtoolsDepth.density` dtype (str): dtype to pass to the SparseArray """ self.max_size = max_size if max_size_dict is None: self.max_size_dict = {} else: self.max_size_dict = max_size_dict self.file_handle = read_samtools_depth( file_handle, num_seqs=num_seqs, seq_ids=max_size_dict ) self.not_found = set() self.closed = False self.data = {} self.dtype = 'Sparse[{}]'.format(dtype) if calc_density: self.density = []
[docs] def advance_file(self): """ .. versionadded:: 0.4.2 Requests the scan of the file and returns the sequence found, the coverage can then be retrivied with :meth:`SamtoolsDepth.region_coverage` Returns: (str, None): the return value is None if the file is fully read, otherwise the the sequnece ID found is returned """ # continue scanning the file try: key, pos_array, cov_array = next(self.file_handle) except StopIteration: self.closed = True return None # Init a SparseArray (by passing the correct dtype) value = pandas.Series( # brings start position to 0 dict(zip((pos - 1 for pos in pos_array), cov_array)), dtype=self.dtype # reindex to the correct size to allow ).reindex( range(self.max_size_dict.get(key, self.max_size)), fill_value=0 ) self.data[key] = value if self.density is not None: self.density.append(value.sparse.density) return key
[docs] def region_coverage(self, seq_id, start, end): """ .. versionchanged:: 0.4.2 now using :meth:`SamtoolsDepth.advance_file` to scan the file Returns the mean coverage of a region. The *start* and *end* parameters are expected to be 1-based coordinates, like the correspondent attributes in :class:`mgkit.io.gff.Annotation` or :class:`mgkit.io.gff.GenomicRange`. If the sequence for which the coverage is requested is not found, the *depth* file is read (and cached) until it is found. Arguments: seq_id (str): sequence for which to return mean coverage start (int): start of the region end (int): end of the region Returns: float: mean coverage of the requested region """ if seq_id in self.data: # data already in dictionary return self.data[seq_id][start-1:end].mean() elif seq_id in self.not_found: # already asked for and not found in depth file return 0. # depth file is closed and we assume # so no coverage is available elif self.closed: self.not_found.add(seq_id) return 0. else: while True: key = self.advance_file() if key is None: self.not_found.add(seq_id) self.closed = True return 0. elif key == seq_id: return self.data[seq_id][start-1:end].mean()
[docs] def drop_sequence(self, seq_id): """ .. versionadded:: 0.4.2 Remove the sequence passed from the internal dictionary """ if seq_id in self.data: del self.data[seq_id]