Source code for mgkit.counts.func

"""
.. versionadded:: 0.1.13

Misc functions for count data
"""
import sys
import logging
import itertools
import functools
from collections import Counter
from future.utils import viewitems

from mgkit.filter import taxon as tx_filters
from mgkit.io import open_file
import mgkit.simple_cache

import pandas

LOG = logging.getLogger(__name__)

SKIP = set(
    [
        'no_feature',
        'ambiguous',
        'too_low_aQual',
        'not_aligned',
        'alignment_not_unique'
    ]
)


[docs]def load_htseq_counts(file_handle, conv_func=int):
    """
    .. versionchanged:: 0.1.15
        added *conv_func* parameter

    Loads an HTSeq-count result file

    Arguments:

        file_handle (file or str): file handle or string with file name
        conv_func (func): function to convert the number from string, defaults
            to *int*, but *float* can be used as well

    Yields:
        tuple: first element is the gene_id and the second is the count

    """

    if isinstance(file_handle, str):
        file_handle = open_file(file_handle, 'rb')

    if getattr(file_handle, 'name', None) is not None:
        LOG.info("Loading HTSeq-count file %s", file_handle.name)

    for line in file_handle:
        line = line.decode('ascii')
        gene_id, count = line.rstrip().split('\t')

        if line.startswith('__') or (gene_id in SKIP):
            continue

        yield gene_id, conv_func(count)


[docs]def batch_load_htseq_counts(count_files, samples=None, cut_name=None):
    """
    Loads a list of htseq count result files and returns a DataFrame
    (IDxSAMPLE)

    The sample names are names are the file names if *samples* and *cut_name*
    are *None*, supplying a list of sample names with *samples* is the
    preferred way, and *cut_name* is used for backward compatibility and as an
    option in cases a string replace is enough.

    Arguments:
        count_files (file or str): file handle or string with file name
        samples (iterable): list of sample names, in the same order as
            *count_files*
        cut_name (str): string to delete from the the file names to get the
            sample names

    Returns:
        pandas.DataFrame: with sample names as columns and gene_ids as index
    """
    counts = {}

    iterator = itertools.zip_longest(
        count_files,
        [] if samples is None else samples
    )

    for fname, sample in iterator:
        if sample is None:
            if cut_name is not None:
                sample = fname.replace(cut_name, '')
            else:
                sample = fname

        counts[sample] = pandas.Series(load_htseq_counts(fname))

    return pandas.DataFrame.from_dict(counts)


[docs]def get_uid_info(info_dict, uid):
    """
    Simple function to get a value from a dictionary of tuples
    (gene_id, taxon_id)
    """
    return info_dict[uid]


[docs]def get_uid_info_ann(annotations, uid):
    """
    Simple function to get a value from a dictionary of annotations
    """
    annotation = annotations[uid]
    return annotation.gene_id, annotation.taxon_id


[docs]def filter_counts(counts_iter, info_func, gfilters=None, tfilters=None):
    """
    Returns counts that pass filters for each *uid* associated *gene_id* and
    *taxon_id*.

    Arguments:
        counts_iter (iterable): iterator that yields a tuple (uid, count)
        info_func (func): function accepting a *uid* that returns a tuple
            *(gene_id, taxon_id)*
        gfilters (iterable): list of filters to apply to each *uid* associated
            *gene_id*
        tfilters (iterable): list of filters to apply to each *uid* associated
            *taxon_id*

    Yields:
        tuple: *(uid, count)* that pass filters
    """
    for uid, count in counts_iter:
        try:
            gene_id, taxon_id = info_func(uid)
        except KeyError:
            continue

        if tfilters is not None:
            if taxon_id is None:
                continue

            if not all(tfilter(taxon_id) for tfilter in tfilters):
                continue

        if gfilters is not None:
            if not all(gfilter(taxon_id) for gfilter in gfilters):
                continue

        yield uid, count


[docs]def map_counts(counts_iter, info_func, gmapper=None, tmapper=None, index=None,
               uid_used=None):
    """
    .. versionchanged:: 0.1.14
        added *index* parameter

    .. versionchanged:: 0.1.15
        added *uid_used* parameter

    Maps counts according to the gmapper and tmapper functions. Each mapped
    gene ID count is the sum of all uid that have the same ID(s). The same is
    true for the taxa.

    Arguments:
        counts_iter (iterable): iterator that yields a tuple (uid, count)
        info_func (func): function accepting a *uid* that returns a tuple
            *(gene_id, taxon_id)*
        gmapper (func): fucntion that accepts a *gene_id* and returns a list
            of mapped IDs
        tmapper (func): fucntion that accepts a *taxon_id* and returns a new
            *taxon_id*
        index (None, str): if None, the index of the Series if
            *(gene_id, taxon_id)*, if a str, it can be either *gene* or
            *taxon*, to specify a single value
        uid_used (None, dict): an empty dictionary in which to store the *uid*
            that were assigned to each key of the returned pandas.Series. If
            *None*, no information is saved

    Returns:
        pandas.Series: array with MultiIndex *(gene_id, taxon_id)* with the
        mapped counts
    """
    mapped_counts = {}

    for uid, count in counts_iter:
        try:
            gene_id, taxon_id = info_func(uid)
        except KeyError:
            continue

        if gmapper is not None:
            gene_ids = gmapper(gene_id)
        else:
            gene_ids = [gene_id]

        if tmapper is not None:
            taxon_id = tmapper(taxon_id)
            if taxon_id is None:
                continue

        for map_id in gene_ids:
            if index is None:
                key = (map_id, taxon_id)
            elif index == 'gene':
                key = map_id
            elif index == 'taxon':
                key = taxon_id

            try:
                mapped_counts[key] += count
                if uid_used is not None:
                    uid_used[key].add(uid)
            except KeyError:
                mapped_counts[key] = count
                if uid_used is not None:
                    uid_used[key] = set([uid])

    return pandas.Series(mapped_counts)


[docs]def map_taxon_id_to_rank(taxonomy, rank, taxon_id, include_higher=True):
    """
    Maps a *taxon_id* to the request taxon rank. Returns *None* if
    *include_higher* is False and the found rank is not the one requested.

    Internally uses :meth:`mgkit.taxon.Taxonomy.get_ranked_taxon`

    Arguments:
        taxonomy: taxonomy instance
        rank (str): taxonomic rank requested
        taxon_id (int): taxon_id to map
        include_higher (bool): if False, any rank different than the requested
            one is discarded

    Returns:
        (int, None): if the mapping is successful, the ranked taxon_id is
        returned, otherwise *None* is returned
    """
    if taxon_id is None:
        return None

    ranked = taxonomy.get_ranked_taxon(taxon_id, rank)

    if (include_higher is False) and (ranked.rank != rank):
        return None
    return ranked.taxon_id


[docs]def map_gene_id_to_map(gene_map, gene_id):
    """
    Function that extract a list of gene mappings from a dictionary and returns
    an empty list if the *gene_id* is not found.
    """
    return gene_map.get(gene_id, [])


[docs]def load_sample_counts(info_dict, counts_iter, taxonomy, inc_anc=None,
                       rank=None, gene_map=None, ex_anc=None,
                       include_higher=True, cached=True, uid_used=None):
    """
    .. versionchanged:: 0.1.14
        added *cached* argument

    .. versionchanged:: 0.1.15
        added *uid_used* parameter

    .. versionchanged:: 0.2.0
        info_dict can be a function

    Reads sample counts, filtering and mapping them if requested. It's an
    example of the usage of the above functions.

    Arguments:
        info_dict (dict): dictionary that has *uid* as key and
            *(gene_id, taxon_id)* as value. In alternative a function that
            accepts a *uid* as sole argument and returns *(gene_id, taxon_id)*
        counts_iter (iterable): iterable that yields a *(uid, count)*
        taxonomy: taxonomy instance
        inc_anc (int, list): ancestor taxa to include
        rank (str): rank to which map the counts
        gene_map (dict): dictionary with the gene mappings
        ex_anc (int, list): ancestor taxa to exclude
        include_higher (bool): if False, any rank different than the requested
            one is discarded
        cached (bool): if *True*, the function will use
            :class:`mgkit.simple_cache.memoize` to cache some of the functions
            used
        uid_used (None, dict): an empty dictionary in which to store the *uid*
            that were assigned to each key of the returned pandas.Series. If
            *None*, no information is saved

    Returns:
        pandas.Series: array with MultiIndex *(gene_id, taxon_id)* with the
        filtered and mapped counts
    """
    if isinstance(info_dict, dict):
        if isinstance(info_dict[list(info_dict.keys())[0]], tuple):
            info_func = functools.partial(get_uid_info, info_dict)
        else:
            info_func = functools.partial(get_uid_info_ann, info_dict)
    else:
        info_func = info_dict

    tfilters = []

    if inc_anc is not None:
        if not isinstance(inc_anc, (list, set, tuple)):
            inc_anc = [inc_anc]
        tfilters.append(
            functools.partial(
                tx_filters.filter_by_ancestor,
                filter_list=inc_anc,
                exclude=False,
                taxonomy=taxonomy
            )
        )
    if ex_anc is not None:
        if not isinstance(ex_anc, (list, set, tuple)):
            ex_anc = [ex_anc]
        tfilters.append(
            functools.partial(
                tx_filters.filter_by_ancestor,
                filter_list=ex_anc,
                exclude=True,
                taxonomy=taxonomy
            )
        )

    if cached:
        tfilters = [
            mgkit.simple_cache.memoize(tfilter)
            for tfilter in tfilters
        ]

    gmapper = None
    tmapper = None

    if rank is not None:
        tmapper = functools.partial(
            map_taxon_id_to_rank,
            taxonomy,
            rank,
            include_higher=include_higher
        )
        if cached:
            tmapper = mgkit.simple_cache.memoize(tmapper)

    if gene_map is not None:
        gmapper = functools.partial(
            map_gene_id_to_map,
            gene_map
        )
        if cached:
            gmapper = mgkit.simple_cache.memoize(gmapper)

    series = map_counts(
        filter_counts(
            counts_iter,
            info_func,
            gfilters=None,
            tfilters=tfilters
        ),
        info_func,
        gmapper=gmapper,
        tmapper=tmapper,
        uid_used=uid_used
    )

    return series


[docs]def load_sample_counts_to_taxon(info_func, counts_iter, taxonomy, inc_anc=None,
                                rank=None, ex_anc=None, include_higher=True,
                                cached=True, uid_used=None):
    """
    .. versionadded:: 0.1.14

    .. versionchanged:: 0.1.15
        added *uid_used* parameter

    Reads sample counts, filtering and mapping them if requested. It's a
    variation of :func:`load_sample_counts`, with the counts being mapped only
    to each specific taxon. Another difference is the absence of any assumption
    on the first parameter. It is expected to return a (gene_id, taxon_id)
    tuple.

    Arguments:
        info_func (callable): any callable that accept an *uid* as the only
            parameter and and returns *(gene_id, taxon_id)* as value
        counts_iter (iterable): iterable that yields a *(uid, count)*
        taxonomy: taxonomy instance
        inc_anc (int, list): ancestor taxa to include
        rank (str): rank to which map the counts
        ex_anc (int, list): ancestor taxa to exclude
        include_higher (bool): if False, any rank different than the requested
            one is discarded
        cached (bool): if *True*, the function will use
            :class:`mgkit.simple_cache.memoize` to cache some of the functions
            used
        uid_used (None, dict): an empty dictionary in which to store the *uid*
            that were assigned to each key of the returned pandas.Series. If
            *None*, no information is saved

    Returns:
        pandas.Series: array with Index *taxon_id* with the filtered and mapped
        counts
    """
    tfilters = []

    if inc_anc is not None:
        if not isinstance(inc_anc, (list, set, tuple)):
            inc_anc = [inc_anc]
        tfilters.append(
            functools.partial(
                tx_filters.filter_by_ancestor,
                filter_list=inc_anc,
                exclude=False,
                taxonomy=taxonomy
            )
        )
    if ex_anc is not None:
        if not isinstance(ex_anc, (list, set, tuple)):
            ex_anc = [ex_anc]
        tfilters.append(
            functools.partial(
                tx_filters.filter_by_ancestor,
                filter_list=ex_anc,
                exclude=True,
                taxonomy=taxonomy
            )
        )

    if cached:
        tfilters = [
            mgkit.simple_cache.memoize(tfilter)
            for tfilter in tfilters
        ]

    tmapper = None

    if rank is not None:
        tmapper = functools.partial(
            map_taxon_id_to_rank,
            taxonomy,
            rank,
            include_higher=include_higher
        )
        if cached:
            tmapper = mgkit.simple_cache.memoize(tmapper)

    series = map_counts(
        filter_counts(
            counts_iter,
            info_func,
            gfilters=None,
            tfilters=tfilters
        ),
        info_func,
        tmapper=tmapper,
        index='taxon',
        uid_used=uid_used
    )

    return series


[docs]def load_sample_counts_to_genes(info_func, counts_iter, taxonomy, inc_anc=None,
                                gene_map=None, ex_anc=None, cached=True,
                                uid_used=None):
    """
    .. versionadded:: 0.1.14

    .. versionchanged:: 0.1.15
        added *uid_used* parameter

    Reads sample counts, filtering and mapping them if requested. It's a
    variation of :func:`load_sample_counts`, with the counts being mapped only
    to each specific gene_id. Another difference is the absence of any
    assumption on the first parameter. It is expected to return a
    (gene_id, taxon_id) tuple.

    Arguments:
        info_func (callable): any callable that accept an *uid* as the only
            parameter and and returns *(gene_id, taxon_id)* as value
        counts_iter (iterable): iterable that yields a *(uid, count)*
        taxonomy: taxonomy instance
        inc_anc (int, list): ancestor taxa to include
        rank (str): rank to which map the counts
        gene_map (dict): dictionary with the gene mappings
        ex_anc (int, list): ancestor taxa to exclude
        cached (bool): if *True*, the function will use
            :class:`mgkit.simple_cache.memoize` to cache some of the functions
            used
        uid_used (None, dict): an empty dictionary in which to store the *uid*
            that were assigned to each key of the returned pandas.Series. If
            *None*, no information is saved

    Returns:
        pandas.Series: array with Index *gene_id* with the filtered and mapped
        counts
    """
    tfilters = []

    if inc_anc is not None:
        if not isinstance(inc_anc, (list, set, tuple)):
            inc_anc = [inc_anc]
        tfilters.append(
            functools.partial(
                tx_filters.filter_by_ancestor,
                filter_list=inc_anc,
                exclude=False,
                taxonomy=taxonomy
            )
        )
    if ex_anc is not None:
        if not isinstance(ex_anc, (list, set, tuple)):
            ex_anc = [ex_anc]
        tfilters.append(
            functools.partial(
                tx_filters.filter_by_ancestor,
                filter_list=ex_anc,
                exclude=True,
                taxonomy=taxonomy
            )
        )

    if cached:
        tfilters = [
            mgkit.simple_cache.memoize(tfilter)
            for tfilter in tfilters
        ]

    if gene_map is not None:
        gmapper = functools.partial(
            map_gene_id_to_map,
            gene_map
        )
        if cached:
            gmapper = mgkit.simple_cache.memoize(gmapper)

    series = map_counts(
        filter_counts(
            counts_iter,
            info_func,
            gfilters=None,
            tfilters=tfilters
        ),
        info_func,
        gmapper=gmapper,
        index='gene',
        uid_used=uid_used
    )

    return series


[docs]def load_deseq2_results(file_name, taxon_id=None):
    """
    .. versionadded:: 0.1.14

    Reads a CSV file output with DESeq2 results, adding a taxon_id to the index
    for concatenating multiple results from different taxonomic groups.

    Arguments:
        file_name (str): file name of the CSV
    Returns:
        pandas.DataFrame: a MultiIndex DataFrame with the results

    """
    dataframe = pandas.DataFrame.from_csv(file_name)

    dataframe = dataframe.rename(
        index=dict(
            (gene_id, (gene_id, taxon_id))
            for gene_id in dataframe.index
        )
    )
    dataframe.index.names = ['gene_id', 'taxon_id']

    return dataframe


[docs]def map_counts_to_category(counts, gene_map, nomap=False, nomap_id='NOMAP'):
    """
    Used to map the counts from a certain gene identifier to another. Genes
    with no mappings are not counted, unless *nomap=True*, in which case they
    are counted as *nomap_id*.

    Arguments:
        counts (iterator): an iterator that yield a tuple, with the first value
            being the gene_id and the second value the count for it
        gene_map (dictionary): a dictionary whose keys are the gene_id yield by
            *counts* and the values are iterable of mapping identifiers
        nomap (bool): if False, counts for genes with no mappings in *gene_map*
            are discarded, if True, they a counted as *nomap_id*
        nomap_id (str): name of the mapping for genes with no mappings

    Returns:
        pandas.Series: mapped counts
    """

    newcounts = {}
    for gene_id, count in counts:
        try:
            map_ids = gene_map[gene_id]
        except KeyError:
            continue

        if nomap and (not map_ids):
            map_ids = [nomap_id]

        for map_id in map_ids:
            try:
                newcounts[map_id] += count
            except KeyError:
                newcounts[map_id] = count

    return pandas.Series(newcounts)


[docs]def load_counts_from_gff(annotations, elem_func=lambda x: x.uid,
                         sample_func=None, nozero=True):
    """
    .. versionadded:: 0.2.5

    Loads counts for each annotations that are stored into the annotation
    *counts_* attributes. Annotations with a total of 0 counts are skipped by
    default (nozero=True), the row index is set to the *uid* of the annotation
    and the column to the sample name. The functions used to transform the
    indices expect the annotation (for the row, *elem_func*) and the sample
    name (for the column, *sample_func*).

    Arguments:
        annotations (iter): iterable of annotations
        elem_func (func): function that accepts an annotation and return a
            str/int for a Index or a tuple for a MultiIndex, defaults to
            returning the *uid* of the annotation
        sample_func (func, None): function that accepts the sample name and
            returns tuple for a MultiIndex. Defaults to *None* so no
            transformation is performed
        nozero (bool): if *True*, annotations with no counts are skipped
    """

    count_dict = {}

    for annotation in annotations:
        counts = pandas.Series(annotation.counts)

        if nozero and (counts.sum() == 0):
            continue

        count_dict[elem_func(annotation)] = counts

    # LOG.debug('Returning DataFrame')

    count_dict = pandas.DataFrame.from_dict(count_dict, orient='index')
    if sample_func is not None:
        count_dict.columns = pandas.MultiIndex.from_tuples(
            [
                sample_func(column)
                for column in count_dict.columns
            ]
        )

    return count_dict


[docs]def from_gff(annotations, samples, ann_func=None, sample_func=None):
    """
    .. versionadded:: 0.3.1

    Loads count data from a GFF file, only for the requested samples. By
    default the function returns a DataFrame where the index is the *uid* of
    each annotation and the columns the requested samples.

    This can be customised by supplying *ann_func* and *sample_func*.
    *sample_func* is a function that accept a sample name and is expected to
    return a string or a tuple. This will be used to change the columns in the
    DataFrame. *ann_func* must accept an :class:`mgkit.io.gff.Annotation`
    instance and return an iterable, with each iteration yielding either a
    single element or a tuple (for a MultiIndex DataFrame), each element
    yielded will have the count of that annotation added to.

    Arguments:
        annotation (iterable): iterable yielding annotations
        samples (iterable): list of samples to keep
        ann_func (func): function used to customise the output
        sample_func (func): function to customise the column elements

    Returns:
        DataFrame: dataframe with the count data, columns are the samples and
        rows the annotation counts (unless mapped with *ann_func*)

    Exmples:
        Assuming we have a list of *annotations* and sample SAMPLE1 and SAMPLE2
        we can obtain the count table for all annotations with this

        >>> from_gff(annotations, ['SAMPLE1', 'SAMPLE2'])

        Assuming we want to group the samples, for example treatment1,
        treatment2 and control1, control2 into a MultiIndex DataFrame column

        >>> sample_func = lambda x: ('T' if x.startswith('t') else 'C', x)
        >>> from_gff(annotations, ['treatment1', 'treatment2', 'control1',
        'control2'], sample_func=sample_func)

        Annotations can be mapped to other levels for example instead of using
        the *uid* that is the default, it can be mapped to the gene_id,
        taxon_id information that is included in the annotation, resulting in a
        MultiIndex index for the rows, with (gene_id, taxon_id) as key.

        >>> ann_func = lambda x: [(x.gene_id, x.taxon_id)]
        >>> from_gff(annotations, ['SAMPLE1', 'SAMPLE2'], ann_func=ann_func)
    """
    if ann_func is None:
        def ann_func(x): return (x.uid, )

    if sample_func is None:
        def sample_func(x): return x

    counters = {
        sample_func(sample): Counter()
        for sample in samples
    }

    for annotation in annotations:
        for sample, count in viewitems(annotation.counts):
            sample = sample_func(sample)
            if sample not in counters:
                continue
            for ann_id in ann_func(annotation):
                counters[sample][ann_id] += count

    return pandas.DataFrame(counters)