Source code for mgkit.io.blast

"""
Blast routines and parsers

"""
import logging
from builtins import range, zip
from future.utils import viewitems
from . import gff
from . import open_file

NUM_LINES = 10 ** 6

LOG = logging.getLogger(__name__)


[docs]def add_blast_result_to_annotation(annotation, gi_taxa_dict, taxonomy,
                                   threshold=60):
    """
    .. deprecated:: 0.4.0

    Adds blast information to a GFF annotation.

    :param annotation: GFF annotation object
    :param dict gi_taxa_dict: dictionary returned by
        :func:`parse_gi_taxa_table`.
    :param taxonomy: Uniprot taxonomy, used to add the taxon name to the
        annotation
    """

    try:
        hit = gi_taxa_dict[annotation.attributes.ko_idx]
    except KeyError:
        return

    if hit.score <= threshold:
        return

    try:
        # skips if the ID is not in the Uniprot Taxonomy
        annotation.attributes.blast_taxon = taxonomy[hit.taxon_id].s_name
        annotation.attributes.blast_taxon_idx = hit.taxon_id
    except KeyError:
        return


[docs]def parse_blast_tab(file_handle, seq_id=0, ret_col=(0, 1, 2, 6, 7, 11),
                    key_func=None, value_funcs=None):
    """
    .. versionadded:: 0.1.12

    Parses blast output tab format, returning for each line a key (the query
    id) and the columns requested in a tuple.

    Arguments:
        file_handle (file): file name or file handle for the blast ouput
        seq_id (int): index for the column which has the query id
        ret_col (list, None): list of indexes for the columns to be returned or
            *None* if all columns must be returned
        key_func (None, func): function to transform the query id value in the
            key returned. If *None*, the query id is used
        value_funcs (None, list): list of functions to transform the value of
            all the requested columns. If *None* the values are not converted

    Yields:
        tuple: iterator of tuples with the first element being the query id
        after key_func is applied, if requested and the second element of
        the tuple is a tuple with the requested columns *ret_col*

    .. table:: BLAST+ used with `-outfmt 6`, default columns

        +--------------+--------------------------------+
        | column index |          description           |
        +==============+================================+
        |            0 | query name                     |
        +--------------+--------------------------------+
        |            1 | subject name                   |
        +--------------+--------------------------------+
        |            2 | percent identities             |
        +--------------+--------------------------------+
        |            3 | aligned length                 |
        +--------------+--------------------------------+
        |            4 | number of mismatched positions |
        +--------------+--------------------------------+
        |            5 | number of gap positions        |
        +--------------+--------------------------------+
        |            6 | query sequence start           |
        +--------------+--------------------------------+
        |            7 | query sequence end             |
        +--------------+--------------------------------+
        |            8 | subject sequence start         |
        +--------------+--------------------------------+
        |            9 | subject sequence end           |
        +--------------+--------------------------------+
        |           10 | e-value                        |
        +--------------+--------------------------------+
        |           11 | bit score                      |
        +--------------+--------------------------------+

    """

    if key_func is None:
        def key_func(x): return x

    if ret_col is None:
        ret_col = list(range(12))

    if value_funcs is None:
        value_funcs = [lambda x: x for y in range(len(ret_col))]

    if isinstance(file_handle, str):
        file_handle = open_file(file_handle, 'r')

    LOG.info(
        "Reading BLAST results from file (%s)",
        getattr(file_handle, 'name', repr(file_handle))
    )

    lineno = 0
    comments = 0

    for lineno, line in enumerate(file_handle):
        line = line.decode('ascii')
        if line.startswith('#'):
            comments += 1
            continue

        line = line.strip()

        if not line:
            continue

        line = line.split('\t')
        key = key_func(line[seq_id])
        values = tuple(
            func(line[index])
            for index, func in zip(ret_col, value_funcs)
        )

        yield key, values

    LOG.info('Read %d BLAST records', lineno + 1 - comments)


[docs]def parse_uniprot_blast(file_handle, bitscore=40, db='UNIPROT-SP', dbq=10,
                        name_func=None, feat_type='CDS', seq_lengths=None):
    """
    .. versionadded:: 0.1.12

    .. versionchanged:: 0.1.13
        added *name_func* argument

    .. versionchanged:: 0.2.1
        added *feat_type*

    .. versionchanged:: 0.2.3
        added *seq_lengths* and added subject *start* and *end* and *e-value*

    Parses BLAST results in tabular format using :func:`parse_blast_tab`,
    applying a basic bitscore filter. Returns the annotations associated with
    each BLAST hit.

    Arguments:
        file_handle (str, file): file name or open file handle
        bitscore (int, float): the minimum bitscore for an annotation to be
            accepted
        db (str): database used
        dbq (int): an index indicating the quality of the sequence database
            used; this value is used in the filtering of annotations
        name_func (func): function to convert the name of the database
            sequences. Defaults to `lambda x: x.split('|')[1]`, which can be
            be used with fasta files provided by Uniprot
        feat_type (str): feature type in the GFF
        seq_lengths (dict): dictionary with the sequences lengths, used to
            deduct the frame of the '-' strand

    Yields:
        Annotation: instances of :class:`mgkit.io.gff.Annotation` instance of
        each BLAST hit.
    """

    if name_func is None:
        def name_func(x): return x.split('|')[1]

    ret_col = (0, 1, 2, 6, 7, 8, 9, 10, 11)

    # the second function extract the Uniprot ID from the sequence header
    value_funcs = (
        str,
        name_func,
        float,
        int,
        int,
        int,
        int,
        float,
        float
    )

    for seq_id, hit in parse_blast_tab(file_handle, ret_col=ret_col,
                                       value_funcs=value_funcs):
        if hit[-1] < bitscore:
            continue

        seq_len = None if seq_lengths is None else seq_lengths[seq_id]

        yield gff.from_nuc_blast(hit, db=db, dbq=dbq, feat_type=feat_type,
                                 seq_len=seq_len)


[docs]def parse_fragment_blast(file_handle, bitscore=40.0):
    """
    .. versionadded:: 0.1.13

    Parse the output of a BLAST output where the sequences are the single
    annotations, so the sequence names are the *uid* of the annotations.

    The only returned values are the best hits, maxed by bitscore and identity.

    Arguments:
        file_handle (str, file): file name or open file handle
        bitscore (float): minimum bitscore for accepting a hit

    Yields:
        tuple: a tuple whose first element is the *uid* (the sequence name) and
        the second is the a list of tuples whose first element is the GID (NCBI
        identifier), the second one is the identity and the third is the
        bitscore of the hit.

    """

    value_funcs = (lambda x: x.split('|')[1], float, float)

    iterator = parse_blast_tab(
        file_handle,
        ret_col=(1, 2, 11),
        value_funcs=value_funcs
    )

    uidmap = {}

    for uid, hit in iterator:

        if hit[-1] < bitscore:
            continue

        try:
            uidmap[uid].append(hit)
        except KeyError:
            uidmap[uid] = [hit]

    for uid, hits in viewitems(uidmap):
        # returns the hit with the max bitscore and max identity
        yield uid, hits


[docs]def parse_accession_taxa_table(file_handle, acc_ids=None, key=1, value=2,
                               num_lines=NUM_LINES, no_zero=True):
    """
    .. versionadded:: 0.2.5

    .. versionchanged:: 0.3.0
        added *no_zero*

    This function superseeds :func:`parse_gi_taxa_table`, since NCBI is
    deprecating the GIDs in favor of accessions like *X53318*. The new file can
    be found at the NCBI `ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid`,
    for DNA sequences (*nt* DB) *nucl_gb.accession2taxid.gz*.

    The file contains 4 columns, the first one is the accession without its
    version, the second one includes the version, the third column is the
    taxonomic identifier and the fourth is either the old GID or **na**.

    The column used as key is the *second*, since by default the fasta headers
    used in NCBI DBs use the versioned identifier. To use the GID as key, the
    *key* parameter can be set to 3, but if no identifier is found (*na* as per
    the file README), the line is skipped.

    Arguments:
        file_handle (str, file): file name or open file handle
        acc_ids (None, list): if it's not `None` only the keys included in the
            passed `acc_ids` list will be returned
        key (int): 0-based index for the column to use as accession. Defaults
            to the versioned accession that is used in GenBank fasta files.
        num_lines (None, int): number of which a message is logged. If None,
            no message is logged
        no_zero (bool): if True (default) a key with taxon_id of 0 is not yield

    .. note::

        GIDs are being phased out in September 2016:
        http://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/

    """

    file_handle = open_file(file_handle, 'rb')

    LOG.info(
        "Reading taxonomic information from file (%s)",
        getattr(file_handle, 'name', repr(file_handle))
    )

    if acc_ids is not None:
        acc_ids = set(acc_ids)

    zero_acc = 0

    idx = 0

    for idx, line in enumerate(file_handle):
        line = line.decode('ascii')
        # skip header
        if line.startswith('accession'):
            continue

        if (num_lines is not None) and ((idx + 1) % num_lines == 0):
            LOG.info("Parsed %d lines", idx + 1)

        line = line.strip().split('\t')

        acc_id = line[key]

        if (acc_ids is not None) and (acc_id not in acc_ids):
            continue

        # this is the case if the column used as key is the fourth (GID) and
        # the GID is not available for that accession. The best case is to skip
        if acc_id.lower() == 'na':
            continue

        taxon_id = int(line[value])

        # in cases where 0 is the organism (doesn't exist in the taxonomy)
        if no_zero and (taxon_id == 0):
            # LOG.warning("accession '%s' has taxon_id 0", acc_id)
            zero_acc += 1
            continue

        yield acc_id, taxon_id

    if no_zero and (zero_acc > 0):
        LOG.warning("%d accessions have taxon_id 0", zero_acc)

    if idx == 0:
        LOG.error("The file passed is empty")
    else:
        LOG.info("Parsed %d lines", idx + 1)