Source code for mgkit.net.uniprot

"""
Contains function and constants for Uniprot access
"""
from __future__ import division
from builtins import range, zip
import mgkit
import logging
from . import url_read

UNIPROT_MAP = 'http://www.uniprot.org/mapping/'
"URL to Uniprot mapping REST API"

UNIPROT_GET = 'http://www.uniprot.org/uniprot/'
"URL to Uniprot REST API"

UNIPROT_TAXONOMY = 'http://www.uniprot.org/taxonomy/'
"URL to Uniprot REST API - Taxonomy"

COLS_TAXON = 'organism-id'
COLS_KO = 'database(KO)'
COLS_EGGNOG = 'database(EGGNOG)'
COLS_EC = 'ec'

LOG = logging.getLogger(__name__)


[docs]def get_sequences_by_ko(ko_id, taxonomy, contact=None, reviewed=True):
    """
    Gets sequences from Uniprot, restricting to the taxon id passed.

    :param str ko_id: KO id of the sequences to download
    :param int taxonomy: id of the taxon
    :param str contact: email address to be passed in the query (requested by
        Uniprot API)
    :param bool reviewed: if the sequences requested must be reviewed

    :return: string with the fasta file downloaded
    """
    params = {
        'query': 'database:(type:ko {0}) AND taxonomy:{1}{2}'.format(
            ko_id, taxonomy, ' reviewed:yes' if reviewed else ''),
        'format': 'fasta',
        'limit': 200,
        'sort': 'score'
    }
    if mgkit.DEBUG:
        LOG.debug("query: %s?%s", UNIPROT_GET, params)
        LOG.debug("request length %d", len(params))

    fasta = url_read(UNIPROT_GET, data=params, agent=contact)

    return fasta


[docs]def get_mappings(entry_ids, db_from='ID', db_to='EMBL', out_format='tab',
                 contact=None):
    """
    Gets mapping of genes using Uniprot REST API. The db_from and db_to values
    are the ones accepted by Uniprot API. The same applies to out_format, the
    only processed formats are 'list', which returns a list of the mappings
    (should be used with one gene only) and 'tab', which returns a dictionary
    with the mapping. All other values returns a string with the newline
    stripped.

    :param iterable entry_ids: iterable of ids to be mapped (there's a limit)
        to the maximum length of a HTTP request, so it should be less than 50
    :param str db_from: string that identify the DB for elements in entry_ids
    :param str db_to: string that identify the DB to which map entry_ids
    :param str out_format: format of the mapping; 'list' and 'tab' are
        processed
    :param str contact: email address to be passed in the query (requested
        Uniprot API)

    :return: tuple, dict or str depending on out_format value
    """

    if isinstance(entry_ids, str):
        entry_ids = [entry_ids]

    data = {
        'from': db_from,
        'to': db_to,
        'query': ' '.join(entry_ids),
        'format': out_format
    }

    mappings = url_read(UNIPROT_MAP, data=data, agent=contact)

    mappings = mappings.strip()

    if out_format == 'list':
        mappings = mappings.split('\n')
    elif out_format == 'tab':
        mapping_dict = {}
        mappings = mappings.split('\n')
        # delete first row 'From to'
        del mappings[0]

        for mapping in mappings:
            id_from, id_to = mapping.split('\t')
            if id_to == 'null':
                continue
            try:
                mapping_dict[id_from].append(id_to)
            except KeyError:
                mapping_dict[id_from] = [id_to]

        mappings = mapping_dict

    return mappings


[docs]def ko_to_mapping(ko_id, query, columns, contact=None):
    """
    Returns the mappings to the supplied KO. Can be used for any id, the
    query format is free as well as the columns returned. The only
    restriction is using a tab format, that is parsed.

    :param str ko_id: id used in the query
    :param str query: query passed to the Uniprot API, ko_id is replaced
        using :func:`str.format`
    :param str column: column used in the results table used to map the ids
    :param str contact: email address to be passed in the query (requested
        Uniprot API)

    .. note::

        each mapping in the column is separated by a ;

    """
    data = {
        'query': query.format(ko_id),
        'format': 'tab',
        'columns': columns
    }

    mappings = url_read(UNIPROT_GET, data=data, agent=contact)

    if mgkit.DEBUG:
        LOG.debug("query: %s?%s", UNIPROT_GET, data)
        LOG.debug("request length %d", len(data))

    mappings = mappings.split('\n')
    del mappings[0]

    categories = set()

    for map_line in mappings:

        mappings = [mapping.strip() for mapping in map_line.split(';')]
        if not mappings:
            continue

        categories.update(mappings)

    # in case an empty line is present
    try:
        categories.remove('')
    except KeyError:
        pass

    return categories


[docs]def get_gene_info(gene_ids, columns, max_req=50, contact=None):
    """
    .. versionadded:: 0.1.12

    Get informations about a list of genes. it uses :func:`query_uniprot` to
    send the request and format the response in a dictionary.

    Arguments:
        gene_ids (iterable, str): gene id(s) to get informations for
        columns (list): list of columns
        max_req (int): number of maximum *gene_ids* per request
        contact (str): email address to be passed in the query (requested
            Uniprot API)

    Returns:
        dict: dictionary where the keys are the *gene_ids* requested and the
        values are dictionaries with the names of the *columns* requested as
        keys and the corresponding values, which can be lists if the values are
        are semicolon separated strings.

    Example:
        To get the taxonomy ids for some genes:

        >>> uniprot.get_gene_info(['Q09575', 'Q8DQI6'], ['organism-id'])
        {'Q09575': {'organism-id': '6239'}, 'Q8DQI6': {'organism-id': '171101'}}

    """
    if isinstance(gene_ids, str):
        gene_ids = [gene_ids]
    elif isinstance(gene_ids, set):
        gene_ids = list(gene_ids)

    if isinstance(columns, str):
        columns = [columns]

    infos = {}

    for index in range(0, len(gene_ids), max_req):

        LOG.info(
            "Querying uniprot ids (%d/%d)",
            index + max_req,
            len(gene_ids)
        )

        info_lines = query_uniprot(
            ' OR '.join('id:{}'.format(gene_id) for gene_id in gene_ids[index:index+max_req]),
            columns=['id'] + columns,
            contact=contact
        )

        info_lines = info_lines.split('\n')

        del info_lines[0]

        for info_line in info_lines:
            info_line = info_line.strip()
            if not info_line:
                continue
            values = info_line.split('\t')

            gene_id = values[0]

            infos[gene_id] = dict(
                (
                    column,
                    value if (not value.endswith(';')) and (not value.endswith('; ')) and ('; ' not in value)
                    else [x.strip() for x in value.split(';') if x.strip()]
                )
                for column, value in zip(columns, values[1:])
            )

    return infos


[docs]def query_uniprot(query, columns=None, format='tab', limit=None, contact=None,
                  baseurl=UNIPROT_GET):
    """
    .. versionadded:: 0.1.12

    .. versionchanged:: 0.1.13
        added *baseurl* and made *columns* a default argument

    Queries Uniprot, returning the raw response in tbe format specified. More
    informations at the `page <http://www.uniprot.org/faq/28>`_

    Arguments:
        query (str): query to submit, as put in the input box
        columns (None, iterable): list of columns to return
        format (str): response format
        limit (int, None): number of entries to return or *None* to request all
            entries
        contact (str): email address to be passed in the query (requested
            Uniprot API)
        baseurl (str): base url for the REST API, can be either
            :data:`UNIPROT_GET` or :data:`UNIPROT_TAXONOMY`
    Returns:
        str: raw response from the query

    Example:
        To get the taxonomy ids for some genes:

        >>> uniprot.query_uniprot('Q09575 OR Q8DQI6', ['id', 'organism-id'])
        'Entry\\tOrganism ID\\nQ8DQI6\\t171101\\nQ09575\\t6239\\n'

    .. warning::

        because of limits in the length of URLs, it's advised to limit the
        length of the query string.

    """
    data = {
        'query': query,
        'format': format
    }

    if limit is not None:
        data['limit'] = limit

    if columns is not None:
        data['columns'] = ','.join(columns)

    if mgkit.DEBUG:
        LOG.debug("query: %s?%s", baseurl, data)
        LOG.debug("request length %d", len(data))

    return url_read(baseurl, data, agent=contact)


[docs]def parse_uniprot_response(data, simple=True):
    """
    .. versionadded:: 0.1.12

    Parses raw response from a Uniprot query (tab format only) from functions
    like :func:`query_uniprot` into a dictionary. It requires that the first
    column is the entry id (or any other unique id).

    Arguments:
        data (str): string response from Uniprot
        simple (bool): if True and the number of columns is 1, the dictionary
            returned has a simplified structure

    Returns:
        dict: The format of the resulting dictionary is
        entry_id -> {column1 -> value, column2 -> value, ..} unless there's
        only one column and *simple* is True, in which case the value is
        equal to the value of the only column.
    """
    data = data.splitlines()

    columns = [x.lower() for x in data[0].split('\t')[1:]]

    del data[0]

    parsed_data = {}

    for line in data:
        line = line.split('\t')
        entry_id = line[0]

        if (len(columns) == 1) and simple:
            parsed_data[entry_id] = line[1]
        else:
            parsed_data[entry_id] = dict(
                zip(columns, line[1:])
            )

    return parsed_data


[docs]def get_ko_to_eggnog_mappings(ko_ids, contact=None):
    """
    .. versionadded:: 0.1.14

    It's not possible to map in one go KO IDs to eggNOG IDs via the API in
    Uniprot. This function uses :func:`query_uniprot` to get all Uniprot IDs
    requested and the return a dictionary with all their eggNOG IDs they map
    to.

    Arguments:
        ko_ids (iterable): an iterable of KO IDs
        contact (str): email address to be passed in the query (requested
            Uniprot API)

    Returns:
        dict: The format of the resulting dictionary is
        ko_id -> {eggnog_id1, ..}
    """

    data = query_uniprot(
        "database:(type:ko AND ({}))".format(' OR '.join(list(ko_ids))),
        columns=['database(KO)', 'database(EGGNOG)'],
        contact=contact
    )

    data = data.splitlines()

    del data[0]

    parsed_data = {}

    for line in data:
        ko_ids, eggnog_ids = line.split('\t')
        ko_ids = ko_ids.split(';')

        for ko_id in ko_ids:
            if not ko_id:
                continue
            if not eggnog_ids:
                continue

            for eggnog_id in eggnog_ids.split(';'):
                if not eggnog_id:
                    continue

                try:
                    parsed_data[ko_id].add(eggnog_id)
                except KeyError:
                    parsed_data[ko_id] = set([eggnog_id])

    return parsed_data


[docs]def get_uniprot_ec_mappings(gene_ids, contact=None):
    """
    .. versionadded:: 0.1.14

    Shortcut to download EC mapping of Uniprot IDs. Uses :func:`get_gene_info`
    passing the correct column (*ec*).

    """
    return get_gene_info(
        gene_ids,
        columns=['ec'],
        contact=contact,
        max_req=100
    )


[docs]def get_gene_info_iter(gene_ids, columns, contact=None, max_req=50):
    """

    .. versionadded:: 0.3.3

    Alternative function to :func:`get_gene_info`, returning an iterator to
    avoid connections timeouts when updating a dictionary

    This funciton's parameters are the same as :func:`get_gene_info`
    """

    gene_ids = list(gene_ids)

    for index in range(0, len(gene_ids), max_req):
        yield get_gene_info(
            gene_ids[index:index+max_req],
            columns,
            contact=contact,
            max_req=max_req
        )