Source code for mgkit.mappings.enzyme

"""
.. versionadded:: 0.1.14

EC mappings
"""
from future.utils import viewitems
import re
from ..io import open_file

LEVEL1_NAMES = {
    1: 'oxidoreductases',
    2: 'transferases',
    3: 'hydrolases',
    4: 'lyases',
    5: 'isomerases',
    6: 'ligases'
}
"""
Top level classification names
"""

ENZCLASS_REGEX = r"^(\d)\. ?([\d-]+)\. ?([\d-]+)\. ?([\d-]+) +(.+)\."
"""
Used to get the description for the higher level enzyme classes from the file
*enzclass.txt* on `expasy <http://expasy.org>`_
"""


[docs]def parse_expasy_file(file_name):
    """
    .. versionchanged:: 0.4.2
        changed to work on python 3.x

    Used to load enzyme descriptions from the file *enzclass.txt* on
    `expasy <http://expasy.org>`_.

    The FTP url for enzclass.txt is:
    `<ftp://ftp.expasy.org/databases/enzyme/enzclass.txt>`_
    """
    labels = {}

    for line in open_file(file_name, mode='rb'):
        line = line.decode('ascii')
        match = re.search(ENZCLASS_REGEX, line)

        if match is None:
            continue

        ec = '.'.join(value for value in match.groups()[:4] if value != '-')

        labels[ec] = match.group(5)

    return labels


[docs]def get_enzyme_level(ec, level=4):
    """
    .. versionadded:: 0.1.14

    Returns an enzyme class at a specific level , between 1 and 4 (by default
    the most specific, 4)

    Arguments:
        ec (str): a string representing an EC number (e.g. 1.2.4.10)
        level (int): from 1 to 4, to get a different level specificity of in
            the enzyme classification

    Returns:
        str: the EC number at the requested specificity

    Example:
        >>> from mgkit.mappings.enzyme import get_enzyme_level
        >>> get_enzyme_level('1.1.3.4', 1)
        '1'
        >>> get_enzyme_level('1.1.3.4', 2)
        '1.1'
        >>> get_enzyme_level('1.1.3.4', 3)
        '1.1.3'
        >>> get_enzyme_level('1.1.3.4', 4)
        '1.1.3.4'
    """
    return '.'.join(ec.split('.')[:level])


[docs]def change_mapping_level(ec_map, level=3):
    """
    .. versionadded:: 0.1.14

    Given a dictionary, whose values are dictionaries, in which a key is named
    *ec* and its value is an iterable of EC numbers, returns an iterator that
    can be used to build a dictionary with the same top level keys and the
    values are sets of the transformed EC numbers.

    Arguments:
        ec_map (dict): dictionary generated by
            :func:`mgkit.net.uniprot.get_gene_info`
        level (int): number from 1 to 4, to specify the level of the mapping,
            passed to :func:`get_enzyme_level`

    Yields:
        tuple: a tuple (gene_id, set(ECs)), which can be passed to *dict* to
        make a dictionary

    Example:
        >>> from mgkit.net.uniprot import get_gene_info
        >>> from mgkit.mappings.enzyme import change_mapping_level
        >>> ec_map = get_gene_info('Q9HFQ1', columns='ec')
        {'Q9HFQ1': {'ec': '1.1.3.4'}}
        >>> dict(change_mapping_level(ec_map, level=2))
        {'Q9HFQ1': {'1.1'}}

    """
    for gene_id, ecdict in viewitems(ec_map):
        try:
            ec_list = ecdict['ec']
        except KeyError:
            continue

        if isinstance(ec_list, str):
            ec_list = [ec_list]

        yield gene_id, set(
            get_enzyme_level(ec_id, level=level) for ec_id in ec_list
        )


[docs]def get_mapping_level(ec_map, level=3):
    """
    .. versionadded:: 0.3.0

    Given a dictionary, whose values are iterable of EC numbers, returns an
    iterator that can be used to build a dictionary with the same top level
    keys and the values are sets of the transformed EC numbers.

    Arguments:
        ec_map (dict): dictionary genes to EC
        level (int): number from 1 to 4, to specify the level of the mapping,
            passed to :func:`get_enzyme_level`

    Yields:
        tuple: a tuple (gene_id, set(ECs)), which can be passed to *dict* to
        make a dictionary
    """
    for gene_id, ec_list in viewitems(ec_map):

        if not ec_list:
            continue

        if isinstance(ec_list, str):
            ec_list = [ec_list]

        yield gene_id, set(
            get_enzyme_level(ec_id, level=level) for ec_id in ec_list
        )


[docs]def get_enzyme_full_name(ec_id, ec_names, sep=', '):
    """
    .. versionadded:: 0.2.1

    From a EC identifiers and a dictionary of names builds a comma separated
    name (by default) that identifies the function of the enzyme.

    Arguments:
        ec_id (str): EC identifier
        ec_names (dict): a dictionary of names that can be produced using
            :func:`parse_expasy_file`
        sep (str): string used to join the names

    Returns:
        str: the enzyme classification name
    """

    name_list = []

    while True:
        try:
            name_list.append(
                ec_names[ec_id]
            )
        except KeyError:
            pass

        ec_id = '.'.join(ec_id.split('.')[:-1])

        if not ec_id:
            break

    return sep.join(reversed(name_list))


[docs]def parse_expasy_dat_section(expasy_dat_section, skip_comments=True, skip_codes=None):
    """
    .. versionadded:: 0.4.2

    Parses an entry of the `enzyme.dat` file in expasy, used internally by
    :func:`mgkit.mappings.enzyme.parse_expasy_dat`, with the other arguments
    being passed over from it.

    Returns:
        dict: dictionary with the entry, with keys being the codes of the entry
        and the values the lines
    """
    parsed_lines = {}
    for line in expasy_dat_section:
        if skip_comments and line.startswith('CC'):
            continue
        key, value = line.rstrip().split(maxsplit=1)
        if (skip_codes is not None) and (key in skip_codes):
            continue
        try:
            parsed_lines[key].append(value)
        except KeyError:
            parsed_lines[key] = [value]

    if 'CA' in parsed_lines:
        ca_lines = ''.join(parsed_lines['CA'])
        parsed_lines['CA'] = ca_lines.rstrip('.').split('. ')

    return parsed_lines

[docs]def parse_expasy_dat(expasy_dat, keep_empty=False, skip_comments=True, skip_codes=None):
    """
    .. versionadded:: 0.4.2

    Parses the information in `enzyme.dat` file in expasy, a flat file
    containting the information about the enzyme classification.

    It can be downloaded at:
    `<ftp://ftp.expasy.org/databases/enzyme/enzyme.dat>`_

    Arguments:
        expasy_dat (str): file name or handle to an expasy.dat file
        keep_empty (bool): section that are empty are removed by default
        skip_comments (bool): used to avoid returning comments (lines starting)
            with `CC` in the file
        skip_codes (set, tuple): set or tuple or list to skip specific parts
            of the file, like `skip_comments`
    Yields:
        dict: dictionary with each entry in the file, where the keys are the
        codes and the values are the lines included in the file
    """
    section = []
    for line in open_file(expasy_dat, 'rb'):
        line = line.decode('ascii')

        # section end
        if line.startswith('//'):
            parsed_section = parse_expasy_dat_section(section, skip_comments=skip_comments, skip_codes=skip_codes)
            section = []
            if parsed_section or keep_empty:
                yield parsed_section
            continue

        section.append(line)