Source code for mgkit.mappings.enzyme

"""
.. versionadded:: 0.1.14

EC mappings
"""
from future.utils import viewitems
import re
from ..io import open_file

LEVEL1_NAMES = {
    1: 'oxidoreductases',
    2: 'transferases',
    3: 'hydrolases',
    4: 'lyases',
    5: 'isomerases',
    6: 'ligases'
}
"""
Top level classification names
"""

ENZCLASS_REGEX = r"^(\d)\. ?([\d-]+)\. ?([\d-]+)\. ?([\d-]+) +(.+)\."
"""
Used to get the description for the higher level enzyme classes from the file
*enzclass.txt* on `expasy <http://expasy.org>`_
"""


[docs]def parse_expasy_file(file_name): """ .. versionchanged:: 0.4.2 changed to work on python 3.x Used to load enzyme descriptions from the file *enzclass.txt* on `expasy <http://expasy.org>`_. The FTP url for enzclass.txt is: `<ftp://ftp.expasy.org/databases/enzyme/enzclass.txt>`_ """ labels = {} for line in open_file(file_name, mode='rb'): line = line.decode('ascii') match = re.search(ENZCLASS_REGEX, line) if match is None: continue ec = '.'.join(value for value in match.groups()[:4] if value != '-') labels[ec] = match.group(5) return labels
[docs]def get_enzyme_level(ec, level=4): """ .. versionadded:: 0.1.14 Returns an enzyme class at a specific level , between 1 and 4 (by default the most specific, 4) Arguments: ec (str): a string representing an EC number (e.g. 1.2.4.10) level (int): from 1 to 4, to get a different level specificity of in the enzyme classification Returns: str: the EC number at the requested specificity Example: >>> from mgkit.mappings.enzyme import get_enzyme_level >>> get_enzyme_level('1.1.3.4', 1) '1' >>> get_enzyme_level('1.1.3.4', 2) '1.1' >>> get_enzyme_level('1.1.3.4', 3) '1.1.3' >>> get_enzyme_level('1.1.3.4', 4) '1.1.3.4' """ return '.'.join(ec.split('.')[:level])
[docs]def change_mapping_level(ec_map, level=3): """ .. versionadded:: 0.1.14 Given a dictionary, whose values are dictionaries, in which a key is named *ec* and its value is an iterable of EC numbers, returns an iterator that can be used to build a dictionary with the same top level keys and the values are sets of the transformed EC numbers. Arguments: ec_map (dict): dictionary generated by :func:`mgkit.net.uniprot.get_gene_info` level (int): number from 1 to 4, to specify the level of the mapping, passed to :func:`get_enzyme_level` Yields: tuple: a tuple (gene_id, set(ECs)), which can be passed to *dict* to make a dictionary Example: >>> from mgkit.net.uniprot import get_gene_info >>> from mgkit.mappings.enzyme import change_mapping_level >>> ec_map = get_gene_info('Q9HFQ1', columns='ec') {'Q9HFQ1': {'ec': '1.1.3.4'}} >>> dict(change_mapping_level(ec_map, level=2)) {'Q9HFQ1': {'1.1'}} """ for gene_id, ecdict in viewitems(ec_map): try: ec_list = ecdict['ec'] except KeyError: continue if isinstance(ec_list, str): ec_list = [ec_list] yield gene_id, set( get_enzyme_level(ec_id, level=level) for ec_id in ec_list )
[docs]def get_mapping_level(ec_map, level=3): """ .. versionadded:: 0.3.0 Given a dictionary, whose values are iterable of EC numbers, returns an iterator that can be used to build a dictionary with the same top level keys and the values are sets of the transformed EC numbers. Arguments: ec_map (dict): dictionary genes to EC level (int): number from 1 to 4, to specify the level of the mapping, passed to :func:`get_enzyme_level` Yields: tuple: a tuple (gene_id, set(ECs)), which can be passed to *dict* to make a dictionary """ for gene_id, ec_list in viewitems(ec_map): if not ec_list: continue if isinstance(ec_list, str): ec_list = [ec_list] yield gene_id, set( get_enzyme_level(ec_id, level=level) for ec_id in ec_list )
[docs]def get_enzyme_full_name(ec_id, ec_names, sep=', '): """ .. versionadded:: 0.2.1 From a EC identifiers and a dictionary of names builds a comma separated name (by default) that identifies the function of the enzyme. Arguments: ec_id (str): EC identifier ec_names (dict): a dictionary of names that can be produced using :func:`parse_expasy_file` sep (str): string used to join the names Returns: str: the enzyme classification name """ name_list = [] while True: try: name_list.append( ec_names[ec_id] ) except KeyError: pass ec_id = '.'.join(ec_id.split('.')[:-1]) if not ec_id: break return sep.join(reversed(name_list))
[docs]def parse_expasy_dat_section(expasy_dat_section, skip_comments=True, skip_codes=None): """ .. versionadded:: 0.4.2 Parses an entry of the `enzyme.dat` file in expasy, used internally by :func:`mgkit.mappings.enzyme.parse_expasy_dat`, with the other arguments being passed over from it. Returns: dict: dictionary with the entry, with keys being the codes of the entry and the values the lines """ parsed_lines = {} for line in expasy_dat_section: if skip_comments and line.startswith('CC'): continue key, value = line.rstrip().split(maxsplit=1) if (skip_codes is not None) and (key in skip_codes): continue try: parsed_lines[key].append(value) except KeyError: parsed_lines[key] = [value] if 'CA' in parsed_lines: ca_lines = ''.join(parsed_lines['CA']) parsed_lines['CA'] = ca_lines.rstrip('.').split('. ') return parsed_lines
[docs]def parse_expasy_dat(expasy_dat, keep_empty=False, skip_comments=True, skip_codes=None): """ .. versionadded:: 0.4.2 Parses the information in `enzyme.dat` file in expasy, a flat file containting the information about the enzyme classification. It can be downloaded at: `<ftp://ftp.expasy.org/databases/enzyme/enzyme.dat>`_ Arguments: expasy_dat (str): file name or handle to an expasy.dat file keep_empty (bool): section that are empty are removed by default skip_comments (bool): used to avoid returning comments (lines starting) with `CC` in the file skip_codes (set, tuple): set or tuple or list to skip specific parts of the file, like `skip_comments` Yields: dict: dictionary with each entry in the file, where the keys are the codes and the values are the lines included in the file """ section = [] for line in open_file(expasy_dat, 'rb'): line = line.decode('ascii') # section end if line.startswith('//'): parsed_section = parse_expasy_dat_section(section, skip_comments=skip_comments, skip_codes=skip_codes) section = [] if parsed_section or keep_empty: yield parsed_section continue section.append(line)