Source code for mgkit.io.gff

"""
This modules define classes and function related to manipulation of GFF/GTF
files.
"""
from __future__ import print_function, division
from builtins import object, zip, bytes, range
import sys
from io import IOBase
from future.utils import viewitems
import random
import itertools
import logging
import uuid
import json
from collections import OrderedDict
try:
    from urllib import unquote, quote
except ImportError:
    # Python3
    from urllib.parse import unquote, quote
import mgkit.io
from ..utils import sequence as seq_utils
from ..consts import MIN_COV
from ..utils.common import between, union_range, ranges_length
from ..utils.trans_tables import UNIVERSAL

LOG = logging.getLogger(__name__)


[docs]class AttributeNotFound(Exception): """ Raised if an attribute is not found in a GFF file """ pass
[docs]def write_gff(annotations, file_handle, verbose=True): """ .. versionchanged:: 0.1.12 added *verbose* argument Write a GFF to file Arguments: annotations (iterable): iterable that returns :class:`GFFKegg` or :class:`Annotation` instances file_handle (str, file): file name or file handle to write to verbose (bool): if True, a message is logged """ if isinstance(file_handle, str): file_handle = mgkit.io.open_file(file_handle, 'wb') if verbose: LOG.info( "Writing annotations to file (%s)", getattr(file_handle, 'name', repr(file_handle)) ) for annotation in annotations: annotation.to_file(file_handle)
[docs]class GenomicRange(object): """ Defines a genomic range .. versionchanged:: 0.2.1 using __slots__ for better memory usage """ __slots__ = ('seq_id', 'strand', 'start', 'end') # seq_id = 'None' # "Sequence ID" # strand = '+' # "Strand" # start = None # "Start of the range, 1-based" # end = None # "End of the range 1-based" def __init__(self, seq_id='None', start=1, end=1, strand='+'): self.seq_id = seq_id self.strand = strand self.start = start self.end = end def __eq__(self, other): if (self.seq_id != other.seq_id) or (self.strand != other.strand): return False if (self.start != other.start) or (self.end != other.end): return False return True def __len__(self): return self.end - self.start + 1 def __str__(self): return "{0}({1}):{2}-{3}".format( self.seq_id, self.strand, self.start, self.end ) def __repr__(self): return str(self)
[docs] def union(self, other): """ Return the union of two :class:`GenomicRange` """ if (self.seq_id == other.seq_id) and (self.strand == other.strand): result = union_range(self.start, self.end, other.start, other.end) if result is not None: gen_range = GenomicRange() gen_range.seq_id = self.seq_id gen_range.strand = self.strand gen_range.start = result[0] gen_range.end = result[1] return gen_range return None
def __or__(self, other): return self.union(other)
[docs] def expand_from_list(self, others): """ Expand the :class:`GenomicRange` range instance with a list of :class:`GenomicRange` Arguments: others (iterable): iterable of :class:`GenomicRange` """ new_range = self for other in others: union = new_range.union(other) if union is None: continue new_range = union self.start = new_range.start self.end = new_range.end
[docs] def intersect(self, other): """ Return an instance of :class:`GenomicRange` that represent the intersection of the current instance and another. """ if (self.seq_id == other.seq_id) and (self.strand == other.strand): if between(other.start, self.start, self.end) or \ between(other.end, self.start, self.end) or \ between(self.start, other.start, other.end) or \ between(self.end, other.start, other.end): gen_range = GenomicRange() gen_range.start = max(self.start, other.start) gen_range.end = min(self.end, other.end) return gen_range return None
def __and__(self, other): return self.intersect(other)
[docs] def __contains__(self, pos): """ .. versionchanged:: 0.2.3 a range or a subclass are accepted .. versionadded:: 0.1.16 Tests if the position is inside the range of the GenomicRange Pos is 1-based as :attr:`GenomicRange.start` and :attr:`GenomicRange.end` """ if isinstance(pos, int): return between(pos, self.start, self.end) elif isinstance(pos, GenomicRange): pos = (pos.start, pos.end) return (between(pos[0], self.start, self.end)) and \ (between(pos[1], self.start, self.end))
[docs] def get_range(self): """ .. versionadded:: 0.1.13 Returns the start and end position as a tuple """ return (self.start, self.end)
[docs] def get_relative_pos(self, pos): """ .. versionadded:: 0.1.16 Given an absolute position (referred to the reference), convert the position to a coordinate relative to the GenomicRange Returns: int: the position relative to the GenomicRange Raises: ValueError: if the position is not in the range """ if pos not in self: raise ValueError("Position {} not in GenomicRange".format(pos)) return pos - self.start + 1
[docs]class Annotation(GenomicRange): """ .. versionadded:: 0.1.12 .. versionchanged:: 0.2.1 using __slots__ for better memory usage Alternative implementation for an Annotation. When initialised, If *uid* is None, a unique id is added using `uuid.uuid4`. """ __slots__ = ('source', 'feat_type', 'score', 'phase', 'attr') # source = 'None' # "Annotation source" # feat_type = 'None' # "Annotation type (e.g. CDS, gene, exon, etc.)" # score = 0.0 # "Score associated to the annotation" # phase = 0 # "Annotation phase, (0, 1, 2)" # attr = None # "Dictionary with the key value pairs in the last column of a GFF/GTF" def __init__(self, seq_id='None', start=1, end=1, strand='+', source='None', feat_type='None', score=0.0, phase=0, uid=None, **kwd): super(Annotation, self).__init__( seq_id=seq_id, start=start, end=end, strand=strand ) self.source = source self.feat_type = feat_type self.score = score self.phase = phase self.attr = kwd if uid is None: self.uid = str(uuid.uuid4()) else: self.uid = uid def __hash__(self): return hash(self.uid)
[docs] def get_ec(self, level=4): """ .. versionadded:: 0.1.13 .. versionchanged:: 0.2.0 returns a *set* instead of a list Returns the EC values associated with the annotation, cutting them at the desired level. Arguments: level (int): level of classification desired (between 1 and 4) Returns: set: list of all EC numbers associated, at the desired level, if none are found an empty set is returned """ ec = self.attr.get('EC', None) if ec is None: return set() ec = ec.split(',') return set(['.'.join(x.split('.')[:level]) for x in ec])
[docs] def get_mapping(self, db): """ .. versionadded:: 0.1.13 Returns the mappings, to a particular db, associated with the annotation. Arguments: db (str): database to which the mappings come from Returns: list: list of all mappings associated, to the specified db, if none are found an empty list is returned """ mappings = self.attr.get('map_{0}'.format(db.upper())) if mappings is None: return [] return mappings.split(',')
[docs] def set_mapping(self, db, values): """ .. versionadded:: 0.1.13 Set mappings to a particular db, associated with the annotation. Arguments: db (str): database to which the mappings come from mappings (iterable): iterable of mappings """ self.set_attr( 'map_{0}'.format(db.upper()), ','.join(values) )
[docs] def get_mappings(self): """ .. versionadded:: 0.2.1 Return a dictionary where the keys are the mapping DBs (lowercase) and and the values are the mapping IDs for that DB """ mappings = {} for key in self.attr: if key.startswith('map_'): db = key.replace('map_', '').lower() mappings[db] = self.get_mapping(db) return mappings
@property def taxon_id(self): """ .. versionchanged:: 0.3.1 if taxon_id is set to "None" as a string, it's converted to *None* taxon_id of the annotation """ value = self.attr.get('taxon_id', None) try: value = int(value) except (TypeError, ValueError): value = None return value @taxon_id.setter def taxon_id(self, value): self.attr['taxon_id'] = int(value) @property def db(self): "db used for the gene_id prediction" return self.attr.get('db', None) @db.setter def db(self, value): self.attr['db'] = value @property def taxon_db(self): "db used for the taxon_id prediction" return self.attr.get('taxon_db', None) @taxon_db.setter def taxon_db(self, value): self.attr['taxon_db'] = value @property def dbq(self): "db quality of the annotation" try: return self.get_attr('dbq', int) except AttributeNotFound: return None @dbq.setter def dbq(self, value): self.attr['dbq'] = value @property def uid(self): """ .. versionadded:: 0.1.13 uid of the annotation """ value = self.attr.get('uid', None) if value is None: # old data where the unique id is marked as ko_idx value = self.attr.get('ko_idx', None) return value @uid.setter def uid(self, value): self.attr['uid'] = value @property def bitscore(self): "bitscore of the annotation" try: return float(self.attr['bitscore']) except KeyError: # legacy for old data bitscore = self.attr.get('bit_score', None) return None if bitscore is None else float(bitscore) @bitscore.setter def bitscore(self, value): self.attr['bitscore'] = float(value) @property def gene_id(self): "gene_id of the annotation, or *ko* if available" try: return self.attr['gene_id'] except KeyError: # legacy for old data return self.attr.get('ko', None) @gene_id.setter def gene_id(self, value): self.attr['gene_id'] = value @property def length(self): """ .. versionchanged:: 0.2.0 Length of the annotation, uses `len(self)` """ return len(self) @property def region(self): """ .. versionadded:: 0.1.13 Return the *region* covered by the annotation, to use in samtools """ return "{0}:{1}:{2}".format(self.seq_id, self.start, self.end) @property def counts(self): """ .. versionadded:: 0.2.2 Returns the sample counts for the annotation """ counts = {} for key, value in viewitems(self.attr): if key.startswith('counts_'): key = key.replace('counts_', '') counts[key] = float(value) return counts @counts.setter def counts(self, counts): """ .. versionadded:: 0.2.2 Sets the sample counts for the annotation Arguments: counts (dict): key is the sample name and the count for it """ for key, value in viewitems(counts): self.attr["counts_{}".format(key)] = value @property def fpkms(self): """ .. versionadded:: 0.2.2 Returns the sample fpkms for the annotation """ fpkms = {} for key, value in viewitems(self.attr): if key.startswith('fpkms_'): key = key.replace('fpkms_', '') fpkms[key] = float(value) return fpkms @fpkms.setter def fpkms(self, fpkms): """ .. versionadded:: 0.2.2 Sets the sample fpkms for the annotation Arguments: fpkms (dict): key is the sample name and the fpmk for it """ for key, value in viewitems(fpkms): self.attr["fpkms_{}".format(key)] = value
[docs] def add_exp_syn_count(self, seq, syn_matrix=None): """ .. versionadded:: 0.1.13 Adds expected synonymous/non-synonymous values for an annotation. Arguments: seq (str): sequence corresponding to the annotation seq_id syn_matrix (None, dict): matrix that determines the return values. Defaults to the one defined in the called function :func:`mgkit.utils.sequnce.get_seq_expected_syn_count`. """ seq = self.get_nuc_seq(seq, reverse=self.strand == '-') syn_count, nonsyn_count = seq_utils.get_seq_expected_syn_count( seq, syn_matrix=syn_matrix ) self.set_attr('exp_syn', syn_count) self.set_attr('exp_nonsyn', nonsyn_count)
[docs] def to_gff(self, sep='='): """ Format the Annotation as a GFF string. Arguments: sep (str): separator key -> value Returns: str: annotation formatted as GFF """ var_names = ( 'seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase' ) values = '\t'.join( str(getattr(self, var_name)) for var_name in var_names ) attr_column = ';'.join( '{0}{1}"{2}"'.format( key, sep, quote(str(self.attr[key]), ' ()/') ) for key in sorted(self.attr) ) return "{0}\t{1}\n".format(values, attr_column)
[docs] def to_dict(self, exclude_attr=None): """ .. versionadded:: 0.3.1 Return a dictionary representation of the Annotation. Arguments: exclude_attr (str,list): attributes to exclude from the dictionary, can be either a single attribute (string) or a list of strings Returns: dict: dictionary with the annotation """ var_names = ( 'seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase' ) dictionary = {} for var_name in var_names: dictionary[var_name] = getattr(self, var_name) dictionary.update(self.attr) if exclude_attr is not None: if isinstance(exclude_attr, str): exclude_attr = [exclude_attr] for attr in exclude_attr: del dictionary[attr] return dictionary
[docs] def to_json(self): """ .. versionadded:: 0.2.1 .. versionchanged:: 0.3.1 now :meth:`Annotation.to_dict` is used Returns a json representation of the Annotation """ return json.dumps(self.to_dict(), separators=(',', ':'))
[docs] def to_mongodb(self, lineage_func=None, indent=None, raw=False): """ .. versionadded:: 0.2.1 .. versionchanged:: 0.2.2 added handling of *counts_* and *fpkms_* .. versionchanged:: 0.2.6 added *indent* parameter .. versionchanged:: 0.3.4 added *raw* Returns a MongoDB document that represent the Annotation. Arguments: lineage (func): function used to populate the lineage key, returns a list of taxon_id indent (int): the amount of indent to put in the record, None (the default) is for the most compact - one line for the record raw (bool): if True, the method returns a string, which is the json dump, if False, the value returned is the dictionary Returns: str or dict: the MongoDB document, with Annotation.uid as _id, as a string if *raw* is True, a dictionary if it is False """ # OrderedDict is necessary to keep the order of the keys dictionary = OrderedDict() # _id must be the first element dictionary['_id'] = self.uid var_names = ( 'seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase', 'gene_id', 'taxon_id', 'bitscore', 'exp_nonsyn', 'exp_syn', 'length', 'dbq', 'coverage', 'uid' ) for var_name in var_names: try: dictionary[var_name] = getattr(self, var_name) except AttributeNotFound: pass ec_ids = self.get_ec() mappings = self.get_mappings() # if one at least has values if ec_ids: mappings['ec'] = list(ec_ids) if lineage_func is not None: dictionary['lineage'] = lineage_func(self.taxon_id) dictionary['map'] = mappings counts = self.counts if counts: dictionary['counts'] = counts fpkms = self.fpkms if fpkms: dictionary['fpkms'] = fpkms # the rest of the dictionary should be put, excluding special keys: # uid is used as _id in the document # EC is put as a array, as is any mapping like map_KO dictionary.update( dict( (key, value) for key, value in viewitems(self.attr) if (key not in var_names) and (key not in ('uid', 'EC')) and (not key.startswith('map_')) and (not key.startswith('counts_')) and (not key.startswith('fpkms_')) ) ) if raw: return dictionary return json.dumps(dictionary, indent=indent, separators=(',', ':'))
[docs] def to_file(self, file_handle): """ Writes the GFF annotation to *file_handle* """ file_handle.write(self.to_gff().encode('ascii'))
[docs] def to_gtf(self, gene_id_attr='uid', sep=' '): """ .. versionadded:: 0.1.15 .. versionchanged:: 0.1.16 added *gene_id_attr* parameter .. versionchanged:: 0.2.2 added *sep* argument, default to a space, now Simple conversion to a valid GTF. gene_id and transcript_id are set to *uid* or the attribute specified using the *gene_id_attr* parameter. It's written to be used with *SNPDat*. """ var_names = ( 'seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase' ) values = '\t'.join( str(getattr(self, var_name)) for var_name in var_names ) # Keys that needs to be at the start of the attributes gtf_attr = ['gene_id', 'transcript_id'] attr_keys = sorted(self.attr.keys()) # eliminate gene_id (always present in new ones) try: attr_keys.remove('gene_id') except ValueError: pass # transcript_id don't always be there try: attr_keys.remove('transcript_id') except ValueError: pass attr_values = [self.get_attr(gene_id_attr)] * 2 + [ self.attr[attr_key] for attr_key in attr_keys ] attr_keys = gtf_attr + attr_keys attr_column = '; '.join( '{0}{1}"{2}"'.format( key, sep, quote(value, ' ()/') ) for key, value in zip(attr_keys, attr_values) ) return "{0}\t{1}\n".format(values, attr_column)
@property def sample_coverage(self): """ .. versionadded:: 0.1.13 Returns a dictionary with the coverage for each sample, the returned dictionary has the sample id (stripped of the *_cov*) suffix and as values the coverage (converted via :func:`int`). :return dict: dictionary with the samples' coverage """ attributes = self.attr return dict( (attribute.replace('_cov', ''), float(value)) for attribute, value in viewitems(attributes) if attribute.endswith('_cov') )
[docs] def get_number_of_samples(self, min_cov=MIN_COV): """ .. versionadded:: 0.1.13 Returns the number of sample that have at least a minimum coverage of `min_cov`. :param int min_cov: minimum coverage :return int: number of samples passing the filter :raise AttributeNotFound: if no sample coverage attribute is found """ coverage = self.sample_coverage if not coverage: raise AttributeNotFound( 'No coverage attribute found (ending in "_cov")' ) return sum( 1 for sample, coverage in viewitems(coverage) if coverage >= min_cov )
[docs] def get_attr(self, attr, conv=str): """ .. versionchanged:: 0.3.4 any GFF attribute can be returned .. versionchanged:: 0.3.3 added *seq_id* as special attribute, in addition do *length* .. versionadded:: 0.1.13 Generic method to get an attribute and convert it to a specific datatype. The order for the lookup is: * length * self.attr (dictionary) * getattr(self) of the first 8 columns of a GFF (seq_id, source, ...) """ if attr == 'length': return len(self) value = self.attr.get(attr, None) if value is not None: return conv(value) if attr in ('seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase'): value = getattr(self, attr, None) if value is not None: return conv(value) raise AttributeNotFound('No {0} attribute found'.format(attr))
[docs] def set_attr(self, attr, value): """ .. versionadded:: 0.1.13 .. versionchanged:: 0.4.4 a standard attribute can now be set Generic method to set an attribute """ if attr in ('seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase'): setattr(self, attr, value) return None self.attr[attr] = value
[docs] def has_attr(self, attr): """ .. versionadded:: 0.4.4 Tests if an attributes is present in the Annotation Arguments: attr (str): attribute to test Returns: bool: True if the attributes is present """ if attr in ('seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase'): return True return attr in self.attr
[docs] def del_attr(self, attr): """ .. versionadded:: 0.4.4 Removes attributes from the Annotation """ if attr in ('seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase'): return None try: del self.attr[attr] except KeyError: pass
@property def coverage(self): """ .. versionadded:: 0.1.13 Return the total coverage for the annotation :return float: coverage :raise AttributeNotFound: if no coverage attribute is found """ return self.get_attr('cov', float) @property def exp_syn(self): """ .. versionadded:: 0.1.13 Returns the expected number of synonymous changes """ return self.get_attr('exp_syn', int) @property def exp_nonsyn(self): """ .. versionadded:: 0.1.13 Returns the expected number of non-synonymous changes """ return self.get_attr('exp_nonsyn', int)
[docs] def get_nuc_seq(self, seq, reverse=False, snp=None): """ .. versionadded:: 0.1.13 .. versionchanged:: 0.1.16 added *snp* parameter Returns the nucleotidic sequence that the annotation covers. if the annotation's strand is *-*, and *reverse* is True, the reverse complement is returned. Arguments: seq (seq): chromosome/contig sequence reverse (bool): if True and the strand is '-', a reverse complement is returned snp (tuple): first element is the position of the SNP relative to the Annotation and the second element is the change Returns: str: nucleotide sequence with requested transformations """ ann_seq = seq[self.start - 1:self.end] if snp is not None: ann_seq = seq_utils.get_variant_sequence(ann_seq, snp) if (self.strand == '-') and reverse: ann_seq = seq_utils.reverse_complement(ann_seq) return ann_seq
[docs] def get_aa_seq(self, seq, start=0, tbl=None, snp=None): """ .. versionadded:: 0.1.16 Returns a translated aminoacid sequence of the annotation. The snp parameter is passed to :meth:`Annotation.get_nuc_seq` Arguments: seq (seq): chromosome/contig sequence start (int): position (0-based) from where the correct occurs (frame). If None, the phase attribute is used tbl (dict): dictionary with the translation for each codon, passed to :func:`mgkit.utils.sequence.translate_sequence` snp (tuple): first element is the position of the SNP and the second element is the change Returns: str: aminoacid sequence """ if start is None: start = self.phase nuc_seq = self.get_nuc_seq(seq, reverse=True, snp=snp) return seq_utils.translate_sequence( nuc_seq, start=start, tbl=None, reverse=False )
[docs] def add_gc_content(self, seq): """ Adds GC content information for an annotation. The formula is: .. math:: :label: gc_content_gff \\frac {(G + C)}{(G + C + A + T)} Modifies the instances of the annotation. gc_ratio will be added to its attributes. Arguments: seq (str): nucleotide sequence referred in the GFF """ ann_seq = self.get_nuc_seq( seq, reverse=True if self.strand == '-' else False ) at_sum = (ann_seq.count('A') + ann_seq.count('T')) gc_sum = (ann_seq.count('G') + ann_seq.count('C')) gc_cont = gc_sum / (gc_sum + at_sum) self.set_attr('gc_cont', gc_cont)
[docs] def add_gc_ratio(self, seq): """ Adds GC content information for an annotation. The formula is: .. math:: :label: gc_ratio_gff \\frac {(A + T)}{(G + C)} Modifies the instances of the annotation. gc_ratio will be added to its attributes. Arguments: seq (str): nucleotide sequence referred in the GFF """ ann_seq = self.get_nuc_seq( seq, reverse=True if self.strand == '-' else False ) at_sum = (ann_seq.count('A') + ann_seq.count('T')) gc_sum = (ann_seq.count('G') + ann_seq.count('C')) gc_ratio = at_sum / gc_sum self.set_attr('gc_ratio', gc_ratio)
[docs] def is_syn(self, seq, pos, change, tbl=None, abs_pos=True, start=0, strict=True): """ .. versionadded:: 0.1.16 .. versionchanged:: 0.4.4 added *strict* parameter Return if a SNP is synonymous or non-synonymous. Arguments: seq (seq): reference sequence of the annotation pos (int): position of the SNP on the reference (1-based index) change (str): nucleotidic change tbl (dict): dictionary with the translation table. Defaults to the universal genetic code abs_pos (bool): if True the *pos* is referred to the reference and not a position relative to the annotation start (int or None): phase to be used to get the start position of the codon. if None, the Annotation phase will be used strict (bool): if a variant codon is not found, a KeyError is raised, otherwise *False* is returned Returns: bool: True if the SNP is synonymous, false if it's non-synonymous. Behaviour in case of variant codons not found in the translation table changes based on *strict* Raises: KeyError: if the variant codon is not found and *strict* is True """ if abs_pos: rel_pos = self.get_relative_pos(pos) else: rel_pos = pos if start is None: start = self.phase if tbl is None: tbl = UNIVERSAL # codon number in the sequence codon_index = (rel_pos - start - 1) // 3 # the position to slice the seq to get a codon (0-based). It takes into # account the phase (start) and the codon index seq_start = (self.start + start + (codon_index * 3) - 1) # position in the codon using the relative position and the phase/frame # the module will give 1, 2 or 0. -1 will shift the position correctly codon_change = ((rel_pos - start) % 3) - 1 codon = seq[seq_start:seq_start+3] var_codon = list(codon) var_codon[codon_change] = change var_codon = ''.join(var_codon) if self.strand == '-': codon = seq_utils.reverse_complement(codon) var_codon = seq_utils.reverse_complement(var_codon) try: return UNIVERSAL[codon] == UNIVERSAL[var_codon] except KeyError: LOG.warning("""Annotation %s has an unrecognised codon: reference %s, variant %s""", self.uid, codon, var_codon) if strict: raise KeyError("Variant codon not found %s", var_codon) else: return False
[docs]def from_glimmer3(header, line, feat_type='CDS'): """ .. versionadded:: 0.1.12 Parses the line of a GLIMMER3 ouput and returns an instance of a GFF annotation. Arguments: header (str): the seq_id to which the ORF belongs line (str): the prediction line for the orf feat_type (str): the feature type to use Returns: Annotation: instance of annotation Example: Assuming a GLIMMER3 output like this:: >sequence0001 orf00001 66 611 +3 6.08 The code used is: >>> header = 'sequence0001' >>> line = 'orf00001 66 611 +3 6.08' >>> from_glimmer3(header, line) """ if isinstance(line, bytes): line = line.decode('ascii') orf_id, start, end, frame, score = line.split() start = int(start) end = int(end) if start > end: start, end = end, start annotation = Annotation( seq_id=header, source='GLIMMER3', feat_type=feat_type, start=start, end=end, score=float(score), strand=frame[0], phase=int(frame[1]) - 1, frame=frame, glimmer_score=float(score), orf_id=orf_id ) return annotation
[docs]class DuplicateKeyError(Exception): """ .. versionadded:: 0.1.12 Raised if a GFF annotation contains duplicate keys """ pass
[docs]def from_gff(line, strict=True, encoding='ascii'): """ .. versionadded:: 0.1.12 .. versionchanged:: 0.2.6 added *strict* parameter .. versionchanged:: 0.4.0 added *encoding* parameter Parse GFF line and returns an :class:`Annotation` instance Arguments: line (str): GFF line strict (bool): if True duplicate keys raise an exception Returns: Annotation: instance of :class:`Annotation` for the line Raises: DuplicateKeyError: if the attribute column has duplicate keys """ if isinstance(line, bytes): line = line.decode(encoding) line = line.rstrip().split('\t') # in case the last column (attributes) is empty if len(line) < 9: values = line # bug in which the phase was not written if len(line[-1]) > 1: line.insert(-1, 0) else: values = line[:-1] var_names = ( 'seq_id', 'source', 'feat_type', 'start', 'end', 'score', 'strand', 'phase' ) # the phase sometimes can be set as unknown, using '-'. We prefer using 0 var_types = (str, str, str, int, int, float, str, lambda x: 0 if x == '' else int(x)) attr = {} for var, value, vtype in zip(var_names, values, var_types): try: attr[var] = vtype(value) except ValueError: attr[var] = value # in case the last column (attributes) is empty if len(line) < 9: return Annotation(**attr) for pair in line[-1].split(';'): try: # by default the key,value separator '=' is assumed to be used var, value = pair.strip().split('=', 1) except ValueError: # in case it doesn't work, it is assumed to be a space if ' ' in pair.strip(): var, value = pair.strip().split(' ', 1) else: # case in which there's an attribute but no value, like a bool var = pair.strip() value = None if (var in attr) and strict: raise DuplicateKeyError("Duplicate attribute: {0}".format(var)) # skips possible key/values generated by the line ending with a ';' if not var: continue if value is not None: value = unquote(value.replace('"', '')) attr[var] = value return Annotation(**attr)
[docs]def from_sequence(name, seq, feat_type='SEQUENCE', **kwd): """ .. versionadded:: 0.1.12 Returns an instance of :class:`Annotation` for the full length of a sequence Arguments: name (str): name of the sequence seq (str): sequence, to get the length of the annotation Keyword Args: feat_type (str): feature type in the GFF **kwd: any additional column Returns: Annotation: instance of :class:`Annotation` """ annotation = Annotation( seq_id=name, source='SEQUENCE', feat_type=feat_type, start=1, end=len(seq), score=0.0, strand='+', phase=0, sequence=name, **kwd ) return annotation
[docs]def from_aa_blast_frag(hit, parent_ann, aa_seqs): frag_id, frame = hit[0].split('-') strand = '+' if frame.startswith('f') else '-' frame = int(frame[1]) identity = hit[2] bitscore = hit[-1] start = hit[3] end = hit[4] if strand == '-': start, end = seq_utils.reverse_aa_coord( start, end, len(aa_seqs[hit[0]]) ) start, end = seq_utils.convert_aa_to_nuc_coord(start, end, frame) annotation = Annotation( seq_id=parent_ann.seq_id, source='BLAST', feat_type='CDS', start=start + parent_ann.start - 1, end=end + parent_ann.start - 1, score=bitscore, strand=strand, phase=frame, db='UNIPROT', gene_id=hit[1], identity=identity, bitscore=bitscore, ID=frag_id ) return annotation
[docs]def from_nuc_blast_frag(hit, parent_ann, db='NCBI-NT'): frag_id = hit[0] strand = '+' identity = hit[2] bitscore = hit[-1] start = hit[3] end = hit[4] annotation = Annotation( seq_id=parent_ann.seq_id, source='BLAST', feat_type='CDS', start=start + parent_ann.start - 1, end=end + parent_ann.start - 1, score=bitscore, strand=strand, phase=0, db=db, gene_id=hit[1], identity=identity, bitscore=bitscore, ID=frag_id ) return annotation
[docs]def annotate_sequence(name, seq, window=None): length = len(seq) if window is None: window = length for index in range(1, length, window): annotation = Annotation.from_sequence(name, seq) annotation.start = index annotation.end = index + window - 1 if annotation.end > length: annotation.end = length yield annotation
[docs]def from_nuc_blast(hit, db, feat_type='CDS', seq_len=None, to_nuc=False, **kwd): """ .. versionadded:: 0.1.12 .. versionchanged:: 0.1.16 added *to_nuc* parameter .. versionchanged:: 0.2.3 removed *to_nuc*, the hit can include the subject end/start and evalue Returns an instance of :class:`Annotation` Arguments: hit (tuple): a BLAST hit, from :func:`mgkit.io.blast.parse_blast_tab` db (str): db used with BLAST Keyword Args: feat_type (str): feature type in the GFF seq_len (int): sequence length, if supplied, the phase for strand '-' can be assigned, otherwise is assigned a 0 **kwd: any additional column Returns: Annotation: instance of :class:`Annotation` """ seq_id = hit[0] gene_id = hit[1] strand = '+' identity = hit[2] bitscore = hit[-1] start = hit[3] end = hit[4] phase = 0 if start > end: start, end = end, start strand = '-' if seq_len is not None: phase = (seq_len - end) % 3 if strand == '+': phase = (start - 1) % 3 annotation = Annotation( seq_id=seq_id, source='BLAST', feat_type=feat_type, start=start, end=end, score=bitscore, strand=strand, phase=phase, db=db, gene_id=gene_id, identity=identity, bitscore=bitscore, frame="{}{}".format('f' if strand == '+' else 'r', phase), **kwd ) # the hit includes subject end/start and evalue, as per new version of # mgkit.io.blast.parse_uniprot_blast if len(hit) == 9: annotation.attr['evalue'] = hit[-2] annotation.attr['subject_end'] = hit[-3] annotation.attr['subject_start'] = hit[-4] return annotation
[docs]def from_json(line): """ .. versionadded:: 0.2.1 Returns an Annotation from a json representation """ return Annotation(**json.loads(line))
[docs]def from_hmmer(line, aa_seqs, feat_type='gene', source='HMMER', db='CUSTOM', custom_profiles=True, noframe=False): """ .. versionadded:: 0.1.15 first implementation to move old scripts to new GFF specs .. versionchanged:: 0.2.1 removed compatibility with old scripts .. versionchanged:: 0.2.2 taxon_id and taxon_name are not saved for non-custom profiles .. versionchanged:: 0.3.1 added support for non mgkit-translated sequences (*noframe*) Parse HMMER results (one line), it won't parse commented lines (starting with *#*) Arguments: line (str): HMMER domain table line aa_seqs (dict): dictionary with amino-acid sequences (name->seq), used to get the correct nucleotide positions feat_type (str): string to be used in the 'feature type' column source (str): string to be used in the 'source' column custom_profiles (bool): if True, the profile name contains gene, taxonomy and reviewed information in the form KOID_TAXONID_TAXON-NAME(-nr) noframe (bool): if True, the sequence is assumed to be in frame f0 Returns: A :class:`Annotation` instance .. note:: if `custom_profiles` is False, gene_id, taxon_id and taxon_name will be equal to the profile name """ if isinstance(line, bytes): line = line.decode('ascii') line = line.split() if noframe: # no information on the frame is provided (already a protein, so f0) frame = 'f0' contig = line[0] else: contig, frame = line[0].rsplit('-', 1) t_from = int(line[17]) t_to = int(line[18]) # first get coordinate if sequence is reversed if frame.startswith('r'): seq_len = len(aa_seqs[line[0]]) t_from, t_to = seq_utils.reverse_aa_coord(t_from, t_to, seq_len) # necessary only if frame information available if not noframe: # converts in nucleotide coordinates t_from, t_to = seq_utils.convert_aa_to_nuc_coord( t_from, t_to, frame=int(frame[-1]) ) # maintains the aa coordinates aa_from = int(line[17]) aa_to = int(line[18]) profile_name = line[3] score = float(line[6]) if custom_profiles: # KOID_TAXONID_TAXON-NAME(-nr) reviewed = 'False' if profile_name.endswith('-nr') else 'True' gene_id, taxon_id, taxon_name = profile_name.split('_') else: gene_id = profile_name taxon_id = taxon_name = None annotation = Annotation( seq_id=contig, source=source, feat_type=feat_type, start=t_from, end=t_to, score=score, strand='-' if frame.startswith('r') else '+', phase=int(frame[1]), db=db, gene_id=gene_id, taxon_id=taxon_id, bitscore=float(line[7]), # custom for HMMER profiles aa_from=aa_from, aa_to=aa_to, # stores the aa sequence aa_seq=aa_seqs[line[0]][aa_from - 1:aa_to], # evalue evalue=score, # maintains HMMER profile information: # profile name name=profile_name, # both strand/phase (e.g r2) frame=frame, # old version of uid # ko_idx=ko_idx, # used in other old profiles, where the taxon name was used instead # of a taxon ID taxon_name=taxon_name ) try: annotation.attr['reviewed'] = reviewed except UnboundLocalError: pass # removes the None values from non-custom profiles if taxon_id is None: del annotation.attr['taxon_id'] if taxon_name is None: del annotation.attr['taxon_name'] return annotation
[docs]def parse_gff(file_handle, gff_type=from_gff, strict=True, encoding='ascii'): """ .. versionchanged:: 0.4.0 In some cases ASCII decoding is not enough, so it is parametrised now .. versionchanged:: 0.3.4 added decoding from binary for compatibility with Python3 .. versionchanged:: 0.2.6 added *strict* parameter .. versionchanged:: 0.2.3 correctly handling of GFF with comments of appended sequences .. versionchanged:: 0.1.12 added *gff_type* parameter Parse a GFF file and returns generator of :class:`GFFKegg` instances Accepts a file handle or a string with the file name Arguments: file_handle (str, file): file name or file handle to read from gff_type (class): class/function used to parse a GFF annotation strict (bool): if True duplicate keys raise an exception encoding (str): encoding of the file, if ascii fails, use utf8 Yields: Annotation: an iterator of :class:`Annotation` instances """ file_handle = mgkit.io.open_file(file_handle, 'rb') LOG.info( "Loading GFF from file (%s)", getattr(file_handle, 'name', repr(file_handle)) ) index = 0 for index, line in enumerate(file_handle): try: line = line.decode(encoding) except UnicodeError: raise UnicodeError("Impossible to decode line to {}: {}".format(encoding, line)) # the first is for GFF with comments and the second for # GFF with the fasta file attached if line.startswith('#'): continue if line.startswith('>'): break annotation = gff_type(line, strict=strict) yield annotation LOG.info( "Read %d line from file (%s)", index + 1, getattr(file_handle, 'name', repr(file_handle)) )
[docs]def diff_gff(files, key_func=None): """ .. versionadded:: 0.1.12 Returns a simple diff made between a list of gff files. The annotations are grouped using *key_func*, so it depends on it to find similar annotations. Arguments: files (iterable): an iterable of file handles, pointing to GFF files key_func (func): function used to group annotations, defaults to this key: *(x.seq_id, x.strand, x.start, x.end, x.gene_id, x.bitscore)* Returns: dict: the returned dictionary keys are determined by key_func and as values lists. The lists elements are tuple whose first element is the index of the file, relative to *files* and the second element is the line number in which the annotation is. Can be used with the :mod:`linecache` module. """ if isinstance(files, str) or len(files) == 1: return if key_func is None: def key_func(x): return (x.seq_id, x.strand, x.start, x.end, x.gene_id, x.bitscore) gff_diff = {} for index, file_handle in enumerate(files): for lineno, annotation in enumerate(parse_gff(file_handle)): key = key_func(annotation) try: gff_diff[key].append((index, lineno)) except KeyError: gff_diff[key] = [(index, lineno)] return gff_diff
[docs]def annotation_elongation(ann1, annotations): """ .. versionadded:: 0.1.12 Given an :class:`Annotation` instance and a list of the instances of the same class, returns the longest overlapping range that can be found and the annotations that are included in it. .. warning:: annotations are not checked for seq_id and strand Arguments: ann1 (Annotation): annotation to elongate annotations (iterable): iterable of :class:`Annotation` instances Returns: tuple: the first element is the longest range found, while the the second element is a set with the annotations used """ used = set([ann1]) union = (ann1.start, ann1.end) for ann2 in annotations: new_union = union_range(union[0], union[1], ann2.start, ann2.end) if new_union is not None: used.add(ann2) union = new_union return union, used
[docs]def elongate_annotations(annotations): """ .. versionadded:: 0.1.12 Given an iterable of :class:`Annotation` instances, tries to find the all possible longest ranges and returns them. .. warning:: annotations are not checked for seq_id and strand Arguments: annotations (iterable): iterable of :class:`Annotation` instances Returns: set: set with the all ranges found """ annotations = sorted(annotations, key=lambda x: x.start) ranges = set() while len(annotations) > 0: ann1 = annotations.pop(0) union, used = annotation_elongation(ann1, annotations) if union is None: ranges.add((ann1.start, ann1.end)) else: annotations = sorted(set(annotations) - used, key=lambda x: x.start) ranges.add(union) return ranges
[docs]def annotation_coverage(annotations, seqs, strand=True): """ .. versionadded:: 0.1.12 Given a list of annotations and a dictionary where the keys are the sequence names referred in the annotations and the values are the sequences themselves, returns a number which indicated how much the sequence length is "covered" in annotations. If *strand* is True the coverage is strand specific. Arguments: annotations (iterable): iterable of :class:`Annotation` instances seqs (dict): dictionary in which the keys are the sequence names and the values are the sequences strand (bool): if True, the values are strand specific (the annotations) are grouped by (seq_id, strand) instead of seq_id Yields: tuple: the first element is the key, (seq_id, strand) if *strand* is True or seq_id if *strand* is False, and the coverage is the second value. """ if strand: def key_func(x): return (x.seq_id, x.strand) else: def key_func(x): return x.seq_id annotations = group_annotations( annotations, key_func=key_func ) for key, key_ann in viewitems(annotations): if isinstance(key, str): seq_len = len(seqs[key]) else: seq_len = len(seqs[key[0]]) covered = ranges_length(elongate_annotations(key_ann)) yield key, covered / seq_len * 100
[docs]def annotation_coverage_sorted(annotations, seqs, strand=True): """ .. versionadded:: 0.3.1 Given a list of annotations and a dictionary where the keys are the sequence names referred in the annotations and the values are the sequences themselves, returns a number which indicated how much the sequence length is "covered" in annotations. If *strand* is True the coverage is strand specific. .. note:: It differs from :func:`annotation_coverage` because it assumes the annotations are correctly sorted and in the values yielded Arguments: annotations (iterable): iterable of :class:`Annotation` instances seqs (dict): dictionary in which the keys are the sequence names and the values are the sequences strand (bool): if True, the values are strand specific (the annotations) are grouped by (seq_id, strand) instead of seq_id Yields: tuple: the first element is the seq_id, the second the strand (if strand is True, else it's set to *None*), and the third element is the coverage. """ if strand: def key_func(x): return (x.seq_id, x.strand) else: def key_func(x): return x.seq_id annotations = group_annotations_sorted( annotations, key_func=key_func ) for ann in annotations: seq_id = ann[0].seq_id if strand: ann_strand = ann[0].strand else: ann_strand = None seq_len = len(seqs[seq_id]) covered = ranges_length(elongate_annotations(ann)) yield seq_id, ann_strand, covered / seq_len * 100
[docs]def group_annotations(annotations, key_func=lambda x: (x.seq_id, x.strand)): """ .. versionadded:: 0.1.12 Group :class:`Annotation` instances in a dictionary by using a key function that returns the key to be used in the dictionary. Arguments: annotations (iterable): iterable with :class:`Annotation` instances key_func (func): function used to extract the key used in the dictionary, defaults to a function that returns (ann.seq_id, ann.strand) Returns: dict: dictionary whose keys are returned by *key_func* and the values are lists of annotations Example: >>> ann = [Annotation(seq_id='seq1', strand='+', start=10, end=15), ... Annotation(seq_id='seq1', strand='+', start=1, end=5), ... Annotation(seq_id='seq1', strand='-', start=30, end=100)] >>> group_annotations(ann) {('seq1', '+'): [seq1(+):10-15, seq1(+):1-5], ('seq1', '-'): [seq1(-):30-100]} """ grouped = {} for annotation in annotations: key = key_func(annotation) try: grouped[key].append(annotation) except KeyError: grouped[key] = [annotation] return grouped
[docs]def group_annotations_sorted(annotations, key_func=lambda x: (x.seq_id, x.strand)): """ .. versionadded:: 0.1.13 Group :class:`Annotation` instances by using a key function that returns a key. Assumes that the annotations are already sorted to return an iterator and save memory. One way to sort them is using: `sort -s -k 1,1 -k 7,7` on the file. Arguments: annotations (iterable): iterable with :class:`Annotation` instances key_func (func): function used to extract the key used in the dictionary, defaults to a function that returns (ann.seq_id, ann.strand) Yields: list: a list of the grouped annotations by *key_func* values """ curr_key = '' curr_ann = [] for annotation in annotations: new_key = key_func(annotation) if curr_key == new_key: curr_ann.append(annotation) else: if curr_key == '': curr_ann.append(annotation) curr_key = new_key else: yield curr_ann curr_key = new_key curr_ann = [annotation] else: yield curr_ann
[docs]def extract_nuc_seqs(annotations, seqs, name_func=lambda x: x.uid, reverse=False): """ .. versionadded:: 0.1.13 Extract the nucleotidic sequences from a list of annotations. Internally uses the method :meth:`Annotation.get_nuc_seq`. Arguments: annotations (iterable): iterable of :class:`Annotation` instances seqs (dict): dictionary with the sequences referenced in the annotations name_func (func): function used to extract the sequence name to be used, defaults to the uid of the annotation reverse (bool): if True the annotations on the *-* strand are reverse complemented Yields: tuple: tuple whose first element is the sequence name and the second is the sequence to which the annotation refers. """ for annotation in annotations: name = name_func(annotation) seq = annotation.get_nuc_seq(seqs[annotation.seq_id], reverse=reverse) yield name, seq
[docs]def group_annotations_by_ancestor(annotations, ancestors, taxonomy): """ .. versionadded:: 0.1.13 Group annotations by the ancestors provided. Arguments: annotations (iterable): annotations to group ancestors (iterable): list of ancestors accepted taxonomy: taxonomy class Returns: dict: grouped annotations """ ann_dict = dict((ancestor, []) for ancestor in ancestors) unknown = [] for annotation in annotations: anc_found = False for ancestor, anc_ids in viewitems(ancestors): if taxonomy.is_ancestor(annotation.taxon_id, anc_ids): ann_dict[ancestor].append(annotation) anc_found = True break if not anc_found: unknown.append(annotation) return ann_dict, unknown
[docs]def split_gff_file(file_handle, name_mask, num_files=2, encoding='ascii'): """ .. versionadded:: 0.1.14 .. versionchanged:: 0.2.6 now accept a file object as sole input .. versionchanged:: 0.4.0 added *encoding* parameter Splits a GFF, or a list of them, into a number of files. It is assured that annotations for the same sequence are kept in the same file, which is useful for cases like filtering, even when the annotations are from different GFF files. Internally, a structure is kept to check if a sequence ID is already been stored to a file, in which case the annotation is written to that file, otherwise a random file handles (among the open ones) is chosen. Arguments: file_handle (str, list): a single or list of file handles (or file names), from which the GFF annotations are read name_mask (str): a string used as template for the output file names on which the function applies :func:`string.format` num_files (int): the number of files to split the records Example: >>> import glob >>> files = glob.glob('*.gff') >>> name_mask = 'split-file-{0}.gff' >>> split_gff_file(files, name_mask, 5) """ if not isinstance(file_handle, IOBase): if isinstance(file_handle, str): file_handle = [file_handle] file_handle = itertools.chain( *(mgkit.io.open_file(x, 'rb') for x in file_handle) ) out_handles = [ mgkit.io.open_file(name_mask.format(filen), 'wb') for filen in range(num_files) ] seq_ids = {} for line in file_handle: line = line.decode(encoding) if line.startswith('#'): continue if line.startswith('>'): break seq_id = line.split('\t')[0] try: out_handle = out_handles[seq_ids[seq_id]] except KeyError: new_index = random.randint(0, num_files - 1) seq_ids[seq_id] = new_index out_handle = out_handles[new_index] out_handle.write(line.encode('ascii'))
[docs]def load_gff_base_info(files, taxonomy=None, exclude_ids=None, include_taxa=None, encoding='ascii'): """ This function is useful if the number of annotations in a GFF is high or there are memory constraints on the system. It returns a dictionary that can be used with functions like :func:`mgkit.counts.func.load_sample_counts`. Arguments: files (iterable, str): file name or list of paths of GFF files taxonomy: taxonomy pickle file, needed if include_taxa is not None exclude_ids (set, list): a list of gene_id to exclude from the dictionary include_taxa (int, iterable): a taxon_id or list thereof to be passed to :meth:`mgkit.taxon.taxonomy.is_ancestor`, so only the taxa that have the those taxon_id(s) as ancestor(s) are kept encoding (str): passed to :func:`parse_gff` Returns: dict: dictionary where the key is :attr:`Annotation.uid` and the value is a tuple (:attr:`Annotation.gene_id`, :attr:`Annotation.taxon_id`) """ if isinstance(files, str): files = [files] infos = {} for fname in files: for annotation in parse_gff(fname, encoding=encoding): # no information on taxa - exclude if annotation.taxon_id is None: continue # to exclude ribosomial genes or any other kind if exclude_ids is not None: if annotation.gene_id in exclude_ids: continue if (include_taxa is not None) and (taxonomy is not None): if not taxonomy.is_ancestor(annotation.taxon_id, include_taxa): continue infos[annotation.uid] = (annotation.gene_id, annotation.taxon_id) return infos
[docs]def load_gff_mappings(files, map_db, taxonomy=None, exclude_ids=None, include_taxa=None, encoding='ascii'): """ This function is useful if the number of annotations in a GFF is high or there are memory constraints on the system. It returns a dictionary that can be used with functions like :func:`mgkit.counts.func.load_sample_counts`. Arguments: files (iterable, str): file name or list of paths of GFF files map_db (str): any kind mapping in the GFF, as passed to :meth:`Annotation.get_mapping` taxonomy: taxonomy pickle file, needed if include_taxa is not None exclude_ids (set, list): a list of gene_id to exclude from the dictionary include_taxa (int, iterable): a taxon_id or list thereof to be passed to :meth:`mgkit.taxon.taxonomy.is_ancestor`, so only the taxa that have the those taxon_id(s) as ancestor(s) are kept encoding (str): passed to :func:`parse_gff` Returns: dict: dictionary where the key is :attr:`Annotation.gene_id` and the value is a list of mappings, as returned by :meth:`Annotation.get_mapping` """ infos = {} for fname in files: for annotation in parse_gff(fname, encoding=encoding): # skips genes that are already in the mapping if annotation.gene_id in infos: continue # exclude genes with no taxonomic information if annotation.taxon_id is None: continue if exclude_ids is not None: if annotation.gene_id in exclude_ids: continue # skips non bacterial/achaeal genes if (include_taxa is not None) and (taxonomy is not None): if not taxonomy.is_ancestor(annotation.taxon_id, include_taxa): continue infos[annotation.gene_id] = annotation.get_mapping(map_db) return infos
[docs]def parse_gff_files(files, strict=True): """ .. versionadded:: 0.1.15 .. versionchanged:: 0.2.6 added *strict* parameter Function that returns an iterator of annotations from multiple GFF files. Arguments: files (iterable, str): iterable of file names of GFF files, or a single file name strict (bool): if True duplicate keys raise an exception Yields: :class:`Annotation`: iterator of annotations """ if isinstance(files, str): files = [files] return itertools.chain(*(parse_gff(file_name, strict=strict) for file_name in files))
[docs]def get_annotation_map(annotations, key_func, value_func): """ .. versionadded:: 0.1.15 Applies two functions to an iterable of annotations with an iterator returned with the applied functions. Useful to build a dictionary Arguments: annotations (iterable): iterable of annotations key_func (func): function that accept an annotation as argument and returns one value, the first of the returned tuple value_func (func): function that accept an annotation as argument and returns one value, the second of the returned tuple Yields: tuple: a tuple where the first value is the result of *key_func* on the passed annotation and the second is the value returned by *value_func* on the same annotation """ for annotation in annotations: yield key_func(annotation), value_func(annotation)
[docs]def convert_gff_to_gtf(file_in, file_out, gene_id_attr='uid'): """ .. versionadded:: 0.1.16 Function that uses :meth:`Annotation.to_gtf` to convert a GFF into GTF. Arguments: file_in (str, file): either file name or file handle of a GFF file file_out (str): file name to which write the converted annotations """ LOG.info("Writing GTF file to %s", file_out) file_out = open(file_out, 'w') for annotation in parse_gff(file_in): file_out.write(annotation.to_gtf())
[docs]def from_mongodb(record, lineage=True): """ .. versionadded:: 0.2.1 .. versionchanged:: 0.2.2 added handling of *counts_* and *fpkms_* .. versionchanged:: 0.2.6 better handling of missing attributes and added *lineage* parameter Returns a :class:`Annotation` instance from a MongoDB record (created) using :meth:`Annotation.to_mongodb`. The actual record returned by pymongo is a dictionary that is copied, manipulated and passed to the :meth:`Annotation.__init__`. Arguments: record (dict): a dictionary with the full record from a MongoDB query lineage (bool): indicates if the lineage information in the record should be kept in the annotation Returns: Annotation: instance of :class:`Annotation` object """ record = record.copy() record['uid'] del record['_id'] if 'map' in record: mappings = record['map'].copy() del record['map'] try: record['EC'] = ','.join(mappings['ec']) del mappings['ec'] except KeyError: pass try: for key in mappings: record['map_{}'.format(key.upper())] = ','.join(mappings[key]) except KeyError: pass try: counts = record['counts'].copy() del record['counts'] for key in counts: record['counts_{}'.format(key)] = counts[key] except KeyError: pass try: fpkms = record['fpkms'].copy() del record['fpkms'] for key in fpkms: record['fpkms_{}'.format(key)] = fpkms[key] except KeyError: pass if ('lineage' in record) and (not lineage): del record['lineage'] return Annotation(**record)
[docs]def from_prodigal_frag(main_gff, blast_gff, attr='ID', split_func=None): """ .. versionchanged:: 0.3.3 fixed a bug for the strand, also the code is tested .. versionadded:: 0.2.6 *experimental* Reads the GFF given in output by PRODIGAL and the resulting GFF from using BLAST (or other software) on the aa or nucleotide file output by PRODIGAL. It then integrates the two outputs, so to the PRODIGAL GFF is added the information from the the output of the gene prediction software used. Arguments: main_gff (file): GFF file from PRODIGAL blast_gff (file): GFF with the returned annotations attr (str): attribute in the PRODIGAL GFF that is used to identify an annotation split_func (func): function to rename the headers from the predicted sequences back to their parent sequence Yields: annotation: annotation for each *blast_gff* back translated """ if split_func is None: def split_func(x): return tuple(x.rsplit('_', 1)) prodigal_gff = {} for annotation in parse_gff(main_gff, strict=False): key = ( annotation.seq_id, split_func(annotation.get_attr(attr))[1] ) prodigal_gff[key] = ( annotation.start, annotation.end, annotation.strand, annotation.get_attr(attr) ) for annotation in parse_gff(blast_gff): key = split_func(annotation.seq_id) annotation.set_attr('prodigal_start', annotation.start) annotation.set_attr('prodigal_end', annotation.end) annotation.set_attr('prodigal_strand', annotation.strand) start, end, strand, p_id = prodigal_gff[key] annotation.seq_id = key[0] annotation.start = start annotation.end = end annotation.strand = strand annotation.set_attr('prodigal_ID', p_id) yield annotation