Tutorial - Exploring the Data

The following section requires that:

  • the tutorial has been completed
  • the data from it is in the same directory

In alternative the data required to run this example can be download from figshare and uncrompressed.

Imports

[1]:
from __future__ import print_function

#Python Standard Library
import glob
import pickle
import sys
import functools
#External Dependencies (install via pip or anaconda)

# Check if running interactively or not
import matplotlib as mpl # http://matplotlib.org
# from:
# http://stackoverflow.com/questions/15411967/how-can-i-check-if-code-is-executed-in-the-ipython-notebook
# and
# http://stackoverflow.com/questions/15455029/python-matplotlib-agg-vs-interactive-plotting-and-tight-layout
import __main__ as main
if hasattr(main, '__file__'):
    # Non interactive, force the use of Agg backend instead
    # of the default one
    mpl.use('Agg')

import numpy # http://www.numpy.org
import pandas # http://pandas.pydata.org
import seaborn # http://stanford.edu/~mwaskom/software/seaborn/
import scipy # http://www.scipy.org
import matplotlib.pyplot as plt


#MGKit Import
from mgkit.io import gff, fasta
from mgkit.mappings import eggnog
import mgkit.counts, mgkit.taxon, mgkit.snps, mgkit.plots
import mgkit.snps
import mgkit.mappings.enzyme
/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
[2]:
mgkit.logger.config_log()
[3]:
mgkit.cite(sys.stdout)

 _|      _|    _|_|_|  _|    _|  _|    _|
 _|_|  _|_|  _|        _|  _|        _|_|_|_|
 _|  _|  _|  _|  _|_|  _|_|      _|    _|
 _|      _|  _|    _|  _|  _|    _|    _|
 _|      _|    _|_|_|  _|    _|  _|      _|_|


MGKit Version: 0.3.4

Rubino, F. and Creevey, C.J. (2014).
MGkit: Metagenomic Framework For The Study Of Microbial Communities.

Available at: http://figshare.com/articles/MGkit_Metagenomic_Framework_For_The_Study_Of_Microbial_Communities/1269288

[doi:10.6084/m9.figshare.1269288]

Download Complete Data

If the tutorial can’t be completed, the download data can be downloaded from: %%

[4]:
# the following variable is used to indicate where the tutorial data is stored
data_dir = 'tutorial-data/'

Read Necessary Data

[5]:
# Keeps a list of the count data file outputted by
# htseq-count
counts = glob.glob('{}*-counts.txt'.format(data_dir))
[6]:
# This file contains the SNPs information and it is the output
# of the snp_parser script
snp_data = pickle.load(open('{}snp_data.pickle'.format(data_dir), 'r'))
[7]:
# Taxonomy needs to be download beforehand. It is loaded into an an
# instance of mgkit.taxon.UniprotTaxonomy. It is used in filtering
# data and to map taxon IDs to different levels in the taxonomy
taxonomy = mgkit.taxon.UniprotTaxonomy('{}/taxonomy.pickle'.format(data_dir))
2018-05-22 14:06:05,570 - INFO - mgkit.taxon->load_data: Loading taxonomy from file tutorial-data//taxonomy.pickle
[8]:
# Loads all annotations in a dictionary, with the unique ID (uid) as key
# and the mgkit.io.gff.Annotation instance that represent the line in the
# GFF file as value
annotations = {x.uid: x for x in gff.parse_gff('{}assembly.uniprot.gff'.format(data_dir))}
2018-05-22 14:06:14,637 - INFO - mgkit.io.gff->parse_gff: Loading GFF from file (tutorial-data/assembly.uniprot.gff)
2018-05-22 14:06:15,578 - INFO - mgkit.io.gff->parse_gff: Read 9136 line from file (tutorial-data/assembly.uniprot.gff)
[9]:
# Used to extract the sample ID from the count file names
file_name_to_sample = lambda x: x.rsplit('/')[-1].split('-')[0]
[10]:
# Used to rename the DataFrame columns
sample_names = {
    'SRR001326': '50m',
    'SRR001325': '01m',
    'SRR001323': '32m',
    'SRR001322': '16m'
}

Explore Count Data

Load Taxa Table

Build a pandas.DataFrame instance. It is NOT required, but it is easier to manipulate. load_sample_counts_to_taxon returns a pandas.Series instance.

The DataFrame will have the sample names as columns names and the different taxon IDs as rows names. There are 3 different function to map counts and annotations to a pandas.Series instance:

  • mgkit.counts.load_sample_counts
  • mgkit.counts.load_sample_counts_to_taxon
  • mgkit.counts.load_sample_counts_to_genes

The three differs primarly by the index for the pandas.Series they return, which is (gene_id, taxon_id), taxon_id and gene_id, respectively. Another change is the possibility to map a gene_id to another and a taxon_id to a different rank. In this contexts, as it is interesting to assess the abundance of each organism, mgkit.counts.load_sample_counts_to_taxon can be used. It provides a rank parameter that can be changed to map all counts to the order level in this case, but can be changed to any rank in mgkit.taxon.TAXON_RANKS, for example genus, phylum.

[11]:
taxa_counts = pandas.DataFrame({
    # Get the sample names
    file_name_to_sample(file_name): mgkit.counts.load_sample_counts_to_taxon(
        # A function accept a uid as only parameter and returns only the
        # gene_id and taxon_id, so we set it to a lambda that does
        # exactly that
        lambda x: (annotations[x].gene_id, annotations[x].taxon_id),
        # An iterator that yields (uid, count) is needed and MGKit
        # has a function that does that for htseq-count files.
        # This can be adapted to any count data file format
        mgkit.counts.load_htseq_counts(file_name),
        # A mgkit.taxon.UniprotTaxonomy instance is necessary to filter
        # the data and map it to a different rank
        taxonomy,
        # A taxonomic rank to map each taxon_id to. Must be lowercase
        rank='order',
        # If False, any taxon_id that can not be resolved at the taxonomic
        # rank requested is excluded from the results
        include_higher=False
    )
    # iterate over all count files
    for file_name in counts
})
2018-05-22 14:06:15,644 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001322-counts.txt
2018-05-22 14:06:15,733 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001323-counts.txt
2018-05-22 14:06:15,837 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001325-counts.txt
2018-05-22 14:06:15,921 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001326-counts.txt
[12]:
# This is an alternative if the counts are stored in the annotations
# ann_func = functools.partial(
#     taxonomy.get_ranked_id,
#     rank='order',
#     it=True,
#     include_higher=False
# )
# taxa_counts = mgkit.counts.func.from_gff(annotations.values(), sample_names, ann_func=lambda x: ann_func(x.taxon_id))
# taxa_counts = taxa_counts[taxa_counts.index.notnull()]

Scaling (DESeq method) and Rename Rows/Columns

Because each sample has different yields in total DNA from the sequencing, the table should be scaled. The are a few approaches, RPKM, scaling by the minimum. MGKit offers mgkit.counts.scaling.scale_factor_deseq and mgkit.counts.scaling.scale_rpkm that scale using the DESeq method and RPKM respectively.

[12]:
# the DESeq method doesn't require information about the gene length
taxa_counts = mgkit.counts.scale_deseq(taxa_counts)
/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/scipy/stats/stats.py:305: RuntimeWarning: divide by zero encountered in log
  log_a = np.log(np.array(a, dtype=dtype))

One of the powers of pandas data structures is the metadata associated and the possibility to modify them with ease. In this case, the columns are named after the sample IDs from ENA and the row names are the taxon IDs. To make it easier to analyse, columns and rows can be renamed and sorted by name and the rows sorted in descending order by the first colum (1 meter).

To rename the columns the dictionary sample_name can be supplied and for the rows the name of each taxon ID can be accessed through the taxonomy instance, because it works as a dictionary and the returned object has a s_name attribute with the scientific name (lowercase).

[13]:
taxa_counts = taxa_counts.rename(
    index=lambda x: taxonomy[x].s_name,
    columns=sample_names
).sort_index(axis='columns')
[14]:
taxa_counts = taxa_counts.loc[taxa_counts.mean(axis='columns').sort_values(ascending=False).index]
[15]:
# the *describe* method of a pandas.Series or pandas.DataFrame
# gives some insights into the data
taxa_counts.describe()
[15]:
01m 16m 32m 50m
count 194.000000 194.000000 194.000000 194.000000
mean 27.781155 33.622739 30.885116 34.487249
std 69.125920 93.515885 86.123968 92.818806
min 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.713044 0.000000
50% 5.345444 3.198502 4.278267 3.465247
75% 19.711325 19.990636 17.826111 21.441218
max 529.198964 722.861412 713.044453 738.097692
[16]:
#Save a CSV to disk, but Excel and other file formats are available
taxa_counts.to_csv('{}taxa_counts.csv'.format(data_dir))
[17]:
# This will give an idea of the counts for each order
taxa_counts.head(n=10)
[17]:
01m 16m 32m 50m
Methanococcales 529.198964 722.861412 713.044453 738.097692
Thermococcales 354.135670 637.568030 577.566007 595.156237
Bacillales 525.189881 554.406983 440.661472 528.450225
Methanobacteriales 163.036044 358.232204 290.922137 330.931125
Archaeoglobales 208.472319 313.453179 278.087337 274.620855
Dehalococcoidales 203.126875 229.225964 286.643870 253.829370
Clostridiales 225.845012 217.498124 205.356802 206.182219
Enterobacterales 199.117792 165.255928 146.887157 252.963059
Methanosarcinales 146.999712 238.821470 213.913336 129.080465
Desulfurococcales 66.818051 197.240946 249.565558 182.791799

Plots for Top40 Taxa

Distribution of Each Taxon Over Depth

How to visualise the data depends on the question we want to ask and the experimental design. As a starting point, it may be interesting to visualise the variation of a taxonomic order abundance over the samples. This can be done using boxplots, among other methods.

MGKit offers a few functions to make complex plots, with a starting point in mgkit.plots.boxplot.boxplot_dataframe. However, as the data produced is in fact a pandas DataFrame, which is widely supported, a host of different specialised libraries tht offer similar functions can be used.

[18]:
# A matplotlib Figure instance and a single axis can be returned
# by this MGKit function. It is an helper function, the axis is
# needed to plot and the figure object to save the file to disk
fig, ax = mgkit.plots.get_single_figure(figsize=(15, 10))
# The return value of mgkit.plots.boxplot.boxplot_dataframe is
# passed to the **_** special variable, as it is not needed and
# it would be printed, otherwise
_ = mgkit.plots.boxplot.boxplot_dataframe(
    # The full dataframe can be passed
    taxa_counts,
    # this variable is used to tell the function
    # which rows and in which order they need to
    # be plot. In this case only the first 40 are
    # plot
    taxa_counts.index[:40],
    # A matplotlib axis instance
    ax,
    # a dictionary with options related to the labels
    # on both the X and Y axes. In this case it changes
    # the size of the labels
    fonts=dict(fontsize=14),
    # The default is to use the same colors for all
    # boxes. A dictionary can be passed to change this
    # in this case, the 'hls' palette from seaborn is
    # used.
    data_colours={
        x: color
        for x, color in zip(taxa_counts.index[:40], seaborn.color_palette('hls', 40))
    }
)
# Adds labels to the axes
ax.set_xlabel('Order', fontsize=16)
ax.set_ylabel('Counts', fontsize=16)
# Ensure the correct layout before writing to disk
fig.set_tight_layout(True)
# Saves a PDF file, or any other supported format by matplotlib
fig.savefig('{}taxa_counts-boxplot_top40_taxa.pdf'.format(data_dir))
2018-05-22 14:06:28,310 - DEBUG - matplotlib.font_manager->findfont: findfont: Matching :family=sans-serif:style=normal:variant=normal:weight=normal:stretch=normal:size=14.0 to DejaVu Sans (u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf') with score of 0.050000
2018-05-22 14:06:28,357 - DEBUG - matplotlib.font_manager->findfont: findfont: Matching :family=sans-serif:style=normal:variant=normal:weight=normal:stretch=normal:size=16.0 to DejaVu Sans (u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf') with score of 0.050000
2018-05-22 14:06:28,386 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:06:28,537 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:06:28,538 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/figure.py:2267: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "
../_images/pipeline_Exploring-Metagenomic-Data_33_1.png

Distribution of Taxa at Each Depth

Seaborn offers a KDE plot, which is useful to display the distribution of taxa counts for each sampling depth.

[19]:
fig, ax = mgkit.plots.get_single_figure(figsize=(10, 10))
# iterate over the columns, which are the samples and assign a color to each one
for column, color in zip(taxa_counts.columns, seaborn.color_palette('Set1', len(taxa_counts.columns))):
    seaborn.kdeplot(
        # The data can transformed with the sqrt function of numpy
        numpy.sqrt(taxa_counts[column]),
        # Assign the color
        color=color,
        # Assign the label to the sample name to appear
        # in the legend
        label=column,
        # Add a shade under the KDE function
        shade=True
    )
# Adds a legend
ax.legend()
ax.set_xlabel('Counts', fontsize=16)
ax.set_ylabel('Frequency', fontsize=16)
fig.set_tight_layout(True)
fig.savefig('{}taxa_counts-distribution_top40_taxa.pdf'.format(data_dir))
2018-05-22 14:06:32,367 - DEBUG - matplotlib.font_manager->findfont: findfont: Matching :family=sans-serif:style=normal:variant=normal:weight=normal:stretch=normal:size=10.0 to DejaVu Sans (u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf') with score of 0.050000
2018-05-22 14:06:32,404 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:06:32,461 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:06:32,463 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
../_images/pipeline_Exploring-Metagenomic-Data_36_1.png

Heatmap of the Table

[20]:
# An heatmap can be created to provide information on the table
clfig = seaborn.clustermap(taxa_counts.iloc[:40], cbar=True, cmap='Blues')
clfig.fig.set_tight_layout(True)
for text in clfig.ax_heatmap.get_yticklabels():
    text.set_rotation('horizontal')
clfig.savefig('{}taxa_counts-heatmap-top40.pdf'.format(data_dir))
2018-05-22 14:06:34,833 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:06:34,936 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:06:34,938 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
2018-05-22 14:06:35,020 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:06:35,129 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:06:35,131 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
../_images/pipeline_Exploring-Metagenomic-Data_38_1.png

Functional Categories

Besides looking at specific taxa, it is possible to map each gene_id to functional categories. eggNOG provides this. v3 must be used, as the mappings in Uniprot points to that version.

Load Necessary Data

[21]:
eg = eggnog.NOGInfo()
[39]:
#Build mapping Uniprot IDs -> eggNOG functional categories
fc_map = {
    # An Annotation instance provide a method to access the list of IDs for the
    # specific mapping. For example eggnog mappings are store into the
    # map_EGGNOG attribute
    annotation.gene_id: eg.get_nogs_funccat(annotation.get_mapping('eggnog'))
    for annotation in annotations.itervalues()
}
[41]:
# Just a few to speed up the analysis, but other can be used
# Should have been downloaded by the full tutorial script
eg.load_members('{}COG.members.txt.gz'.format(data_dir))
eg.load_members('{}NOG.members.txt.gz'.format(data_dir))
eg.load_funccat('{}COG.funccat.txt.gz'.format(data_dir))
eg.load_funccat('{}NOG.funccat.txt.gz'.format(data_dir))
2018-05-22 14:09:38,473 - INFO - mgkit.mappings.eggnog->load_members: Reading Members from tutorial-data/COG.members.txt.gz
2018-05-22 14:09:56,921 - INFO - mgkit.mappings.eggnog->load_members: Reading Members from tutorial-data/NOG.members.txt.gz
2018-05-22 14:10:03,884 - INFO - mgkit.mappings.eggnog->load_funccat: Reading Functional Categories from tutorial-data/COG.funccat.txt.gz
2018-05-22 14:10:03,896 - INFO - mgkit.mappings.eggnog->load_funccat: Reading Functional Categories from tutorial-data/NOG.funccat.txt.gz

Build FC Table

As mentioned above, mgkit.counts.load_sample_counts_to_genes works in the same way as mgkit.counts.load_sample_counts_to_taxon, with the difference of giving gene_id as the only index.

In this case, however, as a mapping to functional categories is wanted, to the gene_map parameter a dictionary where for each gene_id an iterable of mappings is assigned. These are the values used in the index of the returned pandas.Series, which ends up as rows in the fc_counts DataFrame.

[45]:
fc_counts = pandas.DataFrame({
    file_name_to_sample(file_name): mgkit.counts.load_sample_counts_to_genes(
        lambda x: (annotations[x].gene_id, annotations[x].taxon_id),
        mgkit.counts.load_htseq_counts(file_name),
        taxonomy,
        gene_map=fc_map
    )
    for file_name in counts
})
2018-05-22 14:10:11,210 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001322-counts.txt
2018-05-22 14:10:11,293 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001323-counts.txt
2018-05-22 14:10:11,370 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001325-counts.txt
2018-05-22 14:10:11,445 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001326-counts.txt

Scale the Table and Rename Rows/Columns

[46]:
fc_counts = mgkit.counts.scale_deseq(fc_counts).rename(
    columns=sample_names,
    index=eggnog.EGGNOG_CAT
)
[47]:
fc_counts.describe()
[47]:
16m 32m 01m 50m
count 23.000000 23.000000 23.000000 23.000000
mean 288.923364 289.089215 255.206073 287.413551
std 288.130525 286.446124 205.412833 283.111728
min 0.000000 0.000000 0.000000 3.471907
25% 66.085234 78.138939 111.201303 69.004160
50% 209.794392 232.510989 220.994995 223.070050
75% 356.650467 388.026535 344.864801 376.701953
max 1103.518503 1147.308320 734.773169 1177.844584
[49]:
fc_counts
[49]:
16m 32m 01m 50m
RNA processing and modification 4.195888 3.049324 25.337006 45.134796
Chromatin structure and dynamics 27.273271 21.345271 12.668503 68.570171
Energy production and conversion 832.883737 735.649521 609.495751 703.061248
Cell cycle control, cell division, chromosome partitioning 73.428037 97.578382 121.054583 52.078611
Amino acid transport and metabolism 676.586915 658.654079 609.495751 574.600674
Nucleotide transport and metabolism 336.719999 374.304575 271.669007 291.640221
Carbohydrate transport and metabolism 333.573084 354.483966 285.745121 275.148661
Coenzyme transport and metabolism 323.083364 304.170113 368.794196 374.098022
Lipid transport and metabolism 232.871775 238.609637 243.516778 243.901495
Translation, ribosomal structure and biogenesis 1103.518503 1147.308320 734.773169 1177.844584
Transcription 290.565233 276.726193 237.886332 365.418254
Replication, recombination and repair 528.681868 554.214717 439.174767 451.347962
Cell wall/membrane/envelope biogenesis 206.647476 174.573824 216.772161 199.634675
Cell motility 27.273271 19.058278 33.782674 33.851097
Posttranslational modification, protein turnover, chaperones 376.580934 401.748495 320.935407 379.305883
Inorganic ion transport and metabolism 186.717009 200.493082 190.027544 173.595370
Secondary metabolites biosynthesis, transport and catabolism 58.742430 86.143415 106.978469 96.345430
General function prediction only 577.983550 563.362690 494.071613 675.285989
Function unknown 209.794392 232.510989 220.994995 223.070050
Signal transduction mechanisms 35.665047 38.116555 115.424138 46.870750
Intracellular trafficking, secretion, and vesicular transport 87.064673 96.816051 121.054583 86.797685
Defense mechanisms 115.386916 70.134462 90.087132 69.438148
Cytoskeleton 0.000000 0.000000 0.000000 3.471907
[48]:
#Save table to disk
fc_counts.to_csv('{}fc_counts.csv'.format(data_dir))

Heatmap to Explore Functional Categories

[50]:
clfig = seaborn.clustermap(fc_counts, cbar=True, cmap='Greens')
clfig.fig.set_tight_layout(True)
for text in clfig.ax_heatmap.get_yticklabels():
    text.set_rotation('horizontal')
clfig.savefig('{}fc_counts-heatmap.pdf'.format(data_dir))
2018-05-22 14:10:17,521 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:10:17,626 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:10:17,627 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
2018-05-22 14:10:17,693 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:10:17,798 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:10:17,799 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
../_images/pipeline_Exploring-Metagenomic-Data_54_1.png

Enzyme Classification

Enzyme classification number were added the add-gff-info script, so they can be used in a similar way to functional categories. The specificity level requested is 2.

[51]:
ec_map = {
    # EC numbers are store into the EC attribute in a GFF file and
    # an Annotation instance provide a get_ec method that returns
    # a list. A level of specificity can be used to the mapping
    # less specific, as it ranges from 1 to 4 included. Right
    # now a list is returned, so it is a good idea to convert
    # the list into a set so if any duplicate appears (as effect
    # of the change in level) it won't inflate the number later.
    # In later versions (0.2) a set will be returned instead of
    # a list.
    # We also want to remove any hanging ".-" to use the labels
    # from expasy
    annotation.gene_id: set(x.replace('.-', '') for x in annotation.get_ec(level=2))
    for annotation in annotations.itervalues()
}
[52]:
# The only difference with the functional categories is the mapping
# used.
ec_counts = pandas.DataFrame({
    file_name_to_sample(file_name): mgkit.counts.load_sample_counts_to_genes(
        lambda x: (annotations[x].gene_id, annotations[x].taxon_id),
        mgkit.counts.load_htseq_counts(file_name),
        taxonomy,
        gene_map=ec_map
    )
    for file_name in counts
})
2018-05-22 14:10:20,682 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001322-counts.txt
2018-05-22 14:10:20,768 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001323-counts.txt
2018-05-22 14:10:20,844 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001325-counts.txt
2018-05-22 14:10:20,922 - INFO - mgkit.counts.func->load_htseq_counts: Loading HTSeq-count file tutorial-data/SRR001326-counts.txt
[53]:
# This file contains the names of each enzyme class and can be downloaded
# from ftp://ftp.expasy.org/databases/enzyme/enzclass.txt
# It should be downloaded at the end of the tutorial script
ec_names = mgkit.mappings.enzyme.parse_expasy_file('{}enzclass.txt'.format(data_dir))
[54]:
# Rename columns and row. Rows will include the full label the enzyme class
ec_counts = mgkit.counts.scale_deseq(ec_counts).rename(
    index=lambda x: "{} {} [EC {}.-]".format(
        # A name of the second level doesn't include the first level
        # definition, so if it is level 2, we add the level 1 label
        '' if len(x) == 1 else ec_names[x[0]] + " - ",
        # The EC label for the specific class (e.g. 3.2)
        ec_names[x],
        # The EC number
        x
    ),
    columns=sample_names
)
[55]:
plot_order = ec_counts.median(axis=1).sort_values(ascending=True, inplace=False).index
[56]:
ec_counts.describe()
[56]:
16m 32m 01m 50m
count 59.000000 59.000000 59.000000 59.000000
mean 74.276636 78.371254 79.636608 80.406521
std 98.400500 108.265549 108.003831 115.948168
min 0.000000 0.000000 0.000000 0.000000
25% 4.786284 6.764489 5.672632 3.995495
50% 36.375758 42.090153 34.035791 37.291289
75% 96.204307 105.225384 127.229027 121.640634
max 465.226796 537.401066 546.193404 674.794758
[57]:
ec_counts
[57]:
16m 32m 01m 50m
Oxidoreductases [EC 1.-] 30.632217 18.038637 47.001806 64.815812
Oxidoreductases - Acting on the CH-OH group of donors [EC 1.1.-] 159.861883 134.538169 215.560008 161.595587
Oxidoreductases - Acting on a peroxide as acceptor [EC 1.11.-] 6.700797 14.280588 0.000000 3.551551
Oxidoreductases - Acting on hydrogen as donors [EC 1.12.-] 65.093461 81.173867 51.864062 46.170168
Oxidoreductases - Acting on single donors with incorporation of molecular oxygen (oxygenases). The oxygen incorporated need not be derived from O(2) [EC 1.13.-] 0.000000 7.516099 6.483008 3.551551
Oxidoreductases - Acting on paired donors, with incorporation or reduction of molecular oxygen. The oxygen incorporated need not be derived from O(2) [EC 1.14.-] 0.000000 0.000000 0.000000 1.775776
Oxidoreductases - Acting on superoxide as acceptor [EC 1.15.-] 4.786284 3.006440 0.000000 2.663664
Oxidoreductases - Oxidizing metal ions [EC 1.16.-] 14.358852 10.522538 3.241504 11.542542
Oxidoreductases - Acting on CH or CH(2) groups [EC 1.17.-] 54.563637 61.632010 102.107372 43.506504
Oxidoreductases - Acting on iron-sulfur proteins as donors [EC 1.18.-] 36.375758 15.783808 11.345264 37.291289
Oxidoreductases - Acting on the aldehyde or oxo group of donors [EC 1.2.-] 150.289315 145.812317 153.971434 139.398391
Oxidoreductases - Catalyzing the reaction X-H + Y-H = 'X-Y' [EC 1.21.-] 0.957257 6.012879 0.000000 0.000000
Oxidoreductases - Acting on the CH-CH group of donors [EC 1.3.-] 28.717703 42.090153 98.865868 63.927924
Oxidoreductases - Acting on the CH-NH(2) group of donors [EC 1.4.-] 36.375758 71.402939 30.794287 36.403401
Oxidoreductases - Acting on the CH-NH group of donors [EC 1.5.-] 0.957257 9.019319 16.207519 2.663664
Oxidoreductases - Acting on NADH or NADPH [EC 1.6.-] 71.794259 83.428697 53.484814 59.488485
Oxidoreductases - Acting on other nitrogenous compounds as donors [EC 1.7.-] 8.615311 0.000000 9.724512 1.775776
Oxidoreductases - Acting on a sulfur group of donors [EC 1.8.-] 125.400639 102.970554 111.831884 128.743737
Oxidoreductases - Acting on a heme group of donors [EC 1.9.-] 1.914514 0.000000 0.000000 0.000000
Oxidoreductases - Other oxidoreductases [EC 1.97.-] 1.914514 6.012879 4.862256 6.215215
Transferases [EC 2.-] 0.000000 0.000000 9.724512 0.000000
Transferases - Transferring one-carbon groups [EC 2.1.-] 192.408613 223.228135 222.043016 198.886876
Transferases - Transferring molybdenum- or tungsten-containing groups [EC 2.10.-] 0.000000 0.000000 3.241504 0.000000
Transferases - Transferring aldehyde or ketonic groups [EC 2.2.-] 27.760447 23.299906 21.069775 25.748747
Transferases - Acyltransferases [EC 2.3.-] 107.212760 126.270460 87.520605 146.501494
Transferases - Glycosyltransferases [EC 2.4.-] 82.324083 106.728603 129.660155 150.053045
Transferases - Transferring alkyl or aryl groups, other than methyl groups [EC 2.5.-] 79.452313 96.957675 147.488427 114.537531
Transferases - Transferring nitrogenous groups [EC 2.6.-] 85.195854 103.722164 144.246923 66.591588
Transferases - Transferring phosphorus-containing groups [EC 2.7.-] 465.226796 537.401066 546.193404 674.794758
Transferases - Transferring sulfur-containing groups [EC 2.8.-] 53.606380 60.880401 61.588574 71.031027
Hydrolases [EC 3.-] 2.871770 12.777368 3.241504 1.775776
Hydrolases - Acting on ester bonds [EC 3.1.-] 189.536843 239.763553 152.350682 227.299287
Hydrolases - Acting on carbon-sulfur bonds [EC 3.13.-] 0.000000 1.503220 6.483008 0.000000
Hydrolases - Glycosylases [EC 3.2.-] 39.247528 24.051516 47.001806 31.963962
Hydrolases - Acting on ether bonds [EC 3.3.-] 11.487081 9.019319 11.345264 9.766766
Hydrolases - Acting on peptide bonds (peptidases) [EC 3.4.-] 156.990112 190.157300 163.695946 233.514502
Hydrolases - Acting on carbon-nitrogen bonds, other than peptide bonds [EC 3.5.-] 203.895695 153.328416 149.109179 191.783773
Hydrolases - Acting on acid anhydrides [EC 3.6.-] 331.210847 338.224447 398.704977 350.715697
Hydrolases - Acting on carbon-carbon bonds [EC 3.7.-] 8.615311 1.503220 27.552783 4.439439
Hydrolases - Acting on phosphorus-nitrogen bonds [EC 3.9.-] 0.000000 0.000000 0.000000 4.439439
Lyases [EC 4.-] 2.871770 4.509659 12.966016 15.094093
Lyases - Carbon-carbon lyases [EC 4.1.-] 123.486125 131.531730 134.522411 107.434429
Lyases - Carbon-oxygen lyases [EC 4.2.-] 201.981181 171.367053 225.284520 133.183176
Lyases - Carbon-nitrogen lyases [EC 4.3.-] 50.734609 46.599813 27.552783 39.067065
Lyases - Carbon-sulfur lyases [EC 4.4.-] 13.401595 13.528978 0.000000 1.775776
Lyases - Phosphorus-oxygen lyases [EC 4.6.-] 15.316109 9.019319 4.862256 6.215215
Lyases - Other lyases [EC 4.99.-] 0.000000 0.000000 3.241504 0.000000
Isomerases - Racemases and epimerases [EC 5.1.-] 71.794259 51.861082 43.760302 31.076074
Isomerases - Cis-trans-isomerases [EC 5.2.-] 4.786284 9.770928 1.620752 14.206205
Isomerases - Intramolecular oxidoreductases [EC 5.3.-] 76.580543 50.357862 34.035791 78.134130
Isomerases - Intramolecular transferases [EC 5.4.-] 71.794259 92.448016 48.622558 85.237233
Isomerases - Intramolecular lyases [EC 5.5.-] 13.401595 5.261269 19.449023 15.981981
Isomerases - Other isomerases [EC 5.99.-] 84.238597 75.160988 59.967822 55.936934
Ligases [EC 6.-] 0.000000 0.000000 3.241504 0.000000
Ligases - Forming carbon-oxygen bonds [EC 6.1.-] 386.731740 450.214320 333.874900 338.285267
Ligases - Forming carbon-sulfur bonds [EC 6.2.-] 146.460288 129.276900 124.797899 190.895886
Ligases - Forming carbon-nitrogen bonds [EC 6.3.-] 218.254546 222.476525 270.665574 198.886876
Ligases - Forming carbon-carbon bonds [EC 6.4.-] 19.145136 19.541857 32.415039 6.215215
Ligases - Forming phosphoric ester bonds [EC 6.5.-] 44.991069 78.919038 68.071581 107.434429
[58]:
ec_counts.to_csv('{}ec_counts.csv'.format(data_dir))
[59]:
fig, ax = mgkit.plots.get_single_figure(figsize=(15, 12))
_ = mgkit.plots.boxplot.boxplot_dataframe(
    ec_counts,
    plot_order,
    ax,
    # a dictionary with options related to the labels
    # on both the X and Y axes. In this case it changes
    # the size of the labels and the rotation - the default
    # is 'vertical', as the box_vert=True by default
    fonts=dict(fontsize=12, rotation='horizontal'),
    data_colours={
        x: color
        for x, color in zip(plot_order, seaborn.color_palette('hls', len(plot_order)))
    },
    # Changes the direction of the boxplot. The rotation of
    # the labels must be set to 'horizontal' in the *fonts*
    # dictionary
    box_vert=False
)
# Adds labels to the axes
ax.set_xlabel('Counts', fontsize=16)
ax.set_ylabel('Enzyme Class', fontsize=16)
# Ensure the correct layout before writing to disk
fig.set_tight_layout(True)
# Saves a PDF file, or any other supported format by matplotlib
fig.savefig('{}ec_counts-boxplot.pdf'.format(data_dir))
2018-05-22 14:10:25,953 - DEBUG - matplotlib.font_manager->findfont: findfont: Matching :family=sans-serif:style=normal:variant=normal:weight=normal:stretch=normal:size=12.0 to DejaVu Sans (u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf') with score of 0.050000
2018-05-22 14:10:26,264 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:10:26,958 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:10:26,960 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
../_images/pipeline_Exploring-Metagenomic-Data_65_1.png

Explore Diversity

Diversity in metagenomic samples can be analysed using pN/pS values. The data required to do this was produced in the tutorial by the snp_parser script. Here are some examples of how to calculate diversity estimates from this data.

The complete toolset to map diversity estimates can be found in the mgkit.snps package, with the mgkit.snps.funcs.combine_sample_snps function building the final pandas DataFrame. As the use of the function requires the initialisation of different functions, a few easier to use ones are available in the mgkit.snps.conv_func module:

  • get_rank_dataframe
  • get_gene_map_dataframe
  • get_full_dataframe
  • get_gene_taxon_dataframe

The first is used to get diversity estimates for taxa, the second for genes/functions. The other two provides functionality to return estimates tied to both taxon and function.

Taxa

[60]:
# Sets the minimum coverage for an annotation to be
# included into the table (defaults to 4)
mgkit.consts.DEFAULT_SNP_FILTER['min_cov'] = 2
[61]:
# To get diversity estimates for taxa *mgkit.snps.conv_func.get_rank_dataframe* can be used
# It is also imported and accesible from the *mgkit.snps* package
pnps = mgkit.snps.get_rank_dataframe(snp_data, taxonomy, min_num=3, rank='order', index_type='taxon')
2018-05-22 14:10:34,437 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001322
2018-05-22 14:10:34,491 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001323
2018-05-22 14:10:34,552 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001325
2018-05-22 14:10:34,610 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001326
[62]:
pnps = pnps.rename(
    columns=sample_names,
    index=lambda x: taxonomy[x].s_name
)
[63]:
pnps.describe()
[63]:
16m 32m 01m 50m
count 83.000000 83.000000 86.0 75.000000
mean 0.027889 0.035167 0.0 0.020063
std 0.086763 0.136137 0.0 0.100393
min 0.000000 0.000000 0.0 0.000000
25% 0.000000 0.000000 0.0 0.000000
50% 0.000000 0.000000 0.0 0.000000
75% 0.000000 0.000000 0.0 0.000000
max 0.360621 0.882326 0.0 0.604127
[64]:
pnps
[64]:
16m 32m 01m 50m
Rubrobacterales 0.000000 0.000000 0.0 0.000000
Sphaerobacterales 0.000000 0.000000 0.0 NaN
Bifidobacteriales 0.000000 0.000000 0.0 0.000000
Micrococcales 0.000000 0.000000 0.0 0.000000
Neisseriales 0.000000 0.000000 0.0 0.000000
Micromonosporales 0.000000 NaN 0.0 0.000000
Pseudonocardiales 0.000000 0.000000 0.0 NaN
Streptomycetales 0.000000 0.000000 0.0 0.000000
Streptosporangiales 0.000000 0.000000 0.0 0.000000
Haloferacales 0.000000 0.000000 0.0 0.000000
Bacteroidales 0.000000 0.000000 0.0 0.000000
Mycoplasmatales 0.000000 0.000000 0.0 0.000000
Corynebacteriales 0.000000 0.000000 0.0 0.000000
Schizosaccharomycetales 0.306546 NaN 0.0 0.000000
Clostridiales NaN 0.000000 0.0 0.000000
Rhodocyclales 0.000000 0.000000 0.0 NaN
Thiotrichales NaN 0.000000 0.0 0.000000
Pseudomonadales 0.000000 0.000000 0.0 0.000000
Legionellales 0.314474 0.000000 0.0 0.000000
Chlamydiales 0.000000 0.000000 0.0 0.000000
Chroococcales 0.000000 0.000000 0.0 NaN
Methanobacteriales 0.301844 0.304830 0.0 0.000000
Planctomycetales 0.000000 0.000000 0.0 0.000000
Synechococcales 0.000000 0.000000 0.0 0.397696
Desulfovibrionales 0.000000 0.000000 0.0 0.000000
Cenarchaeales 0.000000 0.000000 NaN 0.000000
Oscillatoriales 0.000000 0.000000 0.0 0.000000
Methanococcales 0.099162 0.147193 NaN 0.604127
Spirochaetales 0.000000 0.000000 0.0 0.000000
Nostocales 0.000000 0.000000 0.0 NaN
... ... ... ... ...
Herpetosiphonales 0.000000 0.000000 0.0 0.000000
Thermomicrobiales 0.000000 NaN 0.0 0.000000
Nitrospirales 0.000000 0.000000 0.0 0.000000
Campylobacterales 0.000000 0.000000 0.0 0.000000
Acidithiobacillales 0.000000 0.000000 0.0 0.000000
Parachlamydiales 0.000000 0.000000 0.0 NaN
Thermotogales 0.000000 0.000000 0.0 0.000000
Methanopyrales 0.000000 0.000000 0.0 0.000000
Solibacterales 0.000000 0.000000 0.0 NaN
Cellvibrionales 0.000000 0.000000 0.0 NaN
Natranaerobiales 0.000000 0.000000 0.0 0.000000
Gloeobacterales 0.000000 0.000000 0.0 0.000000
Methanocellales 0.000000 0.000000 0.0 0.000000
Aquificales 0.000000 0.000000 0.0 0.000000
Petrotogales 0.000000 0.000000 0.0 NaN
Eurotiales 0.000000 0.000000 0.0 0.000000
Chlorobiales 0.000000 0.000000 0.0 0.000000
Onygenales NaN 0.000000 0.0 0.000000
Chromatiales 0.000000 0.000000 0.0 0.000000
Xanthomonadales 0.000000 0.000000 0.0 0.000000
Oceanospirillales NaN 0.649600 0.0 0.000000
Flavobacteriales 0.000000 0.000000 0.0 0.000000
Alteromonadales 0.269426 NaN 0.0 0.000000
Vibrionales 0.000000 0.000000 0.0 0.000000
Aeromonadales 0.000000 0.000000 0.0 NaN
Pasteurellales 0.299672 0.000000 0.0 0.000000
Syntrophobacterales 0.000000 0.000000 0.0 0.000000
Rickettsiales 0.000000 0.000000 0.0 0.000000
Methanosarcinales 0.059651 NaN 0.0 0.000000
Cytophagales 0.000000 0.000000 0.0 NaN

90 rows × 4 columns

[65]:
pnps.to_csv('{}pnps-taxa.csv'.format(data_dir))
[66]:
#sort the DataFrame to plot them by mean value
plot_order = pnps.mean(axis=1).sort_values(inplace=False, ascending=False).index[:10]

fig, ax = mgkit.plots.get_single_figure(figsize=(15, 10))
_ = mgkit.plots.boxplot.boxplot_dataframe(
    pnps,
    plot_order,
    ax,
    fonts=dict(fontsize=14, rotation='horizontal'),
    data_colours={
        x: color
        for x, color in zip(plot_order, seaborn.color_palette('hls', len(pnps.index)))
    },
    box_vert=False
)
ax.set_xlabel('pN/pS', fontsize=16)
ax.set_ylabel('Order', fontsize=16)
fig.set_tight_layout(True)
fig.savefig('{}pnps-taxa-boxplot.pdf'.format(data_dir))
/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/numpy/core/fromnumeric.py:52: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  return getattr(obj, method)(*args, **kwds)
2018-05-22 14:10:34,970 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:10:35,015 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:10:35,016 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
../_images/pipeline_Exploring-Metagenomic-Data_75_1.png

Functional Categories

[67]:
# To get diversity estimates of functions, *mgkit.snps.conv_func.get_gene_map_dataframe* can be used
# This is available in the *mgkit.snps* package as well
fc_pnps = mgkit.snps.get_gene_map_dataframe(snp_data, taxonomy, min_num=3, gene_map=fc_map, index_type='gene')
2018-05-22 14:10:38,138 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001322
2018-05-22 14:10:38,174 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001323
2018-05-22 14:10:38,215 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001325
2018-05-22 14:10:38,256 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001326
[68]:
fc_pnps = fc_pnps.rename(
    columns=sample_names,
    index=eggnog.EGGNOG_CAT
)
[69]:
fc_pnps.describe()
[69]:
16m 32m 01m 50m
count 18.000000 15.000000 18.0 19.000000
mean 0.106591 0.195602 0.0 0.085110
std 0.149325 0.257863 0.0 0.173999
min 0.000000 0.000000 0.0 0.000000
25% 0.000000 0.000000 0.0 0.000000
50% 0.000000 0.076031 0.0 0.000000
75% 0.275874 0.225485 0.0 0.127011
max 0.410372 0.761215 0.0 0.701625
[70]:
fc_pnps
[70]:
16m 32m 01m 50m
RNA processing and modification NaN 0.000000 0.0 0.000000
Energy production and conversion 0.300669 0.214854 NaN 0.201721
Chromatin structure and dynamics 0.000000 0.000000 0.0 0.000000
Amino acid transport and metabolism NaN 0.060911 0.0 0.000000
Cell cycle control, cell division, chromosome partitioning 0.000000 NaN 0.0 0.000000
Nucleotide transport and metabolism 0.060956 0.236117 0.0 0.000000
Lipid transport and metabolism 0.000000 NaN 0.0 0.000000
Coenzyme transport and metabolism 0.410372 0.761215 0.0 0.302295
Transcription 0.306611 0.076031 0.0 0.000000
Translation, ribosomal structure and biogenesis 0.029312 0.048914 NaN 0.701625
Cell wall/membrane/envelope biogenesis 0.000000 0.601078 0.0 NaN
Replication, recombination and repair 0.306580 0.148139 0.0 0.000000
Posttranslational modification, protein turnover, chaperones 0.302645 NaN 0.0 0.102591
Cell motility 0.000000 0.000000 0.0 0.000000
Secondary metabolites biosynthesis, transport and catabolism 0.000000 0.000000 0.0 0.000000
Inorganic ion transport and metabolism 0.000000 NaN 0.0 0.157425
Function unknown 0.000000 NaN 0.0 0.000000
General function prediction only 0.201490 0.149640 0.0 0.151431
Signal transduction mechanisms 0.000000 0.000000 0.0 0.000000
Defense mechanisms 0.000000 0.637127 0.0 0.000000
[71]:
fc_pnps.to_csv('{}pnps-fc.csv'.format(data_dir))
[72]:
#sort the DataFrame to plot them by median value
plot_order = fc_pnps.mean(axis=1).sort_values(inplace=False, ascending=False).index

fig, ax = mgkit.plots.get_single_figure(figsize=(15, 10))
_ = mgkit.plots.boxplot.boxplot_dataframe(
    fc_pnps,
    plot_order,
    ax,
    fonts=dict(fontsize=14, rotation='horizontal'),
    data_colours={
        x: color
        for x, color in zip(plot_order, seaborn.color_palette('hls', len(fc_pnps.index)))
    },
    box_vert=False
)
ax.set_xlabel('pN/pS', fontsize=16)
ax.set_ylabel('Functional Category', fontsize=16)
fig.set_tight_layout(True)
fig.savefig('{}pnps-fc-boxplot.pdf'.format(data_dir))
2018-05-22 14:10:38,683 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:10:38,777 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:10:38,778 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
../_images/pipeline_Exploring-Metagenomic-Data_82_1.png

Enzyme Classification

[73]:
ec_map = {
    # Using only the first level
    annotation.gene_id: set(x.replace('.-', '') for x in annotation.get_ec(level=1))
    for annotation in annotations.itervalues()
}
[74]:
ec_pnps = mgkit.snps.get_gene_map_dataframe(snp_data, taxonomy, min_num=3, gene_map=ec_map, index_type='gene')
2018-05-22 14:10:42,201 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001322
2018-05-22 14:10:42,238 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001323
2018-05-22 14:10:42,277 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001325
2018-05-22 14:10:42,319 - INFO - mgkit.snps.funcs->combine_sample_snps: Analysing SNP from sample SRR001326
[75]:
# Rename columns and row. Rows will include the full label the enzyme class
ec_pnps = ec_pnps.rename(
    index=lambda x: "{} {} [EC {}.-]".format(
        # A name of the second level doesn't include the first level
        # definition, so if it is level 2, we add the level 1 label
        '' if len(x) == 1 else ec_names[x[0]] + " - ",
        # The EC label for the specific class (e.g. 3.2)
        ec_names[x],
        # The EC number
        x
    ),
    columns=sample_names
)
[76]:
ec_pnps.describe()
[76]:
16m 32m 01m 50m
count 3.000000 3.000000 2.0 3.000000
mean 0.351120 0.187620 0.0 0.173303
std 0.248257 0.139590 0.0 0.185190
min 0.100278 0.100437 0.0 0.000000
25% 0.228326 0.107119 0.0 0.075732
50% 0.356374 0.113802 0.0 0.151464
75% 0.476541 0.231211 0.0 0.259954
max 0.596708 0.348620 0.0 0.368444
[77]:
ec_pnps
[77]:
16m 32m 01m 50m
Oxidoreductases [EC 1.-] 0.100278 0.113802 NaN 0.151464
Transferases [EC 2.-] 0.356374 0.348620 0.0 0.368444
Isomerases [EC 5.-] 0.596708 0.100437 0.0 0.000000
[78]:
ec_pnps.to_csv('{}pnps-ec.csv'.format(data_dir))
[79]:
#sort the DataFrame to plot them by median value
plot_order = ec_pnps.mean(axis=1).sort_values(inplace=False, ascending=False).index

fig, ax = mgkit.plots.get_single_figure(figsize=(15, 10))
_ = mgkit.plots.boxplot.boxplot_dataframe(
    ec_pnps,
    plot_order,
    ax,
    fonts=dict(fontsize=14, rotation='horizontal'),
    data_colours={
        x: color
        for x, color in zip(plot_order, seaborn.color_palette('hls', len(plot_order)))
    },
    box_vert=False
)
ax.set_xlabel('pN/pS', fontsize=16)
ax.set_ylabel('Enzyme Class', fontsize=16)
fig.set_tight_layout(True)
fig.savefig('{}pnps-ec-boxplot.pdf'.format(data_dir))
2018-05-22 14:10:43,431 - DEBUG - matplotlib.backends.backend_pdf->fontName: Assigning font /F1 = u'/home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf'
2018-05-22 14:10:43,455 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Embedding font /home/frubino/mgkit/dev-env/local/lib/python2.7/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf.
2018-05-22 14:10:43,456 - DEBUG - matplotlib.backends.backend_pdf->writeFonts: Writing TrueType font.
../_images/pipeline_Exploring-Metagenomic-Data_90_1.png