Source code for mgkit.counts.scaling

"""
Scaling functions for counts
"""

from __future__ import division
from scipy import stats
import numpy
import pandas


[docs]def scale_factor_deseq(dataframe): """ .. versionadded:: 0.1.13 Returns the scale factor according to he deseq paper. The columns of the dataframe are the samples. size factor :math:`\\hat{s}_{j}` for sample *j* (from DESeq paper). .. math:: \\hat{s}_{j} = median_{i} ( \\frac {k_{ij}} { \\left ( \\prod_{v=1}^{m} k_{iv} \\right )^{1/m} } ) """ # calc the genes geometric mean over all samples gmean = dataframe.apply(stats.gmean, axis=1) # keeps only the genes whose geometric mean is > 0 gmean = gmean[gmean > 0] sample_factors = {} # calc the scaling factor for each sample for sample, genes in dataframe.items(): scale_factor = numpy.median(genes.loc[gmean.index] / gmean) sample_factors[sample] = scale_factor return pandas.Series(sample_factors)
[docs]def scale_deseq(dataframe): """ .. versionadded:: 0.1.13 Scale a dataframe using the deseq scaling. Uses :func:`scale_factor_deseq` """ scale_factors = scale_factor_deseq(dataframe) return dataframe / scale_factors
[docs]def scale_rpkm(dataframe, gene_len): """ .. versionadded:: 0.1.14 Perform an RPKM scaling of the pandas dataframe/series supplied using the *gene_len* series containing the gene sizes for all elements of *dataframe* .. math:: RPKM =\\frac {10^{9} \\cdot C} {N \\cdot L} """ gene_len = gene_len[dataframe.index] tot_reads = dataframe.sum().sum() return (10 ** 9) * dataframe.div(gene_len * tot_reads, axis='index')