Source code for mgkit.counts.glm

"""
.. versionadded:: 0.3.3

GLM models with metagenomes and metatranscriptomes. Experimental
"""

from __future__ import division
from builtins import range
import statsmodels.api as sm
from scipy import interpolate, optimize
import pandas as pd


[docs]def lowess_ci_bootstrap(endog, exog, num=100, frac=.2, it=3, alpha=.05, delta=0., min_value=10**-3, kind='slinear'): """ .. versionchanged: 0.4.0 the interpolation of the confidence intervals uses the passed `kind` Bootstraps a lowess for the dependent (`endog`) and indipendent (`exog`) arguments. Arguments: endog (array): indipendent variable (Y) exog (array): indipendent variable (X) num (int): number of iterations for the bootstrap frac (float): fraction of the array to use when fitting it (int): number of iterations used to fit the lowess alpha (float): confidence intervals for the bootstrap delta (float): passed to :func:`statsmodels.api.nonparametric.lowess` min_value (float): minimum value for the function to avoid out of bounds kind (str): type of interpolation passed to :func:`scipy.interpolate.interp1d` Returns: tuple: the first element is the function describing the lowest confidence interval, the second element is for the highest confidence interval and the last one for the mean .. note:: Performance increase with the value of *delta*. """ data = pd.DataFrame( { 'endog': endog, 'exog': exog } ).sort_values(by=['exog', 'endog'], ascending=True) boots = [] for idx in range(num): sample = data.sample(n=data.index.size, replace=True) lw = sm.nonparametric.lowess( sample.endog, sample.exog, is_sorted=False, frac=frac, it=it, return_sorted=True, delta=delta ) boots.append(pd.DataFrame({ 'endog': lw[:, 1], 'exog': lw[:, 0] })) alpha = alpha / 2 boots = pd.concat(boots) q1 = boots.groupby('exog').quantile( alpha, interpolation='nearest' ).sort_index() q1 = q1.endog[q1.endog > 0].fillna(min_value) q1 = interpolate.interp1d(q1.index, q1, kind=kind, fill_value='extrapolate') q2 = boots.groupby('exog').quantile( 1 - alpha, interpolation='nearest' ).sort_index() q2 = q2.endog[q2.endog > 0].fillna(min_value) q2 = interpolate.interp1d(q2.index, q2, kind=kind, fill_value='extrapolate') m = fit_lowess_interpolate(data.endog, data.exog, frac=frac, it=it, kind=kind) return q1, q2, m
[docs]def fit_lowess_interpolate(endog, exog, frac=.2, it=3, kind='slinear'): """ Fits a lowess for the passed `endog` (Y) and `exog` (X) and returns an interpolated function that describes it. The first 4 arguments are passed to :func:`statsmodels.api.sm.nonparametric.lowess`, while the last one is passed to :func:`scipy.interpolate.interp1d` Arguments: endog (array): array of the dependent variable (Y) exog (array): array of the indipendent variable (X) frac (float): fraction of the number of elements to use when fitting (0.0-1.0) it (int): number of iterations to fit the lowess kind (str): type of interpolation to use Returns: func: interpolated function representing the lowess fitted from the data passed """ data = pd.DataFrame( { 'endog': endog, 'exog': exog } ).sort_values(by=['exog', 'endog'], ascending=True) lw = sm.nonparametric.lowess( data.endog, data.exog, frac=frac, it=it, ) return interpolate.interp1d( lw[:, 0], lw[:, 1], kind=kind, fill_value='extrapolate' )
[docs]def variance_to_alpha(mu, func, min_alpha=10**-3): """ Based on the variance defined in the Negative Binomial in statsmodels var = mu + alpha * (mu ** 2) Arguments: mu (float): mean to calculate the alphas for func (func): function that returns the variace of the mean min_alpha (float): value of alpha if the `func` goes out of bounds Returns: float: value of alpha for the passed mean """ alpha = (func(mu) - mu) / (mu ** 2) return min_alpha if alpha <= 0 else alpha
[docs]def optimise_alpha_scipy_function(args, formula, data, criterion='aic'): """ .. versionadded:: 0.4.0 """ alpha, = args model = sm.formula.glm( formula, data=data, family=sm.families.NegativeBinomial(alpha=alpha) ).fit() return getattr(model, criterion)
[docs]def optimise_alpha_scipy(formula, data, mean_func, q1_func, q2_func): """ .. versionadded:: 0.4.0 Used to find an optimal *alpha* parameter for the Negative Binomial distribution used in `statsmodels`, using the lowess functions from :func:`lowess_ci_bootstrap`. Arguments: formula (str): the formula used for the regression data (DataFrame): DataFrame for regression mean_func (func): function for the mean :func:`lowess_ci_bootstrap` q1_func (func): function for the q1 :func:`lowess_ci_bootstrap` q2_func (func): function for the q2 :func:`lowess_ci_bootstrap` Returns: float: *alpha* value for the Negative Binomial """ data_mean = data.mean()[0] return optimize.minimize( optimise_alpha_scipy_function, [variance_to_alpha(data_mean, mean_func)], args=(formula, data), bounds=[(variance_to_alpha(data_mean, q1_func), variance_to_alpha(data_mean, q2_func))] ).x[0]