Source code for pymantra.network.enrichment_functions

from warnings import warn
from typing import Set, Dict, List, Union
import pandas as pd
import networkx as nx

from pymantra.statics import Edge
from pymantra.network.ld_estimation import (
    per_sample_ld_estimation, confounder_correction)
from pymantra.network.reaction_associations import associate_multiomics_ld


[docs]def compute_reaction_estimates( graph: nx.DiGraph, metabolite_data: pd.DataFrame, sample_groups: pd.Series, covariates: pd.DataFrame = None, random_effects: Union[str, List[str]] = None, lmm_args: dict = None, residual_summary: str = "expl_var", return_all: bool = False, control_group: any = None, **kwargs ): """Generate reaction estimates Compute the linear-model estimates for a given graph and metabolomics data Parameters ---------- graph : nx.DiGraph Metabolite-reaction graph. Metabolites need to be denoted as 'metabolite' (via node attribute 'node_type') and reactions as 'reaction'. metabolite_data : pd.DataFrame Metabolite data with samples in rows and metabolites in columns. Metabolite names need to match the metabolite node names in `graph` and indices need to match the indices of `sample_groups`. sample_groups : pd.Series Array indicating sample groups covariates : pd.DataFrame, optional Confounder variables to correct for. Correction is done using a Linear Mixed Model. All variables (i.e. columns) not specified as random effect variables in `random_effects` are assumed to be fixed effects variables. The correction currently only supports simple fixed and random effects inclusion. For more complex setups including factor interaction, it is recommended to do the correction beforehand and only pass the residuals to this function instead of the original metabolome data frame. random_effects: str | List[str], optional Random effects for confounder correction. If `covariates` is None this has no effect. Else, this specifies which column(s) of `covariates` to include as random effects, all other columns will be included as fixed effects. If this is None, all columns of `covariates` are assumed to be fixed effects. lmm_args: dict, optional Keyword arguments for :py:func:`MixedLM.from_formula`. Ignored unless `covariates` and `random_effects` are both not None. residual_summary: str, "expl_var" Which method to use as residual summary statistic. Either "expl_var" for explained variance (RSS/TSS) or "norm" for p-norm return_all : bool, False Whether to return all variables return by :py:func:`per_sample_ld_estimation` or only return the scaled residuals control_group : any, optional Name of the control group kwargs Keyword arguments. See :py:func:`per_sample_ld_estimation` for details. Returns ------- Union[ pd.DataFrame, Tuple[Dict[str, LinearModel], Dict[str, np.ndarray], pd.DataFrame] ] If `control_groups` is False a pd.DataFrame with samples as rows and reactions in columns. Else a 3-tuple as returned by :py:func:`per_sample_ld_estimation` only with `scaled_residuals` as a pd.DataFrame generated from the initially returned dictionary Examples -------- >>> from pymantra.datasets import example_metabolome_enrichment_data >>> metabolite_data, sample_groups, graph = \ ... example_metabolome_enrichment_data() >>> compute_reaction_estimates(graph, metabolite_data, sample_groups) """ if control_group is None: control_group = sample_groups[0] control_models, case_residuals, scaled_residuals = \ per_sample_ld_estimation( graph, metabolite_data, sample_groups, control_group=control_group, covariates=covariates, random_effects=random_effects, lmm_args=lmm_args, residual_summary=residual_summary, **kwargs ) res_df = pd.DataFrame.from_dict(scaled_residuals) res_df.fillna(value=0, inplace=True) if return_all: return control_models, case_residuals, res_df return res_df
[docs]def add_reaction_estimates( graph: nx.DiGraph, sample_groups: pd.Series = None, estimate_data: pd.DataFrame = None, metabolite_data: pd.DataFrame = None, control_group: any = None, return_estimates: bool = True, **kwargs ): """Add reaction estimates to a metabolite-reaction graph Add the linear model estimates to a given metabolite-reaction graph, either using pre-computed estimates or computing estimates via :py:func:`compute_reaction_estimates` and adding them directly. Parameters ---------- graph : nx.DiGraph Metabolite-reaction graph. Metabolites need to be denoted as 'metabolite' (via node attribute 'node_type') and reactions as 'reaction'. sample_groups : pd.Series Array indicating sample group estimate_data : pd.DataFrame, optional Linear-model estimates per reaction and sample as generated by :py:func:`compute_reaction_estimates`. If None, estimates will be computed from `metabolite_data`, hence must be given. metabolite_data : pd.DataFrame, optional Metabolite data with samples in rows and metabolites in columns. If `estimate_data` is None this becomes a required parameter as it will be used to compute the reaction models. Metabolite names need to match the metabolite node names in `graph` and indices need to match the indices of `sample_groups`. control_group : any, optional Name of the control group return_estimates: bool, False Whether to return the linear model-base estimates computed in :py:func:`compute_reaction_estimates` kwargs Keyword arguments. See :py:func:`per_sample_ld_estimation` for details Examples -------- >>> from pymantra.datasets import example_metabolome_enrichment_data >>> metabolite_data, sample_groups, graph = \ ... example_metabolome_enrichment_data() >>> residuals = \ ... compute_reaction_estimates(graph, metabolite_data, sample_groups) >>> add_reaction_estimates(graph, sample_groups, residuals) """ if control_group is None: control_group = sample_groups.iloc[0] control_mask = sample_groups == control_group if estimate_data is None: if metabolite_data is None: raise ValueError( "'metabolite_data' must be given when 'estimate_data' is " "None" ) estimate_data = compute_reaction_estimates( graph, metabolite_data, sample_groups, control_group=control_group, **kwargs ) else: return_estimates = False control_data = estimate_data.loc[control_mask, :].to_dict('list') case_data = estimate_data.loc[~control_mask, :].to_dict('list') node_data = { reaction: {'0': control_data[reaction], '1': case_data[reaction]} for reaction in control_data.keys() } nx.set_node_attributes(graph, node_data, 'vec_group_data') if return_estimates: return estimate_data
[docs]def compute_multiomics_associations( residuals: pd.DataFrame, multi_omics: pd.DataFrame, sample_groups: pd.Series, covariates: pd.DataFrame = None, random_effects: Union[str, List[str]] = None, lmm_args: dict = None, **kwargs ): """Compute multi-omics associations with reaction estimates Compute the associations between multi-omics features and the residuals of reactions as estimated by the linear models. This is essentially a wrapper for :py:func:`associate_multiomics_ld` for interface consistency. Parameters ---------- residuals : pd.DataFrame Linear model residuals matrix with samples in rows and reactions in columns multi_omics : pd.DataFrame Multi-omics measurements with samples in rows and multi-omics features in columns. sample_groups : pd.Series Array of sample groups covariates : pd.DataFrame, optional Confounder variables to correct for. Correction is done using a Linear Mixed Model. All variables (i.e. columns) not specified as random effect variables in `random_effects` are assumed to be fixed effects variables. The correction currently only supports simple fixed and random effects inclusion. For more complex setups including factor interaction, it is recommended to do the correction beforehand and only pass the residuals to this function instead of the original metabolome data frame. random_effects: str | List[str], optional Random effects for confounder correction. If `covariates` is None this has no effect. Else, this specifies which column(s) of `covariates` to include as random effects, all other columns will be included as fixed effects. If this is None, all columns of `covariates` are assumed to be fixed effects. lmm_args: dict, optional Keyword arguments for `statsmodels.regression.mixed_linear_model.MixedLM.from_formula`. Ignored unless `covariates` and `random_effects` are both not None. kwargs Keyword arguments passed to :py:func:`associate_multiomics_ld` Returns ------- Tuple[Dict[str, pd.DataFrame], Dict[str, pd.DataFrame]] 2-tuple where the first elements are correlations per group as a data frame of multi-omics x reaction and the second element are the correlation p-values per group in the same format as the correlations Examples -------- >>> from pymantra.datasets import example_multiomics_enrichment_data >>> from pymantra import ( ... compute_reaction_estimates, compute_multiomics_associations) >>> metabolite_data, microbiome_data, sample_groups, graph = \ ... example_multiomics_enrichment_data() >>> residuals = \ ... compute_reaction_estimates(graph, metabolite_data, sample_groups) >>> compute_multiomics_associations( ... residuals, microbiome_data, sample_groups) """ if covariates is not None: corr_mo = confounder_correction( multi_omics, covariates, random_effects, **lmm_args ) return associate_multiomics_ld( residuals, sample_groups, corr_mo, **kwargs) return associate_multiomics_ld( residuals, sample_groups, multi_omics, **kwargs)
def _add_multiomics_associations( graph: nx.DiGraph, associations: Dict[str, pd.DataFrame], sample_groups: pd.Series, edge_type: str, reaction_data: pd.DataFrame, multi_omics: pd.DataFrame, **kwargs ): if associations is None: associations = compute_multiomics_associations( reaction_data, sample_groups, multi_omics, **kwargs) corr_dict = { group: associations[group].to_dict() for group in sample_groups.unique() } edge_data = { edge: { group: corr_dict[group].get(edge[0], {}).get(edge[1], 0) for group in corr_dict.keys() } for edge in graph.edges if graph.edges[edge]['edge_type'] == edge_type } nx.set_edge_attributes(graph, edge_data, 'data')
[docs]def add_microbiome_associations( graph: nx.DiGraph, sample_groups: pd.Series, associations: Dict[str, pd.DataFrame] = None, residuals: pd.DataFrame = None, microbiome_data: pd.DataFrame = None, **kwargs ): """Add microbiome-reaction associations to a multi-omics graph Add the association estimates to a given multi-omics-reaction graph, either using pre-computed estimates or computing estimates via :py:func:`compute_multiomics_associations` and adding them directly. Parameters ---------- graph : nx.DiGraph Metabolite-reaction graph containing additional reaction-organism connections. Usually when calling this function reaction estimates are already added to the graph. sample_groups : pd.Series Array indicating sample group associations : Dict[str, pd.DataFrame], optional Reaction-microbiome associations per group as generated by :py:func:`compute_multiomics_associations` residuals : pd.DataFrame Linear-model estimates per reaction and sample as generated by :py:func:`compute_reaction_estimates`. If `associations` is None this parameter is required. microbiome_data : pd.DataFrame Microbiome data with samples in rows and microbes in columns. If `associations` is None this becomes a required parameter. Microbe names need to match the organism node names in `graph` and indices need to match the indices of `sample_groups`. kwargs Keyword arguments to be passed to :py:func:`compute_multiomics_associations` Examples -------- >>> from pymantra.datasets import example_multiomics_enrichment_data >>> from pymantra import ( ... compute_reaction_estimates, compute_multiomics_associations) >>> metabolite_data, microbiome_data, sample_groups, graph = \ ... example_multiomics_enrichment_data() >>> residuals = \ ... compute_reaction_estimates(graph, metabolite_data, sample_groups) >>> corrs, pvals = compute_multiomics_associations( ... residuals, microbiome_data, sample_groups) >>> add_microbiome_associations(graph, sample_groups, corrs) """ if associations is None and (microbiome_data is None or residuals is None): raise ValueError( "Either 'associations' or 'microbiome_data' and 'residuals' is " "required. See function documentation for more information." ) _add_multiomics_associations( graph, associations, sample_groups, "REACTION_ORGANISM", residuals, microbiome_data, **kwargs )
[docs]def add_gene_associations( graph: nx.DiGraph, sample_groups: pd.Series, associations: Dict[str, pd.DataFrame] = None, residuals: pd.DataFrame = None, gene_data: pd.DataFrame = None, **kwargs ): """Add microbiome-reaction associations to a multi-omics graph Add the association estimates to a given multi-omics-reaction graph, either using pre-computed estimates or computing estimates via :py:func:`compute_multiomics_associations` and adding them directly. Parameters ---------- graph : nx.DiGraph Metabolite-reaction graph containing additional reaction-organism connections. Usually when calling this function reaction estimates are already added to the graph. sample_groups : pd.Series Array indicating sample group associations : Dict[str, pd.DataFrame], optional Reaction-microbiome associations per group as generated by :py:func:`compute_multiomics_associations` residuals : pd.DataFrame Linear-model estimates per reaction and sample as generated by :py:func:`compute_reaction_estimates`. If `associations` is None this parameter is required. gene_data : pd.DataFrame Gene data with samples in rows and genes in columns. If `associations` is None this becomes a required parameter. Gene names need to match the gene node names in `graph` and indices need to match the indices of `sample_groups`. kwargs Keyword arguments to be passed to :py:func:`compute_multiomics_associations` """ if associations is None and (residuals is None or gene_data is None): raise ValueError( "Either 'associations' or 'gene_data' is required. See function " "documentation for more information." ) _add_multiomics_associations( graph, associations, sample_groups, "REACTION_GENE", residuals, gene_data, **kwargs )
def enrichment_pvalue(graph: nx.Graph, subnet: Set[Edge]) -> float: # to cpp? warn("pvalue computation currently not implemented") return -1.