from warnings import warn
from typing import Set, Dict, List, Union
import pandas as pd
import networkx as nx
from pymantra.statics import Edge
from pymantra.network.ld_estimation import (
per_sample_ld_estimation, confounder_correction)
from pymantra.network.reaction_associations import associate_multiomics_ld
[docs]def compute_reaction_estimates(
graph: nx.DiGraph, metabolite_data: pd.DataFrame, sample_groups: pd.Series,
covariates: pd.DataFrame = None,
random_effects: Union[str, List[str]] = None, lmm_args: dict = None,
residual_summary: str = "expl_var", return_all: bool = False,
control_group: any = None, **kwargs
):
"""Generate reaction estimates
Compute the linear-model estimates for a given graph and metabolomics data
Parameters
----------
graph : nx.DiGraph
Metabolite-reaction graph. Metabolites need to be denoted as
'metabolite' (via node attribute 'node_type') and reactions as
'reaction'.
metabolite_data : pd.DataFrame
Metabolite data with samples in rows and metabolites in columns.
Metabolite names need to match the metabolite node names in `graph` and
indices need to match the indices of `sample_groups`.
sample_groups : pd.Series
Array indicating sample groups
covariates : pd.DataFrame, optional
Confounder variables to correct for. Correction is done using a
Linear Mixed Model. All variables (i.e. columns) not specified as
random effect variables in `random_effects` are assumed to be fixed
effects variables.
The correction currently only supports simple fixed and random effects
inclusion. For more complex setups including factor interaction, it is
recommended to do the correction beforehand and only pass the residuals
to this function instead of the original metabolome data frame.
random_effects: str | List[str], optional
Random effects for confounder correction. If `covariates` is None this
has no effect. Else, this specifies which column(s) of `covariates` to
include as random effects, all other columns will be included as fixed
effects. If this is None, all columns of `covariates` are assumed to be
fixed effects.
lmm_args: dict, optional
Keyword arguments for :py:func:`MixedLM.from_formula`.
Ignored unless `covariates` and `random_effects` are both not None.
residual_summary: str, "expl_var"
Which method to use as residual summary statistic. Either "expl_var"
for explained variance (RSS/TSS) or "norm" for p-norm
return_all : bool, False
Whether to return all variables return by
:py:func:`per_sample_ld_estimation` or only return the scaled residuals
control_group : any, optional
Name of the control group
kwargs
Keyword arguments. See :py:func:`per_sample_ld_estimation` for details.
Returns
-------
Union[
pd.DataFrame,
Tuple[Dict[str, LinearModel], Dict[str, np.ndarray], pd.DataFrame]
]
If `control_groups` is False a pd.DataFrame with samples as rows and
reactions in columns. Else a 3-tuple as returned by
:py:func:`per_sample_ld_estimation` only with `scaled_residuals` as
a pd.DataFrame generated from the initially returned dictionary
Examples
--------
>>> from pymantra.datasets import example_metabolome_enrichment_data
>>> metabolite_data, sample_groups, graph = \
... example_metabolome_enrichment_data()
>>> compute_reaction_estimates(graph, metabolite_data, sample_groups)
"""
if control_group is None:
control_group = sample_groups[0]
control_models, case_residuals, scaled_residuals = \
per_sample_ld_estimation(
graph, metabolite_data, sample_groups, control_group=control_group,
covariates=covariates, random_effects=random_effects,
lmm_args=lmm_args, residual_summary=residual_summary, **kwargs
)
res_df = pd.DataFrame.from_dict(scaled_residuals)
res_df.fillna(value=0, inplace=True)
if return_all:
return control_models, case_residuals, res_df
return res_df
[docs]def add_reaction_estimates(
graph: nx.DiGraph, sample_groups: pd.Series = None,
estimate_data: pd.DataFrame = None, metabolite_data: pd.DataFrame = None,
control_group: any = None, return_estimates: bool = True, **kwargs
):
"""Add reaction estimates to a metabolite-reaction graph
Add the linear model estimates to a given metabolite-reaction graph, either
using pre-computed estimates or computing estimates via
:py:func:`compute_reaction_estimates` and adding them directly.
Parameters
----------
graph : nx.DiGraph
Metabolite-reaction graph. Metabolites need to be denoted as
'metabolite' (via node attribute 'node_type') and reactions as
'reaction'.
sample_groups : pd.Series
Array indicating sample group
estimate_data : pd.DataFrame, optional
Linear-model estimates per reaction and sample as generated by
:py:func:`compute_reaction_estimates`. If None, estimates will be
computed from `metabolite_data`, hence must be given.
metabolite_data : pd.DataFrame, optional
Metabolite data with samples in rows and metabolites in columns.
If `estimate_data` is None this becomes a required parameter as it will
be used to compute the reaction models.
Metabolite names need to match the metabolite node names in `graph` and
indices need to match the indices of `sample_groups`.
control_group : any, optional
Name of the control group
return_estimates: bool, False
Whether to return the linear model-base estimates computed in
:py:func:`compute_reaction_estimates`
kwargs
Keyword arguments. See :py:func:`per_sample_ld_estimation` for details
Examples
--------
>>> from pymantra.datasets import example_metabolome_enrichment_data
>>> metabolite_data, sample_groups, graph = \
... example_metabolome_enrichment_data()
>>> residuals = \
... compute_reaction_estimates(graph, metabolite_data, sample_groups)
>>> add_reaction_estimates(graph, sample_groups, residuals)
"""
if control_group is None:
control_group = sample_groups.iloc[0]
control_mask = sample_groups == control_group
if estimate_data is None:
if metabolite_data is None:
raise ValueError(
"'metabolite_data' must be given when 'estimate_data' is "
"None"
)
estimate_data = compute_reaction_estimates(
graph, metabolite_data, sample_groups, control_group=control_group,
**kwargs
)
else:
return_estimates = False
control_data = estimate_data.loc[control_mask, :].to_dict('list')
case_data = estimate_data.loc[~control_mask, :].to_dict('list')
node_data = {
reaction: {'0': control_data[reaction], '1': case_data[reaction]}
for reaction in control_data.keys()
}
nx.set_node_attributes(graph, node_data, 'vec_group_data')
if return_estimates:
return estimate_data
[docs]def compute_multiomics_associations(
residuals: pd.DataFrame, multi_omics: pd.DataFrame,
sample_groups: pd.Series, covariates: pd.DataFrame = None,
random_effects: Union[str, List[str]] = None,
lmm_args: dict = None, **kwargs
):
"""Compute multi-omics associations with reaction estimates
Compute the associations between multi-omics features and the residuals of
reactions as estimated by the linear models.
This is essentially a wrapper for :py:func:`associate_multiomics_ld` for
interface consistency.
Parameters
----------
residuals : pd.DataFrame
Linear model residuals matrix with samples in rows and reactions in
columns
multi_omics : pd.DataFrame
Multi-omics measurements with samples in rows and multi-omics features
in columns.
sample_groups : pd.Series
Array of sample groups
covariates : pd.DataFrame, optional
Confounder variables to correct for. Correction is done using a
Linear Mixed Model. All variables (i.e. columns) not specified as
random effect variables in `random_effects` are assumed to be fixed
effects variables.
The correction currently only supports simple fixed and random effects
inclusion. For more complex setups including factor interaction, it is
recommended to do the correction beforehand and only pass the residuals
to this function instead of the original metabolome data frame.
random_effects: str | List[str], optional
Random effects for confounder correction. If `covariates` is None this
has no effect. Else, this specifies which column(s) of `covariates` to
include as random effects, all other columns will be included as fixed
effects. If this is None, all columns of `covariates` are assumed to be
fixed effects.
lmm_args: dict, optional
Keyword arguments for
`statsmodels.regression.mixed_linear_model.MixedLM.from_formula`.
Ignored unless `covariates` and `random_effects` are both not None.
kwargs
Keyword arguments passed to :py:func:`associate_multiomics_ld`
Returns
-------
Tuple[Dict[str, pd.DataFrame], Dict[str, pd.DataFrame]]
2-tuple where the first elements are correlations per group as a
data frame of multi-omics x reaction and the second element are the
correlation p-values per group in the same format as the correlations
Examples
--------
>>> from pymantra.datasets import example_multiomics_enrichment_data
>>> from pymantra import (
... compute_reaction_estimates, compute_multiomics_associations)
>>> metabolite_data, microbiome_data, sample_groups, graph = \
... example_multiomics_enrichment_data()
>>> residuals = \
... compute_reaction_estimates(graph, metabolite_data, sample_groups)
>>> compute_multiomics_associations(
... residuals, microbiome_data, sample_groups)
"""
if covariates is not None:
corr_mo = confounder_correction(
multi_omics, covariates, random_effects, **lmm_args
)
return associate_multiomics_ld(
residuals, sample_groups, corr_mo, **kwargs)
return associate_multiomics_ld(
residuals, sample_groups, multi_omics, **kwargs)
def _add_multiomics_associations(
graph: nx.DiGraph, associations: Dict[str, pd.DataFrame],
sample_groups: pd.Series, edge_type: str, reaction_data: pd.DataFrame,
multi_omics: pd.DataFrame, **kwargs
):
if associations is None:
associations = compute_multiomics_associations(
reaction_data, sample_groups, multi_omics, **kwargs)
corr_dict = {
group: associations[group].to_dict()
for group in sample_groups.unique()
}
edge_data = {
edge: {
group: corr_dict[group].get(edge[0], {}).get(edge[1], 0)
for group in corr_dict.keys()
}
for edge in graph.edges if
graph.edges[edge]['edge_type'] == edge_type
}
nx.set_edge_attributes(graph, edge_data, 'data')
[docs]def add_microbiome_associations(
graph: nx.DiGraph, sample_groups: pd.Series,
associations: Dict[str, pd.DataFrame] = None,
residuals: pd.DataFrame = None, microbiome_data: pd.DataFrame = None,
**kwargs
):
"""Add microbiome-reaction associations to a multi-omics graph
Add the association estimates to a given multi-omics-reaction graph, either
using pre-computed estimates or computing estimates via
:py:func:`compute_multiomics_associations` and adding them directly.
Parameters
----------
graph : nx.DiGraph
Metabolite-reaction graph containing additional reaction-organism
connections. Usually when calling this function reaction estimates
are already added to the graph.
sample_groups : pd.Series
Array indicating sample group
associations : Dict[str, pd.DataFrame], optional
Reaction-microbiome associations per group as generated by
:py:func:`compute_multiomics_associations`
residuals : pd.DataFrame
Linear-model estimates per reaction and sample as generated by
:py:func:`compute_reaction_estimates`. If `associations` is None
this parameter is required.
microbiome_data : pd.DataFrame
Microbiome data with samples in rows and microbes in columns.
If `associations` is None this becomes a required parameter.
Microbe names need to match the organism node names in `graph` and
indices need to match the indices of `sample_groups`.
kwargs
Keyword arguments to be passed to
:py:func:`compute_multiomics_associations`
Examples
--------
>>> from pymantra.datasets import example_multiomics_enrichment_data
>>> from pymantra import (
... compute_reaction_estimates, compute_multiomics_associations)
>>> metabolite_data, microbiome_data, sample_groups, graph = \
... example_multiomics_enrichment_data()
>>> residuals = \
... compute_reaction_estimates(graph, metabolite_data, sample_groups)
>>> corrs, pvals = compute_multiomics_associations(
... residuals, microbiome_data, sample_groups)
>>> add_microbiome_associations(graph, sample_groups, corrs)
"""
if associations is None and (microbiome_data is None or residuals is None):
raise ValueError(
"Either 'associations' or 'microbiome_data' and 'residuals' is "
"required. See function documentation for more information."
)
_add_multiomics_associations(
graph, associations, sample_groups, "REACTION_ORGANISM", residuals,
microbiome_data, **kwargs
)
[docs]def add_gene_associations(
graph: nx.DiGraph, sample_groups: pd.Series,
associations: Dict[str, pd.DataFrame] = None,
residuals: pd.DataFrame = None, gene_data: pd.DataFrame = None, **kwargs
):
"""Add microbiome-reaction associations to a multi-omics graph
Add the association estimates to a given multi-omics-reaction graph, either
using pre-computed estimates or computing estimates via
:py:func:`compute_multiomics_associations` and adding them directly.
Parameters
----------
graph : nx.DiGraph
Metabolite-reaction graph containing additional reaction-organism
connections. Usually when calling this function reaction estimates
are already added to the graph.
sample_groups : pd.Series
Array indicating sample group
associations : Dict[str, pd.DataFrame], optional
Reaction-microbiome associations per group as generated by
:py:func:`compute_multiomics_associations`
residuals : pd.DataFrame
Linear-model estimates per reaction and sample as generated by
:py:func:`compute_reaction_estimates`. If `associations` is None
this parameter is required.
gene_data : pd.DataFrame
Gene data with samples in rows and genes in columns.
If `associations` is None this becomes a required parameter.
Gene names need to match the gene node names in `graph` and
indices need to match the indices of `sample_groups`.
kwargs
Keyword arguments to be passed to
:py:func:`compute_multiomics_associations`
"""
if associations is None and (residuals is None or gene_data is None):
raise ValueError(
"Either 'associations' or 'gene_data' is required. See function "
"documentation for more information."
)
_add_multiomics_associations(
graph, associations, sample_groups, "REACTION_GENE",
residuals, gene_data, **kwargs
)
def enrichment_pvalue(graph: nx.Graph, subnet: Set[Edge]) -> float:
# to cpp?
warn("pvalue computation currently not implemented")
return -1.