Source code for pymantra.database.APINetworkGenerator

from abc import ABC
from typing import Dict, Set, Tuple
import warnings
import os
import json
import requests
import networkx as nx
from html.parser import HTMLParser

from pymantra.statics import Edge


def _json_safe_dict(py_dict: Dict[str, any]):
    return {
        k: list(v) if isinstance(v, set) else v for k, v in py_dict.items()}


class _HTMLBodyTextReader(HTMLParser, ABC):
    """Helper class to read html to text

    adapted from https://stackoverflow.com/a/55825140 and
    https://stackoverflow.com/questions/16773583
    """
    def __init__(self):
        super().__init__()
        self.text = ""
        self.inbody = False

    def handle_starttag(self, tag, attrs):
        self.inbody = False
        if tag == "body":
            self.inbody = True

    def handle_endtag(self, tag):
        if tag == "body":
            self.inbody = False

    def handle_data(self, data):
        # get data but only after <body> and before </body>
        if self.inbody and data.strip():
            self.text += data


def _get_max_lines(to_print: str, n: int):
    lines = to_print.split(os.linesep)
    return os.linesep.join(lines[:min(len(lines) - 1, n)])


def _parse_html_response(response: requests.Response, n: int = 10):
    hp = _HTMLBodyTextReader()
    hp.feed(response.text)
    return _get_max_lines(hp.text, n)


[docs]class APINetworkGenerator:
    """API mirror for :class:`~NetworkGenerator`

    Querying the mantra online neo4j database containing the reference network
    generated with the Neo4jGenerator class.

    Most query functions depend on the requirements test with the `Verifier`
    class. To ensure that all functions work as expected, only databases test
    for their correctness should be used.

    Attributes
    ----------
    url: str
        Base URL to where requests go
    """
    __slots__ = ["url", "_using_local_api"]

    url: str

[docs]    def __init__(self, base_url: str = "https://exbio.wzw.tum.de/pymantradb"):
        """Initialize a new APINetworkGenerator instance

        Initialize a new instance to run queries to the neo4j mantra-db API.

        Parameters
        ----------
        base_url: str, https://exbio.wzw.tum.de/pymantradb
            Set the root URL where the server is located
        """
        # local API requires different setting to allow for connections
        self._using_local_api: bool = False
        if "127.0.0.1" in base_url:
            self._local_api("127.0.0.1")
        elif "localhost" in base_url:
            self._local_api("localhost")

        # only http/https URIs are allowed
        if not base_url.startswith("http"):
            if not self._using_local_api:
                raise ValueError(
                    "'base_url' needs to be a full web-address it no local "
                    "API is used. Did not find the required starting "
                    "'https://' or 'http://'."
                )
            else:
                # NOTE: local django API cannot handle https
                base_url = f"http://{base_url}"

        # checking whether given base is working
        self.url = base_url
        if not self.verify_connection():
            raise ConnectionError(
                f"Base URL '{base_url}' seems to be invalid. Connection could "
                "not be verified!"
            )

    def _local_api(self, local_option: str):
        """Make local APIs available"""
        warnings.warn(
            f"Setting 'NO_PROXY' to '{local_option}' to ensure that requests "
            "is able to reach API server"
        )
        os.environ['NO_PROXY'] = local_option
        self._using_local_api = True

[docs]    def verify_connection(self) -> bool:
        """Check whether the given base URL is correct

        Returns
        -------
        bool
            True if status code is 200 (connection verified)
        """
        try:
            test_req = requests.get(f"{self.url}/verify-connection")
        except requests.exceptions.RequestException:
            raise ConnectionError(
                f"Connection to {self.url} could not be established! Please "
                "select a valid URL. Aborting verification process..."
            )
        return test_req.status_code == 200

    def _query(self, data: Dict[str, any], subpath: str):
        """Run a query and return the data from json"""
        # dicts are not send properly if passed as python objects
        _prepped_data = {
            k: json.dumps(_json_safe_dict(v)) if isinstance(v, dict) else v
            for k, v in data.items()
        }
        response = requests.post(
            f"{self.url}/{subpath}", data=_prepped_data)
        if response.status_code != 200:
            if response.status_code == 404:
                raise ConnectionError(
                    f"URL '{self.url}/{subpath}' was not found!")
            if response.status_code == 414:
                if self._using_local_api:
                    raise ConnectionError(
                        "Request-URI Too Large. Please change the "
                        "configuration in the APIs nginx container."
                    )
                else:
                    raise ConnectionError(
                        "Request-URI Too Large. Please contact the "
                        "developers to increase the allowed size or find an "
                        "alternative for processing your data."
                    )
            if response.status_code == 422:
                raise ValueError(
                    f"Invalid data reached the server: {response.text}")
            if response.status_code == 500:
                raise ConnectionError(
                    "An internal server error occurred wile processing your "
                    f"request: {_parse_html_response(response)}"
                )
            raise ValueError(
                "Something went wrong while processing your request, failed "
                f"with exit code {response.status_code} ("
                f"{_parse_html_response(response)})."
            )
        return json.loads(response.text)

[docs]    def get_reaction_subgraph(
        self, organisms: Set[str], genes: Set[str], metabolites: Set[str],
        reaction_organism: Tuple[str, str] = None
    ) -> Dict[str, Set[Edge]]:
        """Extract the edges for a given set of entities

        Query a subgraph with all genes, organisms and metabolites given
        and retain the original graph structure with reaction nodes.

        **Important**: gene - organism, organism - reaction and gene - reaction
        edges are of opposite direction outside the database. The database
        structure is made to allow efficient queries, which do not reflect the
        'passing' directions required for quantitative metabolic-network style
        analyses.

        Parameters
        ----------
        organisms: Set[str]
            A set of all organisms to be included in the subgraph.
            The names must correspond to the nodeLabel property in
            the database.
        genes: Set[str]
            A set of all genes to be included in the subgraph.
            The names must correspond to the nodeLabel property in
            the database.
        metabolites: Set[str]
            A set of all metabolites to be included in the subgraph.
            The names must correspond to the nodeLabel property in
            the database.
        reaction_organism: Tuple[str, str], optional
            Specify an organism for which the metabolic reactions should be
            extracted as a 2-tuple of [ID type, ID], where ID type must be
            'Abbreviation_KEGG' or 'KeggID' and ID the KEGG organism code or
            T number, respectively. For human this would thus either be
            ['Abbreviation_KEGG', 'hsa'] or ['KeggID', 'T01001'].
            If `organisms` is not empty the specified organism will be added
            on top.

        Returns
        -------
        Dict[str, Set[Edge]]
            A dictionary, where keys represent edge types as specified in
            `utils.EDGE_TYPES` pointing to a set of :obj:`Edge` representing
            all edges of the respective type contained in the subgraph.

        Examples
        --------
        >>> generator = APINetworkGenerator()
        >>> metabos = {'FDMO3', 'h2o', 'FDMO2', 'fald', 'FDMO6', 'so3',
        ...            'FMNRx', 'nad', 'FMNRx2', 'fmn', 'nadp'}
        >>> gs = {'1576', '1557', '1559'}
        >>> orgs = {"Streptomyces tsukubensis", "Bacillus smithii",
        ...         "Streptomyces fulvissimus"}
        >>> edges_ = generator.get_reaction_subgraph(orgs, gs, metabos)
        """
        data = {
            "organisms": organisms, "genes": genes, "metabolites": metabolites,
            "reaction_organism": reaction_organism
        }
        return {
            edge_type: {Edge(*edge) for edge in edges}
            for edge_type, edges in
            self._query(data, "database/get-reaction-subgraph").items()
        }

[docs]    def get_subgraph(
        self, organisms: Set[str], genes: Set[str], metabolites: Set[str]
    ) -> Dict[str, Set[Edge]]:
        """Returns a subgraph with all nodes given plus the reaction nodes
        required to connect them

        Parameters
        ----------
        organisms: Set[str]
            Set of all organisms to query
            The names must correspond to the nodeLabel property in
            the database.
        genes: Set[str]
            Set of all genes to query
            The names must correspond to the nodeLabel property in
            the database.
        metabolites: Set[str]
            Set of all metabolites to query
            The names must correspond to the nodeLabel property in
            the database.

        Returns
        -------
        Dict[str, Set[Edge]]
            All connections between organisms, genes and metabolites contained
            in the database. organism - metabolite are third order connections
            (via gene and reaction nodes), all other connections are second
            order (via reaction nodes)

        Examples
        --------
        >>> generator = APINetworkGenerator("http://127.0.0.1:8084")
        >>> metabos = {'FDMO3', 'h2o', 'FDMO2', 'fald', 'FDMO6', 'so3',
        ...            'FMNRx', 'nad', 'FMNRx2', 'fmn', 'nadp'}
        >>> gs = {'1576', '1557', '1559'}
        >>> orgs = {"Streptomyces tsukubensis", "Bacillus smithii",
        ...         "Streptomyces fulvissimus"}
        >>> edges_ = generator.get_subgraph(orgs, gs, metabos)
        """
        data = {
            "organisms": organisms, "genes": genes, "metabolites": metabolites}
        return {
            edge_type: {Edge(*edge) for edge in edges}
            for edge_type, edges in
            self._query(data, "database/get-subgraph").items()
        }

[docs]    def as_networkx(
        self, nodes: Dict[str, Set[str]] = None,
        edges: Dict[str, Set[Edge]] = None, include_attributes: bool = True,
        reaction_subgraph: bool = False, reduce: bool = True
    ) -> nx.DiGraph:
        """Convert a set of nodes or edges to a networkx Graph

        Parameters
        ----------
        nodes: Dict[str, Set[str]], Optional
            Nodes to include by node type. Generally optional, but either
            `nodes` or `edges` need to be given.
            Please note: if `edges` is not specified, and `reaction_subgraph`
            is True reaction nodes given in `nodes` will NOT be considered.
        edges: Dict[str, Set[Edge]], Optional
            Edges to include by edge type. Generally optional, but either
            `nodes` or `edges` need to be given. If not specified, edges will
            be queried from the database using the specified nodes using either
            `get_subgraph` or `get_reaction_subgraph` depending on
            `include_attributes`.
            The only edge attribute currently included is `edge_type`.
        include_attributes: bool, default True
            If True, the nx.Graph.nodes contain the attributes specified in the
            database. Else `node_type` will be the only node attribute in the
            output graph. Please be aware that if True and `edges` are None,
            this might make the function much less efficient.
        reaction_subgraph: bool, default False
            Only relevant if edges is None. If True subgraph edges queried
            result in a reaction subgraph (see
            :py:meth:`~NetworkGenerator.get_reaction_subgraph`) else
            the subgraph will not contain reaction nodes
            (:py:meth:`~APINetworkGenerator.get_subgraph`)
        reduce: bool, False
            Whether to reduce the reaction nodes at the end of the

        Returns
        -------
        nx.DiGraph
            Subgraph as a :obj:`nx.DiGraph`

        # TODO: add sample data
        Examples
        --------
        >>> edges_ = {
        >>>     EDGE_TYPE_NAMES['substrate']: {
        >>>         # TODO: example edges
        >>>     },
        >>>     EDGE_TYPE_NAMES['product']: {
        >>>         # TODO: example edges
        >>>     }
        >>> }
        >>> generator = APINetworkGenerator()
        >>> generator.as_networkx(edges=edges_)
        """
        if nodes is None and edges is None:
            raise ValueError(
                "Either 'nodes' or 'edges' must be given to run 'as_networkx")
        data = {
            "nodes": nodes, "edges": edges,
            "include_attributes": include_attributes,
            "reaction_subgraph": reaction_subgraph,
            "reduce": reduce
        }
        graph_data = self._query(data, "database/as-networkx")

        graph = nx.DiGraph()
        for node, node_data in graph_data["nodes"].items():
            graph.add_node(node, **node_data)
        for edge, edge_data in graph_data["edges"].items():
            src, tgt = edge.split(graph_data["split_str"])
            graph.add_edge(src, tgt, **edge_data)
        return graph