Skip to content

Output In-Memory

In-Memory Knowledge Graph Retrieval

Return the in-memory KG class.

Returns

_InMemoryKG: the in-memory KG class
Source code in biocypher/output/in_memory/_get_in_memory_kg.py
def get_in_memory_kg(
    dbms: str,
    deduplicator: Deduplicator,
) -> _InMemoryKG:
    """Return the in-memory KG class.

    Returns
    -------
        _InMemoryKG: the in-memory KG class

    """
    if dbms in ["csv", "pandas", "tabular"]:
        return PandasKG(deduplicator)

    if dbms == "networkx":
        return NetworkxKG(deduplicator)

    msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
    logger.error(msg)
    raise NotImplementedError(msg)

In-Memory Knowledge Graph Base Class

Bases: ABC

Abstract class for handling the in-memory Knowledge Graph instance. Specifics of the different in-memory implementations (e.g. csv, networkx) are implemented in the child classes. Any concrete in-memory implementation needs to implement at least: - add_nodes - add_edges - get_kg

Raises:

Type Description
NotImplementedError

InMemoryKG implementation must override 'add_nodes'

NotImplementedError

InMemoryKG implementation must override 'add_edges'

NotImplementedError

InMemoryKG implementation must override 'get_kg'

Source code in biocypher/output/in_memory/_in_memory_kg.py
class _InMemoryKG(ABC):
    """Abstract class for handling the in-memory Knowledge Graph instance.
    Specifics of the different in-memory implementations (e.g. csv, networkx)
    are implemented in the child classes. Any concrete in-memory implementation
    needs to implement at least:
    - add_nodes
    - add_edges
    - get_kg

    Raises:
        NotImplementedError: InMemoryKG implementation must override 'add_nodes'
        NotImplementedError: InMemoryKG implementation must override 'add_edges'
        NotImplementedError: InMemoryKG implementation must override 'get_kg'
    """

    @abstractmethod
    def add_nodes(self, nodes):
        """Add nodes to the in-memory knowledge graph.

        Args:
            nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
        """
        raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")

    @abstractmethod
    def add_edges(self, edges):
        """Add edges to the in-memory knowledge graph.

        Args:
            edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
        """
        raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")

    @abstractmethod
    def get_kg(self):
        """Return the in-memory knowledge graph."""
        raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")

add_edges(edges) abstractmethod

Add edges to the in-memory knowledge graph.

Parameters:

Name Type Description Default
edges Iterable[BioCypherEdge]

Iterable of BioCypherEdge objects.

required
Source code in biocypher/output/in_memory/_in_memory_kg.py
@abstractmethod
def add_edges(self, edges):
    """Add edges to the in-memory knowledge graph.

    Args:
        edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
    """
    raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")

add_nodes(nodes) abstractmethod

Add nodes to the in-memory knowledge graph.

Parameters:

Name Type Description Default
nodes Iterable[BioCypherNode]

Iterable of BioCypherNode objects.

required
Source code in biocypher/output/in_memory/_in_memory_kg.py
@abstractmethod
def add_nodes(self, nodes):
    """Add nodes to the in-memory knowledge graph.

    Args:
        nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
    """
    raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")

get_kg() abstractmethod

Return the in-memory knowledge graph.

Source code in biocypher/output/in_memory/_in_memory_kg.py
@abstractmethod
def get_kg(self):
    """Return the in-memory knowledge graph."""
    raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")

Pandas Knowledge Graph

Bases: _InMemoryKG

Source code in biocypher/output/in_memory/_pandas.py
class PandasKG(_InMemoryKG):
    def __init__(self, deduplicator):
        super().__init__()  # keeping in spite of ABC not having __init__
        self.deduplicator = deduplicator

        self.dfs = {}

    def get_kg(self):
        return self.dfs

    def add_nodes(self, nodes):
        self.add_tables(nodes)

    def add_edges(self, edges):
        self.add_tables(edges)

    def _separate_entity_types(self, entities):
        """
        Given mixed iterable of BioCypher objects, separate them into lists by
        type. Also deduplicates using the `Deduplicator` instance.
        """
        lists = {}
        for entity in entities:
            if (
                not isinstance(entity, BioCypherNode)
                and not isinstance(entity, BioCypherEdge)
                and not isinstance(entity, BioCypherRelAsNode)
            ):
                raise TypeError(
                    "Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
                )

            if isinstance(entity, BioCypherNode):
                seen = self.deduplicator.node_seen(entity)
            elif isinstance(entity, BioCypherEdge):
                seen = self.deduplicator.edge_seen(entity)
            elif isinstance(entity, BioCypherRelAsNode):
                seen = self.deduplicator.rel_as_node_seen(entity)

            if seen:
                continue

            if isinstance(entity, BioCypherRelAsNode):
                node = entity.get_node()
                source_edge = entity.get_source_edge()
                target_edge = entity.get_target_edge()

                _type = node.get_type()
                if _type not in lists:
                    lists[_type] = []
                lists[_type].append(node)

                _source_type = source_edge.get_type()
                if _source_type not in lists:
                    lists[_source_type] = []
                lists[_source_type].append(source_edge)

                _target_type = target_edge.get_type()
                if _target_type not in lists:
                    lists[_target_type] = []
                lists[_target_type].append(target_edge)
                continue

            _type = entity.get_type()
            if _type not in lists:
                lists[_type] = []
            lists[_type].append(entity)

        return lists

    def add_tables(self, entities):
        """
        Add Pandas dataframes for each node and edge type in the input.
        """

        lists = self._separate_entity_types(entities)

        for _type, _entities in lists.items():
            self._add_entity_df(_type, _entities)

    def _add_entity_df(self, _type, _entities):
        df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities]))
        # replace "properties." with "" in column names
        df.columns = [col.replace("properties.", "") for col in df.columns]
        if _type not in self.dfs:
            self.dfs[_type] = df
        else:
            self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True)
        return self.dfs[_type]

_separate_entity_types(entities)

Given mixed iterable of BioCypher objects, separate them into lists by type. Also deduplicates using the Deduplicator instance.

Source code in biocypher/output/in_memory/_pandas.py
def _separate_entity_types(self, entities):
    """
    Given mixed iterable of BioCypher objects, separate them into lists by
    type. Also deduplicates using the `Deduplicator` instance.
    """
    lists = {}
    for entity in entities:
        if (
            not isinstance(entity, BioCypherNode)
            and not isinstance(entity, BioCypherEdge)
            and not isinstance(entity, BioCypherRelAsNode)
        ):
            raise TypeError(
                "Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
            )

        if isinstance(entity, BioCypherNode):
            seen = self.deduplicator.node_seen(entity)
        elif isinstance(entity, BioCypherEdge):
            seen = self.deduplicator.edge_seen(entity)
        elif isinstance(entity, BioCypherRelAsNode):
            seen = self.deduplicator.rel_as_node_seen(entity)

        if seen:
            continue

        if isinstance(entity, BioCypherRelAsNode):
            node = entity.get_node()
            source_edge = entity.get_source_edge()
            target_edge = entity.get_target_edge()

            _type = node.get_type()
            if _type not in lists:
                lists[_type] = []
            lists[_type].append(node)

            _source_type = source_edge.get_type()
            if _source_type not in lists:
                lists[_source_type] = []
            lists[_source_type].append(source_edge)

            _target_type = target_edge.get_type()
            if _target_type not in lists:
                lists[_target_type] = []
            lists[_target_type].append(target_edge)
            continue

        _type = entity.get_type()
        if _type not in lists:
            lists[_type] = []
        lists[_type].append(entity)

    return lists

add_tables(entities)

Add Pandas dataframes for each node and edge type in the input.

Source code in biocypher/output/in_memory/_pandas.py
def add_tables(self, entities):
    """
    Add Pandas dataframes for each node and edge type in the input.
    """

    lists = self._separate_entity_types(entities)

    for _type, _entities in lists.items():
        self._add_entity_df(_type, _entities)

NetworkX Knowledge Graph

Bases: _InMemoryKG

Source code in biocypher/output/in_memory/_networkx.py
class NetworkxKG(_InMemoryKG):
    def __init__(self, deduplicator):
        super().__init__()  # keeping in spite of ABC not having __init__
        self.deduplicator = deduplicator
        self._pd = PandasKG(
            deduplicator=self.deduplicator,
        )
        self.KG = None

    def get_kg(self):
        if not self.KG:
            self.KG = self._create_networkx_kg()
        return self.KG

    def add_nodes(self, nodes):
        self._pd.add_nodes(nodes)
        return True

    def add_edges(self, edges):
        self._pd.add_edges(edges)
        return True

    def _create_networkx_kg(self) -> nx.DiGraph:
        self.KG = nx.DiGraph()
        all_dfs = self._pd.dfs
        node_dfs = [df for df in all_dfs.values() if df.columns.str.contains("node_id").any()]
        edge_dfs = [
            df
            for df in all_dfs.values()
            if df.columns.str.contains("source_id").any() and df.columns.str.contains("target_id").any()
        ]
        for df in node_dfs:
            nodes = df.set_index("node_id").to_dict(orient="index")
            self.KG.add_nodes_from(nodes.items())
        for df in edge_dfs:
            edges = df.set_index(["source_id", "target_id"]).to_dict(orient="index")
            self.KG.add_edges_from(((source, target, attrs) for (source, target), attrs in edges.items()))
        return self.KG