Skip to content

Commit

Permalink
Merge pull request #34 from y1zhou/feat/route-search
Browse files Browse the repository at this point in the history
  • Loading branch information
y1zhou authored Oct 19, 2022
2 parents f45ab46 + 1688df6 commit f5e138e
Show file tree
Hide file tree
Showing 15 changed files with 1,297 additions and 514 deletions.
116 changes: 116 additions & 0 deletions metabolike/algo/compounds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from difflib import get_close_matches

import pandas as pd

from metabolike.db import Neo4jClient


class CompoundMap:
def __init__(self, db: Neo4jClient):
self.db = db
self.id_table, self.cpds = self.get_all_compounds()

def get_all_compounds(self):
"""
See https://biocyc.org/PGDBConceptsGuide.shtml#TAG:__tex2page_toc_TAG:__tex2page_sec_4.4
for detailed descriptions.
Returns:
A tuple of two ``pandas.DataFrame``.
One with columns ``metaId``, ``compartment``, and ``biocyc``;
One with columns ``biocyc``, ``key_id`` and ``value``.
"""
cpds = self.db.read(
"""
MATCH (c:Compound)
OPTIONAL MATCH (c)-[:hasRDF {bioQualifier: 'is'}]->(rdf:RDF)
OPTIONAL MATCH (c)-[:hasCompartment]->(cpt:Compartment)
RETURN c{.metaId, .name, .commonName, .synonyms, .smiles},
rdf{.biocyc, .chebi, .keggCompound, .inchikey,
.chemspider, .pubchemCompound, .hmdb, .cas,
.metabolights, .lipidmaps, .drugbank, .knapsack,
.umbbdCompound, .keggGlycan
},
cpt.metaId AS compartmentId, cpt.commonName AS compartment;
"""
) # some don't have the RDF/Compartment fields
res = [None] * len(cpds)
for i, cpd in enumerate(cpds):
entry = cpd["c"]
if cpd["rdf"] is not None:
entry = {**entry, **cpd["rdf"]}

# Prefer compartment commonName, fallback to metaId
# CCO: Cell Component Ontology; CCO-CYTOSOL is the default location
if cpd["compartmentId"] is not None:
entry["compartment"] = cpd["compartmentId"]
if cpd["compartment"] is not None:
entry["compartment"] = cpd["compartment"]

res[i] = entry

# Convert wide to long table for easier queries
res = pd.DataFrame(res)
res = res[~res["biocyc"].isna()] # POLYMER-xxx, not important
metaid_to_biocyc = res.filter(["metaId", "biocyc", "compartment"])
res_long = (
res.drop(columns=["metaId", "compartment"])
.melt(id_vars=["biocyc"], var_name="key_id")
.query("value.notna()")
.explode("value")
.drop_duplicates()
.assign(value_lowercase=lambda r: r["value"].str.lower())
)

return metaid_to_biocyc, res_long

def search_compound_biocyc_id(self, query: str, **kwargs):
"""Returns the compound biocyc ID if it exists."""
res = {"query": query, "is_fuzzy_match": False}

# Direct return if query is a biocyc ID
if query in self.id_table["biocyc"]:
res["hits"] = [query]
return res

# Try exact matches first
exact_hits = self.compound_exact_match(query)
if exact_hits:
res["hits"] = exact_hits
return res

# Try fuzzy search if there's no exact biocyc ID matches
value_matches = get_close_matches(
query.lower(), self.cpds["value_lowercase"], **kwargs
)
if value_matches:
hits = [x for v in value_matches for x in self.compound_exact_match(v)]
res["hits"] = list(set(hits))
res["is_fuzzy_match"] = True
return res

res["hits"] = []
return res

def compound_exact_match(self, query: str):
"""Finds biocyc IDs corresponding to the query."""
res = self.cpds.query("value_lowercase == @query")

# Return biocyc ID directly if there's an exact match
if res.shape[0] != 0:
return list(set(res["biocyc"]))

def search_compound_compartment(self, query: str):
"""Given a biocyc ID, return compartments it is in."""
res = self.id_table.loc[self.id_table.biocyc == query, "compartment"]
return list(set(res.values))

def search_compound_metaid_in_compartment(
self, query: str, compartment: str = "cytosol"
):
res = (
self.id_table.query("biocyc == @query")
.query("compartment == @compartment")
.metaId
)
return res.tolist()[0]
195 changes: 194 additions & 1 deletion metabolike/algo/omics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from typing import Set
import logging
from typing import Dict, Iterable, Sequence, Set

import pandas as pd

from metabolike.db import Neo4jClient

logger = logging.getLogger(__name__)


def get_all_gene_products(db: Neo4jClient) -> Set[str]:
ec = db.read(
Expand All @@ -13,3 +18,191 @@ def get_all_gene_products(db: Neo4jClient) -> Set[str]:
"""
)
return {n["ec"] for n in ec}


def get_table_of_gene_products(db: Neo4jClient, rdf_fields: Dict[str, str] = None):
"""
Retrieves all reaction-associated gene products.
Args:
db: A Neo4j client connected to the graph database.
rdf_fields: Properties of the RDF nodes, and the desired output name.
For example, {"ncbigene": "entrez"} would extract the ``ncbigene``
property from the RDF nodes and output it in the ``entrez`` column.
Returns:
A list of entries with the reaction ID, the metaId of the gene, the
gene symbol, and other fields from RDF nodes whenever available.
"""
query = """
MATCH (r:Reaction)-[:hasGeneProduct|hasMember|hasComponent*]->(gp:GeneProduct)
WHERE NOT gp:GeneProductSet OR gp:GeneProductComplex
WITH r.metaId AS rxn_id, gp
OPTIONAL MATCH (gp)-[:hasRDF {bioQualifier: 'isEncodedBy'}]->(rdf:RDF)
RETURN rxn_id, gp.metaId AS gene_id, gp.name AS gene
"""
if rdf_fields:
for field, col in rdf_fields.items():
query += f", rdf.{field} AS {col}"
return db.read(query)


class ReactionGeneMap:
def __init__(
self,
database_connection: Neo4jClient,
gene_ids: Sequence[str],
expression_levels: Sequence[float],
gene_groups: pd.DataFrame | None = None,
gene_id_map: pd.DataFrame | None = None,
gene_set_reduce_func: callable = max,
gene_complex_reduce_func: callable = min,
):
"""
Attributes:
database_connection: Connection to the Neo4j database.
rxn_exp: expression levels of each reaction.
gene_ids: A list-like of strings contains gene identifiers.
expression_levels: gene expression data.
gene_groups: parent-child mapping of gene product sets and complexes.
gene_id_map: mapping between gene metaId and gene name.
"""

self.db = database_connection
self.gene_exp = {
gene_id: exp_level
for gene_id, exp_level in zip(gene_ids, expression_levels)
}
self.rxn_exp: dict[str, float] = {}

if gene_groups is None:
self.gene_groups = self._get_gene_groups()
else:
self.gene_groups = gene_groups

if gene_id_map is None:
self.gene_id_map = self._get_gene_id_map()
else:
self.gene_id_map = gene_id_map

self._valid_gene_groups = set(self.gene_groups["group_id"].unique())
self._set_func = gene_set_reduce_func
self._complex_func = gene_complex_reduce_func

# Setup reaction expression levels
rxn_genes = self._get_top_level_rxn_gene_mapping()
self.reaction_gp_mapping = {}

for rxn, gene in rxn_genes.items():
# GeneProductSet or GeneProductComplex
if gene in self._valid_gene_groups:
self.reaction_gp_mapping[rxn] = self.get_all_genes_in_group(gene)
self.rxn_exp[rxn] = self.get_gene_expression(gene)
elif gene_name := self.gene_id_map.get(gene): # not None or ""
self.reaction_gp_mapping[rxn] = {gene_name}
self.rxn_exp[rxn] = self.get_gene_expression(gene_name)

def get_all_genes_in_group(self, gene: str):
if gene in self.gene_exp:
return {gene}
elif gene in self._valid_gene_groups:
res = set()
genes = self.gene_groups.loc[self.gene_groups["group_id"] == gene, :]
gene_ids = genes["members"].values
for g in gene_ids:
if gene_name := self.gene_id_map.get(g):
if maybe_gene_name := self.get_all_genes_in_group(gene_name):
res.union(maybe_gene_name)

return res

def get_gene_expression(self, gene_name: str):
if (gene_exp := self.gene_exp.get(gene_name)) is not None:
return gene_exp
elif gene_name in self._valid_gene_groups:
return self.calc_reaction_gene_group_expression(gene_name)
# else:
# # raise ValueError(f"Unknown gene product {gene_name}")
# logger.warning(f"Unknown gene product {gene_name}")

def calc_reaction_gene_group_expression(self, gene_group_id: str):
"""
Calculate the expression level of a given gene product set or
complex node.
"""
# TODO: make use of self.get_all_genes_in_group()
genes = self.gene_groups.loc[self.gene_groups["group_id"] == gene_group_id, :]
gene_ids = genes["members"].values
group_type = genes["group_type"].values[0]

res = [
self.get_gene_expression(gene_name)
for gene_id in gene_ids
if (gene_name := self.gene_id_map.get(gene_id))
]
res = [x for x in res if x is not None]
if not res:
return 0.0

if group_type == "Set":
return self._set_func(res)
elif group_type == "Complex":
return self._complex_func(res)
else:
raise ValueError(f"Unknown gene group type: {group_type}")

def get_route_expression(self, rxn_ids: Iterable[str]) -> float:
"""
Sums up total expression levels of the reaction route, and divide
the value by the number of reactions.
Args:
rxn_ids: The metaId fields of the reactions.
Returns:
The averaged expression level.
"""
res = 0.0
for rxn_id in rxn_ids:
if (gene_exp := self.rxn_exp.get(rxn_id)) is not None:
res += gene_exp

return res / len(rxn_ids)

def _get_top_level_rxn_gene_mapping(self):
"""
Get unique reaction -> gene product mappings. The "gene" column could
be GeneProduct, GeneProductSet, or GeneProductComplex nodes.
"""
rxn2gene = self.db.read(
"""
MATCH (r:Reaction)-[:hasGeneProduct]->(g)
WHERE g:GeneProduct OR g:GeneProductSet OR g:GeneProductComplex
RETURN r.metaId AS rxn, g.metaId AS gene_id;
"""
)
res = {n["rxn"]: n["gene_id"] for n in rxn2gene}
return res

def _get_gene_groups(self):
group2child = self.db.read(
"""
MATCH (gps:GeneProductSet)-[:hasMember]->(gp)
RETURN gps.metaId AS group_id, "Set" AS group_type, COLLECT(gp.metaId) AS members
UNION ALL
MATCH (gpc:GeneProductComplex)-[:hasComponent]->(gp)
RETURN gpc.metaId AS group_id, "Complex" AS group_type, COLLECT(gp.metaId) AS members
"""
)
res = pd.DataFrame([x for x in group2child])
return res.explode("members").reset_index(drop=True)

def _get_gene_id_map(self):
genes = self.db.read(
"""
MATCH (gp:GeneProduct)
WHERE NOT (gp:GeneProductSet OR gp:GeneProductComplex)
RETURN gp.metaId AS gene_id, gp.name AS gene_name;
"""
)
return {n["gene_id"]: n["gene_name"] for n in genes}
Loading

0 comments on commit f5e138e

Please sign in to comment.