"""Implementation of EcModel class.
Extend the XbaModel to become an enzyme constraint model.
The implementation of GECKO is based on the GECKO 1.0 MATLAB package (Sánchez et al., 2017).
The implementation of ccFBA and MOMENTmr is based on the R-package sybilccFBA (Desouki, 2015)
The implementation of MOMENT is based on literature (Adadi et al., 2012)
Peter Schubert, HHU Duesseldorf, October 2023
"""
import re
import numpy as np
import pandas as pd
import f2xba.prefixes as pf
from ..utils.mapping_utils import valid_sbml_sid, load_parameter_file
MAX_CONC_PROT = 100 # mg/gDW maximum protein concentration in kpmf (kilo protein mass fraction)
[docs]
class EcModel:
"""Create an enzyme constraint model by extending a XbaModel instance.
Support is provided for generation of GECKO, MOMENT, MOMENTmr and ccFBA models.
GECKO is implemented based on the GECKO 1.0 MATLAB package (Sánchez et al., 2017),
MOMENT is described in literature (Adadi et al., 2012),
ccFBA and MOMENTmr are based on the R-package sybilccFBA (Desouki, 2015).
Example: Create a GECKO model by extending an existing genome scale metabolic model.
The spreadsheet files `xba_parameters.xlsx` and `ecm_parameters.xlsx`
contain configuration data. See tutorials.
.. code-block:: python
xba_model = XbaModel('iML1515.xml')
xba_model.configure('xba_parameters.xlsx')
ec_model = EcModel(xba_model)
ec_model.configure('ecm_parameters.xlsx')
if ec_model.validate():
ec_model.export('iML1515_GECKO.xml')
"""
def __init__(self, xba_model):
"""Instantiate the EcModel instance.
:param xba_model: a reference to a XbaModel instance
:type xba_model: :class:`XbaModel`
"""
self.model = xba_model
"""Reference to the XbaModel instance."""
self.arm_flag = False
def _get_enzyme_gene_products(self):
"""Determine used genes products for enzymes.
Note: Model can contain additional gene products for
gene product associations where enzyme kcat have not
been provided. Useful for gene deletion studies.
:return: gene product ids used by the enzymes in the model
:rtype: set(str)
"""
used_enzymes = set()
for rid, r in self.model.reactions.items():
for eid in r.enzymes:
used_enzymes.add(eid)
used_enz_gps = set()
for eid in used_enzymes:
e = self.model.enzymes[eid]
for gene_id in e.composition:
used_enz_gps.add(self.model.locus2gp[gene_id])
return used_enz_gps
def _add_gecko_protein_species(self):
"""Add protein species to the GECKO/ccFBA/MOMENTmr model.
Note: Protein compartments in XBA model are derived from
reaction compartments, which in turn is a concatenation of
species compartments being substrates to the reaction.
Here we only need one of the species compartments.
We only create proteins when gene product is used by any of the
enzymes
add protein species to the model (also ensure that protein species name is valid)
add protein drain reactions to the model
"""
used_enz_gps = self._get_enzyme_gene_products()
protein_sids = {}
for gpid in used_enz_gps:
uid = self.model.gps[gpid].uid
p = self.model.proteins[uid]
prot_sid = pf.C_prot_ + valid_sbml_sid(uid)
p.link_sid(prot_sid)
prot_metaid = f'meta_prot_{valid_sbml_sid(uid)}'
prot_annot = f'bqbiol:is, uniprot/{uid}'
protein_sids[prot_sid] = [p.name, p.cid, False, False, False, prot_metaid, prot_annot]
cols = ['name', 'compartment', 'hasOnlySubstanceUnits', 'boundaryCondition',
'constant', 'metaid', 'miriamAnnotation']
df_add_species = pd.DataFrame(protein_sids.values(), index=list(protein_sids), columns=cols)
print(f'{len(df_add_species):4d} protein constraints to add')
self.model.add_species(df_add_species)
def _add_moment_protein_species(self):
"""Add protein species to MOMENT model.
Execute before making model irreversible.
In MOMENT promiscuous enzymes can catalyze several reactions in parallel
To implement this constraint, we add for each catalyzed (iso)reaction
a specific protein species
add protein species to the model
"""
# add dedicated protein species for each (iso)reaction
protein_sids = {}
for rid, r in self.model.reactions.items():
assert(len(r.enzymes) <= 1)
if len(r.enzymes) == 1:
enz = self.model.enzymes[r.enzymes[0]]
for locus in enz.composition:
uid = self.model.locus2uid[locus]
p = self.model.proteins[uid]
prot_sid = pf.C_prot_ + valid_sbml_sid(uid) + f'_{rid}'
p.link_sid(prot_sid)
prot_metaid = f'meta_prot_{valid_sbml_sid(uid)}_{rid}'
prot_annot = f'bqbiol:is, uniprot/{uid}'
protein_sids[prot_sid] = [p.name, p.cid, False, False, False, prot_metaid, prot_annot]
cols = ['name', 'compartment', 'hasOnlySubstanceUnits', 'boundaryCondition',
'constant', 'metaid', 'miriamAnnotation']
df_add_species = pd.DataFrame(protein_sids.values(), index=list(protein_sids), columns=cols)
print(f'{len(df_add_species):4d} moment protein constraints to add')
self.model.add_species(df_add_species)
def _select_ccfba_isoenzyme(self):
"""Select least cost enzyme for reactions catalyzed by several isoenzymes.
Consider number of active sites when determining enzyme kcat value
Other isoenzymes get deleted and gpa updated.
Integration with Thermodynamic FBA (TFA):
TFA splits TD related reations in fwd/rev (_REV)
TFA also split reactions that in base model are not reversible
i.e. we may have irreversible '<rid>_REV' with no kcat
Selection is based on molecular weight and turnover number.
Select enzyme with minimal MW/kcat.
Different isoenzymes might be selected for forward/reverse direction.
Therefore, reversible reactions with isoenzymes get split in fwd/bwd reaction.
"""
# Note model.reactions get updated within the loop, therefore fix rids initially
rids = list(self.model.reactions)
for rid in rids:
r = self.model.reactions[rid]
if len(r.enzymes) >= 2 and r.kcatsf is not None and np.isfinite(r.kcatsf[0]):
directional_rs = [r]
if r.reversible:
# split reaction, i.e. create a new irreversible reverse reaction
rev_r = self.model.split_reversible_reaction(r)
directional_rs.append(rev_r)
for dir_r in directional_rs:
valid_kcatsf = (sum([1 for kcat in dir_r.kcatsf if np.isfinite(kcat)])
if dir_r.kcatsf is not None else 0)
# consider reactions with undefined kcatf (due to TD integration)
if len(r.enzymes) == valid_kcatsf:
idx = np.argmin([self.model.enzymes[eid].mw / (kcat * self.model.enzymes[eid].active_sites)
for eid, kcat in zip(dir_r.enzymes, dir_r.kcatsf)])
eid = dir_r.enzymes[idx]
gpa = ' and '.join(sorted([self.model.uid2gp[self.model.locus2uid[locus]]
for locus in self.model.enzymes[eid].composition]))
if ' and ' in gpa:
gpa = '(' + gpa + ')'
dir_r.enzymes = [dir_r.enzymes[idx]]
dir_r.kcatsf = [dir_r.kcatsf[idx]]
dir_r.gene_product_assoc = gpa
else:
dir_r.enzymes = []
dir_r.kcatsf = None
dir_r.kcatsr = None
def _add_total_protein_constraint(self, total_protein):
"""add total modelled protein constraint to the enzyme constraint model.
add protein pool species to the model
add protein drain reactions to the model
add protein pool exchange reaction to the model
protein concentration in model has units of mg/gDW (i.e. 1000.0 * pmf)
:param float total_protein: total modelled protein mass in g/gDW
"""
# add total protein pool species in default compartment
pool_sid = f'{pf.C_prot_pool}'
pool_name = 'total protein pool'
cols = ['name', 'compartment', 'hasOnlySubstanceUnits', 'boundaryCondition', 'constant']
df_add_species = pd.DataFrame([[pool_name, self.model.main_cid, False, False, False]],
index=[pool_sid], columns=cols)
self.model.add_species(df_add_species)
# add total protein pool exchange reaction
protein_vars = {}
draw_rid = pf.V_PC_total
draw_name = f'total concentration modelled protein'
zero_mg_pid = self.model.get_fbc_bnd_pid(0.0, 'mg_per_gDW', 'zero_mass_conc_mg_per_gDW')
total_prot_mg_pid = self.model.get_fbc_bnd_pid(total_protein * 1000.0, 'mg_per_gDW', 'conc_protein_mg_per_gDW')
protein_vars[draw_rid] = [draw_name, f' => {pool_sid}', zero_mg_pid, total_prot_mg_pid, None, 'protein']
# add protein drain reactions - supporting MOMENT with specific protein split in protein species per reaction
max_conc_mg_pid = self.model.get_fbc_bnd_pid(MAX_CONC_PROT, 'mg_per_gDW', 'max_conc_prot_mg_per_gDW')
# create only for used proteins
used_enz_gps = self._get_enzyme_gene_products()
for gpid in used_enz_gps:
uid = self.model.gps[gpid].uid
p = self.model.proteins[uid]
draw_rid = pf.V_PC_ + valid_sbml_sid(uid)
draw_name = f'conc_prot_{uid}'
# supporting MOMENT model with protein split into protein species per reaction
products = ' + '.join([sid for sid in p.linked_sids])
# reaction_string = f'{p.mw / 1000.0} {pool_sid} => {products}'
reaction_string = f'{pool_sid} => {products}'
protein_vars[draw_rid] = [draw_name, reaction_string, zero_mg_pid, max_conc_mg_pid,
self.model.uid2gp[uid], 'protein']
cols = ['name', 'reactionString', 'fbcLowerFluxBound', 'fbcUpperFluxBound', 'fbcGeneProdAssoc', 'kind']
df_add_rids = pd.DataFrame(protein_vars.values(), index=list(protein_vars), columns=cols)
print(f'{len(df_add_rids):4d} protein variables to add')
self.model.add_reactions(df_add_rids)
def _get_modelled_protein_mass_fraction(self, fname):
"""Determine protein mass fraction based on Pax-db.org download.
Determine which part of organism protein mass fraction is modelled.
Protein abundance file needs first to be downloaded from Pax-db.org
:param str fname: file path/name of protein abundance data collected from Pax-db.org
:return: relative pmf of model based
:rtype: float
"""
# parse PacDb file
name = ''
publication = ''
last_comment_line = ''
gene_idx = None
ppm_abundance = {}
with open(fname) as file:
for line in file:
line = line.rstrip()
if re.match('#', line):
if re.match('#name: ', line):
name = line.split(': ')[1]
elif re.match('#publication_year: ', line):
publication = line.split(': ')[1]
last_comment_line = line
else:
if gene_idx is None:
cols = re.sub('^#', '', last_comment_line).split('\t')
gene_idx = cols.index('string_external_id')
ppm_idx = cols.index('abundance')
record = line.split('\t')
gene = record[gene_idx].split('.')[1]
ppm_abundance[gene] = float(record[ppm_idx])
print(f'PaxDb file {name}, year {publication}, covering {len(ppm_abundance)} proteins '
f'(total {sum(ppm_abundance.values()):.1f} ppm) loaded from {fname}')
used_enz_gps = self._get_enzyme_gene_products()
used_uids = {self.model.gps[gpid].uid for gpid in used_enz_gps}
p_rel_total = 0.0
p_rel_model = 0.0
for locus, ppm in ppm_abundance.items():
if locus in self.model.uniprot_data.locus2uid:
uid = self.model.uniprot_data.locus2uid[locus]
rel_mass = ppm / 1.0e6 * self.model.uniprot_data.proteins[uid].mass
p_rel_total += rel_mass
if uid in used_uids:
p_rel_model += rel_mass
return p_rel_model / p_rel_total
def _reaction_enzyme_coupling(self, avg_enz_sat):
"""Couple reactions with enzyme/protein requirement via kcats.
applicable to enzyme catalyzed reactions.
from reaction get enzyme and corresponding kcat
convert kcat from s-1 to h-1
scaled total enzyme kcat by number of active sites and average enzyme saturation
protein constraint for a specific protein:
reaction flux [mmol/gDWh] ≤ kcat [1/s] * 3600 [s/h] * n_AS * 1/stoic * avg_enz_sat * P_conc [mg/gDW]
* 1/MW [mmol/mg]
- stoic: number of protein copies in the enzyme
- n_AS: number of active sites
- MW: protein molecular weight in Da = g/mol = mg/mmol
Except for MOMENT we assume equality to optimize protein allocation
constraint per protein: C_prot_uid couples reaction flux R_xxx with protein concentration V_PC_uid
C_prot_uid: (-1/(kcat * 3600 * n_AS * avg_enz_sat) * stoic * MW) * R_xxx + 1 * V_PC_uid = (≤) 0
:param float avg_enz_sat: average enzyme saturation applicable to all reactions
"""
for r in self.model.reactions.values():
assert len(r.enzymes) <= 1, ('Something went wrong, at this stage of iso-reactions, we expect a maximum'
'of one enzyme catalyzing this reaction')
if len(r.enzymes) == 1 and r.kcat is not None:
e = self.model.enzymes[r.enzymes[0]]
enz_kcat_per_h = r.kcat * 3600.0 * e.active_sites * avg_enz_sat
for locus, stoic in e.composition.items():
uid = self.model.locus2uid[locus]
p = self.model.proteins[uid]
prot_sid = None
# in MOMENT model, a promiscuous enzyme/protein has specific proteins per reactions
# e.g. b2215, ompC, P06996: this protein is linked to all (reversible) reactions it catalyzes
# reactions R_23CCMPtex_iso4 and R_23CCMPtex_iso4_REV both coupled to constraint
# C_prot_P06996_R_23CAMPtex_iso4
if len(p.linked_sids) == 1:
prot_sid = list(p.linked_sids)[0]
else:
rev_rid = re.sub('_REV$', '', r.id)
for linked_sid in p.linked_sids:
if rev_rid in linked_sid:
prot_sid = linked_sid
break
r.reactants[prot_sid] = (stoic / enz_kcat_per_h) * p.mw
def _rescale_reactions(self, df_rescale):
"""Rescale reactions (e.g. Biomass) as per ecYeast7 (Sanchez, 2017)
biomass dict contains
- biomass reaction id 'rid'
- rescale tasks 'rescale' a list of dict, each with
- either rescale factor 'factor' or an absolute value 'value', float
- 'reactants' and/or 'products', consisting of list of species ids
amino acids get scaled by parameter f_p
carbohydrates get scaled by parameter f_c
GAM related species get set to gam level
:param pandas.DataFrame df_rescale: biomass rescaling parameters
"""
modify_reactants = {}
modify_products = {}
for rid, row in df_rescale.iterrows():
r = self.model.reactions[rid]
if type(row['reactants']) is str:
scale_reacs = {sid.strip() for sid in row['reactants'].split(',')}
if rid not in modify_reactants:
modify_reactants[rid] = r.reactants.copy()
new_srefs = {}
if np.isfinite(row['factor']):
factor = row['factor']
for sid, stoic in modify_reactants[rid].items():
new_srefs[sid] = factor * stoic if sid in scale_reacs else stoic
elif np.isfinite(row['value']):
value = row['value']
for sid, stoic in modify_reactants[rid].items():
new_srefs[sid] = value if sid in scale_reacs else stoic
modify_reactants[rid] = new_srefs
if type(row['products']) is str:
scale_prods = {sid.strip() for sid in row['products'].split(',')}
if rid not in modify_products:
modify_products[rid] = r.products.copy()
new_srefs = {}
if np.isfinite(row['factor']):
factor = row['factor']
for sid, stoic in modify_products[rid].items():
new_srefs[sid] = factor * stoic if sid in scale_prods else stoic
elif np.isfinite(row['value']):
value = row['value']
for sid, stoic in modify_products[rid].items():
new_srefs[sid] = value if sid in scale_prods else stoic
modify_products[rid] = new_srefs
modify_attrs = []
for rid, srefs in modify_reactants.items():
reactants = '; '.join([f'species={sid}, stoic={stoic}' for sid, stoic in srefs.items()])
modify_attrs.append([rid, 'reaction', 'reactants', reactants])
for rid, srefs in modify_products.items():
products = '; '.join([f'species={sid}, stoic={stoic}' for sid, stoic in srefs.items()])
modify_attrs.append([rid, 'reaction', 'products', products])
cols = ['id', 'component', 'attribute', 'value']
df_modify_attrs = pd.DataFrame(modify_attrs, columns=cols)
df_modify_attrs.set_index('id', inplace=True)
print(f'reaction rescaling')
self.model.modify_attributes(df_modify_attrs, 'reaction')
[docs]
def validate(self):
"""Validate compliance with SBML standards, including units configuration.
Validation is an optional task taking time. Validation could be skipped once model
configuration is stable.
Information on non-compliance is printed. Details are written to `tmp/tmp.txt`.
In case of an unsuccessful validation, it is recommended to review `tmp/tmp.txt` and
improve on the model configuration.
Example: Ensure compliance with SBML standards for an EcModel instance prior to its export to file.
.. code-block:: python
if ec_model.validate():
ec_model.export('iML1515_GECKO.xml')
:return: success status
:rtype: bool
"""
return self.model.validate()
[docs]
def export(self, fname):
"""Export EcModel to SBML encoded file (.xml) or spreadsheet (.xlsx).
The spreadsheet (.xlsx) is helpful to inspect model configuration.
Example: Export EcModel instance to SBML encoded file and to spreadsheet format.
.. code-block:: python
if ec_model.validate():
ec_model.export('iML1515_GECKO.xml')
ec_model.export('iML1515_GECKO.xlsx')
:param str fname: filename with extension '.xml' or '.xlsx'
:return: success status
:rtype: bool
"""
return self.model.export(fname)