Source code for f2xba.ec_model.ec_model

"""Implementation of EcModel class.

Extend the XbaModel to become an enzyme constraint model.
The implementation of GECKO is based on the GECKO 1.0 MATLAB package (Sánchez et al., 2017).
The implementation of ccFBA and MOMENTmr is based on the R-package sybilccFBA (Desouki, 2015)
The implementation of MOMENT is based on literature (Adadi et al., 2012)

Peter Schubert, HHU Duesseldorf, October 2023
"""

import re
import numpy as np
import pandas as pd
import f2xba.prefixes as pf
from ..utils.mapping_utils import valid_sbml_sid, load_parameter_file

MAX_CONC_PROT = 100  # mg/gDW maximum protein concentration in kpmf (kilo protein mass fraction)


[docs] class EcModel: """Create an enzyme constraint model by extending a XbaModel instance. Support is provided for generation of GECKO, MOMENT, MOMENTmr and ccFBA models. GECKO is implemented based on the GECKO 1.0 MATLAB package (Sánchez et al., 2017), MOMENT is described in literature (Adadi et al., 2012), ccFBA and MOMENTmr are based on the R-package sybilccFBA (Desouki, 2015). Example: Create a GECKO model by extending an existing genome scale metabolic model. The spreadsheet files `xba_parameters.xlsx` and `ecm_parameters.xlsx` contain configuration data. See tutorials. .. code-block:: python xba_model = XbaModel('iML1515.xml') xba_model.configure('xba_parameters.xlsx') ec_model = EcModel(xba_model) ec_model.configure('ecm_parameters.xlsx') if ec_model.validate(): ec_model.export('iML1515_GECKO.xml') """ def __init__(self, xba_model): """Instantiate the EcModel instance. :param xba_model: a reference to a XbaModel instance :type xba_model: :class:`XbaModel` """ self.model = xba_model """Reference to the XbaModel instance.""" self.arm_flag = False
[docs] def configure(self, fname): """Configure the EcModel instance with information provided in the ECM configuration file. The ECM configuration spreadsheet should contain the table 'general'. See tutorials. Example: Create a GECKO model by extending an existing genome scale metabolic model. The spreadsheet files `xba_parameters.xlsx` and `ecm_parameters.xlsx` contain the configuration data. .. code-block:: python xba_model = XbaModel('iML1515.xml') xba_model.configure('xba_parameters.xlsx') ec_model = EcModel(xba_model) ec_model.configure('ecm_parameters.xlsx') :param str fname: filename of ECM configuration file (.xlsx) :return: success status :rtype: bool """ sheet_names = ['general', 'rescale_reactions'] ecm_params = load_parameter_file(fname, sheet_names) if 'general' not in ecm_params.keys(): print(f'mandatory table "general" not found in the document') raise ValueError general_params = ecm_params['general']['value'].to_dict() ecm_type = general_params.get('ecm_type', 'GECKO') p_total = general_params.get('p_total', 0.5) avg_enz_sat = general_params.get('avg_enz_sat', 0.5) self.arm_flag = general_params.get('arm_flag', False) if 'pm2totpm' in general_params: pm2totpm = general_params['pm2totpm'] else: assert 'pm2totpm_val_or_paxdb' in general_params, 'parameter "pm2totpm_val_or_paxdb" is required' if type(general_params['pm2totpm_val_or_paxdb']) is float: pm2totpm = general_params['pm2totpm_val_or_paxdb'] else: pm2totpm = self._get_modelled_protein_mass_fraction(general_params['pm2totpm_val_or_paxdb']) print(f'modelled protein fraction of total protein mass {pm2totpm:.4f} g/g') if 'rescale_reactions' in ecm_params: self._rescale_reactions(ecm_params['rescale_reactions']) # create enzyme constraint model if ecm_type in ['GECKO', 'MOMENTmr']: self.model.add_isoenzyme_reactions() self.model.make_irreversible() self._add_gecko_protein_species() elif ecm_type == 'MOMENT': self.model.add_isoenzyme_reactions() self._add_moment_protein_species() self.model.make_irreversible() elif ecm_type == 'ccFBA': self._select_ccfba_isoenzyme() self.model.make_irreversible() self.model.remove_unused_gps() self._add_gecko_protein_species() else: print(f'model not constructed, invalid ecm_type {ecm_type}') return False if self.arm_flag is True: self.model.add_arm_reactions() protein_pool = p_total * pm2totpm print(f'protein constraint: {protein_pool*1000.0:.2f} mg/gDW - modelled protein') self._add_total_protein_constraint(protein_pool) self._reaction_enzyme_coupling(avg_enz_sat) # remove model components that are not used in the model self.model.clean() # add some parameter values for reference (after model.clean() so they are not removed) add_params = {'gmodeledP_per_gP': {'value': pm2totpm, 'name': 'protein mass fraction modelled'}, 'gP_per_gDW': {'value': p_total, 'name': 'protein mass fraction of total dry mass'}, 'avg_enz_sat': {'value': avg_enz_sat, 'name': 'average enzyme saturation level'}} for pid, p_data in add_params.items(): self.model.add_parameter(pid, p_data) # modify some model attributes and create L3V2 SBML model self.model.model_attrs['id'] += f'_{ecm_type}' if 'name' in self.model.model_attrs: self.model.model_attrs['name'] = f'{ecm_type} model of ' + self.model.model_attrs['name'] self.model.sbml_container['level'] = 3 self.model.sbml_container['version'] = 2 self.model.print_size() return True
def _get_enzyme_gene_products(self): """Determine used genes products for enzymes. Note: Model can contain additional gene products for gene product associations where enzyme kcat have not been provided. Useful for gene deletion studies. :return: gene product ids used by the enzymes in the model :rtype: set(str) """ used_enzymes = set() for rid, r in self.model.reactions.items(): for eid in r.enzymes: used_enzymes.add(eid) used_enz_gps = set() for eid in used_enzymes: e = self.model.enzymes[eid] for gene_id in e.composition: used_enz_gps.add(self.model.locus2gp[gene_id]) return used_enz_gps def _add_gecko_protein_species(self): """Add protein species to the GECKO/ccFBA/MOMENTmr model. Note: Protein compartments in XBA model are derived from reaction compartments, which in turn is a concatenation of species compartments being substrates to the reaction. Here we only need one of the species compartments. We only create proteins when gene product is used by any of the enzymes add protein species to the model (also ensure that protein species name is valid) add protein drain reactions to the model """ used_enz_gps = self._get_enzyme_gene_products() protein_sids = {} for gpid in used_enz_gps: uid = self.model.gps[gpid].uid p = self.model.proteins[uid] prot_sid = pf.C_prot_ + valid_sbml_sid(uid) p.link_sid(prot_sid) prot_metaid = f'meta_prot_{valid_sbml_sid(uid)}' prot_annot = f'bqbiol:is, uniprot/{uid}' protein_sids[prot_sid] = [p.name, p.cid, False, False, False, prot_metaid, prot_annot] cols = ['name', 'compartment', 'hasOnlySubstanceUnits', 'boundaryCondition', 'constant', 'metaid', 'miriamAnnotation'] df_add_species = pd.DataFrame(protein_sids.values(), index=list(protein_sids), columns=cols) print(f'{len(df_add_species):4d} protein constraints to add') self.model.add_species(df_add_species) def _add_moment_protein_species(self): """Add protein species to MOMENT model. Execute before making model irreversible. In MOMENT promiscuous enzymes can catalyze several reactions in parallel To implement this constraint, we add for each catalyzed (iso)reaction a specific protein species add protein species to the model """ # add dedicated protein species for each (iso)reaction protein_sids = {} for rid, r in self.model.reactions.items(): assert(len(r.enzymes) <= 1) if len(r.enzymes) == 1: enz = self.model.enzymes[r.enzymes[0]] for locus in enz.composition: uid = self.model.locus2uid[locus] p = self.model.proteins[uid] prot_sid = pf.C_prot_ + valid_sbml_sid(uid) + f'_{rid}' p.link_sid(prot_sid) prot_metaid = f'meta_prot_{valid_sbml_sid(uid)}_{rid}' prot_annot = f'bqbiol:is, uniprot/{uid}' protein_sids[prot_sid] = [p.name, p.cid, False, False, False, prot_metaid, prot_annot] cols = ['name', 'compartment', 'hasOnlySubstanceUnits', 'boundaryCondition', 'constant', 'metaid', 'miriamAnnotation'] df_add_species = pd.DataFrame(protein_sids.values(), index=list(protein_sids), columns=cols) print(f'{len(df_add_species):4d} moment protein constraints to add') self.model.add_species(df_add_species) def _select_ccfba_isoenzyme(self): """Select least cost enzyme for reactions catalyzed by several isoenzymes. Consider number of active sites when determining enzyme kcat value Other isoenzymes get deleted and gpa updated. Integration with Thermodynamic FBA (TFA): TFA splits TD related reations in fwd/rev (_REV) TFA also split reactions that in base model are not reversible i.e. we may have irreversible '<rid>_REV' with no kcat Selection is based on molecular weight and turnover number. Select enzyme with minimal MW/kcat. Different isoenzymes might be selected for forward/reverse direction. Therefore, reversible reactions with isoenzymes get split in fwd/bwd reaction. """ # Note model.reactions get updated within the loop, therefore fix rids initially rids = list(self.model.reactions) for rid in rids: r = self.model.reactions[rid] if len(r.enzymes) >= 2 and r.kcatsf is not None and np.isfinite(r.kcatsf[0]): directional_rs = [r] if r.reversible: # split reaction, i.e. create a new irreversible reverse reaction rev_r = self.model.split_reversible_reaction(r) directional_rs.append(rev_r) for dir_r in directional_rs: valid_kcatsf = (sum([1 for kcat in dir_r.kcatsf if np.isfinite(kcat)]) if dir_r.kcatsf is not None else 0) # consider reactions with undefined kcatf (due to TD integration) if len(r.enzymes) == valid_kcatsf: idx = np.argmin([self.model.enzymes[eid].mw / (kcat * self.model.enzymes[eid].active_sites) for eid, kcat in zip(dir_r.enzymes, dir_r.kcatsf)]) eid = dir_r.enzymes[idx] gpa = ' and '.join(sorted([self.model.uid2gp[self.model.locus2uid[locus]] for locus in self.model.enzymes[eid].composition])) if ' and ' in gpa: gpa = '(' + gpa + ')' dir_r.enzymes = [dir_r.enzymes[idx]] dir_r.kcatsf = [dir_r.kcatsf[idx]] dir_r.gene_product_assoc = gpa else: dir_r.enzymes = [] dir_r.kcatsf = None dir_r.kcatsr = None def _add_total_protein_constraint(self, total_protein): """add total modelled protein constraint to the enzyme constraint model. add protein pool species to the model add protein drain reactions to the model add protein pool exchange reaction to the model protein concentration in model has units of mg/gDW (i.e. 1000.0 * pmf) :param float total_protein: total modelled protein mass in g/gDW """ # add total protein pool species in default compartment pool_sid = f'{pf.C_prot_pool}' pool_name = 'total protein pool' cols = ['name', 'compartment', 'hasOnlySubstanceUnits', 'boundaryCondition', 'constant'] df_add_species = pd.DataFrame([[pool_name, self.model.main_cid, False, False, False]], index=[pool_sid], columns=cols) self.model.add_species(df_add_species) # add total protein pool exchange reaction protein_vars = {} draw_rid = pf.V_PC_total draw_name = f'total concentration modelled protein' zero_mg_pid = self.model.get_fbc_bnd_pid(0.0, 'mg_per_gDW', 'zero_mass_conc_mg_per_gDW') total_prot_mg_pid = self.model.get_fbc_bnd_pid(total_protein * 1000.0, 'mg_per_gDW', 'conc_protein_mg_per_gDW') protein_vars[draw_rid] = [draw_name, f' => {pool_sid}', zero_mg_pid, total_prot_mg_pid, None, 'protein'] # add protein drain reactions - supporting MOMENT with specific protein split in protein species per reaction max_conc_mg_pid = self.model.get_fbc_bnd_pid(MAX_CONC_PROT, 'mg_per_gDW', 'max_conc_prot_mg_per_gDW') # create only for used proteins used_enz_gps = self._get_enzyme_gene_products() for gpid in used_enz_gps: uid = self.model.gps[gpid].uid p = self.model.proteins[uid] draw_rid = pf.V_PC_ + valid_sbml_sid(uid) draw_name = f'conc_prot_{uid}' # supporting MOMENT model with protein split into protein species per reaction products = ' + '.join([sid for sid in p.linked_sids]) # reaction_string = f'{p.mw / 1000.0} {pool_sid} => {products}' reaction_string = f'{pool_sid} => {products}' protein_vars[draw_rid] = [draw_name, reaction_string, zero_mg_pid, max_conc_mg_pid, self.model.uid2gp[uid], 'protein'] cols = ['name', 'reactionString', 'fbcLowerFluxBound', 'fbcUpperFluxBound', 'fbcGeneProdAssoc', 'kind'] df_add_rids = pd.DataFrame(protein_vars.values(), index=list(protein_vars), columns=cols) print(f'{len(df_add_rids):4d} protein variables to add') self.model.add_reactions(df_add_rids) def _get_modelled_protein_mass_fraction(self, fname): """Determine protein mass fraction based on Pax-db.org download. Determine which part of organism protein mass fraction is modelled. Protein abundance file needs first to be downloaded from Pax-db.org :param str fname: file path/name of protein abundance data collected from Pax-db.org :return: relative pmf of model based :rtype: float """ # parse PacDb file name = '' publication = '' last_comment_line = '' gene_idx = None ppm_abundance = {} with open(fname) as file: for line in file: line = line.rstrip() if re.match('#', line): if re.match('#name: ', line): name = line.split(': ')[1] elif re.match('#publication_year: ', line): publication = line.split(': ')[1] last_comment_line = line else: if gene_idx is None: cols = re.sub('^#', '', last_comment_line).split('\t') gene_idx = cols.index('string_external_id') ppm_idx = cols.index('abundance') record = line.split('\t') gene = record[gene_idx].split('.')[1] ppm_abundance[gene] = float(record[ppm_idx]) print(f'PaxDb file {name}, year {publication}, covering {len(ppm_abundance)} proteins ' f'(total {sum(ppm_abundance.values()):.1f} ppm) loaded from {fname}') used_enz_gps = self._get_enzyme_gene_products() used_uids = {self.model.gps[gpid].uid for gpid in used_enz_gps} p_rel_total = 0.0 p_rel_model = 0.0 for locus, ppm in ppm_abundance.items(): if locus in self.model.uniprot_data.locus2uid: uid = self.model.uniprot_data.locus2uid[locus] rel_mass = ppm / 1.0e6 * self.model.uniprot_data.proteins[uid].mass p_rel_total += rel_mass if uid in used_uids: p_rel_model += rel_mass return p_rel_model / p_rel_total def _reaction_enzyme_coupling(self, avg_enz_sat): """Couple reactions with enzyme/protein requirement via kcats. applicable to enzyme catalyzed reactions. from reaction get enzyme and corresponding kcat convert kcat from s-1 to h-1 scaled total enzyme kcat by number of active sites and average enzyme saturation protein constraint for a specific protein: reaction flux [mmol/gDWh] ≤ kcat [1/s] * 3600 [s/h] * n_AS * 1/stoic * avg_enz_sat * P_conc [mg/gDW] * 1/MW [mmol/mg] - stoic: number of protein copies in the enzyme - n_AS: number of active sites - MW: protein molecular weight in Da = g/mol = mg/mmol Except for MOMENT we assume equality to optimize protein allocation constraint per protein: C_prot_uid couples reaction flux R_xxx with protein concentration V_PC_uid C_prot_uid: (-1/(kcat * 3600 * n_AS * avg_enz_sat) * stoic * MW) * R_xxx + 1 * V_PC_uid = (≤) 0 :param float avg_enz_sat: average enzyme saturation applicable to all reactions """ for r in self.model.reactions.values(): assert len(r.enzymes) <= 1, ('Something went wrong, at this stage of iso-reactions, we expect a maximum' 'of one enzyme catalyzing this reaction') if len(r.enzymes) == 1 and r.kcat is not None: e = self.model.enzymes[r.enzymes[0]] enz_kcat_per_h = r.kcat * 3600.0 * e.active_sites * avg_enz_sat for locus, stoic in e.composition.items(): uid = self.model.locus2uid[locus] p = self.model.proteins[uid] prot_sid = None # in MOMENT model, a promiscuous enzyme/protein has specific proteins per reactions # e.g. b2215, ompC, P06996: this protein is linked to all (reversible) reactions it catalyzes # reactions R_23CCMPtex_iso4 and R_23CCMPtex_iso4_REV both coupled to constraint # C_prot_P06996_R_23CAMPtex_iso4 if len(p.linked_sids) == 1: prot_sid = list(p.linked_sids)[0] else: rev_rid = re.sub('_REV$', '', r.id) for linked_sid in p.linked_sids: if rev_rid in linked_sid: prot_sid = linked_sid break r.reactants[prot_sid] = (stoic / enz_kcat_per_h) * p.mw def _rescale_reactions(self, df_rescale): """Rescale reactions (e.g. Biomass) as per ecYeast7 (Sanchez, 2017) biomass dict contains - biomass reaction id 'rid' - rescale tasks 'rescale' a list of dict, each with - either rescale factor 'factor' or an absolute value 'value', float - 'reactants' and/or 'products', consisting of list of species ids amino acids get scaled by parameter f_p carbohydrates get scaled by parameter f_c GAM related species get set to gam level :param pandas.DataFrame df_rescale: biomass rescaling parameters """ modify_reactants = {} modify_products = {} for rid, row in df_rescale.iterrows(): r = self.model.reactions[rid] if type(row['reactants']) is str: scale_reacs = {sid.strip() for sid in row['reactants'].split(',')} if rid not in modify_reactants: modify_reactants[rid] = r.reactants.copy() new_srefs = {} if np.isfinite(row['factor']): factor = row['factor'] for sid, stoic in modify_reactants[rid].items(): new_srefs[sid] = factor * stoic if sid in scale_reacs else stoic elif np.isfinite(row['value']): value = row['value'] for sid, stoic in modify_reactants[rid].items(): new_srefs[sid] = value if sid in scale_reacs else stoic modify_reactants[rid] = new_srefs if type(row['products']) is str: scale_prods = {sid.strip() for sid in row['products'].split(',')} if rid not in modify_products: modify_products[rid] = r.products.copy() new_srefs = {} if np.isfinite(row['factor']): factor = row['factor'] for sid, stoic in modify_products[rid].items(): new_srefs[sid] = factor * stoic if sid in scale_prods else stoic elif np.isfinite(row['value']): value = row['value'] for sid, stoic in modify_products[rid].items(): new_srefs[sid] = value if sid in scale_prods else stoic modify_products[rid] = new_srefs modify_attrs = [] for rid, srefs in modify_reactants.items(): reactants = '; '.join([f'species={sid}, stoic={stoic}' for sid, stoic in srefs.items()]) modify_attrs.append([rid, 'reaction', 'reactants', reactants]) for rid, srefs in modify_products.items(): products = '; '.join([f'species={sid}, stoic={stoic}' for sid, stoic in srefs.items()]) modify_attrs.append([rid, 'reaction', 'products', products]) cols = ['id', 'component', 'attribute', 'value'] df_modify_attrs = pd.DataFrame(modify_attrs, columns=cols) df_modify_attrs.set_index('id', inplace=True) print(f'reaction rescaling') self.model.modify_attributes(df_modify_attrs, 'reaction')
[docs] def validate(self): """Validate compliance with SBML standards, including units configuration. Validation is an optional task taking time. Validation could be skipped once model configuration is stable. Information on non-compliance is printed. Details are written to `tmp/tmp.txt`. In case of an unsuccessful validation, it is recommended to review `tmp/tmp.txt` and improve on the model configuration. Example: Ensure compliance with SBML standards for an EcModel instance prior to its export to file. .. code-block:: python if ec_model.validate(): ec_model.export('iML1515_GECKO.xml') :return: success status :rtype: bool """ return self.model.validate()
[docs] def export(self, fname): """Export EcModel to SBML encoded file (.xml) or spreadsheet (.xlsx). The spreadsheet (.xlsx) is helpful to inspect model configuration. Example: Export EcModel instance to SBML encoded file and to spreadsheet format. .. code-block:: python if ec_model.validate(): ec_model.export('iML1515_GECKO.xml') ec_model.export('iML1515_GECKO.xlsx') :param str fname: filename with extension '.xml' or '.xlsx' :return: success status :rtype: bool """ return self.model.export(fname)