Source code for f2xba.xba_model.xba_model

"""Implementation of XbaModel class.

XbaModel serves as an in-memory representation of the model during the extension process.
A genome-scale metabolic model is initially loaded from a SBML file, and the in-memory
model is prepared for the extension using the ``xba_model.configure()`` function.


Peter Schubert, HHU Duesseldorf, May 2022
"""

import os
import time
import re
import numpy as np
import pandas as pd
import json
from collections import defaultdict
import sbmlxdf

from .sbml_unit_def import SbmlUnitDef
from .sbml_compartment import SbmlCompartment
from .sbml_parameter import SbmlParameter
from .sbml_species import SbmlSpecies
from .sbml_group import SbmlGroup
from .sbml_function_def import SbmlFunctionDef
from .sbml_init_assign import SbmlInitialAssignment
from .fbc_objective import FbcObjective
from .fbc_gene_product import FbcGeneProduct
from .sbml_reaction import SbmlReaction
from .protein import Protein
from .enzyme import Enzyme
from ..ncbi.ncbi_data import NcbiData
from ..uniprot.uniprot_data import UniprotData
from ..utils.mapping_utils import get_srefs, parse_reaction_string, load_parameter_file
from ..utils.calc_mw import calc_mw_from_formula
from ..biocyc.biocyc_data import BiocycData
import f2xba.prefixes as pf


FBC_BOUND_TOL = '.10e'
DEFAULT_METABOLIC_KCAT = 12.5
DEFAULT_TRANSPORTER_KCAT = 50.0

SBO_GENE = 'SBO:0000243'


[docs]
class XbaModel:
    """Create an in-memory representation of a SBML encoded genome-scale metabolic model loaded from file.

    All extended model creations start with XbaModel. The XbaModel needs to be configured.
    Configuration data, if provided, is loaded from a XBA configuration file (.xlsx).
    The configuration data may include references to other files or online database resources.

    Example: Generate an updated version of a SBML encoded metabolic model. The spreadsheet `xba_parameters.xlsx`
    contains relevant configuration data.

    .. code-block:: python

        xba_model = XbaModel('iML1515.xml')
        xba_model.configure('xba_parameters.xlsx')
        if xba_model.validate():
            xba_model.export('iML1515_updated.xml')

    """


    def __init__(self, sbml_file):
        """Instantiate the XbaModel instance.

        :param str sbml_file: filename of SBML model (FBA model)
        """
        # Load SBML model and extract model data
        if not os.path.exists(sbml_file):
            print(f'{sbml_file} does not exist')
            raise FileNotFoundError
        print(f'loading: {sbml_file} (last modified: {time.ctime(os.path.getmtime(sbml_file))})')
        sbml_model = sbmlxdf.Model(sbml_file)
        if sbml_model.isModel is False:
            print(f'{sbml_file} seems not to be a valid SBML model')
            return

        model_dict = sbml_model.to_df()
        self.sbml_container = model_dict['sbml']
        """SBML container data."""

        self.model_attrs = model_dict['modelAttrs']
        """SBML model attributes."""

        self.unit_defs = {udid: SbmlUnitDef(row)
                          for udid, row in model_dict['unitDefs'].iterrows()}
        """SBML units definition data."""

        self.compartments = {cid: SbmlCompartment(row)
                             for cid, row in model_dict['compartments'].iterrows()}
        """SBML compartments configuration data."""

        self.parameters = {pid: SbmlParameter(row)
                           for pid, row in model_dict['parameters'].iterrows()}
        """SBML parameters configuration data."""

        self.species = {sid: SbmlSpecies(row)
                        for sid, row in model_dict['species'].iterrows()}
        """SBML species configuration data."""

        self.reactions = {rid: SbmlReaction(row, self.species)
                          for rid, row in model_dict['reactions'].iterrows()}
        """SBML reactions configuration data."""

        self.objectives = {oid: FbcObjective(row)
                           for oid, row in model_dict['fbcObjectives'].iterrows()}
        """SBML FBC optimization objective data."""

        self.gps = {gp_id: FbcGeneProduct(row)
                    for gp_id, row in model_dict['fbcGeneProducts'].iterrows()}
        """SBML FBC gene products configuration data."""

        self.groups = {gid: SbmlGroup(row)
                       for gid, row in model_dict['groups'].iterrows()} if 'groups' in model_dict else None
        """SBML GROUPS groups configuration data."""

        self.func_defs = {fd_id: SbmlFunctionDef(row)
                          for fd_id, row in model_dict['funcDefs'].iterrows()} if 'funcDefs' in model_dict else None
        """SBML functions definition data."""

        self.init_assigns = ({symbol_id: SbmlInitialAssignment(row)
                             for symbol_id, row in model_dict['initAssign'].iterrows()}
                             if 'initAssign' in model_dict else None)
        """SBML initial assignments configuration data."""

        # add unit definitions to ensure we can validate SBML wrt units without issuing warnings
        if 'substanceUnits' not in self.model_attrs:
            self.model_attrs['substanceUnits'] = 'mmol_per_gDW'
            u_dict = {'id': 'mmol_per_gDW', 'name': 'millimole per gram (dry weight)',
                      'units': ('kind=mole, exp=1.0, scale=-3, mult=1.0; '
                                'kind=gram, exp=-1.0, scale=0, mult=1.0')}
            self.add_unit_def(u_dict)

        self.cofactor_flag = True
        self.uid2gp = {}
        """Map UniProt identifier to gene product id."""

        self.locus2gp = {}
        """Map gene identifier to gene product id."""

        self.locus2uid = {}
        """Map gene identifier to UniProt identifier."""

        self.locus2rids = {}
        """Map gene identifier to catalyzed reactions."""

        self.update_gp_mappings()

        self.gem_size = {'n_sids': len(self.species), 'n_rids': len(self.reactions),
                         'n_gps': len(self.gps), 'n_pids': len(self.parameters)}

        # determine flux bound unit id used in genome scale model for reuse
        any_fbc_pid = self.reactions[list(self.reactions)[0]].fbc_lower_bound
        self.flux_uid = self.parameters[any_fbc_pid].units

        # collect all flux bound parameters used in the genome scale model for reuse
        val2pid = {data.value: pid for pid, data in self.parameters.items()
                   if data.units == self.flux_uid and data.reuse is True}
        self.fbc_shared_pids = {self.flux_uid: val2pid}

        # determine flux range used in genome scale model
        self.fbc_flux_range = [0.0, 0.0]
        for r in self.reactions.values():
            self.fbc_flux_range[0] = min(self.fbc_flux_range[0], self.parameters[r.fbc_lower_bound].value)
            self.fbc_flux_range[1] = max(self.fbc_flux_range[1], self.parameters[r.fbc_upper_bound].value)

        self.enzymes = {}
        """Enzyme configuration data."""

        self.proteins = {}
        """Protein configuration data."""

        self.user_chebi2sid = {}

        self.ncbi_data = None
        """Reference to NCBI genome data."""

        self.uniprot_data = None
        """Reference to UniProt protein data."""

        # required for RBA models
        self.main_cid = model_dict['species']['compartment'].value_counts().index[0]

        # determine external compartment and identify drain reactions
        self.external_compartment = self._get_external_compartment()
        for rid, r in self.reactions.items():
            if r.kind == 'exchange':
                if r.compartment != self.external_compartment:
                    r.kind = 'drain'

    def update_gp_mappings(self):
        """Update mappings related to genes and gene products

        :meta private:
        """

        self.uid2gp = {gp.uid: gp_id for gp_id, gp in self.gps.items() if gp.uid}
        self.locus2gp = {gp.label: gpid for gpid, gp in self.gps.items()}
        self.locus2uid = {gp.label: gp.uid for gp in self.gps.values() if gp.uid}
        self.locus2rids = self._get_locus2rids()
        uid2locus = defaultdict(list)
        for locus, uid in self.locus2uid.items():
            uid2locus[uid].append(locus)
        for uid, loci in uid2locus.items():
            if len(loci) > 1:
                print(f'WARNING: {loci} gene loci map to same protein {uid}, this needs to be corrected!')

    def _get_external_compartment(self):
        """Determine the external compartment id.

        The external compartment is determined from pseudo `exchange` reactions,
        which include drain and sink reactions. It is the compartment where most of
        the exchanged metabolites are located.

        :return: external compartment id
        :rtype: str
        """
        cids = {}
        for rid, r in self.reactions.items():
            if r.kind == 'exchange':
                cid = r.compartment
                if cid not in cids:
                    cids[cid] = 0
                cids[cid] += 1
        return sorted([(count, cid) for cid, count in cids.items()])[-1][1]

    def _get_locus2rids(self):
        """Determine mapping of locus (gene-id) to reaction ids.

        Based on reaction gpa (Gene Product Association)

        :return: mapping of locus to reaction ids
        :rtype: dict (key: locus, val: list of rids
        """
        locus2rids = {}
        for rid, r in self.reactions.items():
            if r.gene_product_assoc:
                tmp_gpa = re.sub(r'[()]', '', r.gene_product_assoc)
                gpids = {gpid.strip() for gpid in re.sub(r'( and )|( or )', ',', tmp_gpa).split(',')}
                for gpid in gpids:
                    locus = self.gps[gpid].label
                    if locus not in locus2rids:
                        locus2rids[locus] = []
                    locus2rids[locus].append(rid)
        return locus2rids


[docs]
    def configure(self, fname=None):
        """Configure the XbaModel instance.

        Configuration will use default values unless a XBA configuration file (.xlsx) is
        provided with parameter `fname`.

        The spreadsheet may contain the sheets: `general`, `modify_attributes`, `remove_gps`,
        `remove_reactions`, `add_gps`, `add_parameters`, `add_species`, `add_reactions`,
        and `chebi2sid`. See tutorials.

        Example: XbaModel configuration using the configuration data in spreadsheet `xba_parameters.xlsx`.

        .. code-block:: python

            xba_model = XbaModel('iML1515.xml')
            xba_model.configure('xba_parameters.xlsx')

        :param str fname: (optional) filename of XBA configuration file (.xlsx)
        """
        if fname is None:
            xba_params = {}
        else:
            sheet_names = ['general', 'modify_attributes', 'remove_gps', 'remove_reactions',
                           'add_gps', 'add_parameters', 'add_species', 'add_reactions', 'chebi2sid']
            xba_params = load_parameter_file(fname, sheet_names)
        general_params = xba_params['general']['value'].to_dict() if 'general' in xba_params.keys() else {}
        self.cofactor_flag = general_params.get('cofactor_flag', False)

        #####################
        # Update References #
        #####################

        if 'bulk_mappings_fname' in general_params:
            self._update_references(general_params['bulk_mappings_fname'])

        #############################
        # modify/correct components #
        #############################

        protect_ids = []
        if 'remove_gps' in xba_params:
            remove_gps = list(xba_params['remove_gps'].index)
            self.gpa_remove_gps(remove_gps)
        if 'remove_reactions' in xba_params:
            remove_gps = list(xba_params['remove_reactions'].index)
            self.remove_reactions(remove_gps)
        if 'add_gps' in xba_params:
            self.add_gps(xba_params['add_gps'])
            protect_ids.extend(xba_params['add_gps'].index)
        if 'add_species' in xba_params:
            self.add_species(xba_params['add_species'])
            protect_ids.extend(xba_params['add_species'].index)
        if 'add_parameters' in xba_params:
            self.add_parameters(xba_params['add_parameters'])
            protect_ids.extend(xba_params['add_parameters'].index)
        if 'add_reactions' in xba_params:
            self.add_reactions(xba_params['add_reactions'])
            protect_ids.extend(xba_params['add_reactions'].index)
        if 'modify_attributes' in xba_params:
            self.modify_attributes(xba_params['modify_attributes'], 'modelAttrs')
            self.modify_attributes(xba_params['modify_attributes'], 'gp')
            self.modify_attributes(xba_params['modify_attributes'], 'species')
            self.modify_attributes(xba_params['modify_attributes'], 'reaction')
        self.clean(protect_ids)
        # for r in self.reactions.values():
        #     r.correct_reversibility(self.parameters, exclude_ex_reactions=True)
        # update mappings (just in case)
        self.update_gp_mappings()

        #################
        # add NCBI data #
        #################
        if 'chromosome2accids' in general_params and 'organism_dir' in general_params:
            ncbi_dir = os.path.join(general_params['organism_dir'], 'ncbi')
            if not os.path.exists(ncbi_dir):
                os.makedirs(ncbi_dir)
                print(f'{ncbi_dir} created')
            chromosome2accids = {}
            for kv in general_params['chromosome2accids'].split(','):
                k, v = kv.split('=')
                chromosome2accids[k.strip()] = v.strip()
            self.ncbi_data = NcbiData(chromosome2accids, ncbi_dir)
            for chromosome, data in self.ncbi_data.chromosomes.items():
                print(f'{chromosome:15s}: {sum(data.nt_composition.values()):7d} nucleotides, '
                      f'{len(data.mrnas):4d} mRNAs,', f'{len(data.rrnas):2d} rRNAs, {len(data.trnas):2d} tRNAs', )
            if 'modify_attributes' in xba_params:
                self.modify_attributes(xba_params['modify_attributes'], 'ncbi')

        ####################################
        # create proteins based on UniProt #
        ####################################
        if 'organism_id' in general_params and 'organism_dir' in general_params:
            organism_dir = general_params['organism_dir']
            if not os.path.exists(organism_dir):
                os.makedirs(organism_dir)
                print(f'{organism_dir} created')
            self.uniprot_data = UniprotData(general_params['organism_id'], organism_dir)
            if 'modify_attributes' in xba_params:
                self.modify_attributes(xba_params['modify_attributes'], 'uniprot')

        # create proteins for model gene products based on UniProt data (preferred) or NCBI sequence data
        if self.uniprot_data or self.ncbi_data:
            count = self.create_proteins()
            print(f'{count:4d} proteins created')

        #################
        # map cofactors #
        #################
        if 'chebi2sid' in xba_params:
            self.user_chebi2sid = {str(chebi): sid for chebi, sid in xba_params['chebi2sid']['sid'].items()
                                   if sid in self.species}
        if self.cofactor_flag:
            count = self.map_protein_cofactors()
            print(f'{count:4d} cofactors mapped to species ids')

        #####################
        # configure enzymes #
        #####################
        if len(self.proteins) > 0:
            self.update_gp_mappings()

            # ensure we have all required protein data
            missing_proteins = 0
            for locus, uid in self.locus2uid.items():
                if uid not in self.proteins:
                    print(f'{locus} has no assigned protein')
                    missing_proteins += 1
            if missing_proteins > 0:
                raise Exception(f'Cannot continue, {missing_proteins} missing proteins')

            count = self.create_enzymes()
            print(f'{count:4d} enzymes added with default stoichiometry')

            # set specific enzyme composition
            if 'enzyme_comp_fname' in general_params:
                fname = general_params['enzyme_comp_fname']
                count = self.set_enzyme_composition_from_file(fname)
                print(f'{count:4d} enzyme compositions updated from {fname}')
            elif 'biocyc_org_prefix' in general_params:
                biocyc_org_prefix = general_params.get('biocyc_org_prefix')
                biocyc_dir = os.path.join(general_params.get('organism_dir', ''), 'biocyc')
                if not os.path.exists(biocyc_dir):
                    os.makedirs(biocyc_dir)
                    print(f'{biocyc_dir} created')
                count = self.set_enzyme_composition_from_biocyc(biocyc_org_prefix, biocyc_dir)
                print(f'{count:4d} enzyme compositions updated from BioCyc enzyme data')
            else:
                print(f'default enzyme composition with 1 copy of each individual protein')

            # modify enzyme composition, e.g. ABC transporters
            if 'modify_attributes' in xba_params:
                self.modify_attributes(xba_params['modify_attributes'], 'enzyme')

            # print summary information on catalyzed reactions
            n_catalyzed = sum([1 for r in self.reactions.values() if len(r.enzymes) > 0])
            n_enzymes = len({eid for r in self.reactions.values() for eid in r.enzymes})
            print(f'{n_catalyzed} reactions catalyzed by {n_enzymes} enzymes')

        #########################
        # configure kcat values #
        #########################
        if len(self.enzymes) > 0:
            # configure default kcat values provided in parameters
            default_kcats = {'metabolic': general_params.get('default_metabolic_kcat', DEFAULT_METABOLIC_KCAT),
                             'transporter': general_params.get('default_transporter_kcat', DEFAULT_TRANSPORTER_KCAT)}
            self.create_default_kcats(default_kcats)
            print(f'default kcat values configured for {list(default_kcats)} reactions')

            # configure reaction/enzyme specific kcat values
            if 'kcats_fname' in general_params:
                kcats_fname = general_params['kcats_fname']
                count = self.set_reaction_kcats(kcats_fname)
                print(f'{count:4d} kcat values updated from {kcats_fname}')

            # remove enzyme from reactions if kcat values have not been provided
            count = 0
            for rid, r in self.reactions.items():
                if len(r.enzymes) > 0:
                    valid_kcatsf = (sum([1 for kcat in r.kcatsf if np.isfinite(kcat)])
                                    if r.kcatsf is not None else 0)
                    if len(r.enzymes) != valid_kcatsf:
                        r.enzymes = []
                        r.kcatsf = None
                        r.kcatsr = None
                        count += 1
            print(f'{count:4d} enzymes removed due to missing kcat values')

        self.clean(protect_ids)
        self.print_size()
        print('>>> BASELINE XBA model configured!\n')


    def _update_references(self, bulk_mappings_fname):
        """Bulk update of references in the model, mainly MiriamAnnotation data.

        Excel spreadsheet document with bulk mapping tables. Supported tables:
          'fbcGeneProducts', 'species', 'reactions', 'groups'

        Attributes of existing components will be updated if value in supplied table is
        different from None. For updating references in Miriam Annotations, supply new reference
        in data column starting with 'MA:' followed by the database reference, e.g., use 'MA:kegg.compound'
        to update KEGG compound ids on 'species'.

        'fbcGeneProducts': gene products ids newly introduced will be created, allowing replacement
        of gene product ids. Note: 'gene_product_assoc' in 'reactions' need to be updated as well.

        'groups' is special, as it will be used to (re-)create complete GROUPS component.
        Existing data will be overwritten.

        :param str bulk_mappings_fname: file name of Excel spreadsheet document containing mapping tables.
        """

        bulk_mappings = load_parameter_file(bulk_mappings_fname, ['fbcGeneProducts', 'species', 'reactions', 'groups'])

        # create gene products, if they do not yet exist (so we can update attributes subsequently)
        count = 0
        if 'fbcGeneProducts' in bulk_mappings and 'label' in bulk_mappings['fbcGeneProducts'].columns:
            df_mapping = bulk_mappings['fbcGeneProducts']
            for gp_id, row in df_mapping.iterrows():
                if gp_id not in self.gps:
                    self.gps[gp_id] = FbcGeneProduct(pd.Series({'id': gp_id, 'label': row['label']}, name=gp_id))
                    count += 1
            if count > 0:
                print(f'{count:4d} gene products added to the model based on supplied references')

        component2instances = {'reactions': self.reactions, 'species': self.species, 'fbcGeneProducts': self.gps}
        for component, instances in component2instances.items():
            if component in bulk_mappings:
                df_mapping = bulk_mappings[component]
                annot_refs = {col for col in df_mapping.columns if re.match('MA:', col)}
                attributes = set(df_mapping.columns).difference(annot_refs)
                count = 0
                for component_id, row in df_mapping.iterrows():
                    if component_id in instances:
                        instance = instances[component_id]
                        for attr in attributes:
                            if type(row.get(attr)) is str:
                                instance.modify_attribute(attr, row[attr])
                        for annot_ref in annot_refs:
                            if type(row[annot_ref]) is str:
                                instance.miriam_annotation.replace_refs('bqbiol:is', re.sub('^MA:', '', annot_ref),
                                                                        row[annot_ref])
                        count += 1
                print(f'{count:4d} {component} records updated with supplied references')

        # this actually (re-) creates GROUPS component. Any original data will be overwritten
        if 'groups' in bulk_mappings:
            df_groups = bulk_mappings['groups'].reset_index()
            self.groups = {row['id']: SbmlGroup(row) for _, row in df_groups.iterrows()}
            if 'name=groups' not in self.sbml_container.packages:
                self.sbml_container.packages += '; name=groups, version=1, required=False'
            print(f'{len(df_groups):4d} groups with reaction data added to the model')

    def _get_compartments(self, component_type):
        """Get lists of component ids per compartment.

        Supported component types: 'species', 'reactions', 'proteins', 'enzymes'

        :meta private
        :param str component_type: component type to query
        :return: mapping compartment id to component ids
        :rtype: dict(str, list(str))
        """
        component_mapping = {'species': self.species, 'reactions': self.reactions,
                             'proteins': self.proteins, 'enzymes': self.enzymes}
        comp2xid = {}
        for xid, data in component_mapping[component_type].items():
            compartment = data.compartment
            if compartment not in comp2xid:
                comp2xid[compartment] = []
            comp2xid[compartment].append(xid)
        return comp2xid


[docs]
    def print_size(self):
        """Print XbaModel size.

        The difference wrt to the original model is indicated.

        """
        size = {'n_sids': len(self.species), 'n_rids': len(self.reactions),
                'n_gps': len(self.gps), 'n_pids': len(self.parameters)}
        print(f'{size["n_sids"]} constraints ({size["n_sids"] - self.gem_size["n_sids"]:+}); '
              f'{size["n_rids"]} variables ({size["n_rids"] - self.gem_size["n_rids"]:+}); '
              f'{size["n_gps"]} genes ({size["n_gps"] - self.gem_size["n_gps"]:+}); '
              f'{size["n_pids"]} parameters ({size["n_pids"] - self.gem_size["n_pids"]:+})')


    def _get_used_cids(self):
        """Identify compartments used by the model.

        :return: used_cids
        :rtype: dict (key: cid, val: count of species)
        """
        used_cids = {}
        for sid, s in self.species.items():
            cid = s.compartment
            if cid not in used_cids:
                used_cids[cid] = 0
            used_cids[cid] += 1
        return used_cids

    def _get_used_sids_pids_gpids(self):
        """Collect ids used in reactions (species, parameters, gene products)

        :meta private:
        :return: set used ids (species, parameters, gene products)
        :rtype: 3-tuple of sets
        """
        used_sids = set()
        used_pids = set()
        used_gpids = set()
        for rid, r in self.reactions.items():
            used_pids.add(r.fbc_lower_bound)
            used_pids.add(r.fbc_upper_bound)
            for sid in r.reactants:
                used_sids.add(sid)
            for sid in r.products:
                used_sids.add(sid)
            if r.gene_product_assoc:
                gpa = re.sub('and', '', r.gene_product_assoc)
                gpa = re.sub('or', '', gpa)
                gpa = re.sub('[()]', '', gpa)
                for gpx in gpa.split(' '):
                    gp = gpx.strip()
                    if gp != '':
                        used_gpids.add(gp)
        return used_sids, used_pids, used_gpids

    def clean(self, protect_ids=None):
        """Remove unused components from the model.

        Remove species, parameters, gene products not used by reactions.
        Remove compartments not used by species.
        Update reactions in groups component.

        :meta private:
        """
        if protect_ids is None:
            protect_ids = set()
        else:
            protect_ids = set(protect_ids)

        used_sids, used_pids, used_gpids = self._get_used_sids_pids_gpids()
        unused_pids = set(self.parameters).difference(used_pids)
        unused_sids = set(self.species).difference(used_sids)
        unused_gpids = set(self.gps).difference(used_gpids)
        del_components = {'pids': [], 'sids': [], 'gpids': [], 'cids': []}
        for pid in unused_pids:
            if pid not in protect_ids:
                del self.parameters[pid]
                del_components['pids'].append(pid)
        for sid in unused_sids:
            if sid not in protect_ids:
                del self.species[sid]
                del_components['sids'].append(sid)
        for gpid in unused_gpids:
            if gpid not in protect_ids:
                del self.gps[gpid]
                del_components['gpids'].append(gpid)

        used_cids = set(self._get_used_cids())
        unused_cids = set(self.compartments).difference(used_cids)
        for cid in unused_cids:
            if cid not in protect_ids:
                del self.compartments[cid]
                del_components['cids'].append(cid)

        # update reactions in groups component
        orig2rids = defaultdict(set)
        if self.groups:
            for rid in self.reactions:
                rid_fwd = re.sub(r'_REV$', '', rid)
                orig_rid = re.sub(r'_iso\d+$', '', rid_fwd)
                orig2rids[orig_rid].add(rid)
            for group in self.groups.values():
                new_refs = set()
                for ref in group.id_refs:
                    if ref in orig2rids:
                        new_refs |= orig2rids[ref]
                group.id_refs = new_refs

        if len(del_components['pids']) > 0:
            print(f"{len(del_components['pids']):4d} parameters removed following cleanup: {del_components['pids']}")
        if len(del_components['sids']) > 0:
            print(f"{len(del_components['sids']):4d} species removed following cleanup: {del_components['sids']}")
        if len(del_components['gpids']) > 0:
            print(f"{len(del_components['gpids']):4d} parameters removed following cleanup: {del_components['gpids']}")
        if len(del_components['cids']) > 0:
            print(f"{len(del_components['pids']):4d} compartments removed following cleanup: {del_components['cids']}")

    def _create_sbml_model(self):
        """Generate in-memory SBML model."""

        m_dict = {
            'sbml': self.sbml_container,
            'modelAttrs': self.model_attrs,
            'unitDefs': pd.DataFrame([data.to_dict() for data in self.unit_defs.values()]).set_index('id'),
            'compartments': pd.DataFrame([data.to_dict() for data in self.compartments.values()]).set_index('id'),
            'parameters': pd.DataFrame([data.to_dict() for data in self.parameters.values()]).set_index('id'),
            'species': pd.DataFrame([data.to_dict() for data in self.species.values()]).set_index('id'),
            'reactions': pd.DataFrame([data.to_dict() for data in self.reactions.values()]).set_index('id'),
            'fbcObjectives': pd.DataFrame([data.to_dict() for data in self.objectives.values()]).set_index('id'),
            'fbcGeneProducts': pd.DataFrame([data.to_dict() for data in self.gps.values()]).set_index('id'),
        }
        if self.groups:
            m_dict['groups'] = pd.DataFrame([data.to_dict() for data in self.groups.values()])
        if self.func_defs:
            m_dict['funcDefs'] = pd.DataFrame([data.to_dict() for data in self.func_defs.values()]).set_index('id')
        if self.init_assigns:
            m_dict['initAssign'] = pd.DataFrame([data.to_dict()
                                                 for data in self.init_assigns.values()]).set_index('symbol')
        sbml_model = sbmlxdf.Model()
        sbml_model.from_df(m_dict)
        return sbml_model


[docs]
    def validate(self):
        """Validate compliance with SBML standards, including units configuration.

        Validation is an optional task taking time. Validation could be skipped once model
        configuration is stable.

        Information on non-compliance is printed. Details are written to `tmp/tmp.txt`.
        In case of an unsuccessful validation, it is recommended to review `tmp/tmp.txt` and
        improve on the model configuration.

        Example: Ensure compliance with SBML standards for a XbaModel instance prior to its export to file.

        .. code-block:: python

            if xba_model.validate():
                xba_model.export('iML1515_updated.xml')

        :return: success status
        :rtype: bool
        """
        sbml_model = self._create_sbml_model()
        errors = sbml_model.validate_sbml()
        if len(errors) == 0:
            return True
        else:
            print(f'Model not fully compliant to SBML standards, see ./tmp/tmp.txt): ', errors)
            return False


    @staticmethod
    def _json_annotation(data):
        """Extract Miriam Annotation data for JSON export

        :param data: model component data
        :return: annotation data extracted from XBA model
        :rtype: dict
        """
        annotation = {}
        if hasattr(data, 'sboterm'):
            annotation['sbo'] = data.sboterm
        miriam_annot = data.miriam_annotation
        if 'bqbiol:is' in miriam_annot.references:
            for ref, values in miriam_annot.references['bqbiol:is'].items():
                annotation[ref] = list(values) if len(values) > 1 else list(values)[0]
        return annotation

    def _export_to_json(self, fname, gpid2label=None):
        """Export the XBA model to JSON format (which can be used for Escher map creation).

        Escher maps (https://escher.github.io) provide a visual representation of optimization results.
        Either use already existing Escher maps or generate maps compliant to your SBML model using a
        JSON export of the metabolic model.

        Ref: King, Z. A., Dräger, A., Ebrahim, A., Sonnenschein, N., Lewis, N. E., & Palsson, B. O. (2015).
        Escher: A Web Application for Building, Sharing, and Embedding Data-Rich Visualizations of
        Biological Pathways. PLOS Computational Biology, 11(8), e1004321.
        https://doi.org/10.1371/journal.pcbi.1004321

        Optionally, gene product identifiers can be remapped using `gpid2label`.

        Example: Gene essentiality status added to gene ids, which would be displayed accordingly in Escher maps.
        Here `keio_ess` and `keio_red` contain sets of genes that are experimentally considered essential and
        redundant.

        .. code-block:: python

            gpid2label = {}
            for gpid, data in xba_model.gps.items():
                if data.label in keio_ess:
                    sgko_type = 'ess'
                elif data.label in keio_red:
                    sgko_type = 'red'
                else:
                    sgko_type = 'not_in_Keio'
                gpid2label[gpid] = f'{data.label}/{data.name}/{sgko_type}'
            xba_model.export('iML1515.json', gpid2label)

        :param str fname: filename (with extension '.json')
        :param dict(str, str) gpid2label: (optional) remapping of gene ids
        """
        if gpid2label is None:
            gpid2label = {gpid: label for label, gpid in self.locus2gp.items()}

        # create json metabolites data
        metabolites = []
        for mid, data in self.species.items():
             if hasattr(data, 'formula') and type(data.formula) is str and bool(np.isfinite(data.charge)):
                metabolites.append({'id': re.sub('^M_', '', mid), 'name': data.name,
                                    'compartment': data.compartment,
                                    'charge': data.charge, 'formula': data.formula,
                                    'annotation': self._json_annotation(data) })
             else:
                metabolites.append({'id': re.sub('^M_', '', mid), 'name': data.name,
                                    'compartment': data.compartment,
                                    'annotation': self._json_annotation(data)})

        # create json reactions data
        rid2groups = defaultdict(list)
        for _, data in self.groups.items():
            for rid in data.id_refs:
                rid2groups[rid].append(data.name)
        reactions = []
        for rid, data in self.reactions.items():
            substrates = {re.sub('^M_', '', mid): -stoic for mid, stoic in data.reactants.items()}
            substrates.update({re.sub('^M_', '', mid): stoic for mid, stoic in data.products.items()})
            map_gpid2label = {}
            gpr = data.gene_product_assoc if data.gene_product_assoc else ''
            for item in re.findall(r'\b\w+\b', gpr):
                if item in gpid2label:
                    map_gpid2label[item] = gpid2label[item]
            for gpid, label in map_gpid2label.items():
                gpr = re.sub(gpid, label, gpr)
            if rid in rid2groups:
                subsystems = rid2groups[rid] if len(rid2groups[rid]) > 1 else rid2groups[rid][0]
            else:
                subsystems = ''
            reactions.append({'id': re.sub('^R_', '', rid), 'name': data.name, 'metabolites': substrates,
                              'lower_bound': self.parameters[data.fbc_lower_bound].value,
                              'upper_bound': self.parameters[data.fbc_upper_bound].value,
                              'gene_reaction_rule': gpr, 'subsystem': subsystems,
                              'annotation': self._json_annotation(data)})

        # create json genes data
        genes = []
        for gpid, data in self.gps.items():
            genes.append({'id': gpid2label[gpid], 'name': getattr(data, 'name', data.label),
                          'annotation': self._json_annotation(data)})

        # construct model
        annotation = {}
        for ref_val in [ref_val.strip() for ref_val in self.model_attrs.get('miriamAnnotation', '').split(',')]:
            if '/' in ref_val:
                ref, val = ref_val.split('/')
                annotation[ref] = val
        json_model = {'metabolites': metabolites, 'reactions': reactions, 'genes': genes,
                      'id': self.model_attrs.get('id', ''),
                      'name': self.model_attrs.get('name', ''),
                      'compartments': {data.id: getattr(data, 'name', data.id) for data in self.compartments.values()},
                      'annotation': annotation, 'version': '1'}

        with open(fname, 'w') as f:
            json.dump(json_model, f, ensure_ascii=False, indent=0)


[docs]
    def export(self, fname, gpid2label=None):
        """Export XbaModel to SBML encoded file (.xml), spreadsheet (.xlsx) or JSON format (.json).

        The spreadsheet (.xlsx) is helpful to inspect model configuration.

        The optional parameter `gpid2label` is used for export to JSON format.

        Example: Export XbaModel instance to SBML encoded file and to spreadsheet.

        .. code-block:: python

            if xba_model.validate():
                xba_model.export('iML1515_updated.xml')
                xba_model.export('iML1515_updated.xlsx')

        Escher maps (https://escher.github.io) can provide a visual representation of optimization results.
        Escher maps of the modelled organism can be constructed fairly easy using the .json format. This
        file can be imported to the Escher map builder tool using 'Model -> Load COBRA model JSON'.

        Ref: King, Z. A., Dräger, A., Ebrahim, A., Sonnenschein, N., Lewis, N. E., & Palsson, B. O. (2015).
        Escher: A Web Application for Building, Sharing, and Embedding Data-Rich Visualizations of
        Biological Pathways. PLOS Computational Biology, 11(8), e1004321.
        https://doi.org/10.1371/journal.pcbi.1004321

        Example: Generation of a JSON file with gene essentiality status added to gene ids.
        Here `keio_ess` and `keio_red` contain sets of genes that are experimentally
        considered essential and redundant.

        .. code-block:: python

            gpid2label = {}
            for gpid, data in xba_model.gps.items():
                if data.label in keio_ess:
                    sgko_type = 'ess'
                elif data.label in keio_red:
                    sgko_type = 'red'
                else:
                    sgko_type = 'not_in_Keio'
                gpid2label[gpid] = f'{data.label}/{data.name}/{sgko_type}'

                xba_model.export('iML1515.json', gpid2label)

        :param str fname: filename with extension '.xml', '.xlsx' or '.json'
        :param dict(str, str) gpid2label: (optional) remapping of gene product ids for JSON export.
        :return: success status
        :rtype: bool
        """
        extension = fname.split('.')[-1]
        if extension not in ['xml', 'xlsx', 'json']:
            print(f'model not exported, unknown file extension, expected ".xml", ".xlsx" or ".json": {fname}')
            return False

        sbml_model = self._create_sbml_model()
        if extension == 'xml':
            sbml_model.export_sbml(fname)
            print(f'model exported to SBML format: {fname}')
        elif extension == 'xlsx':
            sbml_model.to_excel(fname)
            print(f'model exported to Microsoft Excel Spreadsheet: {fname}')
        elif extension == 'json':
            self._export_to_json(fname, gpid2label)
            print(f'model exported to JSON format: {fname}')
        return True



[docs]
    def export_kcats(self, fname):
        """Export turnover numbers configuration data.

        Enzyme catalyzed reactions in the model have turnover numbers assigned for
        each iso-enzyme in forward and reverse direction. This configuration data can be exported
        to spreadsheet (.xlsx). During XbaModel configuration, turnover number configuration
        data can be loaded from file by configuring the parameter `kcats_fname` in the
        XBA parameter file, sheet `general`.

        Example: Export turnover number configuration to spreadsheet.

        .. code-block:: python

            xba_model = XbaModel('iML1515.xml')
            xba_model.configure('xba_parameters.xlsx')
            xba_model.export_kcats(kcats.xlsx')

        :param str fname: filename with extension '.xlsx'
        """
        notes = 'XBA model export'
        kcats = {}
        for rid, r in self.reactions.items():
            if len(r.enzymes) > 0:
                name = getattr(r, 'name', '')
                ecns = ', '.join(r.miriam_annotation.get_qualified_refs('bqbiol:is', 'ec-code'))
                for idx, enzid in enumerate(sorted(r.enzymes)):
                    key = f'{rid}_iso{idx + 1}' if len(r.enzymes) > 1 else rid
                    e = self.enzymes[enzid]
                    active_sites = e.active_sites
                    genes = ', '.join(sorted(list(e.composition)))
                    fwd_rs = r.get_reaction_string()
                    parts = re.split(r'[-=]>', fwd_rs)
                    arrow = ' -> ' if '->' in fwd_rs else ' => '
                    rev_rs = parts[1].strip() + arrow + parts[0].strip()
                    if r.kcatsf is not None:
                        kcatf = r.kcatsf[idx]
                        kcats[key] = [rid, 1, enzid, kcatf, notes, active_sites, ecns, r.kind, genes, name, fwd_rs]
                    if r.kcatsr is not None:
                        kcatr = r.kcatsr[idx]
                        kcats[f'{key}_REV'] = [rid, -1, enzid, kcatr, notes, active_sites, ecns, r.kind,
                                               genes, name, rev_rs]

        cols = ['rid', 'dirxn', 'enzyme', 'kcat_per_s', 'notes',
                'info_active_sites', 'info_ecns', 'info_type', 'info_genes', 'info_name', 'info_reaction']
        df_rid_kcats = pd.DataFrame(kcats.values(), columns=cols, index=list(kcats))
        df_rid_kcats.index.name = 'key'

        with pd.ExcelWriter(fname) as writer:
            df_rid_kcats.to_excel(writer, sheet_name='kcats')
            print(f'{len(df_rid_kcats)} reaction kcat values exported to', fname)



[docs]
    def export_enz_composition(self, fname):
        """Export enzyme composition data.

        Each enzyme used in the model is composed of proteins with configured copy numbers.
        An enzyme may consist of one or several active sites. Turnover number values
        are defined per active site. Enzyme composition data can be exported
        to spreadsheet (.xlsx). During XbaModel configuration, enzyme configuration
        data can be loaded from file by configuring the parameter `enzyme_comp_fname` in the
        XBA parameter file, sheet `general`.

        Example: Export enzyme composition to spreadsheet.

        .. code-block:: python

            xba_model = XbaModel('iML1515.xml')
            xba_model.configure('xba_parameters.xlsx')
            xba_model.export_enz_composition('enzyme_composition.xlsx')

        :param str fname: filename with extension '.xlsx'
        """
        enz_comp = []
        for eid, e in self.enzymes.items():
            genes = sorted(list(e.composition))
            composition = '; '.join([f'gene={gene}, stoic={e.composition[gene]}' for gene in genes])
            enz_comp.append([eid, e.name, composition, e.active_sites, e.mw / 1000.0, len(e.rids), e.rids[0]])

        cols = ['eid', 'name', 'composition', 'active_sites', 'info_mw_kDa', 'info_n_reactions', 'info_sample_rid']
        df_enz_comp = pd.DataFrame(enz_comp, columns=cols)
        df_enz_comp.set_index('eid', inplace=True)

        with pd.ExcelWriter(fname) as writer:
            df_enz_comp.to_excel(writer, sheet_name='enzymes')
            print(f'{len(df_enz_comp)} enzyme compositions exported to', fname)



[docs]
    def generate_turnup_input(self, orig_kcats_fname, kind='metabolic', mids2ref=None,
                              input_basename='tmp_turnup_input', max_records=500):
        """Generate input files for TurNuP web portal for prediction of turnover numbers.

        Ref: Kroll, et al., 2023, Turnover number predictions for kinetically uncharacterized enzymes using
        machine and deep learning. Nature Communications, 14(1), 4139.
        DOI: https://doi.org/10.1038/s41467-023-39840-4

        The TurNuP web portal (https://turnup.cs.hhu.de/Kcat_multiple_input) predicts turnover
        number values (kcat) for enzyme catalyzed reactions. The web portal accepts input files with
        a maximum of 500 records. Each record contains amino acid sequence information of the proteins
        composing the catalyzing enzyme and references to the reactants of the catalyzed reaction.
        TurNuP predicts the same value for forward and backward direction.
        TurNuP has not yet been trained on transporters.

        Input records are based on the turnover number configuration data loaded from `orig_kcats_fname`.
        By default, all metabolic reactions in forward direction are selected.
        The reaction filter can be changed with the optional parameter `kind`. Amino acid sequence data
        is based on enzyme composition by concatenating sequences of its proteins.
        KEGG compound identifiers are extracted from Miriam annotation data of the SBML model and used
        as reactant references. Optionally, reactant references can be provided with the
        parameter `mids2ref`, mapping metabolite ids (without compartment postfix) to valid
        KEGG compound, InChI or SMILES identifiers.
        The base name for produced input files can be selected using the optional parameter `input_basename`
        and the number of records per input file can be limited by the optional parameter `max_records`.

        Example: Generate input files for upload to TurNuP web portal for subsequent prediction of turnover numbers.

        .. code-block:: python

            xba_model = XbaModel('iML1515.xml')
            xba_model.configure('xba_parameters.xlsx')
            xba_model.export_kcats(kcats.xlsx')
            mids_no_ref = xba_model.generate_turnup_input('kcats.xlsx', 'tmp_turnup_input')

        Metabolites without references can be manually mapped, using parameter `mids2ref` in a second run.
        TurNuP input files need to be uploaded to the TurNuP web server
        (https://turnup.cs.hhu.de/Kcat_multiple_input). The TurNuP output files, containing
        predictions for turnover numbers, can be used to generate an updated turnover number
        configuration file. The update turnover numbers can be referenced in the XBA configuration file
        (parameter `kcats_fname`) and used for XbaModel configuration. See tutorial.

        :param str orig_kcats_fname: turnover number configuration file of baseline model
        :param str kind: (optional) reaction kind: 'metabolic', 'transporter' or 'all' (default: 'metabolic')
        :param dict(str, str) mids2ref: (optional) mapping table of metabolite ids to KEGG, InChI or SMILES references
        :param str input_basename: (optional) base name for TurNuP input files
        :param int max_records: (optional) maximum number of records per input file (default: 500)
        :return: metabolites not mapped to reactant references with species identifiers
        :rtype: dict(str, str)
        """
        # aa_freq used to Kozlowski, 2016, Table 2, pubmed: 27789699, used to filter invalid amino acids ids
        aa_freq = {'A': 8.76, 'C': 1.38, 'D': 5.49, 'E': 6.32, 'F': 3.87,
                   'G': 7.03, 'H': 2.26, 'I': 5.49, 'K': 5.19, 'L': 9.68,
                   'M': 2.32, 'N': 3.93, 'P': 5.02, 'Q': 3.90, 'R': 5.78,
                   'S': 7.14, 'T': 5.53, 'V': 6.73, 'W': 1.25, 'Y': 2.91}
        invalid_aa_ids = '[^' + ''.join(aa_freq.keys()) + ']'

        df_orig_kcats = load_parameter_file(orig_kcats_fname, ['kcats'])['kcats']

        if mids2ref is None:
            mids2ref = {}
        mids_no_ref = {}
        records = []
        for fwd_key, row in df_orig_kcats.iterrows():

            # TurNuP predicts same kcat for fwd and corresponding reverse reaction
            if row['dirxn'] != 1:
                continue
            # TurNuP presently only predicts kcat values for metabolic reactions
            if kind == 'all' or row['info_type'] == kind:
                rev_key = f'{fwd_key}_REV' if f'{fwd_key}_REV' in df_orig_kcats.index else None

                # retrieve KEGG ids for substrates and products
                r = self.reactions[row['rid']]
                missing_ref = False

                substrates = []
                for sid in r.reactants:
                    mid = re.sub('_[^_]*$', '', sid)
                    if mid not in mids2ref:
                        s = self.species[sid]
                        if len(s.kegg_refs) > 0:
                            mids2ref[mid] = s.kegg_refs[0]
                    if mid in mids2ref:
                        substrates.append(mids2ref[mid])
                    else:
                        missing_ref = True
                        mids_no_ref[mid] = sid

                products = []
                for sid in r.products:
                    mid = re.sub('_[^_]*$', '', sid)
                    if mid not in mids2ref:
                        s = self.species[sid]
                        if len(s.kegg_refs) > 0:
                            mids2ref[mid] = s.kegg_refs[0]
                    if mid in mids2ref:
                        products.append(mids2ref[mid])
                    else:
                        missing_ref = True
                        mids_no_ref[mid] = sid

                if missing_ref:
                    subs_keggs = None
                    prod_keggs = None
                else:
                    subs_keggs = ';'.join(substrates)
                    prod_keggs = ';'.join(products)

                # for enzyme complexes, we concatenate aa sequences of involved proteins
                e = self.enzymes[row['enzyme']]
                aa_seq = ''
                for gpid in e.composition:
                    pid = self.locus2uid[gpid]
                    p = self.proteins[pid]
                    aa_seq += p.aa_sequence
                valid_aa_seq = re.sub(invalid_aa_ids, 'L', aa_seq)
                records.append([fwd_key, rev_key, valid_aa_seq, subs_keggs, prod_keggs])

        cols = ['fwd_key', 'rev_key', 'Enzyme', 'Substrates', 'Products']
        df_input = pd.DataFrame(records, columns=cols)
        df_input.set_index('fwd_key', inplace=True)

        n_rev = sum(df_input['rev_key'].notna())
        full = sum(df_input['Substrates'].notna())
        print(f'{len(df_input)} kcat records ({n_rev} reversible), {full} records with sequence and reaction')
        print(f'{len(mids_no_ref)} mids without reference ids, e.g.: {list(mids_no_ref)[:min(len(mids_no_ref), 10)]}')

        # store TurNuP input file(s) - split as per max record length
        if len(df_input) <= max_records:
            fname = f'{input_basename}.xlsx'
            with pd.ExcelWriter(fname) as writer:
                df_input.to_excel(writer)
            print(f'1 TurNuP input file ({len(df_input)} records) stored under {fname}')
            print(f' - upload file to TurNuP Web Server: https://turnup.cs.hhu.de/Kcat_multiple_input')
        else:
            idx = 1
            start = 0
            stop = start + max_records
            fnames = []
            while start < len(df_input):
                fname = f'{input_basename}_{idx}_seq.xlsx'
                with pd.ExcelWriter(fname) as writer:
                    df_input.iloc[start:stop].to_excel(writer)
                fnames.append(fname)
                idx += 1
                start = stop
                stop = min(len(df_input), start + max_records)

            print(f'{len(fnames)} TurNuP input files (max {max_records} records) stored under '
                  f'{fnames[0]} ... {fnames[-1]}')
            print(f' - upload files individually to TurNuP Web Server: https://turnup.cs.hhu.de/Kcat_multiple_input')
        print(f' - retrieve TurNuP kcat predictions and continue with process_turnup_output()')
        return mids_no_ref


    #############################
    # MODIFY GENOME SCALE MODEL #
    #############################

    def gpa_remove_gps(self, del_gps):
        """Remove gene products from Gene Product Rules.

        Used to remove dummy protein gene product and
        gene products related to coenzymes that already
        appear in reaction reactants/products

        :meta private:
        :param del_gps: gene product ids to be removed
        :type del_gps: set or list of str, or str
        """
        n_count = 0
        if type(del_gps) is str:
            del_gps = [del_gps]
        for rid, mr in self.reactions.items():
            mr.gpa_remove_gps(del_gps)
        for gp in del_gps:
            if gp in self.gps:
                n_count += 1
                del self.gps[gp]
        print(f'{n_count:4d} gene product(s) removed from reactions ({len(self.gps)} gene products remaining)')

    def remove_reactions(self, del_rids):
        """remove reactions form the model

        removal of obsolete gene products, species, compartments and groups entries
        will be done during clean().

        :meta private:
        :param del_rids: reaction ids to be removed
        :type del_rids: set or list of str, or str
        """
        n_count = 0
        if type(del_rids) is str:
            del_rids = [del_rids]
        for rid in del_rids:
            if rid in self.reactions:
                del self.reactions[rid]
                n_count += 0
        print(f'{n_count:4d} reaction(s) removed from reactions ({len(self.reactions)} reactions remaining)')

    def add_unit_def(self, u_dict):
        """Add a single unit definition to the model.

        :meta private:
        :param dict u_dict: unit definition
        """
        unit_id = u_dict['id']
        self.unit_defs[unit_id] = SbmlUnitDef(pd.Series(u_dict, name=unit_id))

    def add_parameter(self, pid, p_dict):
        """Add a single parameter to the model.

        parameter id must be SBML compliant
        p_dict must incluce 'value' and may include 'units', 'constant', 'name', 'sboterm', 'metaid'

        :meta private:
        :param str pid: parameter id to be used
        :param dict p_dict: parameter definition
        """
        if pid is None:
            pid = p_dict['id']
        self.parameters[pid] = SbmlParameter(pd.Series(p_dict, name=pid))

    def add_function_def(self, fd_dict):
        """Add a single function definition to the model.

        :meta private:
        :param dict fd_dict: function definition
        """
        if self.func_defs is None:
            self.func_defs = {}
        fd_id = fd_dict['id']
        self.func_defs[fd_id] = SbmlFunctionDef(pd.Series(fd_dict, name=fd_id))

    def add_initial_assignment(self, ia_dict):
        """Add a single initial assignment to the model.

        :meta private:
        :param dict ia_dict: initial assignment definition
        """
        if self.init_assigns is None:
            self.init_assigns = {}
        ia_id = ia_dict['symbol']
        self.init_assigns[ia_id] = SbmlInitialAssignment(pd.Series(ia_dict))

    def add_species(self, df_species):
        """Add species to the model according to species configuration

        species_config contains for each new species id its specific
        configuration. Parameters not provide will be set with default values.
        e.g. {'M_pqq_e': {'name': 'pyrroloquinoline quinone(3−)',
                           'miriamAnnotation': 'bqbiol:is, chebi/CHEBI:1333',
                           'fbcCharge': -3, 'fbcChemicalFormula': 'C14H3N2O8'}, ...}

        :meta private:
        :param pandas.DataFrame df_species: species configurations
        """
        n_count = 0
        for sid, row in df_species.iterrows():
            n_count += 1
            s_data = row
            if hasattr(s_data, 'miriamAnnotation'):
                if type(s_data['miriamAnnotation']) is not str:
                    s_data['miriamAnnotation'] = ''
                # miriam annotation requires a 'metaid'
                if not hasattr(s_data, 'metaid'):
                    s_data['metaid'] = f'meta_{sid}'
            self.species[sid] = SbmlSpecies(s_data)
        print(f'{n_count:4d} constraint ids added to the model ({len(self.species)} total constraints)')

    def add_parameters(self, df_parameters):
        """Add parameters based on supplied definition.

        df_parameters structure:
        - based on sbmlxdf parameters structure

        :meta private:
        :param pandas.DataFrame df_parameters: parameters configurations
        """
        n_count = 0
        for pid, row in df_parameters.iterrows():
            n_count += 1
            p_data = row
            if hasattr(p_data, 'miriamAnnotation'):
                if type(p_data['miriamAnnotation']) is not str:
                    p_data['miriamAnnotation'] = ''
                # miriam annotation requires a 'metaid'
                if not hasattr(p_data, 'metaid'):
                    p_data['metaid'] = f'meta_{pid}'
            self.parameters[pid] = SbmlParameter(p_data)
        print(f'{n_count:4d} parameter ids added to the model ({len(self.parameters)} total parameters)')

    def add_reactions(self, df_reactions):
        """Add reactions based on supplied definition

        df_reactions structure:
        - based on sbmlxdf reactions structure
        - instead of 'fbcLowerFluxBound', 'fbcLowerFluxBound' parameter ids, one can supply
          'fbcLb', 'fbcUb' numerial flux values, in which case parameters are retrieve/created
          parameter unit id can be supplied optionally in 'fbcBndUid', e.g. 'mmol_per_gDW',
          if provided, this will overwrite the unit used as reaction flux
        - instead of providing 'reactants', 'products' and 'reversible', a 'reactionString'
          can be provided to determine 'reactants', 'products' and 'reversible'

        :meta private:
        :param pandas.DataFrame df_reactions: reaction records
        """
        n_count = 0
        for rid, r_data in df_reactions.iterrows():
            n_count += 1
            if hasattr(r_data, 'miriamAnnotation'):
                if type(r_data['miriamAnnotation']) is not str:
                    r_data['miriamAnnotation'] = ''
                # miriam annotation requires a 'metaid'
                if not hasattr(r_data, 'metaid'):
                    r_data['metaid'] = f'meta_{rid}'
            if not hasattr(r_data, 'reactants'):
                assert('reactionString' in r_data)
                reaction_components = parse_reaction_string(str(r_data['reactionString']))
                for key, val in reaction_components.items():
                    r_data[key] = val
            if not hasattr(r_data, 'fbcLowerFluxBound'):
                assert(('fbcLb' in r_data) and ('fbcUb' in r_data))
                unit_id = r_data.get('fbcBndUid', self.flux_uid)
                r_data['fbcLowerFluxBound'] = self.get_fbc_bnd_pid(r_data.loc['fbcLb'], unit_id, f'fbc_{rid}_lb')
                r_data['fbcUpperFluxBound'] = self.get_fbc_bnd_pid(r_data.loc['fbcUb'], unit_id, f'fbc_{rid}_ub')
            self.reactions[rid] = SbmlReaction(r_data, self.species)
        print(f'{n_count:4d} variable ids added to the model ({len(self.reactions)} total variables)')

    def add_compartments(self, compartments_config):
        """Add compartments to the model according to compartments configuration

        compartments_config contains for each new compartment id its specific
        configuration. Parameters not provide will be set with default values.
        e.g. {'c-p': {'name': 'inner membrane', 'spatialDimensions': 2}, ...}

        :meta private:
        :param dict(dict) compartments_config: compartment configurations
        """
        for cid, data in compartments_config.items():
            if 'constant' not in data:
                data['constant'] = True
            if 'units' not in data:
                data['units'] = 'dimensionless'
            if 'metaid' not in data:
                data['metaid'] = f'meta_{cid}'
            self.compartments[cid] = SbmlCompartment(pd.Series(data, name=cid))

    def add_objectives(self, objectives_config):
        """Add (FBC) objectives to the model.

        objective_config contains for each new objetive id its specific
        configuration. Instead of providing 'fluxObjectives' (str) one can provide
        'coefficients' (dict) directly.

        :meta private:
        :param dict(dict) objectives_config: objectives configurations
        """
        for obj_id, data in objectives_config.items():
            self.objectives[obj_id] = FbcObjective(pd.Series(data, name=obj_id))

    def modify_attributes(self, df_modify, component_type):
        """Modify model attributes for specified component ids.

        DataFrame structure:
            index: component id to modify or None
            columns:
                'component': one of the supported model components
                    'modelAttrs', 'gp', 'species', 'reaction', 'protein', 'enzyme',
                    'parameter', 'compartment'
                    'uniprot', 'ncbi'
                'attribute': the attribute name to modify
                'value': value to configure for the attribute

        Examples for modification of reaction stoichiometry
        ---------------------------------------------------
        - change reactants/products stoichiometry (use attributes 'reactants', 'products')
            - e.g.: id='R_BPNT', component='reaction', attribute='reactants',
                value='species=M_h2o_c, stoic=1.0; species=M_pap_c, stoic=1.0'
        - delete a single reactant/product from a reaction string (use attributes 'reactant', 'product')
            - e.g.: id='R_CofactorSynt', component='reaction', attribute='reactant', value='M_hemeA_c=0.0'
        - set change the stoichiometry factor for a single reactant/product
            - e.g.: id='R_CAT', component='reaction', attribute='reactant', value='M_h2o_c=2.0'

        :meta private:
        :param pandas.DataFrame df_modify: structure with modifications
        :param str component_type: type of component, e.g. 'species'
        """
        component_mapping = {'gp': self.gps,
                             'species': self.species, 'reaction': self.reactions,
                             'protein': self.proteins, 'enzyme': self.enzymes,
                             'parameter': self.parameters, 'compartment': self.compartments}

        value_counts = df_modify['component'].value_counts().to_dict()
        if component_type in value_counts:
            df_modify_attrs = df_modify[df_modify['component'] == component_type]
            if component_type == 'modelAttrs':
                for _, row in df_modify_attrs.iterrows():
                    self.model_attrs[row['attribute']] = row['value']
            elif component_type == 'uniprot':
                self.uniprot_data.modify_attributes(df_modify_attrs)
            elif component_type == 'ncbi':
                self.ncbi_data.modify_attributes(df_modify_attrs)
            elif component_type in component_mapping:
                comp_obj = component_mapping[component_type]
                for _id, row in df_modify_attrs.iterrows():
                    comp_obj[_id].modify_attribute(row.loc['attribute'], row.loc['value'])
            else:
                print('unknown component_type {component_type} in "modify_attributes" sheet')
                return
            print(f'{value_counts[component_type]:4d} attributes on {component_type} instances updated')

    def del_components(self, component, ids):
        """Delete / remove components from the model.

        We are not cleaning up yet any leftovers, e.g. species no longer required.

        :meta private:
        :param str component: component type, e.g. 'species', 'reactions', etc.
        :param list(str) ids: list of component ids to be deleted
        """
        component_mapping = {'species': self.species, 'reactions': self.reactions,
                             'proteins': self.proteins, 'enzymes': self.enzymes,
                             'objectives': self.objectives}

        if component in component_mapping:
            count = 0
            for component_id in ids:
                if component_id in component_mapping[component]:
                    del component_mapping[component][component_id]
                    count += 1
            print(f'{count:4d} {component} removed')
        else:
            print(f'component type {component} not yet supported')

    #######################
    # PROTEIN COMPOSITION #
    #######################

    def create_proteins(self):
        """Create proteins related to gene products used in the model.

        For each gene product create a related protein.
        If corresponding UniProt record exists (loaded from UniProt file), use UniProt id
        as reference and configure model protein with data from UniProt record.
        Try extracting the UniProt id from gene product MiriamAnnotation
        Alternatively check mapping of UniProt records to gene labels, in which case we
        add the UniProt id to the gene product data.
        If corresponding UniProt not found, collect protein data from NCBI sequence data.

        We also configure gene id and gene name 'notes'-field of gene product.

        :meta private:
        """
        proteins_not_found = []
        n_created = 0
        for gp in self.gps.values():

            # in case of invalid UniProt id, assign uid based on uniprot_data
            if gp.uid not in self.uniprot_data.proteins:
                if gp.label in self.uniprot_data.locus2uid:
                    gp.miriam_annotation.replace_refs('bqbiol:is', 'uniprot', self.uniprot_data.locus2uid[gp.label])

            # add protein related to gene product to model proteins
            pid = gp.uid
            if pid not in self.proteins:
                cid = None

                # for gene products used in reaction gene product rules, use compartment of first reaction
                if gp.label in self.locus2rids:
                    any_rid = self.locus2rids[gp.label][0]
                    cid = self.reactions[any_rid].compartment
                elif gp.compartment is not None:
                    # gene products used in RBA process machines have compartment already configured
                    cid = gp.compartment

                if cid is not None:
                    p = None
                    # try retrieving protein data from UniProt proteins (normal case)
                    if pid in self.uniprot_data.proteins:
                        p = Protein(self.uniprot_data.proteins[pid], gp.label, cid)

                    # as fallback, retrieve protein data from NCBI proteins
                    elif (self.ncbi_data is not None) and (gp.label in self.ncbi_data.label2locus):
                        ncbi_locus = self.ncbi_data.label2locus[gp.label]
                        p = Protein(self.ncbi_data.locus2protein[ncbi_locus], gp.label, cid)
                    else:
                        proteins_not_found.append(gp.id)
                    if p:
                        self.proteins[pid] = p
                        gp.add_notes(f'[{p.gene_name}], {p.name}')
                        n_created += 1

        # update mapping, in case we modified UniProt information
        self.locus2uid = {gp.label: gp.uid for gp in self.gps.values()}
        self.uid2gp = {gp.uid: gp_id for gp_id, gp in self.gps.items()}
        if len(proteins_not_found) > 0:
            examples = min(5, len(proteins_not_found))
            print(f'{len(proteins_not_found)} proteins not found for gene products, ',
                  f'e.g.: {proteins_not_found[:examples]}')

        return n_created

    def add_gps(self, df_add_gps):
        """Add gene products to the model.

        E.g. required for ribosomal proteins

        df_add_gps pandas DataFrame has the pandas DataFrame structure of sbmlxdf:
            columns:
                'gpid': gene product id, e.g. 'G_b0015'
                'label': gene locus, e.g. 'b0015'
                'compartment': optional compartment id of protein
        optionally we add compartment info to support RBA machinery related gene products
                'compartment': location of gene product, e.g. 'c'

        Add 'name' and 'sboterm' if not provided
        Add MiriamAnnotation with UniProt id, if MiriamAnnotation has not been provided.
        UniProt ID is retrieved with UniProt data

        :meta private:
        :param pandas.DataFrame df_add_gps: configuration data for gene product, see sbmlxdf
        :return: number of added gene products
        :rtype: int
        """
        count = 0
        for gpid, gp_data in df_add_gps.iterrows():
            if gpid not in self.gps:
                gene_id = gp_data['label']
                uid = None
                if self.uniprot_data:
                    uid = self.uniprot_data.locus2uid.get(gene_id)

                if uid and gp_data.get('name') is None:
                    pdata = self.uniprot_data.proteins[uid]
                    gp_data['name'] = pdata.gene_name.split(',')[0] if type(pdata.gene_name) is str else gene_id
                if uid and type(gp_data.get('miriamAnnotation')) is not str:
                    gp_data['metaid'] = f'meta_{gpid}'
                    gp_data['miriamAnnotation'] = f'bqbiol:is, uniprot/{uid}'
                if not hasattr(gp_data, 'sboterm'):
                    gp_data['sboterm'] = SBO_GENE

                self.gps[gpid] = FbcGeneProduct(gp_data)
                count += 1
        print(f'{count:4d} gene products added to the model ({len(self.gps)} total gene products)')
        return count

    def map_protein_cofactors(self):
        """Map cofactors to species ids and add to protein data.

        Cofactors are retrieved from UniProt.
        CHEBI ids are used for mapping to model species.
           - UniProt cofactor names already have mapping to one CHEBI ids.
           - model species have map to zero, one or several CHEBI ids
           - species in different compartments can map to same CHEBI id
           - based on protein location we identify a matching species
        - mapping table in df_chebi2sid provides a direct mapping to sid,
          required for cofactor CHEBI ids that cannot be mapped to model species

        :meta private:
        :return: number of mapped cofactors
        :rtype: int
        """
        # get mapping chebi id to species for model species
        chebi2sid = {}
        n_sids_per_cid = defaultdict(int)
        for sid, s in self.species.items():
            n_sids_per_cid[s.compartment] += 1
            for chebi in s.chebi_refs:
                if chebi not in chebi2sid:
                    chebi2sid[chebi] = []
                chebi2sid[chebi].append(sid)
        main_cid = sorted([(count, cid) for cid, count in n_sids_per_cid.items()], reverse=True)[0][1]

        cof_not_mapped = {}
        n_cof_not_mapped = 0
        n_cof_mapped = 0
        for pid, p in self.proteins.items():
            if len(p.up_cofactors) > 0:
                # UniProt cofactors with stoichiometry
                for up_cf_name, cf_data in p.up_cofactors.items():
                    stoic = cf_data['stoic']
                    chebi = cf_data['chebi']
                    selected_sid = None
                    if chebi is not None:
                        if chebi in self.user_chebi2sid:
                            # selected species is based on data provided by user in XBA parameters file
                            selected_sid = self.user_chebi2sid[chebi]
                        elif chebi in chebi2sid:
                            sids = chebi2sid[chebi]
                            cid2sids = {self.species[sid].compartment: sid for sid in sids}
                            # preferrably use a cofactor from cytosol (compartment with most species, assumed)
                            if main_cid in cid2sids:
                                selected_sid = cid2sids[main_cid]
                            else:
                                # 00 use a cofactor that overlaps with any of the compartments of protein
                                pcids = {pcid for pcid in p.compartment.split('-')}
                                cids_intersect = list(pcids.intersection(cid2sids))
                                if len(cids_intersect) > 0:
                                    selected_sid = cid2sids[cids_intersect[0]]
                                else:
                                    selected_sid = sids[0]
                    if selected_sid is not None:
                        p.cofactors[selected_sid] = stoic
                        n_cof_mapped += 1
                    else:
                        cof_not_mapped[chebi] = up_cf_name
                        n_cof_not_mapped += 1

        if n_cof_not_mapped > 0:
            print(f'{n_cof_not_mapped} cofactors used in proteins could not be mapped (are not considered). ' 
                  f'These correspond to {len(cof_not_mapped)} CHEBI ids, which could be added '
                  f'to the parameter spreadsheet (chebi2sid):')
            print(cof_not_mapped)
        return n_cof_mapped

    def get_protein_compartments(self):
        """Get the compartments where proteins are located with their count.

        :meta private:
        :return: protein compartments used
        :rtype: dict (key: compartment id, value: number of proteins
        """
        compartments = {}
        for p in self.proteins.values():
            if p.compartment not in compartments:
                compartments[p.compartment] = 0
            compartments[p.compartment] += 1
        return compartments

    ######################
    # ENZYME COMPOSITION #
    ######################

    def create_enzymes(self):
        """Create model enzymes based on reaction gene product associations.

        Default stoichiometry of 1.0 for gene products
        Enzyme stoichiometries can be updated in a subsequent step.
        Cofactors and stoichiometry are retrieved from proteins

        Enzymes are connected to reactions

        Enzyme ids are formatted based on model reaction gpa as follows:
            'enz_' followed by sorted list of gene loci ids separated by '_'
            e.g. 'enz_b1234_b3321'

        :meta private:
        :return: number of created enzymes
        :rtype: int
        """
        n_created = 0
        for rid, r in self.reactions.items():
            eids = []
            if r.gene_product_assoc:
                gpa = re.sub(' and ', '_and_', r.gene_product_assoc)
                gpa = re.sub(r'[()]', '', gpa)
                gp_sets = [item.strip() for item in gpa.split('or')]
                for gp_set in gp_sets:
                    loci = sorted([self.gps[item.strip()].label for item in gp_set.split('_and_')])
                    enz_composition = {locus: 1.0 for locus in loci}
                    eid = 'enz_' + '_'.join(loci)
                    eids.append(eid)
                    if eid not in self.enzymes:
                        self.enzymes[eid] = Enzyme(eid, eid, self, enz_composition, r.compartment)
                        n_created += 1
                    self.enzymes[eid].rids.append(rid)
            r.set_enzymes(eids)
        return n_created

    def set_enzyme_composition_from_biocyc(self, org_prefix, biocyc_dir):
        """Configure enzyme composition from BioCyc.

        BioCyc enzyme related data is read from files in directory biocyc_dir, if files exist,
        alternatively, BioCyc data is downloaded from BiBioCyc (assuming access for given
        database is available)

        Enzyme active site number is configured based on heuristics.
        - for transporters we assume number of active sites = 1.0
        - for metabolic enzymes we assume we take the minimum of the composition stoichiometry
          as number of active sites.

        :meta private:
        :param str org_prefix: BioCyc organism prefix (when using BioCyc enzyme composition)
        :param str biocyc_dir: directory where to retrieve/store BioCyc organism data
        :return: number of updates
        :rtype: int
        """
        biocyc = BiocycData(org_prefix, biocyc_dir)

        cols = ['name', 'synonyms', 'composition', 'enzyme']
        enz_comp = {}
        for enz_id, enz in biocyc.proteins.items():
            # exclude monomeric enzymes. Either enzyme is used in reactions or it is not included in complexes
            if len(enz.gene_composition) > 0 and (len(enz.enzrxns) > 0 or len(enz.complexes) == 0):
                gene_comp = '; '.join([f'gene={gene}, stoic={stoic}'
                                       for gene, stoic in enz.gene_composition.items()])
                eid = 'enz_' + '_'.join(sorted(enz.gene_composition.keys()))
                enz_comp[enz_id] = [enz.name, enz.synonyms, gene_comp, eid]
        df = pd.DataFrame(enz_comp.values(), index=list(enz_comp), columns=cols)
        df.drop_duplicates(['enzyme'], inplace=True)
        df = df.reset_index().set_index('enzyme')
        print(f'{len(df)} enzymes extracted from Biocyc')

        # update model enzymes with BioCyc composition data
        # enzymes in the model are based on reaction gene product associations.
        #  These enzymes might be composed of one or several BioCyc enzyme sub-complexes.
        #  We try to identify such sub-complexes and update that part of the enzyme composition.
        biocyc_eid2comp = {eid: biocyc.proteins[row['index']].gene_composition for eid, row in df.iterrows()}
        count = 0
        for eid, e in self.enzymes.items():
            if eid in biocyc_eid2comp:
                e.name = df.at[eid, 'name']
                e.composition = biocyc_eid2comp[eid].copy()
                count += 1
            else:
                updates = False
                gpa_genes = set(e.composition)
                if len(gpa_genes) > 1:
                    updates = False
                    for genes_stoic in biocyc_eid2comp.values():
                        if set(genes_stoic).intersection(gpa_genes) == set(genes_stoic):
                            e.composition.update(genes_stoic)
                            updates = True
                if updates:
                    count += 1
            # set active sites (based on heuristics) - for metabolic enzymes (not transporters)
            e.active_sites = 1.0
            min_stoic = min(e.composition.values())
            any_rkind = self.reactions[e.rids[0]].kind
            if min_stoic > 1.0 and any_rkind == 'metabolic':
                e.active_sites = min_stoic
        return count

    def set_enzyme_composition_from_file(self, fname):
        """Configure enzyme composition from file

        Excel document requires an index column (not used) and column 'composition'
        Only column 'composition' is processed.
            'composition' contains a sref presentation of enzyme composition,
        e.g. 'gene=b2222, stoic=2.0; gene=b2221, stoic=2.0'

        Model enzyme id is determined by concatenating the locus ids (after sorting)
        e.g. 'gene=b2222, stoic=2.0; gene=b2221, stoic=2.0' - > 'enz_b2221_b2222'

        :meta private:
        :param str fname: name of Excel document specifying enzyme composition
        :return: number of updates
        :rtype: int
        """
        # load enzyme composition data from file
        df = load_parameter_file(fname, ['enzymes'])['enzymes']

        count = 0
        for _, row in df.iterrows():
            enz_comp = get_srefs(re.sub('gene', 'species', row['composition']))
            eid = 'enz_' + '_'.join(sorted(enz_comp.keys()))
            if eid in self.enzymes:
                e = self.enzymes[eid]
                e.composition = enz_comp
                e.active_sites = row.get('active_sites', 1)
                count += 1
        return count

    def get_catalyzed_reaction_kinds(self):
        """retrieve reaction ids for each kind/type of catalyzed reaction

        collect reaction ids that are reversible,

        :meta private:
        :return: model reaction ids under reaction kinds
        :rtype: dict (key: reaction type, val: set of reaction ids
        """
        rxnkind2rids = {'reversible': set()}
        for rid, r in self.reactions.items():
            if r.gene_product_assoc:
                if r.reversible is True:
                    rxnkind2rids['reversible'].add(rid)
                if r.kind not in rxnkind2rids:
                    rxnkind2rids[r.kind] = set()
                rxnkind2rids[r.kind].add(rid)
        return rxnkind2rids

    ######################
    # KCAT CONFIGURATION #
    ######################

    def create_default_kcats(self, default_kcats, subtypes=None):
        """Create a default mapping for reactions to default turnover numbers.

        Only set kcat values if default value for reaction type is finite.
        Both forward and reverse kcat values are set to the same default kcat.
        Default kcat values for reaction types 'metabolic' and 'transporter' may be provided,
        as these are the reaction types configured by default.

        Default kcat value for specific subtypes can be provided, e.g. for 'ABC transporter'.
        Subtypes specific values will be used, if reaction id is assigned to
        specified subtype in subtypes dict.

        :meta private:
        :param dict(str, float) default_kcats: default values for selected kinds
        :param dict(str, str) subtypes: subtypes of specific reactions
        """
        if type(subtypes) is not dict:
            subtypes = {}
        for rid, r in self.reactions.items():
            if len(r.enzymes) > 0:
                default_kcat = default_kcats.get(r.kind, np.nan)
                if rid in subtypes:
                    if subtypes[rid] in default_kcats:
                        default_kcat = default_kcats[subtypes[rid]]
                if np.isfinite(default_kcat):
                    r.set_kcat('unspec', 1, default_kcat)
                    if r.reversible is True:
                        r.set_kcat('unspec', -1, default_kcat)

    def set_reaction_kcats(self, fname):
        """Set kcat values with enzyme-reaction specific kcat values.

        We only update kcats, if respective reaction exists in given direction
        Spreadsheet contains following columns:
            'key': a unique key used as index, e.g. R_ANS_iso1_REV (first column)
            'rid', model reaction id (str), e.g. 'R_ANS'
            'dirxn': (1, -1) reaction direction forward/reverse
            'genes': enzyme composition in terms of gene loci, comma separated
                e.g. 'b1263, b1264', or np.nan/empty, if kcat is not isoenzyme specific
            'kcat': kcat value in per second (float)

        :meta private:
        :param str fname: file name of Excel document with specific kcat values
        :return: number of kcat updates
        :rtype: int
        """
        df_reaction_kcats = load_parameter_file(fname, ['kcats'])['kcats']

        n_updates = 0
        for idx, row in df_reaction_kcats.iterrows():
            rid = row['rid']
            eid = row.get('enzyme', 'unspec')
            # if type(row['genes']) is str and len(row['genes']) > 1:
            #     eid = 'enz_' + '_'.join(sorted([locus.strip() for locus in row['genes'].split(',')]))
            # else:
            #     eid = 'unspec'
            if rid in self.reactions:
                r = self.reactions[rid]
                n_updates += r.set_kcat(eid, row['dirxn'], row['kcat_per_s'])
        return n_updates

    def set_enzyme_kcats(self, df_enz_kcats):
        """Set isoenzyme specific kcat values.

        sets / overwrites isoenzyme specific kcat values on reactions.

        We only update kcats, if respective reaction exists in given direction
        df_enz_kcats contains kcat for selected enzymatic reactions:
            index: 'loci', enzyme composition in terms of gene loci, e.g. 'b1263, b1264'
                comma separated
            columns:
                'dirxn': (1, -1) reaction direction forward/reverse
                'rid': specific reaction id, e.g. 'R_ANS' or np.nan/empty
                'kcat': kcat value in per second (float)

        :meta private:
        :param pandas.DataFrame df_enz_kcats: enzyme specific kcat values
        :return: number of kcat updates
        :rtype: int
        """
        n_updates = 0
        for loci, row in df_enz_kcats.iterrows():
            loci = str(loci)
            eid = 'enz_' + '_'.join(sorted([locus.strip() for locus in loci.split(',')]))
            if eid in self.enzymes:
                if type(row['rid']) is str and len(row['rid']) > 1:
                    rids = [row['rid']]
                else:
                    rids = self.enzymes[eid].rids
                for rid in rids:
                    r = self.reactions[rid]
                    n_updates += r.set_kcat(eid, row['dirxn'], row['kcat'])
        return n_updates

    def scale_enzyme_costs(self, df_scale_costs):
        """Scale isoenzyme cost for selected reactions.

        We only update kcats, if respective reaction exists in given direction
        If specified enzyme kcat value has not been defined yet,
         'unspec' value is used as reference

        df_scale_costs contains scale values for selected enzymatic reactions:
            index: 'rid', model reaction id (str), e.g. 'R_ANS'
            columns:
                'dirxn': (1, -1) reaction direction forward/reverse
                'enzyme': enzyme composition in terms of gene loci, comma separated
                    e.g. 'b1263, b1264', or np.nan/empty, if kcat is not isoenzyme specific
                'scale': scale value (float) (used to divide kcat)
        scale factor will be used as divisor for existing kcat value

        :meta private:
        :param pandas.DataFrame df_scale_costs: table with cost scale factor
        :return: number of kcat updates
        :rtype: int
        """
        n_updates = 0
        for rid, row in df_scale_costs.iterrows():
            if type(row['enzyme']) is str and len(row['enzyme']) > 1:
                eid = 'enz_' + '_'.join(sorted([locus.strip() for locus in row['enzyme'].split(',')]))
            else:
                eid = 'unspec'
            if rid in self.reactions:
                r = self.reactions[rid]
                n_updates += r.scale_kcat(eid, row['dirxn'], row['scale'])
        return n_updates

    # MODEL CONVERSIONS
    def get_fbc_bnd_pid(self, val, unit_id, proposed_pid, reuse=True):
        """Get parameter id for a given fbc bound value und unit type.

        Construct a new fbc bnd parameter, if the value is
        not found among existing parameters.

        If 'reuse' is set to False, a new parameter is created that will not be shared.

        Extends uids.

        :meta private:
        :param float val:
        :param str unit_id: unit id
        :param str proposed_pid: proposed parameter id that would be created
        :param bool reuse: (optional, default: True) Flag if existing parameter id with same value can be reused
        :return: parameter id for setting flux bound
        :rtype: str
        """
        valstr = val
        # if units have not been defined, we create them
        if unit_id not in self.fbc_shared_pids:
            if unit_id == 'umol_per_gDW':
                u_dict = {'id': unit_id, 'name': 'micromole per gram (dry weight)',
                          'units': ('kind=mole, exp=1.0, scale=-6, mult=1.0; '
                                    'kind=gram, exp=-1.0, scale=0, mult=1.0')}
            elif unit_id == 'mmol_per_gDW':
                u_dict = {'id': unit_id, 'name': 'millimole per gram (dry weight)',
                          'units': ('kind=mole, exp=1.0, scale=-3, mult=1.0; '
                                    'kind=gram, exp=-1.0, scale=0, mult=1.0')}
            elif unit_id == 'umol_per_gDWh':
                u_dict = {'id': unit_id,
                          'name': 'micromole per gram (dry weight) per hour',
                          'units': ('kind=mole, exp=1.0, scale=-6, mult=1.0; '
                                    'kind=gram, exp=-1.0, scale=0, mult=1.0; '
                                    'kind=second, exp=-1.0, scale=0, mult=3600.0')}
            elif unit_id == 'kJ_per_mol':
                u_dict = {'id': unit_id, 'name': 'kilo joule per mole',
                          'units': ('kind=joule, exp=1.0, scale=3, mult=1.0; '
                                    'kind=mole, exp=-1.0, scale=0, mult=1.0')}
            elif unit_id == 'mg_per_gDW':
                u_dict = {'id': unit_id, 'name': 'milligram per gram (dry weight)',
                          'units': ('kind=gram, exp=1.0, scale=-3, mult=1.0; '
                                    'kind=gram, exp=-1.0, scale=0, mult=1.0')}
            elif unit_id == 'fbc_dimensionless':
                u_dict = {'id': unit_id, 'name': 'dimensionless',
                          'units': 'kind=dimensionless, exp=1.0, scale=0, mult=1.0'}
            else:
                print('unsupported flux bound unit type, create unit in unit definition')
                return None
            self.add_unit_def(u_dict)
            self.fbc_shared_pids[unit_id] = {}

        if not reuse:
            p_dict = {'id': proposed_pid, 'name': proposed_pid, 'value': val, 'constant': True,
                      'sboterm': 'SBO:0000626', 'units': unit_id}
            self.parameters[proposed_pid] = SbmlParameter(pd.Series(p_dict, name=proposed_pid))
            self.parameters[proposed_pid].modify_attribute('reuse', False)
            return proposed_pid

        elif valstr in self.fbc_shared_pids[unit_id]:
            return self.fbc_shared_pids[unit_id][valstr]
        else:
            p_dict = {'id': proposed_pid, 'name': proposed_pid, 'value': val, 'constant': True,
                      'sboterm': 'SBO:0000626', 'units': unit_id}
            self.parameters[proposed_pid] = SbmlParameter(pd.Series(p_dict, name=proposed_pid))
            self.fbc_shared_pids[unit_id][valstr] = proposed_pid
            return proposed_pid

    def split_reversible_reaction(self, reaction):
        """Split reversible reaction into irreversible fwd and rev reactions.

        Original reaction becomes the forward reaction. Flux bounds
        are checked and reaction is made irreversible.
        Reverse reaction is newly created with original name + '_REV'.
        reactants/products are swapped and flux bounds inverted.

        Add newly created reverse reaction to the model reactions

        :meta private:
        :return: reverse reaction with reactants/bounds inverted
        :rtype: Reaction
        """
        r_dict = reaction.to_dict()
        rev_rid = f'{reaction.id}_REV'
        rev_r = SbmlReaction(pd.Series(r_dict, name=rev_rid), self.species)
        rev_r.orig_rid = r_dict['id']

        if hasattr(rev_r, 'name'):
            rev_r.name += ' (rev)'
        if hasattr(rev_r, 'metaid'):
            rev_r.metaid = f'meta_{rev_rid}'

        rev_r.reverse_substrates()
        rev_r.enzymes = reaction.enzymes
        rev_r.kcatsf = reaction.kcatsr
        rev_lb = -min(0.0, self.parameters[reaction.fbc_upper_bound].value)
        rev_ub = -min(0.0, self.parameters[reaction.fbc_lower_bound].value)
        lb_pid = self.get_fbc_bnd_pid(rev_lb, self.flux_uid, f'fbc_{rev_rid}_lb')
        ub_pid = self.get_fbc_bnd_pid(rev_ub, self.flux_uid, f'fbc_{rev_rid}_ub')
        rev_r.modify_bounds({'lb': lb_pid, 'ub': ub_pid})
        rev_r.reversible = False
        self.reactions[rev_rid] = rev_r

        # make forward reaction irreversible and check flux bounds
        zero_flux_bnd_pid = self.get_fbc_bnd_pid(0.0, self.flux_uid, f'zero_flux_bnd')
        if self.parameters[reaction.fbc_lower_bound].value < 0.0:
            reaction.modify_bounds({'lb': zero_flux_bnd_pid})
        if self.parameters[reaction.fbc_upper_bound].value < 0.0:
            reaction.modify_bounds({'ub': zero_flux_bnd_pid})
        reaction.reversible = False
        reaction.kcatsr = None

        return rev_r

    def _create_arm_reaction_old(self, orig_r):
        """Create optional arm reactions and connecting pseudo metabolites.

        Arm reactions is in series to several isoenzyme reactions. It controls
        total flux through these isoenzyme reactions that all originate from same
        original reaction.
        Isoenzyme reactions consume original reactant and produce a common pseudo metabolite
        Arm reactions consume the pseudo metabolite and produces original product.

        :param orig_r: reaction for which to produce an arm reaction and a pseudo metabolite
        :type orig_r: :class:`SbmlReaction`
        :return: arm reaction
        :rtype: :class:`SbmlReaction`
        """
        # add pseudo metabolite to connect iso reactions with arm reaction, exclude other constraints
        product_srefs = {sid: stoic for sid, stoic in orig_r.products.items()
                         if re.match('C_', sid) is None}
        cid = self.species[sorted(list(product_srefs))[0]].compartment
        ridx = re.sub('^R_', '', orig_r.orig_rid)
        # support reaction names having compartment id as postfix
        pmet_sid = f'M_pmet_{ridx}'
        if '_' not in ridx or ridx.rsplit('_', 1)[1] != cid:
            pmet_sid += f'_{cid}'
        pmet_name = f'pseudo metabolite for arm reaction of {ridx}'
        pmet_dict = {'id': pmet_sid, 'name': pmet_name, 'compartment': cid,
                     'hasOnlySubstanceUnits': False, 'boundaryCondition': False, 'constant': False}
        self.species[pmet_sid] = SbmlSpecies(pd.Series(pmet_dict, name=pmet_sid))

        # add arm reaction to control overall flux, based on original reaction
        if re.search('_REV$', orig_r.id):
            arm_rid = f"{re.sub('_REV$', '', orig_r.id)}_arm_REV"
        else:
            arm_rid = f"{orig_r.id}_arm"
        arm_r = SbmlReaction(pd.Series(orig_r.to_dict(), name=arm_rid), self.species)
        arm_r.orig_rid = orig_r.id

        if hasattr(arm_r, 'name'):
            arm_r.name += ' (arm)'
        if hasattr(arm_r, 'metaid'):
            arm_r.metaid = f'meta_{arm_rid}'

        # arm reaction is connected behind the enzyme reactions. It produces the original products
        arm_r.reactants = {pmet_sid: 1.0}
        arm_r.products = product_srefs
        arm_r.gene_product_assoc = None

        return arm_r

    def add_arm_reactions(self):
        """Add arm reactions to control combined flux of iso-reactions.

        Arm reactions can be added to combine flux of iso-reactions in forward/reverse direction.
        This introduces new reactions 'R_<rid>_arm' and 'R_<rid>_arm_REV'
        This introduces new pseudo metabolites 'M_pmet_<rid>_<cid>' and 'M_pmet_<rid>_REV<cid>'

        :meta private:
        """
        arm_data = defaultdict(list)
        for rid, r in self.reactions.items():
            if re.match(pf.V_, rid) is None:
                if re.match(r'.*_iso\d+', rid):
                    arm_rid = re.sub(r'_iso\d+', '_arm', rid)
                    arm_data[arm_rid].append(rid)

        reactions_to_add = {}
        for arm_rid, iso_rids in arm_data.items():
            iso_r = self.reactions[iso_rids[0]]

            # add pseudo metabolite to connect arm reaction to iso reactions
            product_srefs = {sid: stoic for sid, stoic in iso_r.products.items()
                             if re.match('C_', sid) is None}
            cid = self.species[sorted(list(product_srefs))[0]].compartment
            ridx = re.sub(f'(^{pf.R_})|(_arm)', '', arm_rid)

            # support reaction names having compartment id as postfix
            pmet_sid = f'M_pmet_{ridx}'
            if '_' not in ridx or ridx.rsplit('_', 1)[1] != cid:
                pmet_sid += f'_{cid}'
            pmet_name = f'pseudo metabolite for arm reaction of {ridx}'
            pmet_dict = {'id': pmet_sid, 'name': pmet_name, 'compartment': cid,
                         'hasOnlySubstanceUnits': False, 'boundaryCondition': False, 'constant': False}
            self.species[pmet_sid] = SbmlSpecies(pd.Series(pmet_dict, name=pmet_sid))

            # add arm reaction to control overall flux, based on original reaction
            arm_r = SbmlReaction(pd.Series(iso_r.to_dict(), name=arm_rid), self.species)
            arm_r.orig_rid = re.sub(r'_iso\d+', '', iso_r.id)
            if hasattr(arm_r, 'name'):
                arm_r.name = re.sub(r'iso\d+', 'arm', arm_r.name)
            if hasattr(arm_r, 'metaid'):
                arm_r.metaid = f'meta_{arm_rid}'

            # arm reaction is connected behind sub-reactions. It produces the original products.
            arm_r.reactants = {pmet_sid: 1.0}
            arm_r.products = product_srefs
            arm_r.gene_product_assoc = None
            reactions_to_add[arm_rid] = arm_r

            # connect iso reactions to arm reactions
            for iso_rid in iso_rids:
                iso_r = self.reactions[iso_rid]
                iso_r.products = {pmet_sid: 1.0}

        # add arm reactions to the model
        for new_rid, new_r in reactions_to_add.items():
            self.reactions[new_rid] = new_r
        print(f'{len(reactions_to_add):4d} arm reactions added, together with pseudo metabolites')

    def add_isoenzyme_reactions(self):
        """Add reactions catalyzed by isoenzymes.

        This will only affect reactions catalyzed by isoenzymes.
        We only split a reaction into iso-reactions (one reaction per catalyzing enzyme), if kcat value are defined.

        in non-TD constraint models, e.g. GECKO, a reversible FBA reaction catalyzed by several iso-enzymes will
        be split in reversible sub-reactions, one per isoenzyme. A subsequent call in ECM configuration can
        split these reversible sub-reactions in irreversible forward/reverse sub reactions

        TD configuration may split reversible FBA reactions catalyzed by several iso-enzymes in irreversible
        forward/reverse reactions. Subsequently, this function (add_isoenzyme_reactions) will split
        irreversible forward/reverse reactions in irreversible forward/reverse sub-reactions

        Integration with Thermodynamic FBA (TFA):
            TFA splits TD related reations in fwd/rev (_REV)
            TFA may also split reactions that in base model are not reversible
            TFA adds forward flux coupling constraint (C_FFC_<rid>) as product to fwd reaction
            TFA adds reverse flux coupling constraint (C_RFC_<rid>) as product or rev reaction
            new reaction ids: for reverse direction add _isox_ prior to _REV
            keep flux coupling constraints as products in _isox_ reactions

        reactions catalyzed by isoenzymes get split up
        - new reactions are created based on original reaction
        - several new iso reactions, each catalyzed by a single isoenzyme
        - reaction parameters are updated accordingly
        - original reaction is removed

        :meta private:
        """
        reactions_to_add = {}
        rids_to_del = []
        for rid, orig_r in self.reactions.items():
            if len(orig_r.enzymes) > 1:
                valid_kcatsf = sum([1 for kcat in orig_r.kcatsf if np.isfinite(kcat)])
                valid_kcatsr = (sum([1 for kcat in orig_r.kcatsr if np.isfinite(kcat)])
                                if orig_r.kcatsr is not None else 0)
                if valid_kcatsf + valid_kcatsr > 0:
                    orig_r_dict = orig_r.to_dict()
                    rids_to_del.append(rid)
                    # add iso reactions, one per isoenzyme
                    # support iso reactions after reaction has been split by TD configuration
                    for idx, eid in enumerate(orig_r.enzymes):
                        if re.search('_REV$', rid):
                            iso_rid = re.sub('_REV$', f'_iso{idx+1}_REV', rid)
                        else:
                            iso_rid = f"{rid}_iso{idx + 1}"
                        iso_r = SbmlReaction(pd.Series(orig_r_dict, name=iso_rid), self.species)
                        iso_r.orig_rid = rid
                        if hasattr(iso_r, 'name'):
                            iso_r.name += f' (iso{idx+1})'
                        if hasattr(iso_r, 'metaid'):
                            iso_r.metaid = f'meta_{iso_rid}'
                        enz = self.enzymes[eid]
                        gpa = ' and '.join(sorted([self.uid2gp[self.locus2uid[locus]] for locus in enz.composition]))
                        if ' and ' in gpa:
                            gpa = '(' + gpa + ')'
                        iso_r.gene_product_assoc = gpa
                        iso_r.enzymes = [eid]
                        iso_r.kcatsf = [orig_r.kcatsf[idx]] if orig_r.kcatsf is not None else None
                        if orig_r.reversible:
                            iso_r.kcatsr = [orig_r.kcatsr[idx]] if orig_r.kcatsr is not None else None
                        reactions_to_add[iso_rid] = iso_r

        # add iso reactions to the model
        for new_rid, new_r in reactions_to_add.items():
            self.reactions[new_rid] = new_r

        # remove original reaction no longer required
        for orig_rid in rids_to_del:
            del self.reactions[orig_rid]

    def make_irreversible(self):
        """Split enzyme catalyzed reactions into irreversible reactions.

        for enzyme catalyzed reactions that are already irreversible,
        check that flux bounds are >= 0.0 and configure kcat value from
        kcatsf

        For enzyme catalyzed reversible reactions add a new reaction
        with postfix '_REV', switch reactants/products and inverse flux
        bounds (create new flux bound parameters if required).

        Result: all enzyme catalyzed reactions are made irreversible.
        Reversible reactions of the original model will be split in fwd/rev
        Flux bound are checked and reversible flag set to 'False'

        :meta private:
        """
        # Note: self.reactions gets modified in the loop, therefore iterate though initial list of reactions
        zero_flux_bnd_pid = self.get_fbc_bnd_pid(0.0, self.flux_uid, f'zero_flux_bnd')
        rids = list(self.reactions)
        for rid in rids:
            r = self.reactions[rid]
            if len(r.enzymes) > 0:
                if r.reversible and r.kcatsf is not None and np.isfinite(r.kcatsf[0]):
                    rev_r = self.split_reversible_reaction(r)
                    r.kcat = r.kcatsf[0] if r.kcatsf is not None and np.isfinite(r.kcatsf[0]) else None
                    rev_r.kcat = rev_r.kcatsf[0] if rev_r.kcatsf is not None and np.isfinite(rev_r.kcatsf[0]) else None
                else:
                    # ensure that the irreversible reaction has correct flux bounds
                    if self.parameters[r.fbc_lower_bound].value < 0.0:
                        r.modify_bounds({'lb': zero_flux_bnd_pid})
                    if self.parameters[r.fbc_upper_bound].value < 0.0:
                        r.modify_bounds({'ub': zero_flux_bnd_pid})
                    r.kcat = None if r.kcatsf is None or not np.isfinite(r.kcatsf[0]) else r.kcatsf[0]

    def remove_unused_gps(self):
        """Remove unused genes, proteins, enzymes from the model

        :meta private:
        """
        _, _, used_gpids = self._get_used_sids_pids_gpids()
        unused_gpids = set(self.gps).difference(used_gpids)

        used_eids = set()
        for r in self.reactions.values():
            for eid in r.enzymes:
                used_eids.add(eid)
        unused_eids = set(self.enzymes).difference(used_eids)

        used_uids = set()
        for eid in used_eids:
            for locus in self.enzymes[eid].composition:
                used_uids.add(self.locus2uid[locus])
        unused_uids = set(self.proteins).difference(used_uids)

        for gpid in unused_gpids:
            del self.gps[gpid]
        for eid in unused_eids:
            del self.enzymes[eid]
        for uid in unused_uids:
            del self.proteins[uid]

    def get_sref_data(self, srefs):
        """For given species references of a reaction extract weight data

        Determine molecular weight, based on species formula
        Determine mg_per_gDW based on stoichiometry

        :meta private:
        :param dict(str, float) srefs: species references for either reaction reactants or products
        :return: species reference data collected
        :rtype: pandas.DataFrame
        """
        sref_data = {}
        for sid, mmol_per_gDW in srefs.items():
            s = self.species[sid]
            name = s.name
            formula = s.formula
            g_per_mol = np.nan
            mg_per_gdw = np.nan
            if type(formula) is str:
                g_per_mol = calc_mw_from_formula(formula)
                mg_per_gdw = mmol_per_gDW * g_per_mol
            sref_data[sid] = [name, formula, g_per_mol, mmol_per_gDW, mg_per_gdw]
        cols = ['name', 'formula', 'g_per_mol', 'mmol_per_gDW', 'mg_per_gDW']
        df_sref_data = pd.DataFrame(sref_data.values(), index=list(sref_data), columns=cols)
        df_sref_data.index.name = 'id'
        return df_sref_data

    def get_biomass_data(self, rid):
        """For given (biomass) reaction extract reactant/product data.

        Determine molecular weight, based on species formula
        Determine mg_per_gDW based on stoichiometry

        :meta private:
        :param str rid: (biomass) reaction id of the model
        :return: reactant and product data
        :rtype: two pandas DataFrames
        """
        r = self.reactions[rid]
        df_reactants = self.get_sref_data(r.reactants)
        df_products = self.get_sref_data(r.products)
        return df_reactants, df_products
Source code for f2xba.xba_model.xba_model

f2xba

Navigation

Related Topics