Source code for f2xba.biocyc.biocyc_data

"""Implementation of BiocycData class.

Peter Schubert, CCB, HHU Duesseldorf, November 2022
"""

import os
import pandas as pd
import urllib.parse
import urllib.request

from .biocyc_gene import BiocycGene
from .biocyc_protein import BiocycProtein
from .biocyc_rna import BiocycRNA

api_url = 'https://websvc.biocyc.org/xmlquery'



[docs]
class BiocycData:
    """Access to enzyme related data from BioCyc online database.

    Access BioCyc data via BioVelo query, see https://biocyc.org/web-services.shtml

    Enzyme constraint and resource balance constraint models require information on enzyme composition
    during model construction. By default, enzymes will be composed of one copy per gene product, derived
    for the reaction gene product reaction rule configuration of the SBML model. Alternatively, enzyme
    composition can be retrieved from BioCyc online resource or loaded from an
    enzyme composition configuration file.

    Enzyme composition derived from BioCyc would be suitable for highly curated organism databases,
    like E. coli K-12. Access to BioCyc resources requires a paid BioCyc subscription, depending on organism.

    Initially, enzyme composition data could be retrieved from BioCyc and exported to file
    using XbaModel.export_enz_composition(). The enzyme composition could be manually adjusted and
    used for subsequent model creations.

    The organism in the BioCyc database is referenced by `org_prefix`.
    In order to use enzyme composition data from Biocyc, use configuration data in the XBA configuration file,
    sheet `general`. The parameter `biocyc_org_prefix` references the organism in BioCyc, the parameter
    `organism_dir` specifies the local download directory.
    Delete locally stored BioCyc files to enforce a retrieval from the online database.
    Enzyme composition can be loaded from file by configuring the parameter `enzyme_comp_fname` in the
    XBA parameter file, sheet `general`.

    Example: Access enzyme configuration for E. coli K-12 MG1655 strain (org_prefix: ecoli).

        .. code-block:: python

            from f2xba.biocyc.biocyc_data import BiocycData

            biocyc_data = BiocycData('ecoli', 'data_refs/biocyc')

            gene = 'b0928'
            bc_gene = biocyc_data.locus2gene[gene]
            bc_protein = biocyc_data.genes[bc_gene].proteins[0]
            biocyc_data.proteins[bc_protein].__dict__

    :param str org_prefix: BioCyc organism reference
    :param str biocyc_dir: directory name, where downloads of BioCyc are stored
    """

    def __init__(self, org_prefix, biocyc_dir):
        """Instantiate BiocycData Instance

        For a given organism, query BioCyc on-line database for specific components in
         suitable detail level. Subsequently, extract relevant information.
        Use already downloaded BioCyc exports in case they exist in biocyc_dir.

        :param str org_prefix: BioCyc organism reference
        :param str biocyc_dir: directory name, where downloads of BioCyc are stored
        """

        self.prefix_lc = org_prefix.lower()
        self.prefix_uc = org_prefix.upper() + ':'
        self.url = api_url
        self.biocyc_dir = biocyc_dir

        # Exports with required level of detail.
        self.biocyc_data = {'Gene': ['genes', 'full'],
                            'Protein': ['proteins', 'low'],
                            'RNA': ['RNAs', 'low']}

        if not self._is_complete_biocyc_data():
            self._retrieve_biocyc_data()

        self.genes = BiocycGene.get_genes(self._biocyc_data_fname('Gene'))
        """BioCyc gene related data, referenced by BioCyc gene id."""

        self.locus2gene = {gene.locus: bc_id for bc_id, gene in self.genes.items()}
        """Map gene id (locus) to BioCyc gene id."""

        self.proteins = BiocycProtein.get_proteins(self._biocyc_data_fname('Protein'))
        """BioCyc protein related data, referenced by BioCyc protein id."""

        self.rnas = BiocycRNA.get_rnas(self._biocyc_data_fname('RNA'))
        """BioCyc RNA related data, referenced by BioCyc RNA id."""

        # set gene locus on direct gene product and rnas
        for protein in self.proteins.values():
            if type(protein.gene) is str and protein.gene in self.genes:
                protein.gene_composition = {self.genes[protein.gene].locus: 1.0}
        for rna in self.rnas.values():
            if type(rna.gene) is str and rna.gene in self.genes:
                rna.gene_composition = {self.genes[rna.gene].locus: 1.0}

        # iteratively configure gene composition
        for protein_id, p in self.proteins.items():
            if len(p.protein_parts) > 0:
                p.gene_composition = self.get_gene_composition(protein_id)

    @staticmethod
    def _add_composition(tot_composition, composition, p_stoic):
        for item, stoic in composition.items():
            if item not in tot_composition:
                tot_composition[item] = 0
            tot_composition[item] += stoic * p_stoic
        return tot_composition

    def add_simple_proteins(self, add_proteins):
        """Add simple proteins (i.e. direct gene products).

        Proteins have a unique Protein id, e.g. 'GATC-MONOMER'
        and parameters, supplied in a dict
        Protein instance is created, configured and added to dict of
        existing proteins in BioCyc model.
        Gene needs to exist.
        Gene is identified by gene locus, e.g. 'b2092'
        Gene record is updated with new protein id

        :meta private:
        :param dict(str, dict) add_proteins: proteins with configuration data
        :return: number of added proteins
        :rtype: int
        """
        n_updates = 0
        for pid, data in add_proteins.items():
            if 'locus' in data:
                gid = self.locus2gene[data['locus']]
                p = BiocycProtein(pid)
                p.gene = gid
                p.gene_composition = {data['locus']: 1.0}
                self.proteins[pid] = p
                self.modify_proteins({pid: data})
                if pid not in self.genes[gid].proteins:
                    self.genes[gid].proteins.append(pid)
                n_updates += 1
        return n_updates

    def modify_proteins(self, protein_updates):
        """Modify configuration of proteins.

        Proteins have a unique Protein id, e.g. 'CPLX0-231'

        :meta private:
        :param dict(str, dict) protein_updates: proteins with configuration data
        :return: number of added proteins
        :rtype: int
        """
        n_updates = 0
        for pid, data in protein_updates.items():
            if pid in self.proteins:
                p = self.proteins[pid]
                for key, val in data.items():
                    if hasattr(p, key):
                        setattr(p, key, val)
                n_updates += 1
        return n_updates


[docs]
    def get_gene_composition(self, protein_id):
        """Retrieve gene composition of an enzyme (BioCyc protein).

        .. code-block:: python

            biocyc_data.get_gene_composition('ASPAMINOTRANS-MONOMER')

        :param str protein_id: BioCyc protein identifier
        :return: gene composition of enzyme
        :rtype: dict(str, float)
        """
        gene_composition = {}
        p = self.proteins[protein_id]

        if len(p.gene_composition) > 0:
            # in case a protein is a direct gene product, its composition contains the gene reference.
            # create new dict with gene composition to avoid side effects
            gene_composition = {gene: stoic for gene, stoic in p.gene_composition.items()}
        else:
            # retrieve composition information for a protein complex (iteratively)
            for p_part_id, p_stoic in p.protein_parts.items():
                composition = self.get_gene_composition(p_part_id)
                gene_composition = self._add_composition(gene_composition, composition, p_stoic)
            # add rna gene composition
            for rna_id, p_stoic in p.rna_parts.items():
                composition = self.rnas[rna_id].gene_composition
                gene_composition = self._add_composition(gene_composition, composition, p_stoic)
        return gene_composition


    def _biocyc_data_fname(self, component):
        class_name, detail = self.biocyc_data[component]
        return os.path.join(self.biocyc_dir, f'biocyc_{class_name}_{detail}.xml')

    def _is_complete_biocyc_data(self):
        exports_available = True
        for component in self.biocyc_data:
            file_name = self._biocyc_data_fname(component)
            if not os.path.exists(file_name):
                exports_available = False
                print(f'{file_name} does not exist.')
        return exports_available

    def _retrieve_biocyc_data(self):
        """Retrieve data from BioCyc site using REST API

        using BioVelo query, see https://biocyc.org/web-services.shtml
        """
        for component in self.biocyc_data:
            file_name = self._biocyc_data_fname(component)
            if not os.path.exists(file_name):
                class_name, detail = self.biocyc_data[component]
                print('retrieve ' + class_name + ' from BioCyc at level ' + detail)
                base_url = urllib.parse.urlparse(self.url)
                biovelo_qry = f'[x: x<-{self.prefix_lc}^^{class_name}]'
                query = urllib.parse.quote(biovelo_qry) + '&detail=' + detail
                split_url = base_url._replace(query=query)
                url = urllib.parse.urlunparse(split_url)
                req = urllib.request.Request(url)

                with urllib.request.urlopen(req) as response, open(file_name, 'w') as file:
                    file.write(response.read().decode('utf-8'))
                print(f'{file_name} from BioCyc retrieved')

    def _get_protein_components(self, protein_id):
        """Determine components of a given protein/enzyme.

        recursively parse through proteins and extract compounds
        and loci for proteins, RNAs. All with stoichiometry
        """
        components = {'proteins': {}, 'rnas': {}}   # 'compounds': {}}
        gene = self.proteins[protein_id].gene
        if gene is not None:
            locus = self.genes[gene].locus
            components['proteins'] = {locus: 1.0}
        else:
            for rna_part in self.proteins[protein_id].rna_parts:
                rna, stoic_str = rna_part.split(':')
                gene = self.rnas[rna].gene
                locus = self.genes[gene].locus
                stoic = int(stoic_str)
                if locus not in components['rnas']:
                    components['rnas'][locus] = stoic
                else:
                    components['rnas'][locus] += stoic
            for protein_part in self.proteins[protein_id].protein_parts:
                protein, stoic_str = protein_part.split(':')
                stoic = int(stoic_str)
                sub_components = self._get_protein_components(protein)
                # update components:
                for part_type in sub_components:
                    for component in sub_components[part_type]:
                        if component not in components[part_type]:
                            components[part_type][component] = stoic * sub_components[part_type][component]
                        else:
                            components[part_type][component] += stoic * sub_components[part_type][component]
        return components


[docs]
    def export_enzyme_composition(self, fname):
        """Export BioCyc enzyme composition to Excel spreadsheet.

        .. code-block:: python

            biocyc_data.export_enzyme_composition('BioCyc_enz_composition.xlsx')

        :param str fname: file name of export file with extension '.xlsx'
        """
        enz_comp = {}
        for enz_id, enz in self.proteins.items():
            if len(enz.gene_composition) > 0 and len(enz.enzrxns) > 0:
                gene_comp = '; '.join([f'gene={gene}, stoic={stoic}'
                                       for gene, stoic in enz.gene_composition.items()])
                enz_comp[enz_id] = [enz.name, enz.synonyms, gene_comp]

        df_enz_comp = pd.DataFrame(enz_comp.values(), index=list(enz_comp), columns=['name', 'synonyms', 'genes'])
        df_enz_comp.index.name = 'biocyc_id'

        with pd.ExcelWriter(fname) as writer:
            df_enz_comp.to_excel(writer, sheet_name='composition')
            print(f'{len(df_enz_comp)} enzyme compositions written to {fname}')
Source code for f2xba.biocyc.biocyc_data

f2xba

Navigation

Related Topics