Source code for f2xba.biocyc.biocyc_data

"""Implementation of BiocycData class.

Peter Schubert, CCB, HHU Duesseldorf, November 2022
"""

import os
import pandas as pd
import urllib.parse
import urllib.request

from .biocyc_gene import BiocycGene
from .biocyc_protein import BiocycProtein
from .biocyc_rna import BiocycRNA

api_url = 'https://websvc.biocyc.org/xmlquery'


[docs] class BiocycData: """Access to enzyme related data from BioCyc online database. Access BioCyc data via BioVelo query, see https://biocyc.org/web-services.shtml Enzyme constraint and resource balance constraint models require information on enzyme composition during model construction. By default, enzymes will be composed of one copy per gene product, derived for the reaction gene product reaction rule configuration of the SBML model. Alternatively, enzyme composition can be retrieved from BioCyc online resource or loaded from an enzyme composition configuration file. Enzyme composition derived from BioCyc would be suitable for highly curated organism databases, like E. coli K-12. Access to BioCyc resources requires a paid BioCyc subscription, depending on organism. Initially, enzyme composition data could be retrieved from BioCyc and exported to file using XbaModel.export_enz_composition(). The enzyme composition could be manually adjusted and used for subsequent model creations. The organism in the BioCyc database is referenced by `org_prefix`. In order to use enzyme composition data from Biocyc, use configuration data in the XBA configuration file, sheet `general`. The parameter `biocyc_org_prefix` references the organism in BioCyc, the parameter `organism_dir` specifies the local download directory. Delete locally stored BioCyc files to enforce a retrieval from the online database. Enzyme composition can be loaded from file by configuring the parameter `enzyme_comp_fname` in the XBA parameter file, sheet `general`. Example: Access enzyme configuration for E. coli K-12 MG1655 strain (org_prefix: ecoli). .. code-block:: python from f2xba.biocyc.biocyc_data import BiocycData biocyc_data = BiocycData('ecoli', 'data_refs/biocyc') gene = 'b0928' bc_gene = biocyc_data.locus2gene[gene] bc_protein = biocyc_data.genes[bc_gene].proteins[0] biocyc_data.proteins[bc_protein].__dict__ :param str org_prefix: BioCyc organism reference :param str biocyc_dir: directory name, where downloads of BioCyc are stored """ def __init__(self, org_prefix, biocyc_dir): """Instantiate BiocycData Instance For a given organism, query BioCyc on-line database for specific components in suitable detail level. Subsequently, extract relevant information. Use already downloaded BioCyc exports in case they exist in biocyc_dir. :param str org_prefix: BioCyc organism reference :param str biocyc_dir: directory name, where downloads of BioCyc are stored """ self.prefix_lc = org_prefix.lower() self.prefix_uc = org_prefix.upper() + ':' self.url = api_url self.biocyc_dir = biocyc_dir # Exports with required level of detail. self.biocyc_data = {'Gene': ['genes', 'full'], 'Protein': ['proteins', 'low'], 'RNA': ['RNAs', 'low']} if not self._is_complete_biocyc_data(): self._retrieve_biocyc_data() self.genes = BiocycGene.get_genes(self._biocyc_data_fname('Gene')) """BioCyc gene related data, referenced by BioCyc gene id.""" self.locus2gene = {gene.locus: bc_id for bc_id, gene in self.genes.items()} """Map gene id (locus) to BioCyc gene id.""" self.proteins = BiocycProtein.get_proteins(self._biocyc_data_fname('Protein')) """BioCyc protein related data, referenced by BioCyc protein id.""" self.rnas = BiocycRNA.get_rnas(self._biocyc_data_fname('RNA')) """BioCyc RNA related data, referenced by BioCyc RNA id.""" # set gene locus on direct gene product and rnas for protein in self.proteins.values(): if type(protein.gene) is str and protein.gene in self.genes: protein.gene_composition = {self.genes[protein.gene].locus: 1.0} for rna in self.rnas.values(): if type(rna.gene) is str and rna.gene in self.genes: rna.gene_composition = {self.genes[rna.gene].locus: 1.0} # iteratively configure gene composition for protein_id, p in self.proteins.items(): if len(p.protein_parts) > 0: p.gene_composition = self.get_gene_composition(protein_id) @staticmethod def _add_composition(tot_composition, composition, p_stoic): for item, stoic in composition.items(): if item not in tot_composition: tot_composition[item] = 0 tot_composition[item] += stoic * p_stoic return tot_composition def add_simple_proteins(self, add_proteins): """Add simple proteins (i.e. direct gene products). Proteins have a unique Protein id, e.g. 'GATC-MONOMER' and parameters, supplied in a dict Protein instance is created, configured and added to dict of existing proteins in BioCyc model. Gene needs to exist. Gene is identified by gene locus, e.g. 'b2092' Gene record is updated with new protein id :meta private: :param dict(str, dict) add_proteins: proteins with configuration data :return: number of added proteins :rtype: int """ n_updates = 0 for pid, data in add_proteins.items(): if 'locus' in data: gid = self.locus2gene[data['locus']] p = BiocycProtein(pid) p.gene = gid p.gene_composition = {data['locus']: 1.0} self.proteins[pid] = p self.modify_proteins({pid: data}) if pid not in self.genes[gid].proteins: self.genes[gid].proteins.append(pid) n_updates += 1 return n_updates def modify_proteins(self, protein_updates): """Modify configuration of proteins. Proteins have a unique Protein id, e.g. 'CPLX0-231' :meta private: :param dict(str, dict) protein_updates: proteins with configuration data :return: number of added proteins :rtype: int """ n_updates = 0 for pid, data in protein_updates.items(): if pid in self.proteins: p = self.proteins[pid] for key, val in data.items(): if hasattr(p, key): setattr(p, key, val) n_updates += 1 return n_updates
[docs] def get_gene_composition(self, protein_id): """Retrieve gene composition of an enzyme (BioCyc protein). .. code-block:: python biocyc_data.get_gene_composition('ASPAMINOTRANS-MONOMER') :param str protein_id: BioCyc protein identifier :return: gene composition of enzyme :rtype: dict(str, float) """ gene_composition = {} p = self.proteins[protein_id] if len(p.gene_composition) > 0: # in case a protein is a direct gene product, its composition contains the gene reference. # create new dict with gene composition to avoid side effects gene_composition = {gene: stoic for gene, stoic in p.gene_composition.items()} else: # retrieve composition information for a protein complex (iteratively) for p_part_id, p_stoic in p.protein_parts.items(): composition = self.get_gene_composition(p_part_id) gene_composition = self._add_composition(gene_composition, composition, p_stoic) # add rna gene composition for rna_id, p_stoic in p.rna_parts.items(): composition = self.rnas[rna_id].gene_composition gene_composition = self._add_composition(gene_composition, composition, p_stoic) return gene_composition
def _biocyc_data_fname(self, component): class_name, detail = self.biocyc_data[component] return os.path.join(self.biocyc_dir, f'biocyc_{class_name}_{detail}.xml') def _is_complete_biocyc_data(self): exports_available = True for component in self.biocyc_data: file_name = self._biocyc_data_fname(component) if not os.path.exists(file_name): exports_available = False print(f'{file_name} does not exist.') return exports_available def _retrieve_biocyc_data(self): """Retrieve data from BioCyc site using REST API using BioVelo query, see https://biocyc.org/web-services.shtml """ for component in self.biocyc_data: file_name = self._biocyc_data_fname(component) if not os.path.exists(file_name): class_name, detail = self.biocyc_data[component] print('retrieve ' + class_name + ' from BioCyc at level ' + detail) base_url = urllib.parse.urlparse(self.url) biovelo_qry = f'[x: x<-{self.prefix_lc}^^{class_name}]' query = urllib.parse.quote(biovelo_qry) + '&detail=' + detail split_url = base_url._replace(query=query) url = urllib.parse.urlunparse(split_url) req = urllib.request.Request(url) with urllib.request.urlopen(req) as response, open(file_name, 'w') as file: file.write(response.read().decode('utf-8')) print(f'{file_name} from BioCyc retrieved') def _get_protein_components(self, protein_id): """Determine components of a given protein/enzyme. recursively parse through proteins and extract compounds and loci for proteins, RNAs. All with stoichiometry """ components = {'proteins': {}, 'rnas': {}} # 'compounds': {}} gene = self.proteins[protein_id].gene if gene is not None: locus = self.genes[gene].locus components['proteins'] = {locus: 1.0} else: for rna_part in self.proteins[protein_id].rna_parts: rna, stoic_str = rna_part.split(':') gene = self.rnas[rna].gene locus = self.genes[gene].locus stoic = int(stoic_str) if locus not in components['rnas']: components['rnas'][locus] = stoic else: components['rnas'][locus] += stoic for protein_part in self.proteins[protein_id].protein_parts: protein, stoic_str = protein_part.split(':') stoic = int(stoic_str) sub_components = self._get_protein_components(protein) # update components: for part_type in sub_components: for component in sub_components[part_type]: if component not in components[part_type]: components[part_type][component] = stoic * sub_components[part_type][component] else: components[part_type][component] += stoic * sub_components[part_type][component] return components
[docs] def export_enzyme_composition(self, fname): """Export BioCyc enzyme composition to Excel spreadsheet. .. code-block:: python biocyc_data.export_enzyme_composition('BioCyc_enz_composition.xlsx') :param str fname: file name of export file with extension '.xlsx' """ enz_comp = {} for enz_id, enz in self.proteins.items(): if len(enz.gene_composition) > 0 and len(enz.enzrxns) > 0: gene_comp = '; '.join([f'gene={gene}, stoic={stoic}' for gene, stoic in enz.gene_composition.items()]) enz_comp[enz_id] = [enz.name, enz.synonyms, gene_comp] df_enz_comp = pd.DataFrame(enz_comp.values(), index=list(enz_comp), columns=['name', 'synonyms', 'genes']) df_enz_comp.index.name = 'biocyc_id' with pd.ExcelWriter(fname) as writer: df_enz_comp.to_excel(writer, sheet_name='composition') print(f'{len(df_enz_comp)} enzyme compositions written to {fname}')