Source code for f2xba.utils.sgko_utils

"""Implementation of utility for single gene knockout analysis.

Peter Schubert, HHU Duesseldorf, November 2025
"""

import numpy as np
import pandas as pd

pred_outcome = {True: {True: 'tp', False: 'fn'},
                False: {True: 'fp', False: 'tn'}}


[docs] def confusion_matrix(act_classification, pred_classification): """Create a 2D confusion matrix based on actual and predicted classifications. Statistics, set of items and confusion matrix are returned in a dictionary. Example: Perform single gene deletion simulation (using gurobipy interface) and plot confusion matrix. `keio_ess` and `keio_red` hold lists of genes that are considered essential/redundant for selected condition. .. code-block:: python eo = EcmOptimization('iML1515_GECKO.xml') eo.medium = {rid: 1000.0 for rid in lb_medium} df_sgko = eo.single_gene_deletion() act_classification = {gene: False for gene in keio_red} act_classification.update({gene: True for gene in keio_ess}) pred_classification = (df_sgko['fitness'] < 0.05).to_dict() pred = confusion_matrix(act_classification, pred_classification) print('recall:', pred['recall']) pred['cm'] :param dict(str, bool) act_classification: actual classifications :param dict(str, bool) pred_classification: predicted classifications :return: prediction results :rtype: dict """ pred_results = {'tp': set(), 'fn': set(), 'fp': set(), 'tn': set(), 'recall': 0.0, 'precision': 0.0, 'specificity': 0.0, 'accuracy': 0.0, 'mcc': 0.0, 'cm': pd.DataFrame()} for gene, pred_class in pred_classification.items(): if gene in act_classification: prediction = pred_outcome[act_classification[gene]][pred_class] pred_results[prediction].add(gene) tp = len(pred_results['tp']) fn = len(pred_results['fn']) fp = len(pred_results['fp']) tn = len(pred_results['tn']) pred_results['recall'] = tp / (tp + fn) if (tp + fn) > 0 else np.nan pred_results['precision'] = tp / (tp + fp) if (tp + fp) > 0 else np.nan pred_results['specificity'] = tn / (fp + tn) if (fp + tn) > 0 else np.nan pred_results['accuracy'] = (tp + tn) / (tp + fn + fp + tn) if (tp + fn + fp + tn) > 0 else np.nan pred_results['mcc'] = ((tp * tn - fp * fn) / np.sqrt((tp + fp) * (fn + tn) * (tp + fn) * (fp + tn)) if (tp + fp) * (fn + tn) * (tp + fn) * (fp + tn) > 0 else np.nan) cm = [[tp, fn], [fp, tn]] pred_results['cm'] = pd.DataFrame(cm, index=['act_pos', 'act_neg'], columns=['pred_pos', 'pred_neg'], dtype=int) return pred_results
[docs] def export_gene_predictions(pred_results, exp_fitness, pred_fitness, pred_status, uniprot_data, exp_mpmf, fname=None): """Export gene predictions with additional information. Using the structure returned by `confusion_matrix()` a table is generated, indexed by gene id. The table will be written to an Excel file, if `fname` is provided. Table contains additional data, extracted from information provided in the parameters. For gene essentiality analysis, set parameter `exp_fitness` to {}. Example: Perform single gene deletion simulation (using gurobipy interface) and export prediction results. `keio_ess` and `keio_red` hold lists of genes that are considered essential/redundant for selected condition. `df_mpmf` contains proteomics data for reference. Uniprot data is collected for the organism in question. .. code-block:: python from f2xba.uniprot.uniprot_data import UniprotData uniprot_data = UniprotData(83333, 'data_refs') eo = EcmOptimization('iML1515_GECKO.xml') eo.medium = {rid: 1000.0 for rid in lb_medium} df_sgko = eo.single_gene_deletion() act_classification = {gene: False for gene in keio_red} act_classification.update({gene: True for gene in keio_ess}) pred_classification = (df_sgko['fitness'] < 0.05).to_dict() pred = confusion_matrix(act_classification, pred_classification) pred_fitness = df_sgko['fitness'].to_dict() pred_status = df_sgko['status'].to_dict() exp_mpmf = df_mpmf['LB'].to_dict() fname = 'essentiality_predictions.xlsx' df_predictions = export_gene_predictions(pred, {}, pred_fitness, pred_status, uniprot_data, exp_mpmf, fname) :param dict pred_results: SGKO prediction results generated by confusion_matrix() :param dict(str, float) exp_fitness: fitness data from experiment, if available, otherwise {} :param dict(str, float) pred_fitness: fitness data determined from SGKO analysis :param dict(str, str) pred_status: optimization status of SGKO predictions :param uniprot_data: instance containing UniProt protein data for given model/organism :type uniprot_data: :class:`UniprotData` :param dict(str, float) exp_mpmf: experimental values of protein mass fractions in mg/g :param str fname: (optional) Excel file name of spreadsheet with`.xlsx` :return: table with detailed prediction data :rdata: pandas.DataFrame """ cols = ['gene_name', 'prediction', 'pred_fitness', 'pred_status', 'exp_fitness', 'mpmf', 'uid', 'aa_len', 'description', 'go_processes'] data = {} for pred_cat in ['fp', 'fn', 'tp', 'tn']: for gene in pred_results[pred_cat]: uid = uniprot_data.locus2uid[gene] prot = uniprot_data.proteins[uid] data[gene] = [prot.gene_name, pred_cat, pred_fitness.get(gene), pred_status.get(gene), exp_fitness.get(gene), exp_mpmf.get(gene), uid, prot.length, prot.protein_name, '; '.join(prot.go_processes)] df_predictions = pd.DataFrame(data.values(), index=list(data), columns=cols) df_predictions.sort_index(inplace=True) df_predictions.index.name = 'gene' # write predictions results to file if fname: with pd.ExcelWriter(fname) as writer: df_predictions.to_excel(writer, sheet_name='SGKO predictions') print(f'prediction results written to {fname}') return df_predictions