"""Implementation of utility for single gene knockout analysis.
Peter Schubert, HHU Duesseldorf, November 2025
"""
import numpy as np
import pandas as pd
pred_outcome = {True: {True: 'tp', False: 'fn'},
False: {True: 'fp', False: 'tn'}}
[docs]
def confusion_matrix(act_classification, pred_classification):
"""Create a 2D confusion matrix based on actual and predicted classifications.
Statistics, set of items and confusion matrix are returned in a dictionary.
Example: Perform single gene deletion simulation (using gurobipy interface) and plot confusion matrix.
`keio_ess` and `keio_red` hold lists of genes that are considered essential/redundant for selected condition.
.. code-block:: python
eo = EcmOptimization('iML1515_GECKO.xml')
eo.medium = {rid: 1000.0 for rid in lb_medium}
df_sgko = eo.single_gene_deletion()
act_classification = {gene: False for gene in keio_red}
act_classification.update({gene: True for gene in keio_ess})
pred_classification = (df_sgko['fitness'] < 0.05).to_dict()
pred = confusion_matrix(act_classification, pred_classification)
print('recall:', pred['recall'])
pred['cm']
:param dict(str, bool) act_classification: actual classifications
:param dict(str, bool) pred_classification: predicted classifications
:return: prediction results
:rtype: dict
"""
pred_results = {'tp': set(), 'fn': set(), 'fp': set(), 'tn': set(),
'recall': 0.0, 'precision': 0.0, 'specificity': 0.0, 'accuracy': 0.0, 'mcc': 0.0,
'cm': pd.DataFrame()}
for gene, pred_class in pred_classification.items():
if gene in act_classification:
prediction = pred_outcome[act_classification[gene]][pred_class]
pred_results[prediction].add(gene)
tp = len(pred_results['tp'])
fn = len(pred_results['fn'])
fp = len(pred_results['fp'])
tn = len(pred_results['tn'])
pred_results['recall'] = tp / (tp + fn) if (tp + fn) > 0 else np.nan
pred_results['precision'] = tp / (tp + fp) if (tp + fp) > 0 else np.nan
pred_results['specificity'] = tn / (fp + tn) if (fp + tn) > 0 else np.nan
pred_results['accuracy'] = (tp + tn) / (tp + fn + fp + tn) if (tp + fn + fp + tn) > 0 else np.nan
pred_results['mcc'] = ((tp * tn - fp * fn) / np.sqrt((tp + fp) * (fn + tn) * (tp + fn) * (fp + tn))
if (tp + fp) * (fn + tn) * (tp + fn) * (fp + tn) > 0 else np.nan)
cm = [[tp, fn],
[fp, tn]]
pred_results['cm'] = pd.DataFrame(cm, index=['act_pos', 'act_neg'], columns=['pred_pos', 'pred_neg'], dtype=int)
return pred_results
[docs]
def export_gene_predictions(pred_results, exp_fitness, pred_fitness, pred_status, uniprot_data, exp_mpmf, fname=None):
"""Export gene predictions with additional information.
Using the structure returned by `confusion_matrix()` a table is generated, indexed by gene id. The table will
be written to an Excel file, if `fname` is provided.
Table contains additional data, extracted from information provided in the parameters.
For gene essentiality analysis, set parameter `exp_fitness` to {}.
Example: Perform single gene deletion simulation (using gurobipy interface) and export prediction results.
`keio_ess` and `keio_red` hold lists of genes that are considered essential/redundant for selected condition.
`df_mpmf` contains proteomics data for reference. Uniprot data is collected for the organism in question.
.. code-block:: python
from f2xba.uniprot.uniprot_data import UniprotData
uniprot_data = UniprotData(83333, 'data_refs')
eo = EcmOptimization('iML1515_GECKO.xml')
eo.medium = {rid: 1000.0 for rid in lb_medium}
df_sgko = eo.single_gene_deletion()
act_classification = {gene: False for gene in keio_red}
act_classification.update({gene: True for gene in keio_ess})
pred_classification = (df_sgko['fitness'] < 0.05).to_dict()
pred = confusion_matrix(act_classification, pred_classification)
pred_fitness = df_sgko['fitness'].to_dict()
pred_status = df_sgko['status'].to_dict()
exp_mpmf = df_mpmf['LB'].to_dict()
fname = 'essentiality_predictions.xlsx'
df_predictions = export_gene_predictions(pred, {}, pred_fitness, pred_status, uniprot_data, exp_mpmf, fname)
:param dict pred_results: SGKO prediction results generated by confusion_matrix()
:param dict(str, float) exp_fitness: fitness data from experiment, if available, otherwise {}
:param dict(str, float) pred_fitness: fitness data determined from SGKO analysis
:param dict(str, str) pred_status: optimization status of SGKO predictions
:param uniprot_data: instance containing UniProt protein data for given model/organism
:type uniprot_data: :class:`UniprotData`
:param dict(str, float) exp_mpmf: experimental values of protein mass fractions in mg/g
:param str fname: (optional) Excel file name of spreadsheet with`.xlsx`
:return: table with detailed prediction data
:rdata: pandas.DataFrame
"""
cols = ['gene_name', 'prediction', 'pred_fitness', 'pred_status', 'exp_fitness', 'mpmf',
'uid', 'aa_len', 'description', 'go_processes']
data = {}
for pred_cat in ['fp', 'fn', 'tp', 'tn']:
for gene in pred_results[pred_cat]:
uid = uniprot_data.locus2uid[gene]
prot = uniprot_data.proteins[uid]
data[gene] = [prot.gene_name, pred_cat, pred_fitness.get(gene), pred_status.get(gene),
exp_fitness.get(gene),
exp_mpmf.get(gene), uid, prot.length, prot.protein_name, '; '.join(prot.go_processes)]
df_predictions = pd.DataFrame(data.values(), index=list(data), columns=cols)
df_predictions.sort_index(inplace=True)
df_predictions.index.name = 'gene'
# write predictions results to file
if fname:
with pd.ExcelWriter(fname) as writer:
df_predictions.to_excel(writer, sheet_name='SGKO predictions')
print(f'prediction results written to {fname}')
return df_predictions