Source code for f2xba.utils.sgko_utils

"""Implementation of utility for single gene knockout analysis.

Peter Schubert, HHU Duesseldorf, November 2025
"""

import numpy as np
import pandas as pd

pred_outcome = {True: {True: 'tp', False: 'fn'},
                False: {True: 'fp', False: 'tn'}}



[docs]
def confusion_matrix(act_classification, pred_classification):
    """Create a 2D confusion matrix based on actual and predicted classifications.

    Statistics, set of items and confusion matrix are returned in a dictionary.

    Example: Perform single gene deletion simulation (using gurobipy interface) and plot confusion matrix.
    `keio_ess` and `keio_red` hold lists of genes that are considered essential/redundant for selected condition.

    .. code-block:: python

        eo = EcmOptimization('iML1515_GECKO.xml')
        eo.medium = {rid: 1000.0 for rid in lb_medium}
        df_sgko = eo.single_gene_deletion()

        act_classification = {gene: False for gene in keio_red}
        act_classification.update({gene: True for gene in keio_ess})
        pred_classification = (df_sgko['fitness'] < 0.05).to_dict()
        pred = confusion_matrix(act_classification, pred_classification)

        print('recall:', pred['recall'])
        pred['cm']

    :param dict(str, bool) act_classification: actual classifications
    :param dict(str, bool) pred_classification: predicted classifications
    :return: prediction results
    :rtype: dict
    """
    pred_results = {'tp': set(), 'fn': set(), 'fp': set(), 'tn': set(),
                    'recall': 0.0, 'precision': 0.0, 'specificity': 0.0, 'accuracy': 0.0, 'mcc': 0.0,
                    'cm': pd.DataFrame()}
    for gene, pred_class in pred_classification.items():
        if gene in act_classification:
            prediction = pred_outcome[act_classification[gene]][pred_class]
            pred_results[prediction].add(gene)

    tp = len(pred_results['tp'])
    fn = len(pred_results['fn'])
    fp = len(pred_results['fp'])
    tn = len(pred_results['tn'])
    pred_results['recall'] = tp / (tp + fn) if (tp + fn) > 0 else np.nan
    pred_results['precision'] = tp / (tp + fp) if (tp + fp) > 0 else np.nan
    pred_results['specificity'] = tn / (fp + tn) if (fp + tn) > 0 else np.nan
    pred_results['accuracy'] = (tp + tn) / (tp + fn + fp + tn) if (tp + fn + fp + tn) > 0 else np.nan
    pred_results['mcc'] = ((tp * tn - fp * fn) / np.sqrt((tp + fp) * (fn + tn) * (tp + fn) * (fp + tn))
                           if (tp + fp) * (fn + tn) * (tp + fn) * (fp + tn) > 0 else np.nan)
    cm = [[tp, fn],
          [fp, tn]]
    pred_results['cm'] = pd.DataFrame(cm, index=['act_pos', 'act_neg'], columns=['pred_pos', 'pred_neg'], dtype=int)
    return pred_results




[docs]
def export_gene_predictions(pred_results, exp_fitness, pred_fitness, pred_status, uniprot_data, exp_mpmf, fname=None):
    """Export gene predictions with additional information.

    Using the structure returned by `confusion_matrix()` a table is generated, indexed by gene id. The table will
    be written to an Excel file, if `fname` is provided.
    Table contains additional data, extracted from information provided in the parameters.

    For gene essentiality analysis, set parameter `exp_fitness` to {}.

    Example: Perform single gene deletion simulation (using gurobipy interface) and export prediction results.
    `keio_ess` and `keio_red` hold lists of genes that are considered essential/redundant for selected condition.
    `df_mpmf` contains proteomics data for reference. Uniprot data is collected for the organism in question.

    .. code-block:: python

        from f2xba.uniprot.uniprot_data import UniprotData

        uniprot_data = UniprotData(83333, 'data_refs')

        eo = EcmOptimization('iML1515_GECKO.xml')
        eo.medium = {rid: 1000.0 for rid in lb_medium}
        df_sgko = eo.single_gene_deletion()

        act_classification = {gene: False for gene in keio_red}
        act_classification.update({gene: True for gene in keio_ess})
        pred_classification = (df_sgko['fitness'] < 0.05).to_dict()
        pred = confusion_matrix(act_classification, pred_classification)

        pred_fitness = df_sgko['fitness'].to_dict()
        pred_status = df_sgko['status'].to_dict()
        exp_mpmf = df_mpmf['LB'].to_dict()
        fname = 'essentiality_predictions.xlsx'
        df_predictions = export_gene_predictions(pred, {}, pred_fitness, pred_status, uniprot_data, exp_mpmf, fname)

    :param dict pred_results: SGKO prediction results generated by confusion_matrix()
    :param dict(str, float) exp_fitness: fitness data from experiment, if available, otherwise {}
    :param dict(str, float) pred_fitness: fitness data determined from SGKO analysis
    :param dict(str, str) pred_status: optimization status of SGKO predictions
    :param uniprot_data: instance containing UniProt protein data for given model/organism
    :type uniprot_data: :class:`UniprotData`
    :param dict(str, float) exp_mpmf: experimental values of protein mass fractions in mg/g
    :param str fname: (optional) Excel file name of spreadsheet with`.xlsx`
    :return: table with detailed prediction data
    :rdata: pandas.DataFrame
    """
    cols = ['gene_name', 'prediction', 'pred_fitness', 'pred_status', 'exp_fitness', 'mpmf',
            'uid', 'aa_len', 'description', 'go_processes']
    data = {}
    for pred_cat in ['fp', 'fn', 'tp', 'tn']:
        for gene in pred_results[pred_cat]:
            uid = uniprot_data.locus2uid[gene]
            prot = uniprot_data.proteins[uid]
            data[gene] = [prot.gene_name, pred_cat, pred_fitness.get(gene), pred_status.get(gene),
                          exp_fitness.get(gene),
                          exp_mpmf.get(gene), uid, prot.length, prot.protein_name, '; '.join(prot.go_processes)]
    df_predictions = pd.DataFrame(data.values(), index=list(data), columns=cols)
    df_predictions.sort_index(inplace=True)
    df_predictions.index.name = 'gene'

    # write predictions results to file
    if fname:
        with pd.ExcelWriter(fname) as writer:
            df_predictions.to_excel(writer, sheet_name='SGKO predictions')
            print(f'prediction results written to {fname}')
    return df_predictions
Source code for f2xba.utils.sgko_utils

f2xba

Navigation

Related Topics