Source code for a3fe.analyse.analyse_set

"""Functionality to analyse a set of calculations and compare the result with experiment"""

__all__ = [
    "compute_stats",
    "get_bootstrapped_results",
    "compute_statistic",
]

from typing import Dict as _Dict
from typing import List as _List
from typing import Tuple as _Tuple

import numpy as _np
import pandas as _pd
from scipy import stats as _stats
from sklearn import metrics as _metrics


[docs]def compute_stats(all_results: _pd.DataFrame) -> _Dict[str, _List[float]]: """ Compute statistics for the passed results, generating 95 % C.I.s by bootstrapping. Parameters ---------- all_results : pd.DataFrame The dataframe containing all results. Returns ------- Dict[str, List[float]] A dictionary of the computed statistics, and their upper and lower confidence bounds. """ # This will hold metric: [value, upper_err, lower_er] results = {"r": [], "r2": [], "mue": [], "rmse": [], "rho": [], "tau": []} # Get the bootstrapped results n_bootstrap = 1000 bootstrapped_exp_dg, bootstrapped_calc_dg = get_bootstrapped_results( all_results=all_results, n_bootstrap=n_bootstrap ) # For each metric, calculate i) over the actual data ii) overall bootstrapped data and extract stats for metric in results: results[metric].append( compute_statistic(all_results["exp_dg"], all_results["calc_dg"], metric) ) bootstrapped_metric = _np.zeros([n_bootstrap]) for i in range(n_bootstrap): bootstrapped_metric[i] = compute_statistic( bootstrapped_exp_dg[i], bootstrapped_calc_dg[i], metric ) percentiles = _np.percentile(bootstrapped_metric, [5, 95]) # 95 % C.I.s results[metric].append(percentiles[0]) results[metric].append(percentiles[1]) return results
[docs]def get_bootstrapped_results( all_results: _pd.DataFrame, n_bootstrap: int = 1000 ) -> _Tuple[_np.ndarray, _np.ndarray]: """ Return n_bootstrap bootstrapped versions of the original experimental and calculated free energies. Parameters ---------- all_results : pd.DataFrame The dataframe containing all results. n_bootstrap : int, optional, default = 1000 Number of boostrap iterations to perform Returns ------- boostrapped_exp_dg: np.ndarray The bootstrapped experimental free energy changes bootstrapped_calc_dg: np_ndarray The bootstrapped calculated free energy changes """ exp_dg = all_results["exp_dg"] calc_dg = all_results["calc_dg"] exp_sem = all_results["exp_er"] / 1.96 calc_sem = all_results["calc_er"] / 1.96 # Check that the data passed are of the same length if len(exp_dg) != len(calc_dg): raise ValueError( "The lengths of the calculated and experimental free energy values must match" ) n_samples = len(exp_dg) bootstrapped_exp_dg = _np.zeros([n_bootstrap, n_samples]) bootstrapped_calc_dg = _np.zeros([n_bootstrap, n_samples]) for i in range(n_bootstrap): # Ensure we use same indices for the experimental and calculated results to avoid mixing # results indices = _np.random.choice(_np.arange(n_samples), size=n_samples, replace=True) bootstrapped_exp_dg[i] = _np.array( [_np.random.normal(loc=exp_dg[i], scale=exp_sem[i]) for i in indices] ) bootstrapped_calc_dg[i] = _np.array( [_np.random.normal(loc=calc_dg[i], scale=calc_sem[i]) for i in indices] ) return bootstrapped_exp_dg, bootstrapped_calc_dg
[docs]def compute_statistic(exp_dg: _pd.Series, calc_dg: _pd.Series, statistic: str) -> float: """ Compute the desired statistic for one set of experimental and calculated values. Parameters ---------- exp_dg : pd.Series The experimental free energies calc_dg : pd.Series The calculated free energies statistic : str The desired statistic to be calculated, from "r", "mue", "rmse" "rho", or "tau". Returns ------- float The desired statistic. """ # Check that requested metric is implemented allowed_stats = ["r", "r2", "mue", "rmse", "rho", "tau"] if statistic not in allowed_stats: raise ValueError( f"Statistic must be one of {allowed_stats} but was {statistic}" ) if statistic == "r": return _stats.pearsonr(exp_dg, calc_dg)[0] if statistic == "r2": m, c, r, p, sem = _stats.linregress(exp_dg, calc_dg) return r**2 if statistic == "mue": return _metrics.mean_absolute_error(exp_dg, calc_dg) if statistic == "rmse": return _np.sqrt(_metrics.mean_squared_error(exp_dg, calc_dg)) if statistic == "rho": return _stats.spearmanr(exp_dg, calc_dg)[0] if statistic == "tau": return _stats.kendalltau(exp_dg, calc_dg)[0]