Source code for a3fe.run.calculation

"""Functionality for setting up and running an entire ABFE calculation,
consisting of two legs (bound and unbound) and multiple stages."""

__all__ = ["Calculation"]

import logging as _logging
import os as _os
import time as _time
from pathlib import Path as _Path
from typing import List as _List
from typing import Optional as _Optional

from ..configuration import EngineType as _EngineType
from ..configuration import SlurmConfig as _SlurmConfig
from ..configuration import _BaseSystemPreparationConfig, _EngineConfig
from ..configuration.enums import PreparationStage as _PreparationStage
from ._simulation_runner import SimulationRunner as _SimulationRunner
from .leg import Leg as _Leg


[docs]class Calculation(_SimulationRunner): """ Class to set up and run an entire ABFE calculation, consisting of two legs (bound and unbound) and multiple stages. """ required_input_files = [ "protein.pdb", "ligand.sdf", ] # Waters.pdb is optional
[docs] def __init__( self, equil_detection: str = "multiwindow", runtime_constant: _Optional[float] = 0.0005, relative_simulation_cost: float = 1, ensemble_size: int = 5, input_dir: _Optional[str] = None, base_dir: _Optional[str] = None, stream_log_level: int = _logging.INFO, slurm_config: _Optional[_SlurmConfig] = None, analysis_slurm_config: _Optional[_SlurmConfig] = None, engine_config: _Optional[_EngineConfig] = None, engine_type: _EngineType = _EngineType.SOMD, update_paths: bool = True, ) -> None: """ Instantiate a calculation based on files in the input dir. If calculation.pkl exists in the base directory, the calculation will be loaded from this file and any arguments supplied will be overwritten. Parameters ---------- equil_detection : str, Optional, default: "multiwindow" Method to use for equilibration detection. Options are: - "multiwindow": Use the multiwindow paired t-test method to detect equilibration. This is applied on a per-stage basis. - "chodera": Use Chodera's method to detect equilibration. runtime_constant: float, Optional, default: 0.0005 The runtime_constant (kcal**2 mol**-2 ns*-1) only affects behaviour if running adaptively, and must be supplied if running adaptively. This is used to calculate how long to run each simulation for based on the current uncertainty of the per-window free energy estimate, as discussed in the docstring of the run() method. relative_simlation_cost : float, Optional, default: 1 The relative cost of the simulation for a given runtime. This is used to calculate the predicted optimal runtime during adaptive simulations. The recommended use is to set this to 1 for the bound leg and to (speed of bound leg / speed of free leg) for the free leg. ensemble_size : int, Optional, default: 5 Number of simulations to run in the ensemble. base_dir : str, Optional, default: None Path to the base directory in which to set up the legs and stages. If None, this is set to the current working directory. input_dir : str, Optional, default: None Path to directory containing input files for the simulations. If None, this is set to `current_working_directory/input`. stream_log_level : int, Optional, default: logging.INFO Logging level to use for the steam file handlers for the calculation object and its child objects. slurm_config: SlurmConfig, default: None Configuration for the SLURM job scheduler. If None, the default partition is used. analysis_slurm_config: SlurmConfig, default: None Configuration for the SLURM job scheduler for the analysis. This is helpful e.g. if you want to submit analysis to the CPU partition, but the main simulation to the GPU partition. If None, engine_config: EngineConfig, default: None Configuration for the engine. If None, the default configuration is used. engine_type: EngineType, default: EngineType.SOMD The type of engine to use for the production simulations. update_paths: bool, Optional, default: True If True, if the simulation runner is loaded by unpickling, then update_paths() is called. Returns ------- None """ super().__init__( base_dir=base_dir, input_dir=input_dir, output_dir=None, stream_log_level=stream_log_level, ensemble_size=ensemble_size, update_paths=update_paths, slurm_config=slurm_config, analysis_slurm_config=analysis_slurm_config, engine_config=engine_config.copy() if engine_config else None, engine_type=engine_type, dump=False, ) if not self.loaded_from_pickle: self.equil_detection = equil_detection self.runtime_constant = runtime_constant self.relative_simulation_cost = relative_simulation_cost self.setup_complete: bool = False # Save the state and update log self._update_log() self._dump()
@property def legs(self) -> _List[_Leg]: return self._sub_sim_runners @legs.setter def legs(self, value) -> None: self._logger.info("Modifying/ creating legs") self._sub_sim_runners = value def _validate_input( self, sysprep_config: _BaseSystemPreparationConfig, ) -> None: """Check that the required input files are present in the input directory.""" # Check backwards, as we care about the most advanced preparation stage for prep_stage in reversed(_PreparationStage): files_absent = False for leg_type in sysprep_config.required_legs: for file in _Leg.required_input_files[leg_type][prep_stage]: if not _os.path.isfile(f"{self.input_dir}/{file}"): files_absent = True # We have the required files for this prep stage for both legs, and this is the most # advanced prep stage that files are present for if not files_absent: self._prep_stage = prep_stage self._logger.info( f"Found all required input files for preparation stage {prep_stage.name.lower()}" ) return # We didn't find all required files for any of the prep stages raise ValueError( f"Could not find all required input files for " f"any preparation stage. Required files are: " f"{[_Leg.required_input_files[leg] for leg in sysprep_config.required_legs]}" ) @property def prep_stage(self) -> _PreparationStage: if self.legs: min_prep_stage = _PreparationStage.PREEQUILIBRATED for leg in self.legs: min_prep_stage = min( [min_prep_stage, leg.prep_stage], key=lambda x: x.value ) self._prep_stage = min_prep_stage # Lazy initialization: if not set, infer from input files if not hasattr(self, "_prep_stage"): self._validate_input(self.engine_type.system_prep_config()) return self._prep_stage @property def is_complete(self) -> bool: f"""Whether the {self.__class__.__name__} has completed.""" # Check if the overall_stats.dat file exists if _Path(f"{self.output_dir}/overall_stats.dat").is_file(): return True return False
[docs] def setup( self, sysprep_config: _Optional[_BaseSystemPreparationConfig] = None, ) -> None: """ Set up the calculation. This involves parametrising, equilibrating, and deriving restraints for the bound leg. Most of the work is done by the Leg class. Parameters ---------- sysprep_config: BaseSystemPreparationConfig, optional, default = None The system preparation configuration to use for all legs. The required legs and stages will be determined from this configuration. If None, the default configuration is used. """ if self.setup_complete: self._logger.info("Setup already complete. Skipping...") return # Validate the input if sysprep_config is None: sysprep_config = self.engine_type.system_prep_config() self._validate_input(sysprep_config) self._logger.info("Starting calculation setup...") setup_start = _time.time() # Set up the legs self.legs = [] for leg_type in reversed(sysprep_config.required_legs): self._logger.info(f"Setting up {leg_type.name.lower()} leg...") leg_start = _time.time() leg = _Leg( leg_type=leg_type, equil_detection=self.equil_detection, runtime_constant=self.runtime_constant, relative_simulation_cost=self.relative_simulation_cost, ensemble_size=self.ensemble_size, input_dir=self.input_dir, base_dir=_os.path.join(self.base_dir, leg_type.name.lower()), stream_log_level=self.stream_log_level, slurm_config=self.slurm_config, analysis_slurm_config=self.analysis_slurm_config, engine_config=self.engine_config, engine_type=self.engine_type, ) self.legs.append(leg) leg.setup(sysprep_config) self._logger.debug( f"Completed {leg_type.name.lower()} leg setup in {_time.time() - leg_start:.2f}s" ) total_time = _time.time() - setup_start self._logger.info(f"Calculation setup completed in {total_time:.2f}s") # Save the state self.setup_complete = True self._dump()
[docs] def get_optimal_lam_vals( self, simtime: float = 0.1, er_type: str = "root_var", delta_er: float = 2, set_relative_sim_cost: bool = True, reference_sim_cost: float = 0.21, run_nos: _List[int] = [1], ) -> None: """ Determine the optimal lambda windows for each stage of the calculation by running short simulations at each lambda value and analysing them. This also sets the relative_simulation_effieciency of the free leg simulation runners (relative to the bound leg, which is set to 1). Parameters ---------- simtime : float, Optional, default: 0.1 The length of the short simulations to run, in ns. er_type: str, optional, default="root_var" Whether to integrate the standard error of the mean ("sem") or root variance of the gradients ("root_var") to calculate the optimal lambda values. delta_er : float, default=2 If er_type == "root_var", the desired integrated root variance of the gradients between each lambda value, in kcal mol^(-1). If er_type == "sem", the desired integrated standard error of the mean of the gradients between each lambda value, in kcal mol^(-1) ns^(1/2). A sensible default for root_var is 2 kcal mol-1, and 0,1 kcal mol-1 ns^(1/2) for sem. This is referred to as 'thermodynamic speed' in the publication. set_relative_sim_cost: bool, optional, default=True Whether to recursively set the relative simulation cost for the leg and all sub simulation runners according to the mean simulation cost of the leg. reference_sim_cost: float, optional, default=0.16 The reference simulation cost to use if set_relative_sim_cost is True, in hr / ns. The default of 0.21 is the average bound leg simulation cost from a test set of ligands of a range of system sizes on RTX 2080s. This is used to set the relative simulation cost according to average_sim_cost / reference_sim_cost. run_nos : List[int], optional, default=[1] The run numbers to use for the calculation. Only 1 is run by default, so by default we only analyse 1. If using delta_er == "sem", more than one run must be specified. Returns ------- None """ # Check that we have more than one run if using delta_er == "sem" if er_type == "sem" and len(run_nos) == 1: raise ValueError( "If using er_type = 'sem', more than one run must be specified, as the " "SEM is calculated using between-run errors by default." ) # First, run all the simulations for a 100 ps self._logger.info( f"Running simulations for {simtime} ns to determine optimal lambda values..." ) self.run(adaptive=False, runtime=simtime, run_nos=run_nos) self.wait() # Then, determine the optimal lambda windows self._logger.info( f"Determining optimal lambda values for each leg with er_type = {er_type} and delta_er = {delta_er}..." ) costs = {} for leg in self.legs: # Set simtime = None to avoid running any more simulations cost = leg.get_optimal_lam_vals( simtime=None, er_type=er_type, delta_er=delta_er, set_relative_sim_cost=set_relative_sim_cost, reference_sim_cost=reference_sim_cost, run_nos=run_nos, ) costs[leg.leg_type] = cost # Save state self._dump()
[docs] def run( self, run_nos: _Optional[_List[int]] = None, adaptive: bool = True, runtime: _Optional[float] = None, runtime_constant: _Optional[float] = None, parallel: bool = True, ) -> None: """ Run all stages and perform analysis once finished. If running adaptively, cycles of short runs then optimal runtime estimation are performed, where the optimal runtime is estimated according to .. math:: t_{\\mathrm{Optimal, k}} = \\sqrt{\\frac{t_{\\mathrm{Current}, k}}{C}}\\sigma_{\\mathrm{Current}}(\\Delta \\widehat{F}_k) where: - :math:`t_{\\mathrm{Optimal, k}}` is the calculated optimal runtime for lambda window :math:`k` - :math:`t_{\\mathrm{Current}, k}` is the current runtime for lambda window :math:`k` - :math:`C` is the runtime constant - :math:`\\sigma_{\\mathrm{Current}}(\\Delta \\widehat{F}_k)` is the current uncertainty in the free energy change contribution for lambda window :math:`k`. This is estimated from inter-run deviations. - :math:`\\Delta \\widehat{F}_k` is the free energy change contribution for lambda window :math:`k` Parameters ---------- run_nos : List[int], Optional, default: None List of run numbers to run. If None, all runs will be run. adaptive : bool, Optional, default: True If True, the stages will run until the simulations are equilibrated and perform analysis afterwards. If False, the stages will run for the specified runtime and analysis will not be performed. runtime : float, Optional, default: None If adaptive is False, runtime must be supplied and stage will run for this number of nanoseconds. runtime_constant: float, Optional, default: None The runtime_constant (kcal**2 mol**-2 ns*-1) only affects behaviour if running adaptively. This is used to calculate how long to run each simulation for based on the current uncertainty of the per-window free energy estimate. parallel : bool, Optional, default: True If True, the stages will run in parallel. If False, the stages will run sequentially. Returns ------- None """ if not self.setup_complete: raise ValueError( "The calculation has not been set up yet. Please call setup() first." ) if runtime_constant: self.recursively_set_attr("runtime_constant", runtime_constant) super().run( run_nos=run_nos, adaptive=adaptive, runtime=runtime, parallel=parallel )