"""Functionality for setting up and running an entire ABFE calculation,
consisting of two legs (bound and unbound) and multiple stages."""
__all__ = ["Calculation"]
import logging as _logging
import os as _os
import time as _time
from pathlib import Path as _Path
from typing import List as _List
from typing import Optional as _Optional
from ..configuration import EngineType as _EngineType
from ..configuration import SlurmConfig as _SlurmConfig
from ..configuration import _BaseSystemPreparationConfig, _EngineConfig
from ..configuration.enums import PreparationStage as _PreparationStage
from ._simulation_runner import SimulationRunner as _SimulationRunner
from .leg import Leg as _Leg
[docs]class Calculation(_SimulationRunner):
"""
Class to set up and run an entire ABFE calculation, consisting of two legs
(bound and unbound) and multiple stages.
"""
required_input_files = [
"protein.pdb",
"ligand.sdf",
] # Waters.pdb is optional
[docs] def __init__(
self,
equil_detection: str = "multiwindow",
runtime_constant: _Optional[float] = 0.0005,
relative_simulation_cost: float = 1,
ensemble_size: int = 5,
input_dir: _Optional[str] = None,
base_dir: _Optional[str] = None,
stream_log_level: int = _logging.INFO,
slurm_config: _Optional[_SlurmConfig] = None,
analysis_slurm_config: _Optional[_SlurmConfig] = None,
engine_config: _Optional[_EngineConfig] = None,
engine_type: _EngineType = _EngineType.SOMD,
update_paths: bool = True,
) -> None:
"""
Instantiate a calculation based on files in the input dir. If calculation.pkl exists in the
base directory, the calculation will be loaded from this file and any arguments
supplied will be overwritten.
Parameters
----------
equil_detection : str, Optional, default: "multiwindow"
Method to use for equilibration detection. Options are:
- "multiwindow": Use the multiwindow paired t-test method to detect equilibration.
This is applied on a per-stage basis.
- "chodera": Use Chodera's method to detect equilibration.
runtime_constant: float, Optional, default: 0.0005
The runtime_constant (kcal**2 mol**-2 ns*-1) only affects behaviour if running adaptively, and must
be supplied if running adaptively. This is used to calculate how long to run each simulation for based on
the current uncertainty of the per-window free energy estimate, as discussed in the docstring of the run() method.
relative_simlation_cost : float, Optional, default: 1
The relative cost of the simulation for a given runtime. This is used to calculate the
predicted optimal runtime during adaptive simulations. The recommended use
is to set this to 1 for the bound leg and to (speed of bound leg / speed of free leg)
for the free leg.
ensemble_size : int, Optional, default: 5
Number of simulations to run in the ensemble.
base_dir : str, Optional, default: None
Path to the base directory in which to set up the legs and stages. If None,
this is set to the current working directory.
input_dir : str, Optional, default: None
Path to directory containing input files for the simulations. If None, this
is set to `current_working_directory/input`.
stream_log_level : int, Optional, default: logging.INFO
Logging level to use for the steam file handlers for the
calculation object and its child objects.
slurm_config: SlurmConfig, default: None
Configuration for the SLURM job scheduler. If None, the
default partition is used.
analysis_slurm_config: SlurmConfig, default: None
Configuration for the SLURM job scheduler for the analysis.
This is helpful e.g. if you want to submit analysis to the CPU
partition, but the main simulation to the GPU partition. If None,
engine_config: EngineConfig, default: None
Configuration for the engine. If None, the default configuration is used.
engine_type: EngineType, default: EngineType.SOMD
The type of engine to use for the production simulations.
update_paths: bool, Optional, default: True
If True, if the simulation runner is loaded by unpickling, then
update_paths() is called.
Returns
-------
None
"""
super().__init__(
base_dir=base_dir,
input_dir=input_dir,
output_dir=None,
stream_log_level=stream_log_level,
ensemble_size=ensemble_size,
update_paths=update_paths,
slurm_config=slurm_config,
analysis_slurm_config=analysis_slurm_config,
engine_config=engine_config.copy() if engine_config else None,
engine_type=engine_type,
dump=False,
)
if not self.loaded_from_pickle:
self.equil_detection = equil_detection
self.runtime_constant = runtime_constant
self.relative_simulation_cost = relative_simulation_cost
self.setup_complete: bool = False
# Save the state and update log
self._update_log()
self._dump()
@property
def legs(self) -> _List[_Leg]:
return self._sub_sim_runners
@legs.setter
def legs(self, value) -> None:
self._logger.info("Modifying/ creating legs")
self._sub_sim_runners = value
def _validate_input(
self,
sysprep_config: _BaseSystemPreparationConfig,
) -> None:
"""Check that the required input files are present in the input directory."""
# Check backwards, as we care about the most advanced preparation stage
for prep_stage in reversed(_PreparationStage):
files_absent = False
for leg_type in sysprep_config.required_legs:
for file in _Leg.required_input_files[leg_type][prep_stage]:
if not _os.path.isfile(f"{self.input_dir}/{file}"):
files_absent = True
# We have the required files for this prep stage for both legs, and this is the most
# advanced prep stage that files are present for
if not files_absent:
self._prep_stage = prep_stage
self._logger.info(
f"Found all required input files for preparation stage {prep_stage.name.lower()}"
)
return
# We didn't find all required files for any of the prep stages
raise ValueError(
f"Could not find all required input files for "
f"any preparation stage. Required files are: "
f"{[_Leg.required_input_files[leg] for leg in sysprep_config.required_legs]}"
)
@property
def prep_stage(self) -> _PreparationStage:
if self.legs:
min_prep_stage = _PreparationStage.PREEQUILIBRATED
for leg in self.legs:
min_prep_stage = min(
[min_prep_stage, leg.prep_stage], key=lambda x: x.value
)
self._prep_stage = min_prep_stage
# Lazy initialization: if not set, infer from input files
if not hasattr(self, "_prep_stage"):
self._validate_input(self.engine_type.system_prep_config())
return self._prep_stage
@property
def is_complete(self) -> bool:
f"""Whether the {self.__class__.__name__} has completed."""
# Check if the overall_stats.dat file exists
if _Path(f"{self.output_dir}/overall_stats.dat").is_file():
return True
return False
[docs] def setup(
self,
sysprep_config: _Optional[_BaseSystemPreparationConfig] = None,
) -> None:
"""
Set up the calculation. This involves parametrising, equilibrating, and
deriving restraints for the bound leg. Most of the work is done by the
Leg class.
Parameters
----------
sysprep_config: BaseSystemPreparationConfig, optional, default = None
The system preparation configuration to use for all legs. The required legs
and stages will be determined from this configuration. If None, the default
configuration is used.
"""
if self.setup_complete:
self._logger.info("Setup already complete. Skipping...")
return
# Validate the input
if sysprep_config is None:
sysprep_config = self.engine_type.system_prep_config()
self._validate_input(sysprep_config)
self._logger.info("Starting calculation setup...")
setup_start = _time.time()
# Set up the legs
self.legs = []
for leg_type in reversed(sysprep_config.required_legs):
self._logger.info(f"Setting up {leg_type.name.lower()} leg...")
leg_start = _time.time()
leg = _Leg(
leg_type=leg_type,
equil_detection=self.equil_detection,
runtime_constant=self.runtime_constant,
relative_simulation_cost=self.relative_simulation_cost,
ensemble_size=self.ensemble_size,
input_dir=self.input_dir,
base_dir=_os.path.join(self.base_dir, leg_type.name.lower()),
stream_log_level=self.stream_log_level,
slurm_config=self.slurm_config,
analysis_slurm_config=self.analysis_slurm_config,
engine_config=self.engine_config,
engine_type=self.engine_type,
)
self.legs.append(leg)
leg.setup(sysprep_config)
self._logger.debug(
f"Completed {leg_type.name.lower()} leg setup in {_time.time() - leg_start:.2f}s"
)
total_time = _time.time() - setup_start
self._logger.info(f"Calculation setup completed in {total_time:.2f}s")
# Save the state
self.setup_complete = True
self._dump()
[docs] def get_optimal_lam_vals(
self,
simtime: float = 0.1,
er_type: str = "root_var",
delta_er: float = 2,
set_relative_sim_cost: bool = True,
reference_sim_cost: float = 0.21,
run_nos: _List[int] = [1],
) -> None:
"""
Determine the optimal lambda windows for each stage of the calculation
by running short simulations at each lambda value and analysing them. This
also sets the relative_simulation_effieciency of the free leg simulation
runners (relative to the bound leg, which is set to 1).
Parameters
----------
simtime : float, Optional, default: 0.1
The length of the short simulations to run, in ns.
er_type: str, optional, default="root_var"
Whether to integrate the standard error of the mean ("sem") or root
variance of the gradients ("root_var") to calculate the optimal
lambda values.
delta_er : float, default=2
If er_type == "root_var", the desired integrated root variance of the gradients
between each lambda value, in kcal mol^(-1). If er_type == "sem", the
desired integrated standard error of the mean of the gradients between each lambda
value, in kcal mol^(-1) ns^(1/2). A sensible default for root_var is 2 kcal mol-1,
and 0,1 kcal mol-1 ns^(1/2) for sem. This is referred to as 'thermodynamic speed'
in the publication.
set_relative_sim_cost: bool, optional, default=True
Whether to recursively set the relative simulation cost for the leg and all
sub simulation runners according to the mean simulation cost of the leg.
reference_sim_cost: float, optional, default=0.16
The reference simulation cost to use if set_relative_sim_cost is True, in hr / ns.
The default of 0.21 is the average bound leg simulation cost from a test set of ligands
of a range of system sizes on RTX 2080s. This is used to set the relative simulation
cost according to average_sim_cost / reference_sim_cost.
run_nos : List[int], optional, default=[1]
The run numbers to use for the calculation. Only 1 is run by default, so by default
we only analyse 1. If using delta_er == "sem", more than one run must be specified.
Returns
-------
None
"""
# Check that we have more than one run if using delta_er == "sem"
if er_type == "sem" and len(run_nos) == 1:
raise ValueError(
"If using er_type = 'sem', more than one run must be specified, as the "
"SEM is calculated using between-run errors by default."
)
# First, run all the simulations for a 100 ps
self._logger.info(
f"Running simulations for {simtime} ns to determine optimal lambda values..."
)
self.run(adaptive=False, runtime=simtime, run_nos=run_nos)
self.wait()
# Then, determine the optimal lambda windows
self._logger.info(
f"Determining optimal lambda values for each leg with er_type = {er_type} and delta_er = {delta_er}..."
)
costs = {}
for leg in self.legs:
# Set simtime = None to avoid running any more simulations
cost = leg.get_optimal_lam_vals(
simtime=None,
er_type=er_type,
delta_er=delta_er,
set_relative_sim_cost=set_relative_sim_cost,
reference_sim_cost=reference_sim_cost,
run_nos=run_nos,
)
costs[leg.leg_type] = cost
# Save state
self._dump()
[docs] def run(
self,
run_nos: _Optional[_List[int]] = None,
adaptive: bool = True,
runtime: _Optional[float] = None,
runtime_constant: _Optional[float] = None,
parallel: bool = True,
) -> None:
"""
Run all stages and perform analysis once finished. If running adaptively,
cycles of short runs then optimal runtime estimation are performed, where the optimal
runtime is estimated according to
.. math::
t_{\\mathrm{Optimal, k}} = \\sqrt{\\frac{t_{\\mathrm{Current}, k}}{C}}\\sigma_{\\mathrm{Current}}(\\Delta \\widehat{F}_k)
where:
- :math:`t_{\\mathrm{Optimal, k}}` is the calculated optimal runtime for lambda window :math:`k`
- :math:`t_{\\mathrm{Current}, k}` is the current runtime for lambda window :math:`k`
- :math:`C` is the runtime constant
- :math:`\\sigma_{\\mathrm{Current}}(\\Delta \\widehat{F}_k)` is the current uncertainty in the free energy change contribution for lambda window :math:`k`. This is estimated from inter-run deviations.
- :math:`\\Delta \\widehat{F}_k` is the free energy change contribution for lambda window :math:`k`
Parameters
----------
run_nos : List[int], Optional, default: None
List of run numbers to run. If None, all runs will be run.
adaptive : bool, Optional, default: True
If True, the stages will run until the simulations are equilibrated and perform analysis afterwards.
If False, the stages will run for the specified runtime and analysis will not be performed.
runtime : float, Optional, default: None
If adaptive is False, runtime must be supplied and stage will run for this number of nanoseconds.
runtime_constant: float, Optional, default: None
The runtime_constant (kcal**2 mol**-2 ns*-1) only affects behaviour if running adaptively. This is used
to calculate how long to run each simulation for based on the current uncertainty of the per-window
free energy estimate.
parallel : bool, Optional, default: True
If True, the stages will run in parallel. If False, the stages will run sequentially.
Returns
-------
None
"""
if not self.setup_complete:
raise ValueError(
"The calculation has not been set up yet. Please call setup() first."
)
if runtime_constant:
self.recursively_set_attr("runtime_constant", runtime_constant)
super().run(
run_nos=run_nos, adaptive=adaptive, runtime=runtime, parallel=parallel
)