"""
Functionality for running mbar on SOMD output files. This uses
pymbar through SOMD
"""
__all__ = ["run_mbar"]
import glob as _glob
import os as _os
import subprocess as _subprocess
from time import sleep as _sleep
from typing import Dict as _Dict
from typing import List as _List
from typing import Tuple as _Tuple
import numpy as _np
from ..configuration import SlurmConfig as _SlurmConfig
from ..read._process_somd_files import read_mbar_gradients as _read_mbar_gradients
from ..read._process_somd_files import read_mbar_result as _read_mbar_result
from ..read._process_somd_files import (
write_truncated_sim_datafile as _write_truncated_sim_datafile,
)
from ..run._virtual_queue import Job as _Job
from ..run._virtual_queue import VirtualQueue as _VirtualQueue
[docs]def run_mbar(
output_dir: str,
run_nos: _List[int],
percentage_end: float = 100,
percentage_start: float = 0,
subsampling: bool = False,
delete_outfiles: bool = False,
equilibrated: bool = True,
) -> _Tuple[_np.ndarray, _np.ndarray, _List[str], _Dict[str, _Dict[str, _np.ndarray]]]:
"""
Run MBAR on SOMD output files.
Parameters
----------
output_dir : str
The path to the output directory
run_nos : List[int]
The run numbers to use for MBAR.
percentage_end : float, Optional, default: 100
The percentage of data after which to truncate the datafiles.
For example, if 100, the full datafile will be used. If 50, only
the first 50% of the data will be used.
percentage_start : float, Optional, default: 0
The percentage of data before which to truncate the datafiles.
For example, if 0, the full datafile will be used. If 50, only
the last 50% of the data will be used.
subsampling : bool, Optional, default: False
Whether to use subsampling for MBAR.
delete_outfiles : bool, Optional, default: False
Whether to delete the MBAR analysis output files after the free
energy change and errors have been extracted.
equilibrated : bool, Optional, default: True
Whether to use the equilibrated datafiles or the full datafiles.
If true, the files name simfile_equilibrated.dat will be used,
otherwise simfile.dat will be used.
Returns
-------
free_energies : np.ndarray
The free energies from each run, in kcal mol-1.
errors : np.ndarray
The mbar errors on the free energies from each run, in kcal mol-1.
mbar_out_files : List[str]
The paths to the MBAR output files.
mbar_grads: Dict[str, Dict[str, np.ndarray]]
The gradients of the free energies obtained from the MBAR results (not TI),
for each run.
"""
tmp_files = _prepare_simfiles(
output_dir=output_dir,
run_nos=run_nos,
percentage_end=percentage_end,
percentage_start=percentage_start,
equilibrated=equilibrated,
)
# Run MBAR using pymbar through SOMD
mbar_out_files = []
for run_no in run_nos:
outfile = f"{output_dir}/freenrg-MBAR-run_{str(run_no).zfill(2)}_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat"
mbar_out_files.append(outfile)
with open(outfile, "w") as ofile:
cmd_list = [
"analyse_freenrg",
"mbar",
"-i",
f"{output_dir}/lambda*/run_{str(run_no).zfill(2)}/simfile_truncated_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat",
"-p",
"100",
"--overlap",
]
if subsampling:
cmd_list.append("--subsampling")
_subprocess.run(cmd_list, stdout=ofile)
free_energies = _np.array([_read_mbar_result(ofile)[0] for ofile in mbar_out_files])
errors = _np.array([_read_mbar_result(ofile)[1] for ofile in mbar_out_files])
# Get the gradients from the MBAR results
mbar_grads = {}
for i, run_no in enumerate(run_nos):
mbar_grads[f"run_{run_no}"] = {}
lam_vals, grads, grad_errors = _read_mbar_gradients(mbar_out_files[i])
mbar_grads[f"run_{run_no}"]["lam_vals"] = lam_vals
mbar_grads[f"run_{run_no}"]["grads"] = grads
mbar_grads[f"run_{run_no}"]["grad_errors"] = grad_errors
if delete_outfiles:
for ofile in mbar_out_files:
_subprocess.run(["rm", ofile])
mbar_out_files = []
# Clean up temporary simfiles
for tmp_file in tmp_files:
_subprocess.run(["rm", tmp_file])
return free_energies, errors, mbar_out_files, mbar_grads
def submit_mbar_slurm(
output_dir: str,
virtual_queue: _VirtualQueue,
slurm_config: _SlurmConfig,
run_nos: _List[int],
percentage_end: float = 100,
percentage_start: float = 0,
subsampling: bool = False,
equilibrated: bool = True,
wait: bool = False,
) -> _Tuple[_List[_Job], _List[str], _List[str]]:
"""
Submit slurm jobs to run MBAR on SOMD output files.
Parameters
----------
output_dir : str
The path to the output directory
virtual_queue : VirtualQueue
The virtual queue to submit the MBAR jobs to.
slurm_config: SlurmConfig
The SLURM configuration to use for the jobs.
run_nos : List[int]
The run numbers to use for MBAR.
percentage_end : float, Optional, default: 100
The percentage of data after which to truncate the datafiles.
For example, if 100, the full datafile will be used. If 50, only
the first 50% of the data will be used.
percentage_start : float, Optional, default: 0
The percentage of data before which to truncate the datafiles.
For example, if 0, the full datafile will be used. If 50, only
the last 50% of the data will be used.
subsampling : bool, Optional, default: False
Whether to use subsampling for MBAR.
equilibrated : bool, Optional, default: True
Whether to use the equilibrated datafiles or the full datafiles.
If true, the files name simfile_equilibrated.dat will be used,
otherwise simfile.dat will be used.
wait: bool, default: False
Whether to wait for the job to complete or not.
Returns
-------
jobs : List[Job]
The jobs submitted to the virtual queue.
mbar_out_files : List[str]
The paths to the MBAR output files, which will be created
once the jobs complete.
tmp_files : List[str]
The paths to temporary files (truncated simfiles and submission scripts),
so that they can be cleaned up later.
"""
tmp_files = _prepare_simfiles(
output_dir=output_dir,
run_nos=run_nos,
percentage_end=percentage_end,
percentage_start=percentage_start,
equilibrated=equilibrated,
)
# Add MBAR command and run for each run.
mbar_out_files = []
jobs = []
for run_no in run_nos:
# Get the name of the output files - the first for the MBAR output, and the second for the SLURM output
outfile = f"{output_dir}/freenrg-MBAR-run_{str(run_no).zfill(2)}_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat"
mbar_out_files.append(outfile)
slurm_outfile = f"slurm_freenrg-MBAR-run_{str(run_no).zfill(2)}_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.out"
slurm_config.output = slurm_outfile
# Create the command.
cmd_list = [
"analyse_freenrg",
"mbar",
"-i",
f"{output_dir}/lambda*/run_{str(run_no).zfill(2)}/simfile_truncated_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat",
"-p",
"100",
"--overlap",
"--output",
outfile,
]
if subsampling:
cmd_list.append("--subsampling")
slurm_cmd = " ".join(cmd_list)
# Create and submit the job
script_name = f"{output_dir}/freenrg-MBAR-run_{str(run_no).zfill(2)}_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start"
submission_args = slurm_config.get_submission_cmds(
slurm_cmd, output_dir, script_name
)
job = virtual_queue.submit(
submission_args,
slurm_file_base=slurm_config.get_slurm_output_file_base(output_dir),
)
tmp_files += [script_name + ".sh", _os.path.join(output_dir, slurm_outfile)]
# Update the virtual queue to submit the job to the real queue
virtual_queue.update()
jobs.append(job)
# Wait for the job to complete if we've specified wait
if wait:
for job in jobs:
while job in virtual_queue.queue:
_sleep(slurm_config.job_submission_wait)
virtual_queue.update()
return jobs, mbar_out_files, tmp_files
def collect_mbar_slurm(
output_dir: str,
run_nos: _List[int],
jobs: _List[_Job],
mbar_out_files: _List[str],
virtual_queue: _VirtualQueue,
delete_outfiles: bool = False,
tmp_files: _List[str] = [],
) -> _Tuple[_np.ndarray, _np.ndarray, _List[str], _Dict[str, _Dict[str, _np.ndarray]]]:
"""
Collect the results from MBAR slurm jobs.
Parameters
----------
output_dir : str
The path to the output directory
run_nos : List[int]
The run numbers to use for MBAR.
jobs : List[Job]
The jobs submitted to the virtual queue.
mbar_out_files : List[str]
The paths to the MBAR
virtual_queue : VirtualQueue
The virtual queue to submit the MBAR jobs to.
delete_outfiles : bool, Optional, default: False
Whether to delete the MBAR analysis output files after the free
energy change and errors have been extracted.
tmp_files : List[str], Optional, default: []
The paths to temporary files (truncated simfiles and submission scripts),
so that they can be cleaned up later.
Returns
-------
free_energies : np.ndarray
The free energies from each run, in kcal mol-1.
errors : np.ndarray
The mbar errors on the free energies from each run, in kcal mol-1.
mbar_out_files : List[str]
The paths to the MBAR output files. Returned for consistency with the non-
slurm function.
mbar_grads: Dict[str, Dict[str, np.ndarray]]
The gradients of the free energies obtained from the MBAR results (not TI),
for each run.
"""
# Wait for the jobs to finish.
for job in jobs:
while job in virtual_queue.queue:
_sleep(30)
virtual_queue.update()
# Try to read the results
try:
free_energies = _np.array(
[_read_mbar_result(ofile)[0] for ofile in mbar_out_files]
)
errors = _np.array([_read_mbar_result(ofile)[1] for ofile in mbar_out_files])
except FileNotFoundError:
raise FileNotFoundError(
"The MBAR output files could not be found. Checkk the output of the slurm jobs in"
f" {output_dir} to see if there were any errors."
)
# Get the gradients from the MBAR results
mbar_grads = {}
for i, run_no in enumerate(run_nos):
mbar_grads[f"run_{run_no}"] = {}
lam_vals, grads, grad_errors = _read_mbar_gradients(mbar_out_files[i])
mbar_grads[f"run_{run_no}"]["lam_vals"] = lam_vals
mbar_grads[f"run_{run_no}"]["grads"] = grads
mbar_grads[f"run_{run_no}"]["grad_errors"] = grad_errors
if delete_outfiles:
for ofile in mbar_out_files:
_subprocess.run(["rm", ofile])
mbar_out_files = []
# Clean up temporary simfiles
for tmp_file in tmp_files:
_subprocess.run(["rm", tmp_file])
return free_energies, errors, mbar_out_files, mbar_grads
def _prepare_simfiles(
output_dir: str,
run_nos: _List[int],
percentage_end: float = 100,
percentage_start: float = 0,
equilibrated: bool = True,
) -> _List[str]:
"""
Helper function to prepare the simfiles for MBAR analysis. Returns
the paths to the temporary truncated simfiles, so that they can be
cleaned up later.
"""
# Check that the simfiles actually exist
file_name = "simfile_equilibrated.dat" if equilibrated else "simfile.dat"
simfiles = _glob.glob(f"{output_dir}/lambda*/run_*/{file_name}")
# Filter by run numbers
if run_nos is not None:
simfiles = [
simfile
for simfile in simfiles
if int(simfile.split("/")[-2].split("_")[-1]) in run_nos
]
if len(simfiles) == 0:
raise FileNotFoundError(
"No equilibrated simfiles found. Have you run the simulations "
"and checked for equilibration?"
)
# Create temporary truncated simfiles
tmp_files = [] # Clean these up afterwards
for simfile in simfiles:
tmp_file = _os.path.join(
_os.path.dirname(simfile),
f"simfile_truncated_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat",
)
tmp_files.append(tmp_file)
_write_truncated_sim_datafile(
simfile,
tmp_file,
fraction_final=percentage_end / 100,
fraction_initial=percentage_start / 100,
)
return tmp_files