Source code for a3fe.analyse.mbar

"""
Functionality for running mbar on SOMD output files. This uses
pymbar through SOMD
"""

__all__ = ["run_mbar"]

import glob as _glob
import os as _os
import subprocess as _subprocess
from time import sleep as _sleep
from typing import Dict as _Dict
from typing import List as _List
from typing import Tuple as _Tuple

import numpy as _np

from ..configuration import SlurmConfig as _SlurmConfig
from ..read._process_somd_files import read_mbar_gradients as _read_mbar_gradients
from ..read._process_somd_files import read_mbar_result as _read_mbar_result
from ..read._process_somd_files import (
    write_truncated_sim_datafile as _write_truncated_sim_datafile,
)
from ..run._virtual_queue import Job as _Job
from ..run._virtual_queue import VirtualQueue as _VirtualQueue


[docs]def run_mbar(
    output_dir: str,
    run_nos: _List[int],
    percentage_end: float = 100,
    percentage_start: float = 0,
    subsampling: bool = False,
    delete_outfiles: bool = False,
    equilibrated: bool = True,
) -> _Tuple[_np.ndarray, _np.ndarray, _List[str], _Dict[str, _Dict[str, _np.ndarray]]]:
    """
    Run MBAR on SOMD output files.

    Parameters
    ----------
    output_dir : str
        The path to the output directory
    run_nos : List[int]
        The run numbers to use for MBAR.
    percentage_end : float, Optional, default: 100
        The percentage of data after which to truncate the datafiles.
        For example, if 100, the full datafile will be used. If 50, only
        the first 50% of the data will be used.
    percentage_start : float, Optional, default: 0
        The percentage of data before which to truncate the datafiles.
        For example, if 0, the full datafile will be used. If 50, only
        the last 50% of the data will be used.
    subsampling : bool, Optional, default: False
        Whether to use subsampling for MBAR.
    delete_outfiles : bool, Optional, default: False
        Whether to delete the MBAR analysis output files after the free
        energy change and errors have been extracted.
    equilibrated : bool, Optional, default: True
        Whether to use the equilibrated datafiles or the full datafiles.
        If true, the files name simfile_equilibrated.dat will be used,
        otherwise simfile.dat will be used.

    Returns
    -------
    free_energies : np.ndarray
        The free energies from each run, in kcal mol-1.
    errors : np.ndarray
        The mbar errors on the free energies from each run, in kcal mol-1.
    mbar_out_files : List[str]
        The paths to the MBAR output files.
    mbar_grads: Dict[str, Dict[str, np.ndarray]]
        The gradients of the free energies obtained from the MBAR results (not TI),
        for each run.
    """
    tmp_files = _prepare_simfiles(
        output_dir=output_dir,
        run_nos=run_nos,
        percentage_end=percentage_end,
        percentage_start=percentage_start,
        equilibrated=equilibrated,
    )

    # Run MBAR using pymbar through SOMD
    mbar_out_files = []
    for run_no in run_nos:
        outfile = f"{output_dir}/freenrg-MBAR-run_{str(run_no).zfill(2)}_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat"
        mbar_out_files.append(outfile)
        with open(outfile, "w") as ofile:
            cmd_list = [
                "analyse_freenrg",
                "mbar",
                "-i",
                f"{output_dir}/lambda*/run_{str(run_no).zfill(2)}/simfile_truncated_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat",
                "-p",
                "100",
                "--overlap",
            ]
            if subsampling:
                cmd_list.append("--subsampling")
            _subprocess.run(cmd_list, stdout=ofile)

    free_energies = _np.array([_read_mbar_result(ofile)[0] for ofile in mbar_out_files])
    errors = _np.array([_read_mbar_result(ofile)[1] for ofile in mbar_out_files])

    # Get the gradients from the MBAR results
    mbar_grads = {}
    for i, run_no in enumerate(run_nos):
        mbar_grads[f"run_{run_no}"] = {}
        lam_vals, grads, grad_errors = _read_mbar_gradients(mbar_out_files[i])
        mbar_grads[f"run_{run_no}"]["lam_vals"] = lam_vals
        mbar_grads[f"run_{run_no}"]["grads"] = grads
        mbar_grads[f"run_{run_no}"]["grad_errors"] = grad_errors

    if delete_outfiles:
        for ofile in mbar_out_files:
            _subprocess.run(["rm", ofile])
        mbar_out_files = []

    # Clean up temporary simfiles
    for tmp_file in tmp_files:
        _subprocess.run(["rm", tmp_file])

    return free_energies, errors, mbar_out_files, mbar_grads


def submit_mbar_slurm(
    output_dir: str,
    virtual_queue: _VirtualQueue,
    slurm_config: _SlurmConfig,
    run_nos: _List[int],
    percentage_end: float = 100,
    percentage_start: float = 0,
    subsampling: bool = False,
    equilibrated: bool = True,
    wait: bool = False,
) -> _Tuple[_List[_Job], _List[str], _List[str]]:
    """
    Submit slurm jobs to run MBAR on SOMD output files.

    Parameters
    ----------
    output_dir : str
        The path to the output directory
    virtual_queue : VirtualQueue
        The virtual queue to submit the MBAR jobs to.
    slurm_config: SlurmConfig
        The SLURM configuration to use for the jobs.
    run_nos : List[int]
        The run numbers to use for MBAR.
    percentage_end : float, Optional, default: 100
        The percentage of data after which to truncate the datafiles.
        For example, if 100, the full datafile will be used. If 50, only
        the first 50% of the data will be used.
    percentage_start : float, Optional, default: 0
        The percentage of data before which to truncate the datafiles.
        For example, if 0, the full datafile will be used. If 50, only
        the last 50% of the data will be used.
    subsampling : bool, Optional, default: False
        Whether to use subsampling for MBAR.
    equilibrated : bool, Optional, default: True
        Whether to use the equilibrated datafiles or the full datafiles.
        If true, the files name simfile_equilibrated.dat will be used,
        otherwise simfile.dat will be used.
    wait: bool, default: False
        Whether to wait for the job to complete or not.

    Returns
    -------
    jobs : List[Job]
        The jobs submitted to the virtual queue.

    mbar_out_files : List[str]
        The paths to the MBAR output files, which will be created
        once the jobs complete.

    tmp_files : List[str]
        The paths to temporary files (truncated simfiles and submission scripts),
        so that they can be cleaned up later.
    """

    tmp_files = _prepare_simfiles(
        output_dir=output_dir,
        run_nos=run_nos,
        percentage_end=percentage_end,
        percentage_start=percentage_start,
        equilibrated=equilibrated,
    )

    # Add MBAR command and run for each run.
    mbar_out_files = []
    jobs = []

    for run_no in run_nos:
        # Get the name of the output files - the first for the MBAR output, and the second for the SLURM output
        outfile = f"{output_dir}/freenrg-MBAR-run_{str(run_no).zfill(2)}_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat"
        mbar_out_files.append(outfile)
        slurm_outfile = f"slurm_freenrg-MBAR-run_{str(run_no).zfill(2)}_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.out"
        slurm_config.output = slurm_outfile

        # Create the command.
        cmd_list = [
            "analyse_freenrg",
            "mbar",
            "-i",
            f"{output_dir}/lambda*/run_{str(run_no).zfill(2)}/simfile_truncated_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat",
            "-p",
            "100",
            "--overlap",
            "--output",
            outfile,
        ]
        if subsampling:
            cmd_list.append("--subsampling")
        slurm_cmd = " ".join(cmd_list)

        # Create and submit the job
        script_name = f"{output_dir}/freenrg-MBAR-run_{str(run_no).zfill(2)}_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start"
        submission_args = slurm_config.get_submission_cmds(
            slurm_cmd, output_dir, script_name
        )
        job = virtual_queue.submit(
            submission_args,
            slurm_file_base=slurm_config.get_slurm_output_file_base(output_dir),
        )
        tmp_files += [script_name + ".sh", _os.path.join(output_dir, slurm_outfile)]

        # Update the virtual queue to submit the job to the real queue
        virtual_queue.update()
        jobs.append(job)

    # Wait for the job to complete if we've specified wait
    if wait:
        for job in jobs:
            while job in virtual_queue.queue:
                _sleep(slurm_config.job_submission_wait)
                virtual_queue.update()

    return jobs, mbar_out_files, tmp_files


def collect_mbar_slurm(
    output_dir: str,
    run_nos: _List[int],
    jobs: _List[_Job],
    mbar_out_files: _List[str],
    virtual_queue: _VirtualQueue,
    delete_outfiles: bool = False,
    tmp_files: _List[str] = [],
) -> _Tuple[_np.ndarray, _np.ndarray, _List[str], _Dict[str, _Dict[str, _np.ndarray]]]:
    """
    Collect the results from MBAR slurm jobs.

    Parameters
    ----------
    output_dir : str
        The path to the output directory
    run_nos : List[int]
        The run numbers to use for MBAR.
    jobs : List[Job]
        The jobs submitted to the virtual queue.
    mbar_out_files : List[str]
        The paths to the MBAR
    virtual_queue : VirtualQueue
        The virtual queue to submit the MBAR jobs to.
    delete_outfiles : bool, Optional, default: False
        Whether to delete the MBAR analysis output files after the free
        energy change and errors have been extracted.
    tmp_files : List[str], Optional, default: []
        The paths to temporary files (truncated simfiles and submission scripts),
        so that they can be cleaned up later.

    Returns
    -------
    free_energies : np.ndarray
        The free energies from each run, in kcal mol-1.
    errors : np.ndarray
        The mbar errors on the free energies from each run, in kcal mol-1.
    mbar_out_files : List[str]
        The paths to the MBAR output files. Returned for consistency with the non-
        slurm function.
    mbar_grads: Dict[str, Dict[str, np.ndarray]]
        The gradients of the free energies obtained from the MBAR results (not TI),
        for each run.
    """
    # Wait for the jobs to finish.
    for job in jobs:
        while job in virtual_queue.queue:
            _sleep(30)
            virtual_queue.update()

    # Try to read the results
    try:
        free_energies = _np.array(
            [_read_mbar_result(ofile)[0] for ofile in mbar_out_files]
        )
        errors = _np.array([_read_mbar_result(ofile)[1] for ofile in mbar_out_files])
    except FileNotFoundError:
        raise FileNotFoundError(
            "The MBAR output files could not be found. Checkk the output of the slurm jobs in"
            f" {output_dir} to see if there were any errors."
        )

    # Get the gradients from the MBAR results
    mbar_grads = {}
    for i, run_no in enumerate(run_nos):
        mbar_grads[f"run_{run_no}"] = {}
        lam_vals, grads, grad_errors = _read_mbar_gradients(mbar_out_files[i])
        mbar_grads[f"run_{run_no}"]["lam_vals"] = lam_vals
        mbar_grads[f"run_{run_no}"]["grads"] = grads
        mbar_grads[f"run_{run_no}"]["grad_errors"] = grad_errors

    if delete_outfiles:
        for ofile in mbar_out_files:
            _subprocess.run(["rm", ofile])
        mbar_out_files = []

    # Clean up temporary simfiles
    for tmp_file in tmp_files:
        _subprocess.run(["rm", tmp_file])

    return free_energies, errors, mbar_out_files, mbar_grads


def _prepare_simfiles(
    output_dir: str,
    run_nos: _List[int],
    percentage_end: float = 100,
    percentage_start: float = 0,
    equilibrated: bool = True,
) -> _List[str]:
    """
    Helper function to prepare the simfiles for MBAR analysis. Returns
    the paths to the temporary truncated simfiles, so that they can be
    cleaned up later.
    """
    # Check that the simfiles actually exist
    file_name = "simfile_equilibrated.dat" if equilibrated else "simfile.dat"
    simfiles = _glob.glob(f"{output_dir}/lambda*/run_*/{file_name}")
    # Filter by run numbers
    if run_nos is not None:
        simfiles = [
            simfile
            for simfile in simfiles
            if int(simfile.split("/")[-2].split("_")[-1]) in run_nos
        ]

    if len(simfiles) == 0:
        raise FileNotFoundError(
            "No equilibrated simfiles found. Have you run the simulations "
            "and checked for equilibration?"
        )

    # Create temporary truncated simfiles
    tmp_files = []  # Clean these up afterwards
    for simfile in simfiles:
        tmp_file = _os.path.join(
            _os.path.dirname(simfile),
            f"simfile_truncated_{round(percentage_end, 3)}_end_{round(percentage_start, 3)}_start.dat",
        )
        tmp_files.append(tmp_file)
        _write_truncated_sim_datafile(
            simfile,
            tmp_file,
            fraction_final=percentage_end / 100,
            fraction_initial=percentage_start / 100,
        )

    return tmp_files