OpenFreeEnergy · hannahbaumann · Jan 23, 2026 · Jan 29, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/src/openfe_analysis/reader.py b/src/openfe_analysis/reader.py
@@ -13,17 +13,18 @@
 
 def _determine_iteration_dt(dataset) -> float:
     """
-    Find out the timestep between each frame in the trajectory.
+    Determine the time increment between successive iterations
+    in a MultiStateReporter trajectory.
 
     Parameters
     ----------
     dataset : nc.Dataset
-      Dataset holding the MultiStateReporter generated NetCDF file.
+      NetCDF dataset produced by ``openmmtools.multistate.MultiStateReporter``.
 
     Returns
     -------
     float
-      The timestep in units of picoseconds.
+      The time between successive iterations, in picoseconds.
 
     Raises
     ------
@@ -35,7 +36,8 @@ def _determine_iteration_dt(dataset) -> float:
     -----
     This assumes an MCMC move which serializes in a manner similar
     to `openmmtools.mcmc.LangevinDynamicsMove`, i.e. it must have
-    both a `timestep` and `n_steps` defined.
+    both a `timestep` and `n_steps` defined, such that
+        dt_iteration = n_steps * timestep
     """
     # Deserialize the MCMC move information for the 0th entry.
     mcmc_move_data = yaml.load(
@@ -149,16 +151,31 @@ def index_method(self) -> str:
 
     @staticmethod
     def parse_n_atoms(filename, **kwargs) -> int:
+        """
+        Determine the number of atoms stored in a MultiStateReporter NetCDF file.
+
+        Parameters
+        ----------
+        filename : path-like
+            Path to the NetCDF file.
+
+        Returns
+        -------
+        int
+            Number of atoms in the system.
+        """
         with nc.Dataset(filename) as ds:
             n_atoms = ds.dimensions["atom"].size
         return n_atoms
 
     def _read_next_timestep(self, ts=None) -> Timestep:
+        # Advance the trajectory by one frame.
         if (self._frame_index + 1) >= len(self):
             raise EOFError
         return self._read_frame(self._frame_index + 1)
 
     def _read_frame(self, frame: int) -> Timestep:
+        # Read a single trajectory frame.
         self._frame_index = frame
 
         frame = self._frames[self._frame_index]
@@ -197,6 +214,7 @@ def _read_frame(self, frame: int) -> Timestep:
 
     @property
     def dt(self) -> float:
+        # Time difference between successive trajectory frames.
         return self._dt
 
     def _reopen(self):

diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py
@@ -16,22 +16,48 @@
 
 
 def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Universe:
-    """Makes a Universe and applies some transformations
+    """
+    Construct an MDAnalysis Universe from a MultiState NetCDF trajectory
+    and apply standard analysis transformations.
+
+    The Universe is created using the custom ``FEReader`` to extract a
+    single state from a multistate simulation.
+
+    Parameters
+    ----------
+    top : pathlib.Path or Topology
+        Path to a topology file (e.g. PDB) or an already-loaded MDAnalysis
+        topology object.
+    trj : nc.Dataset
+        Open NetCDF dataset produced by
+        ``openmmtools.multistate.MultiStateReporter``.
+    state : int
+        Thermodynamic state index to extract from the multistate trajectory.
+
+    Returns
+    -------
+    MDAnalysis.Universe
+        A Universe with trajectory transformations applied.
 
+    Notes
+    -----
     Identifies two AtomGroups:
     - protein, defined as having standard amino acid names, then filtered
       down to CA
     - ligand, defined as resname UNK
 
-    Then applies some transformations.
+    Depending on whether a protein is present, a sequence of trajectory
+    transformations is applied:
 
     If a protein is present:
     - prevents the protein from jumping between periodic images
-    - moves the ligand to the image closest to the protein
-    - aligns the entire system to minimise the protein RMSD
+      (class:`NoJump`)
+    - moves the ligand to the image closest to the protein (:class:`Minimiser`)
+    - aligns the entire system to minimise the protein RMSD (:class:`Aligner`)
 
-    If only a ligand:
+    If only a ligand is present:
     - prevents the ligand from jumping between periodic images
+    - Aligns the ligand to minimize its RMSD
     """
     u = mda.Universe(
         top,
@@ -76,23 +102,39 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers
 def gather_rms_data(
     pdb_topology: pathlib.Path, dataset: pathlib.Path, skip: Optional[int] = None
 ) -> dict[str, list[float]]:
-    """Generate structural analysis of RBFE simulation
+    """
+    Compute structural RMSD-based metrics for a multistate BFE simulation.
 
     Parameters
     ----------
     pdb_topology : pathlib.Path
-      path to pdb topology
+      Path to the PDB file defining system topology.
     dataset : pathlib.Path
-      path to nc trajectory
+      Path to the NetCDF trajectory file produced by a multistate simulation.
     skip : int, optional
-      step at which to progress through the trajectory.  by default, selects a
-      step that produces roughly 500 frames of analysis per replicate
-
-    Produces, for each lambda state:
-    - 1D protein RMSD timeseries 'protein_RMSD'
-    - ligand RMSD timeseries
-    - ligand COM motion 'ligand_wander'
-    - 2D protein RMSD plot
+      Frame stride for analysis. If ``None``, a stride is chosen such that
+      approximately 500 frames are analyzed per state.
+
+    Returns
+    -------
+    dict[str, list]
+        Dictionary containing per-state analysis results with keys:
+        ``protein_RMSD``, ``ligand_RMSD``, ``ligand_wander``,
+        ``protein_2D_RMSD``, and ``time(ps)``.
+
+    Notes
+    -----
+    For each thermodynamic state (lambda), this function:
+      - Loads the trajectory using ``FEReader``
+      - Applies standard PBC-handling and alignment transformations
+      - Computes protein and ligand structural metrics over time
+
+    The following analyses are produced per state:
+      - 1D protein CA RMSD time series
+      - 1D ligand RMSD time series
+      - Ligand center-of-mass displacement from its initial position
+        (``ligand_wander``)
+      - Flattened 2D protein RMSD matrix (pairwise RMSD between frames)
     """
     output = {
         "protein_RMSD": [],
@@ -187,19 +229,25 @@ def gather_rms_data(
 
 
 def twoD_RMSD(positions, w: Optional[npt.NDArray]) -> list[float]:
-    """2 dimensions RMSD
+    """
+    Compute a flattened 2D RMSD matrix from a trajectory.
+
+    For all unique frame pairs ``(i, j)`` with ``i < j``, this function
+    computes the RMSD between atomic coordinates after optimal alignment.
 
     Parameters
     ----------
     positions : np.ndarray
-      the protein positions for the entire trajectory
+      Atomic coordinates for all frames in the trajectory.
     w : np.ndarray, optional
-      weights array
+      Per-atom weights to use in the RMSD calculation. If ``None``,
+      all atoms are weighted equally.
 
     Returns
     -------
-    rmsd_matrix : list
-      a flattened version of the 2d
+    list of float
+      Flattened list of RMSD values corresponding to all frame pairs
+      ``(i, j)`` with ``i < j``.
     """
     nframes, _, _ = positions.shape
 

diff --git a/src/openfe_analysis/transformations.py b/src/openfe_analysis/transformations.py
@@ -14,11 +14,26 @@
 
 
 class NoJump(TransformationBase):
-    """Stops an AtomGroup from moving more than half a box length between frames
-
-    This transformation prevents an AtomGroup "teleporting" across the box
-    border between two subsequent frames.  This then simplifies the calculation
-    of motion over time.
+    """
+    Prevent an AtomGroup from jumping between periodic images.
+    This transformation removes large apparent
+    COM displacements caused by periodic boundary conditions.
+
+    Parameters
+    ----------
+    ag : MDAnalysis.AtomGroup
+        AtomGroup whose center-of-mass motion should be made continuous.
+
+    Notes
+    -----
+    - This transformation assumes an orthorhombic unit cell.
+    - Only translations are applied; no rotations or scaling.
+    - The correction is based on center-of-mass motion and is therefore
+      most appropriate for compact groups (e.g. proteins, ligands).
+    - Must be applied before any alignment transformations to avoid
+      mixing reference frames.
+    - Is intended to be applied before analyses that rely on smooth
+      time evolution (e.g. RMSD, COM motion).
     """
 
     ag: mda.AtomGroup
@@ -49,12 +64,13 @@ class ClosestImageShift(TransformationBase):
     """
     PBC-safe transformation that shifts one or more target AtomGroups
     so that their COM is in the closest image relative to a reference AtomGroup.
-    Works for any box type (triclinic or orthorhombic).
 
-    CAVEAT:
-    This Transformation requires the AtomGroups to be unwrapped!
+    CAVEAT: This Transformation requires the AtomGroups to be unwrapped!
 
-    Inspired from:
+    Notes
+    -----
+    - Works for any box type (triclinic or orthorhombic).
+    - Inspired from:
     https://github.com/wolberlab/OpenMMDL/blob/main/openmmdl/openmmdl_simulation/scripts/post_md_conversions.py
     """
 
@@ -75,10 +91,15 @@ def _transform(self, ts):
 
 
 class Aligner(TransformationBase):
-    """On-the-fly transformation to align a trajectory to minimise RMSD
-
-    centers all coordinates onto origin
-    rotates **entire universe** to minimise rmsd relative to **ref_ag**
+    """
+    Align a trajectory to a reference AtomGroup by minimizing RMSD.
+
+    Notes
+    -----
+    Performs an on-the-fly least-squares alignment
+    of the entire universe to a reference AtomGroup.
+    At each frame, the coordinates are translated and rotated to minimize the
+    RMSD of the atoms relative to their positions in the reference.
     """
 
     ref_pos: npt.NDArray

diff --git a/src/openfe_analysis/utils/multistate.py b/src/openfe_analysis/utils/multistate.py
@@ -24,6 +24,11 @@ def _determine_position_indices(dataset: nc.Dataset) -> NDArray:
     indices : NDArray[int]
       An ordered array of iteration indices which hold positions.
 
+    Raises
+    ------
+    ValueError
+        If positions are not written at a consistent interval.
+
     Note
     ----
     This assumes that the indices are equally spaced by a given
@@ -55,11 +60,12 @@ def _determine_position_indices(dataset: nc.Dataset) -> NDArray:
 
 
 def _state_to_replica(dataset: nc.Dataset, state_num: int, frame_num: int) -> int:
-    """Convert a state index to replica index at a given Dataset frame
+    """
+    Map a thermodynamic state index to the corresponding replica index.
 
     Parameters
     ----------
-    dataset : netCDF4.Dataset
+    dataset : nc.Dataset
         Dataset containing the MultiState reporter generated NetCDF file
         with information about all the frames and replica in the system.
     state_num : int
@@ -86,7 +92,7 @@ def _replica_positions_at_frame(
 
     Parameters
     ----------
-    dataset : netCDF4.Dataset
+    dataset : nc.Dataset
         Dataset containing the MultiState information.
     replica_index : int
         Replica index to extract positions for.
@@ -125,7 +131,7 @@ def _create_new_dataset(filename: Path, n_atoms: int, title: str) -> nc.Dataset:
 
     Returns
     -------
-    netCDF4.Dataset
+    nc.Dataset
         AMBER Conventions compliant NetCDF dataset to store information
         contained in MultiState reporter generated NetCDF file.
     """
@@ -170,13 +176,13 @@ def _get_unitcell(
     dataset: nc.Dataset, replica_index: int, frame_num: int
 ) -> Optional[Tuple[unit.Quantity]]:
     """
-    Helper method to extract a unit cell from the stored
+    Helper method to extract unit cell dimensions from the stored
     box vectors in a MultiState reporter generated NetCDF file
     at a given state and Dataset frame.
 
     Parameters
     ----------
-    dataset : netCDF4.Dataset
+    dataset : nc.Dataset
         Dataset of MultiState reporter generated NetCDF file.
     replica_index : int
         Replica for which to get the unit cell for.