Simulation-Two-Chain-PAA/python/probability_exchange.py at main · alec-glisman/Simulation-Two-Chain-PAA · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
This file parses gmx mdrun log files and extracts the exchange probabilities
between neighboring replicas. It then outputs the exchange probabilities in a
txt file.

:Author: Alec Glisman (GitHub: @alec-glisman)
:Date: 2022-10-10
:Copyright: 2022 Alec Glisman
"""

# Standard library
import datetime
from pathlib import Path
import re
import warnings

# Third-party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


def find_logs(dr: Path) -> list[Path]:
    """
    Find all log files in the given directory matching internal pattern.

    Args:
        dr (Path): Path to directory containing log files.

    Returns:
        list: List of log files.
    """
    log_files = sorted(
        list(Path(dr).rglob("**/replica_00/nvt_hremd_*.log")))
    log_files.extend(sorted(
        list(Path(dr).rglob("**/replica_00/2-mdrun-output/nvt_hremd_*.log"))))
    return log_files


def parse_probabilities(f_log: Path) -> pd.DataFrame:
    """
    Load text file and parse exchange probabilities

    Args:
        f_log (Path): Input gmx mdrun log file

    Returns:
        pd.DataFrame: Exchange probabilities at each step
    """

    # parse text file and extract exchange probabilities
    text = f_log.read_text()
    prob_txt = re.findall(r"(?<=Repl pr).*", text)
    prob_flt = []
    tf = []

    # prepare data for numpy array
    for i in range(len(prob_txt)):

        # remove alphabetic characters
        prob_txt[i] = re.sub(r"[a-zA-Z]", "", prob_txt[i])

        # remove leading and trailing whitespace
        prob_txt[i] = prob_txt[i].strip()

        # convert whitespace to comma
        prob_txt[i] = re.sub(r"\s+", ",", prob_txt[i])

        # convert comma separated string to list of floats
        lst_float = []
        for val in prob_txt[i].split(","):
            try:
                x = float(val)
                lst_float.append(x)
            except ValueError:
                warnings.warn(f"ValueError: {val}")
                # lst_float.append(np.nan)

        lst_zeros = [np.nan] * len(lst_float)

        # place zero elements between non-zero elements
        lst_complete = [x for l in list(
            zip(lst_float, lst_zeros)) for x in l][:-1]

        # prepend and append to list if necessary
        if len(prob_flt) == 0:
            pass
        elif (len(lst_complete) != len(prob_flt[-1])):
            lst_complete = [np.nan] + lst_complete + [np.nan]

        # check that list is the same length as previous list
        if len(prob_flt) > 0:
            if len(lst_complete) != len(prob_flt[-1]):
                warnings.warn(
                    f"List length mismatch: {len(lst_complete)} vs {len(prob_flt[-1])}")
                continue

        # output results
        prob_flt.append(lst_complete)

    # calculate final simulation time
    times = re.findall(r"(?<=time\s)\d+.\d+", text)
    times = [float(x) for x in times]
    try:
        tf = max(times) / 1000.
    except ValueError:
        tf = np.nan

    # convert to pandas dataframe
    if len(prob_flt) > 0:
        idxs = range(len(prob_flt[0]))
        cols = [f"{i}-{i+1}" for i in idxs]
        return pd.DataFrame(prob_flt, columns=cols), tf
    else:
        return pd.DataFrame(), tf


plt.rcParams["axes.formatter.use_mathtext"] = True
plt.rcParams["font.family"] = "serif"
plt.rcParams["font.serif"] = "cmr10"


if __name__ == "__main__":
    # Input path
    d_data = f"{Path.cwd()}/data"

    # Find all log files in the current directory
    f_logs = find_logs(d_data)
    print(f"Found {len(f_logs)} log files")

    # Parse log files and extract exchange probabilities
    for (idx, log_file) in enumerate(f_logs):
        print(f"Reading log file {idx+1}/{len(f_logs)}: {log_file}")
        df, t = parse_probabilities(log_file)

        # calculate average exchange probability
        avg = pd.DataFrame(df.mean(axis=0))
        try:
            stats = pd.DataFrame(df.describe())
        except:
            stats = pd.DataFrame()

        # output probabilities to file
        if "prod" in log_file.stem:
            d_out = log_file.parents[1] / "analysis" / "replica_exchange"
        else:
            d_out = log_file.parents[1] / "analysis" / "replica_exchange"

        d_out.mkdir(parents=True, exist_ok=True)

        # output probabilities to terminal
        print(f"Output path: {d_out}")
        print(f"Average Exchange probabilities (tf = {t:.1f} ns):")
        print(avg.to_string(float_format="{:.2f}".format))
        print()

        f_out = d_out / "exchange_probability_dynamics.csv"
        df.to_csv(f_out)

        f_out = d_out / "exchange_probability_average.txt"
        with open(f_out, "w", encoding="utf-8") as f:
            # date and time
            f.write(f"Datetime: {datetime.datetime.now()}")
            f.write("\n\n")
            # exchange probabilities
            f.write(f"Average Exchange probabilities (tf: {t:.1f} ns):\n")
            f.write(avg.round(decimals=2).to_string(header=False))
            f.write("\n")
            f.write(f"Average of averages: {avg.mean().values[0]:.2f}\n")
            f.write(f"Standard deviation: {avg.std().values[0]:.2f}\n")
            # whitespace
            f.write("\n\n")
            # statistics
            f.write("Statistics:\n")
            f.write(stats.round(decimals=2).to_string())

        # convert probabilities to percentage
        avg *= 100
        avg = avg.round(0).astype(int)

        # continue loop if no data
        if len(avg) == 0:
            continue

        # output probabilities as table to file
        f_out = d_out / "exchange_probability_average.png"
        fig = plt.figure(figsize=(1.5, 7))
        ax = fig.add_subplot(111, frame_on=False)
        ax.axis("off")
        ax.axis("tight")
        ax.set_title(r"$t_f = $ " + f"{t:.1f} [ns]")
        ax.table(cellText=avg.values, rowLabels=avg.index,
                 colLabels=[r"$P(\mathrm{Exch.})$ [%]"],
                 fontsize=32, loc="center")
        fig.tight_layout()
        fig.savefig(f_out, dpi=600)