-
+
Residue confidence - pLDDT
-
-
+
+
Residue-pair alignment error - PAE
-
+
@@ -539,7 +468,7 @@
'
+ html = html.replace('', f'{config_script}\n', 1)
+
+ # Generate sequence coverage plot from first MSA file
+ seq_cov_html = None
+ if msa_files:
+ # Filter out tools that don't generate MSAs (e.g. ESMFold) - if MSA file is a dummy placeholder, skip the section entirely
+
+ valid_msa = [(m, _tool_program_label(m)) for m in msa_files if not os.path.basename(m).startswith("DUMMY_")]
+
+ if valid_msa:
+ seq_cov_sections = []
+ for msa_file, tool_label in valid_msa:
+ seq_cov_fig = generate_sequence_coverage_plot(msa_file)
+ # In comparison mode, label each coverage plot with its tool name
+ if report_type == "comparison" and len(valid_msa) > 1:
+ seq_cov_fig.update_layout(
+ title=dict(text=f"Sequence Coverage — {tool_label}")
+ )
+ seq_cov_sections.append(
+ seq_cov_fig.to_html(
+ full_html=False,
+ include_plotlyjs="cdn",
+ config=PLOTLY_CONFIG,
+ )
+ )
+ seq_cov_html = "\n".join(seq_cov_sections)
+
+ # Replace or remove optional sections
+ if seq_cov_html:
+ html = html.replace('
', seq_cov_html, 1)
else:
- for i, plddt_values_str in enumerate(output_data):
- plddt_per_model[i] = []
- plddt_per_model[i] = [float(x) for x in plddt_values_str.strip().split()]
+ html = re.sub(r'.*?', '', html, flags=re.DOTALL)
- fig = go.Figure()
- for idx, (model_name, value_plddt) in enumerate(plddt_per_model.items()):
- rank_label = os.path.splitext(pdb[idx])[0]
- fig.add_trace(
- go.Scatter(
- x=list(range(len(value_plddt))),
- y=value_plddt,
- mode="lines",
- name=rank_label,
- text=[f"({i}, {value:.2f})" for i, value in enumerate(value_plddt)],
- hoverinfo="text",
- )
- )
- fig.update_layout(
- title=dict(text="Predicted LDDT per position", x=0.5, xanchor="center"),
- xaxis=dict(
- title="Positions", showline=True, linecolor="black", gridcolor="WhiteSmoke", minallowed=0, maxallowed=len(value_plddt)-1
- ),
- yaxis=dict(
- title="Predicted LDDT",
- range=[0, 100],
- fixedrange=True,
- showline=True,
- linecolor="black",
- gridcolor="WhiteSmoke",
- ),
- legend=dict(yanchor="bottom", y=0.02, xanchor="right", x=1, bordercolor="Black", borderwidth=1),
- plot_bgcolor="white",
- width=600,
- height=600,
- modebar_remove=["toImage", "zoomIn", "zoomOut"],
- )
- html_content = fig.to_html(
+ # Generate the pLDDT plot and convert to HTML
+ plddt_fig = generate_plddt_plot(parsed_structures, labels=model_labels)
+ plddt_html = plddt_fig.to_html(
full_html=False,
include_plotlyjs="cdn",
- config={"displayModeBar": True, "displaylogo": False, "scrollZoom": True},
+ config=PLOTLY_CONFIG,
)
+ html = html.replace('
', plddt_html, 1)
- with open(
- f"{out_dir}/{name+('_' if name else '')}coverage_LDDT.html", "w"
- ) as out_file:
- out_file.write(html_content)
-
- if args.pae and not args.pae.endswith('NO_FILE_PAE'):
- pae_fig = generate_pae_plot(args.pae, out_dir, name)
- pae_html_content = pae_fig.to_html(
+ # Generate PAE plot from first PAE file (TODO: toggle PAE with model selection), Not used in comparison report
+ if pae_files:
+ pae_fig = generate_pae_plot(pae_files[0])
+ pae_html = pae_fig.to_html(
full_html=False,
include_plotlyjs="cdn",
- config={"displayModeBar": True, "displaylogo": False, "scrollZoom": True},
- )
- with open(
- f"{out_dir}/{name+('_' if name else '')}PAE.html", "w"
- ) as pae_out_file:
- pae_out_file.write(pae_html_content)
-
-def generate_plots(msa_path, plddt_paths, name, out_dir):
- msa = []
- with open(msa_path, "r") as in_file:
- for line in in_file:
- msa.append([int(x) for x in line.strip().split()])
-
- seqid = []
- for sequence in msa:
- matches = [
- 1.0 if first == other else 0.0 for first, other in zip(msa[0], sequence)
- ]
- seqid.append(sum(matches) / len(matches))
-
- seqid_sort = sorted(range(len(seqid)), key=seqid.__getitem__)
-
- non_gaps = []
- for sequence in msa:
- non_gaps.append(
- [float(num != 21) if num != 21 else float("nan") for num in sequence]
- )
-
- sorted_non_gaps = [non_gaps[i] for i in seqid_sort]
- final = []
- for sorted_seq, identity in zip(sorted_non_gaps, [seqid[i] for i in seqid_sort]):
- final.append(
- [
- value * identity if not isinstance(value, str) else value
- for value in sorted_seq
- ]
+ config=PLOTLY_CONFIG,
)
-
- # Plotting Sequence Coverage using Plotly
- fig = go.Figure()
- fig.add_trace(
- go.Heatmap(
- z=final,
- colorscale="Rainbow",
- zmin=0,
- zmax=1,
- )
- )
- fig.update_layout(
- title="Sequence coverage", xaxis_title="Positions", yaxis_title="Sequences"
- )
- # Save as interactive HTML instead of an image
- fig.savefig(f"{out_dir}/{name+('_' if name else '')}seq_coverage.png")
-
- # Plotting Predicted LDDT per position using Plotly
- plddt_per_model = OrderedDict()
- plddt_paths.sort()
- for plddt_path in plddt_paths:
- with open(plddt_path, "r") as in_file:
- plddt_per_model[os.path.basename(plddt_path)[:-4]] = [
- float(x) for x in in_file.read().strip().split()
- ]
-
- i = 0
- for model_name, value_plddt in plddt_per_model.items():
- fig = go.Figure()
- fig.add_trace(
- go.Scatter(
- x=list(range(len(value_plddt))),
- y=value_plddt,
- mode="lines",
- name=model_name,
- )
- )
- fig.update_layout(title="Predicted LDDT per Position")
- fig.savefig(f"{out_dir}/{name+('_' if name else '')}coverage_LDDT_{i}.png")
- i += 1
-
-
-
-def align_structures(structures):
- parser = PDB.PDBParser(QUIET=True)
- structures = [
- parser.get_structure(f"Structure_{i}", pdb) for i, pdb in enumerate(structures)
- ]
- ref_structure = structures[0]
-
- common_atoms = set(
- f"{atom.get_parent().get_parent().get_id()}-{atom.get_parent().get_id()[1]}-{atom.name}"
- for atom in ref_structure.get_atoms() if not atom.element == 'H'
- )
- #print(common_atoms)
- for i, structure in enumerate(structures[1:], start=1):
- common_atoms = common_atoms.intersection(
- set(
- f"{atom.get_parent().get_parent().get_id()}-{atom.get_parent().get_id()[1]}-{atom.name}"
- for atom in structure.get_atoms()
- )
- )
-
- ref_atoms = [
- atom
- for atom in ref_structure.get_atoms()
- if f"{atom.get_parent().get_parent().get_id()}-{atom.get_parent().get_id()[1]}-{atom.name}" in common_atoms
- ]
- # print(ref_atoms)
- super_imposer = PDB.Superimposer()
- aligned_structures = [structures[0]] # Include the reference structure in the list
-
- for i, structure in enumerate(structures[1:], start=1):
- target_atoms = [
- atom
- for atom in structure.get_atoms()
- if f"{atom.get_parent().get_parent().get_id()}-{atom.get_parent().get_id()[1]}-{atom.name}" in common_atoms
- ]
-
- super_imposer.set_atoms(ref_atoms, target_atoms)
- super_imposer.apply(structure.get_atoms())
-
- aligned_structure = f"aligned_structure_{i}.pdb"
- io = PDB.PDBIO()
- io.set_structure(structure)
- io.save(aligned_structure)
- aligned_structures.append(aligned_structure)
-
- return aligned_structures
-
-
-def pdb_to_lddt(struct_files, generate_tsv):
- struct_files_sorted = struct_files
- struct_files_sorted.sort()
-
- output_lddt = []
- averages = []
-
- for struct_file in struct_files_sorted:
- plddt_values = []
-
- if struct_file.endswith('.pdb'):
- parser = PDB.PDBParser(QUIET=True)
- suffix = ".pdb"
- elif struct_file.endswith('.cif'):
- parser = PDB.MMCIFParser(QUIET=True)
- suffix = ".cif"
- else:
- raise NotImplementedError("Reporting only supported for .pdb and .cif filetypes")
- structure = parser.get_structure("", struct_file)
-
- for residue in structure.get_residues():
- res_pLDDT_tot = 0
- res_atom_count = 0
-
- for atom in residue.get_atoms():
- res_atom_count +=1
- res_pLDDT_tot += atom.get_bfactor()
-
- # Residue-level mean for ESMfold atom-level pLDDT
- res_pLDDT_ave = res_pLDDT_tot/res_atom_count
-
- if res_pLDDT_ave < 1.0:
- res_pLDDT_ave *= 100
- plddt_values.append(res_pLDDT_ave)
-
- # Calculate the average PLDDT value for the current file
- if plddt_values:
- avg_plddt = sum(plddt_values) / len(plddt_values)
- averages.append(round(avg_plddt, 3))
- else:
- averages.append(0.0)
-
- if generate_tsv == "y":
- output_file = f"{struct_file.replace(suffix, '')}_plddt.tsv"
- with open(output_file, "w") as outfile:
- outfile.write(" ".join(map(str, plddt_values)) + "\n")
- output_lddt.append(output_file)
- else:
- plddt_values_string = " ".join(map(str, plddt_values))
- output_lddt.append(plddt_values_string)
-
- return output_lddt, averages
-
-
-print("Starting...")
-
-version = "1.0.0"
-model_name = {
- "esmfold": "ESMFold",
- "alphafold2": "AlphaFold2",
- "alphafold3": "Alphafold3",
- "colabfold": "ColabFold",
- "rosettafold_all_atom": "RosettaFold All-Atom",
- "helixfold3": "HelixFold3",
- "rosettafold2na": "RoseTTAFold2NA",
- "boltz": "Boltz"
-}
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--type", dest="in_type")
-parser.add_argument(
- "--generate_tsv", choices=["y", "n"], default="n", dest="generate_tsv"
-)
-parser.add_argument("--msa", dest="msa", default="NO_FILE")
-parser.add_argument("--pdb", dest="pdb", required=True, nargs="+")
-parser.add_argument("--pae", dest="pae", default="NO_FILE")
-parser.add_argument("--name", dest="name")
-parser.add_argument("--output_dir", dest="output_dir")
-parser.add_argument("--html_template", dest="html_template")
-parser.add_argument("--version", action="version", version=f"{version}")
-parser.set_defaults(output_dir="")
-parser.set_defaults(in_type="esmfold")
-parser.set_defaults(name="")
-args = parser.parse_args()
-
-lddt_data, lddt_averages = pdb_to_lddt(args.pdb, args.generate_tsv)
-
-generate_output_images(
- args.msa, lddt_data, args.name, args.output_dir, args.in_type, args.generate_tsv, args.pdb
-)
-
-print("generating html report...")
-structures = args.pdb
-structures.sort()
-aligned_structures = align_structures(structures)
-
-io = PDB.PDBIO()
-ref_structure_path = "aligned_structure_0.pdb"
-io.set_structure(aligned_structures[0])
-io.save(ref_structure_path)
-aligned_structures[0] = ref_structure_path
-
-proteinfold_template = open(args.html_template, "r").read()
-proteinfold_template = proteinfold_template.replace("*sample_name*", args.name)
-proteinfold_template = proteinfold_template.replace(
- "*prog_name*", model_name[args.in_type.lower()]
-)
-
-args_pdb_array_js = ",\n".join([f'"{model}"' for model in structures])
-proteinfold_template = re.sub(
- r"const MODELS = \[.*?\];", # Match the existing MODELS array in HTML template
- f"const MODELS = [\n {args_pdb_array_js}\n];", # Replace with the new array
- proteinfold_template,
- flags=re.DOTALL,
-)
-
-averages_js_array = f"const LDDT_AVERAGES = {lddt_averages};"
-proteinfold_template = proteinfold_template.replace(
- "const LDDT_AVERAGES = [];", averages_js_array
-)
-
-i = 0
-for structure in aligned_structures:
- proteinfold_template = proteinfold_template.replace(
- f"*_data_ranked_{i}.pdb*", open(structure, "r").read().replace("\n", "\\n")
- )
- i += 1
-
-if not args.msa.endswith("NO_FILE"):
- image_path = f"{args.output_dir}/{args.name}_{args.in_type}_seq_coverage.png"
- with open(image_path, "rb") as in_file:
- proteinfold_template = proteinfold_template.replace(
- "seq_coverage.png",
- f"data:image/png;base64,{base64.b64encode(in_file.read()).decode('utf-8')}",
- )
-else:
- pattern = r'
.*?(.*?)*?
\s*