Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 51 additions & 10 deletions dpdata/formats/cp2k/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,9 @@ def get_frames(fname):
fp.close()
return [], [], [], [], [], [], [], None

# Check if this is CP2K 2025 format
is_cp2k_2025 = "energy [hartree]" in content
Comment on lines +405 to +406

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Description: Verify that "energy [hartree]" does not appear in CP2K 2023 format files

# Search for the detection string in CP2K 2023 test fixtures
rg -n 'energy \[hartree\]' tests/cp2k/ -g '!*2025*'

Repository: deepmodeling/dpdata

Length of output: 45


🏁 Script executed:

# Verify the detection string appears in CP2K 2025 format files
rg -n 'energy \[hartree\]' tests/cp2k/ -g '*2025*'

Repository: deepmodeling/dpdata

Length of output: 180


🏁 Script executed:

# Check ruff linting compliance for the file
cd dpdata && ruff check formats/cp2k/output.py

Repository: deepmodeling/dpdata

Length of output: 1286


🏁 Script executed:

# Inspect the code context around lines 405-406
sed -n '400,490p' dpdata/formats/cp2k/output.py | cat -n

Repository: deepmodeling/dpdata

Length of output: 4362


Fix ruff linting violations before committing.

The format detection via substring search works as intended—the string "energy [hartree]" correctly discriminates CP2K 2025 format from 2023 format and appears only in the former. However, the file has linting violations that must be resolved per the coding guidelines:

  • Line 118: Rename unused loop variable ii to _ii (B007)
  • Line 534: Prefix unused unpacked variable tmp_names with underscore (RUF059)

Run ruff check --fix dpdata/formats/cp2k/output.py to address these issues.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@dpdata/formats/cp2k/output.py` around lines 405 - 406, Fix the ruff linting
violations in the file dpdata/formats/cp2k/output.py by renaming the unused loop
variable ii to _ii on line 118 to comply with the B007 rule, and by prefixing
the unused unpacked variable tmp_names with an underscore to become _tmp_names
on line 534 to comply with the RUF059 rule. These changes follow the convention
of marking unused variables with a leading underscore to satisfy linting
requirements.


# search duplicated header
fp.seek(0)
header_idx = []
Expand Down Expand Up @@ -430,16 +433,54 @@ def get_frames(fname):
atom_symbol_idx_list.append(ii.split()[1])

if "ENERGY|" in ii:
energy = ii.split()[8]
if " Atom Kind " in ii:
force_flag = True
force_idx = idx
if force_flag:
if idx > force_idx:
if "SUM OF ATOMIC FORCES" in ii:
force_flag = False
else:
force.append(ii.split()[3:6])
# CP2K 2025 format: ENERGY| Total FORCE_EVAL ( QS ) energy [hartree] -7.364190264587725
# CP2K 2023 format: ENERGY| Total FORCE_EVAL ( QS ) energy (a.u.): -1766.225653832774242
if is_cp2k_2025:
# Find the energy value after "[hartree]"
parts = ii.split()
energy_2025 = None
try:
hartree_idx = parts.index("[hartree]")
energy_2025 = parts[hartree_idx + 1]
except (ValueError, IndexError):
# Fallback: try to find energy value in the line
for part in reversed(parts):
try:
float(part)
energy_2025 = part
break
except ValueError:
continue
if energy_2025 is None:
raise RuntimeError(
f"Cannot parse energy from CP2K 2025 output line: {ii.strip()}"
)
energy = energy_2025
else:
energy = ii.split()[8]

# CP2K 2025 force format: FORCES| prefix lines
if is_cp2k_2025:
if (
"FORCES|" in ii
and "Atom x y z" not in ii
and "Atomic forces" not in ii
):
parts = ii.split()
# FORCES| 1 -5.73440344E-02 2.95274914E-02 -1.50988167E-02 6.62433792E-02
if len(parts) >= 5 and parts[1].isdigit():
force.append(parts[2:5])
else:
# CP2K 2023 format
if " Atom Kind " in ii:
force_flag = True
force_idx = idx
if force_flag:
if idx > force_idx:
if "SUM OF ATOMIC FORCES" in ii:
force_flag = False
else:
force.append(ii.split()[3:6])
# add reading stress tensor
if "STRESS TENSOR [GPa" in ii:
stress_flag = True
Expand Down
75 changes: 75 additions & 0 deletions tests/cp2k/cp2k_2025_output/cp2k_2025_output
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
DBCSR| Multiplication driver XSMM
DBCSR| Multrec recursion limit 512

**** **** ****** ** PROGRAM STARTED AT 2025-01-15 10:30:00.000
***** ** *** *** ** PROGRAM STARTED ON test-machine
** **** ****** PROGRAM STARTED BY test
***** ** ** ** ** PROGRAM PROCESS ID 12345
**** ** ******* ** PROGRAM STARTED IN /test

CP2K| version string: CP2K version 2025.1
CP2K| source code revision number: git:abc123

GLOBAL| Force Environment number 1
GLOBAL| Run type ENERGY_FORCE
GLOBAL| Global print level MEDIUM
GLOBAL| Total number of message passing processes 1

MEMORY| system memory details [Kb]

CELL| Vector a [angstrom]: 5.000 0.000 0.000 |a| = 5.000
CELL| Vector b [angstrom]: 0.000 5.000 0.000 |b| = 5.000
CELL| Vector c [angstrom]: 0.000 0.000 5.000 |c| = 5.000

ATOMIC KIND INFORMATION

1. Atomic kind: H Number of atoms: 2

MOLECULE KIND INFORMATION

TOTAL NUMBERS AND MAXIMUM NUMBERS

Total number of - Atomic kinds: 1
- Atoms: 2


SCF WAVEFUNCTION OPTIMIZATION

Step Update method Time Convergence Total energy Change
------------------------------------------------------------------------------

1 OT DIIS 0.80E-01 0.1 0.00000001 -7.3641902645877 -7.36E+00

*** SCF run converged in 1 steps ***


*******************************************************************************
MODULE QUICKSTEP: ATOMIC COORDINATES IN angstrom

Atom Kind Element X Y Z Z(eff) Mass

1 1 H 1 0.000000 0.000000 0.000000 1.00 1.0079
2 1 H 1 0.760000 0.000000 0.000000 1.00 1.0079

ENERGY| Total FORCE_EVAL ( QS ) energy [hartree] -7.364190264587725

FORCES| Atomic forces [hartree/bohr]

FORCES| Atom x y z |f|

FORCES| 1 -5.73440344E-02 2.95274914E-02 -1.50988167E-02 6.62433792E-02
FORCES| 2 7.92269287E-02 3.84670665E-02 -3.41478833E-02 9.44600412E-02

STRESS TENSOR [GPa]

X Y Z
X 0.12345678 0.00000000 0.00000000
Y 0.00000000 0.12345678 0.00000000
Z 0.00000000 0.00000000 0.12345678

-------------------------------------------------------------------------------
- -
- T I M I N G -
-------------------------------------------------------------------------------

**** **** ****** ** PROGRAM ENDED AT 2025-01-15 10:30:05.000
Binary file added tests/cp2k/cp2k_2025_output/deepmd/set.000/box.npy
Binary file not shown.
Binary file added tests/cp2k/cp2k_2025_output/deepmd/set.000/coord.npy
Binary file not shown.
Binary file not shown.
Binary file added tests/cp2k/cp2k_2025_output/deepmd/set.000/force.npy
Binary file not shown.
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/cp2k/cp2k_2025_output/deepmd/type.raw
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
0
0
1 change: 1 addition & 0 deletions tests/cp2k/cp2k_2025_output/deepmd/type_map.raw
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
H
216 changes: 216 additions & 0 deletions tests/test_cp2k_2025_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
from __future__ import annotations

import os
import tempfile
import unittest

from comp_sys import CompLabeledSys
from context import dpdata


class TestCp2k2025Output(unittest.TestCase, CompLabeledSys):
"""Test CP2K 2025 output format parsing."""

def setUp(self):
self.system_1 = dpdata.LabeledSystem(
"cp2k/cp2k_2025_output/cp2k_2025_output", fmt="cp2k/output"
)
self.system_2 = dpdata.LabeledSystem(
"cp2k/cp2k_2025_output/deepmd", fmt="deepmd/npy"
)
self.places = 6
self.e_places = 6
self.f_places = 6
self.v_places = 4

def test_energy_extraction(self):
"""Test that energy is correctly extracted from CP2K 2025 format."""
# Energy should be -7.364190264587725 hartree
# Using dpdata's conversion factor: -200.3898256786414 eV
expected_energy = -200.3898256786414
self.assertAlmostEqual(
self.system_1.data["energies"][0], expected_energy, places=5
)

def test_forces_extraction(self):
"""Test that forces are correctly extracted from CP2K 2025 format."""
# Forces should be converted from hartree/bohr to eV/angstrom
self.assertEqual(self.system_1.data["forces"].shape, (1, 2, 3))
# Check first atom force x-component
self.assertAlmostEqual(
self.system_1.data["forces"][0][0][0], -2.94874881, places=5
)


class TestCp2k2023FormatStillWorks(unittest.TestCase, CompLabeledSys):
"""Test that CP2K 2023 format still works (regression test)."""

def setUp(self):
self.system_1 = dpdata.LabeledSystem(
"cp2k/cp2k_normal_output/cp2k_output", fmt="cp2k/output"
)
self.system_2 = dpdata.LabeledSystem(
"cp2k/cp2k_normal_output/deepmd", fmt="deepmd/npy"
)
self.places = 6
self.e_places = 6
self.f_places = 6
self.v_places = 4


class TestCp2k2025EdgeCases(unittest.TestCase):
"""Test edge cases for CP2K 2025 format parsing to improve coverage."""

def create_cp2k_output_2025(self, energy_line=None, forces_lines=None):
"""Create a minimal CP2K 2025 output file for testing."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".out", delete=False) as f:
# Header required for parsing
f.write(
" DBCSR| Multiplication driver XSMM\n"
)
f.write("\n")
f.write(
" **** **** ****** ** PROGRAM STARTED AT 2025-01-15 10:30:00.000\n"
)
f.write(
" CP2K| version string: CP2K version 2025.1\n"
)
f.write("\n")
f.write(
" CELL| Vector a [angstrom]: 5.000 0.000 0.000 |a| = 5.000\n"
)
f.write(
" CELL| Vector b [angstrom]: 0.000 5.000 0.000 |b| = 5.000\n"
)
f.write(
" CELL| Vector c [angstrom]: 0.000 0.000 5.000 |c| = 5.000\n"
)
f.write("\n")
f.write(" ATOMIC KIND INFORMATION\n")
f.write("\n")
f.write(
" 1. Atomic kind: H Number of atoms: 2\n"
)
f.write("\n")
f.write(" SCF WAVEFUNCTION OPTIMIZATION\n")
f.write("\n")
f.write(" *** SCF run converged in 1 steps ***\n")
f.write("\n")
f.write(" MODULE QUICKSTEP: ATOMIC COORDINATES IN angstrom\n")
f.write("\n")
f.write(
" Atom Kind Element X Y Z Z(eff) Mass\n"
)
f.write("\n")
f.write(
" 1 1 H 1 0.000000 0.000000 0.000000 1.00 1.0079\n"
)
f.write(
" 2 1 H 1 0.760000 0.000000 0.000000 1.00 1.0079\n"
)
f.write("\n")

# Energy line - use provided or default
if energy_line is not None:
f.write(energy_line + "\n")
else:
f.write(
" ENERGY| Total FORCE_EVAL ( QS ) energy [hartree] -7.364190264587725\n"
)

f.write("\n")
f.write(" FORCES| Atomic forces [hartree/bohr]\n")
f.write("\n")
f.write(" FORCES| Atom x y z |f|\n")

# Forces lines - use provided or default
if forces_lines is not None:
for line in forces_lines:
f.write(line + "\n")
else:
f.write(
" FORCES| 1 -5.73440344E-02 2.95274914E-02 -1.50988167E-02 6.62433792E-02\n"
)
f.write(
" FORCES| 2 7.92269287E-02 3.84670665E-02 -3.41478833E-02 9.44600412E-02\n"
)

f.write("\n")
f.write(" STRESS TENSOR [GPa]\n")
f.write("\n")
f.write(" X Y Z\n")
f.write(" X 0.12345678 0.00000000 0.00000000\n")
f.write(" Y 0.00000000 0.12345678 0.00000000\n")
f.write(" Z 0.00000000 0.00000000 0.12345678\n")
f.write("\n")
f.write(
" **** **** ****** ** PROGRAM ENDED AT 2025-01-15 10:30:05.000\n"
)

return f.name

def test_cp2k2025_format_with_labeled_system(self):
"""Test CP2K 2025 format using LabeledSystem (integration test for coverage)."""
fname = self.create_cp2k_output_2025()
try:
system = dpdata.LabeledSystem(fname, fmt="cp2k/output")
self.assertIsNotNone(system.data["energies"])
self.assertIsNotNone(system.data["forces"])
self.assertEqual(system.data["forces"].shape[1], 2)
finally:
os.unlink(fname)

def test_cp2k2025_energy_parsing_with_extra_whitespace(self):
"""Test energy parsing with extra whitespace around value (coverage for parsing robustness)."""
fname = self.create_cp2k_output_2025(
energy_line=" ENERGY| Total FORCE_EVAL ( QS ) energy [hartree] -7.364190264587725 "
)
try:
system = dpdata.LabeledSystem(fname, fmt="cp2k/output")
self.assertIsNotNone(system.data["energies"])
self.assertAlmostEqual(
system.data["energies"][0], -200.3898256786414, places=5
)
self.assertEqual(system.data["forces"].shape[1], 2)
finally:
os.unlink(fname)

def test_cp2k2025_force_parsing_with_header_lines(self):
"""Test that FORCES| header lines are correctly skipped (coverage for filtering)."""
fname = self.create_cp2k_output_2025(
forces_lines=[
" FORCES| Atom x y z |f|", # Should be skipped - contains "Atom x y z"
" FORCES| 1 -5.73440344E-02 2.95274914E-02 -1.50988167E-02 6.62433792E-02",
" FORCES| 2 7.92269287E-02 3.84670665E-02 -3.41478833E-02 9.44600412E-02",
]
)
try:
system = dpdata.LabeledSystem(fname, fmt="cp2k/output")
self.assertEqual(system.data["forces"].shape[1], 2)
self.assertAlmostEqual(
system.data["forces"][0][0][0], -2.94874881, places=5
)
finally:
os.unlink(fname)

def test_cp2k2025_force_parsing_with_atomic_forces_line(self):
"""Test that 'Atomic forces' line is correctly skipped (coverage for filtering)."""
fname = self.create_cp2k_output_2025(
forces_lines=[
" FORCES| Atomic forces [hartree/bohr]", # Should be skipped
" FORCES| 1 -5.73440344E-02 2.95274914E-02 -1.50988167E-02 6.62433792E-02",
" FORCES| 2 7.92269287E-02 3.84670665E-02 -3.41478833E-02 9.44600412E-02",
]
)
try:
system = dpdata.LabeledSystem(fname, fmt="cp2k/output")
self.assertEqual(system.data["forces"].shape[1], 2)
self.assertAlmostEqual(
system.data["forces"][0][0][0], -2.94874881, places=5
)
finally:
os.unlink(fname)


if __name__ == "__main__":
unittest.main()
Loading