From 32d4397157d3cc657abaf3c7219b23c010c6a25e Mon Sep 17 00:00:00 2001 From: Sangtaek Lee Date: Wed, 17 Dec 2025 17:15:14 +0000 Subject: [PATCH 1/6] add error percentage to cpu and ram monitor --- diagnostic_common_diagnostics/README.md | 2 ++ .../cpu_monitor.py | 21 ++++++++++--------- .../ram_monitor.py | 11 ++++++++-- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/diagnostic_common_diagnostics/README.md b/diagnostic_common_diagnostics/README.md index 959adc85f..6b8fe687d 100644 --- a/diagnostic_common_diagnostics/README.md +++ b/diagnostic_common_diagnostics/README.md @@ -14,6 +14,7 @@ It publishes the usage percentage in a diagnostic message. * Name of the node is "cpu_monitor_" + hostname. * Uses the following args: * warning_percentage: If the CPU usage is > warning_percentage, a WARN status will be publised. + * error_percentage: If the CPU usage is > error_percentage, a ERROR status will be published. * window: the maximum length of the used collections.deque for queuing CPU readings. ### Published Topics @@ -97,6 +98,7 @@ It publishes the usage percentage in a diagnostic message. * Name of the node is "ram_monitor_" + hostname. * Uses the following args: * warning_percentage: If the RAM usage is > warning_percentage, a WARN status will be published. + * error_percentage: If the RAM usage is > error_percentage, a ERROR status will be published. * window: the maximum length of the used collections.deque for queuing RAM readings. ### Published Topics diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py index 32dd60eb3..ccc74616a 100755 --- a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py @@ -51,10 +51,11 @@ class CpuTask(DiagnosticTask): - def __init__(self, warning_percentage=90, window=1): + def __init__(self, warning_percentage=90, error_percentage=100, window=1): DiagnosticTask.__init__(self, 'CPU Information') self._warning_percentage = int(warning_percentage) + self._error_percentage = int(error_percentage) self._readings = deque(maxlen=window) def _get_average_reading(self): @@ -71,15 +72,12 @@ def run(self, stat): stat.add('CPU Load Average', f'{cpu_average:.2f}') - warn = False - for idx, cpu_percentage in enumerate(cpu_percentages): - stat.add(f'CPU {idx} Load', f'{cpu_percentage:.2f}') - if cpu_percentage > self._warning_percentage: - warn = True - - if warn: + if cpu_average > self._error_percentage: + stat.summary(DiagnosticStatus.ERROR, + f'CPU Average exceeds {self._error_percentage} percent') + elif cpu_average > self._warning_percentage: stat.summary(DiagnosticStatus.WARN, - f'At least one CPU exceeds {self._warning_percentage} percent') + f'CPU Average exceeds {self._warning_percentage} percent') else: stat.summary(DiagnosticStatus.OK, f'CPU Average {cpu_average:.2f} percent') @@ -100,16 +98,19 @@ def main(args=None): # Declare and get parameters node.declare_parameter('warning_percentage', 90) + node.declare_parameter('error_percentage', 100) node.declare_parameter('window', 1) warning_percentage = node.get_parameter( 'warning_percentage').get_parameter_value().integer_value + error_percentage = node.get_parameter( + 'error_percentage').get_parameter_value().integer_value window = node.get_parameter('window').get_parameter_value().integer_value # Create diagnostic updater with default updater rate of 1 hz updater = Updater(node) updater.setHardwareID(hostname) - updater.add(CpuTask(warning_percentage=warning_percentage, window=window)) + updater.add(CpuTask(warning_percentage=warning_percentage, error_percentage=error_percentage, window=window)) rclpy.spin(node) diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py index da59a6d25..92eebe3c9 100755 --- a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py @@ -48,9 +48,10 @@ class RamTask(DiagnosticTask): - def __init__(self, warning_percentage, window): + def __init__(self, warning_percentage, error_percentage, window): DiagnosticTask.__init__(self, 'RAM Information') self._warning_percentage = int(warning_percentage) + self._error_percentage = int(error_percentage) self._readings = collections.deque(maxlen=window) def run(self, stat): @@ -59,7 +60,12 @@ def run(self, stat): stat.add('RAM Load Average', f'{ram_average:.2f}') - if ram_average > self._warning_percentage: + if ram_average > self._error_percentage: + stat.summary( + DiagnosticStatus.ERROR, + f'RAM Average exceeds {self._error_percentage:d} percent', + ) + elif ram_average > self._warning_percentage: stat.summary( DiagnosticStatus.WARN, f'RAM Average exceeds {self._warning_percentage:d} percent', @@ -84,6 +90,7 @@ def main(): updater.add( RamTask( node.declare_parameter('warning_percentage', 90).value, + node.declare_parameter('error_percentage', 100).value, node.declare_parameter('window', 1).value, ) ) From f20f0ca5b85a7ee4e4ce88ed85e72912d2352b1d Mon Sep 17 00:00:00 2001 From: Sangtaek Lee Date: Wed, 17 Dec 2025 17:23:20 +0000 Subject: [PATCH 2/6] fix doc --- diagnostic_common_diagnostics/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/diagnostic_common_diagnostics/README.md b/diagnostic_common_diagnostics/README.md index 6b8fe687d..d2b5bd6de 100644 --- a/diagnostic_common_diagnostics/README.md +++ b/diagnostic_common_diagnostics/README.md @@ -14,7 +14,7 @@ It publishes the usage percentage in a diagnostic message. * Name of the node is "cpu_monitor_" + hostname. * Uses the following args: * warning_percentage: If the CPU usage is > warning_percentage, a WARN status will be publised. - * error_percentage: If the CPU usage is > error_percentage, a ERROR status will be published. + * error_percentage: If the CPU usage is > error_percentage, an ERROR status will be published. * window: the maximum length of the used collections.deque for queuing CPU readings. ### Published Topics @@ -98,7 +98,7 @@ It publishes the usage percentage in a diagnostic message. * Name of the node is "ram_monitor_" + hostname. * Uses the following args: * warning_percentage: If the RAM usage is > warning_percentage, a WARN status will be published. - * error_percentage: If the RAM usage is > error_percentage, a ERROR status will be published. + * error_percentage: If the RAM usage is > error_percentage, an ERROR status will be published. * window: the maximum length of the used collections.deque for queuing RAM readings. ### Published Topics From d42401c1937e72734cc6d99b3a9fdcd9a855604d Mon Sep 17 00:00:00 2001 From: Sangtaek Lee Date: Wed, 17 Dec 2025 17:33:13 +0000 Subject: [PATCH 3/6] fix format --- .../diagnostic_common_diagnostics/cpu_monitor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py index ccc74616a..b94af3289 100755 --- a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py @@ -110,7 +110,8 @@ def main(args=None): # Create diagnostic updater with default updater rate of 1 hz updater = Updater(node) updater.setHardwareID(hostname) - updater.add(CpuTask(warning_percentage=warning_percentage, error_percentage=error_percentage, window=window)) + updater.add(CpuTask(warning_percentage=warning_percentage, error_percentage=error_percentage, + window=window)) rclpy.spin(node) From fe701e998e07abc182e48346c869018bc99d73dd Mon Sep 17 00:00:00 2001 From: Sangtaek Lee Date: Wed, 17 Dec 2025 17:44:03 +0000 Subject: [PATCH 4/6] fix test --- .../test/systemtest/test_cpu_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py index 28430c482..4dc83f0c1 100644 --- a/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py +++ b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py @@ -85,7 +85,7 @@ def test_warn(self): print(f'Raw readings: {task._readings}') self.assertEqual(task.name, 'CPU Information') self.assertEqual(stat.level, DiagnosticStatus.WARN) - self.assertIn(str('At least one CPU exceeds'), stat.message) + self.assertIn(str('CPU Average exceeds'), stat.message) # Check for at least 1 CPU Load Average and 1 CPU Load self.assertGreaterEqual(len(stat.values), 2) From 374e5ac2d555dd775a995907952d6092f9d398e8 Mon Sep 17 00:00:00 2001 From: Sangtaek Lee Date: Wed, 17 Dec 2025 17:46:41 +0000 Subject: [PATCH 5/6] revert each cpu load stat --- .../diagnostic_common_diagnostics/cpu_monitor.py | 3 +++ .../diagnostic_common_diagnostics/param_decl.yaml | 0 2 files changed, 3 insertions(+) create mode 100644 diagnostic_common_diagnostics/diagnostic_common_diagnostics/param_decl.yaml diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py index b94af3289..a997ce633 100755 --- a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py @@ -72,6 +72,9 @@ def run(self, stat): stat.add('CPU Load Average', f'{cpu_average:.2f}') + for idx, cpu_percentage in enumerate(cpu_percentages): + stat.add(f'CPU {idx} Load', f'{cpu_percentage:.2f}') + if cpu_average > self._error_percentage: stat.summary(DiagnosticStatus.ERROR, f'CPU Average exceeds {self._error_percentage} percent') diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/param_decl.yaml b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/param_decl.yaml new file mode 100644 index 000000000..e69de29bb From 6f7b33252e546f7372f1433a28fbb5b6fb235a48 Mon Sep 17 00:00:00 2001 From: Sangtaek Lee Date: Tue, 26 May 2026 17:47:20 +0100 Subject: [PATCH 6/6] address comments --- diagnostic_common_diagnostics/README.md | 1 + .../cpu_monitor.py | 22 +++++++++++++------ .../ram_monitor.py | 2 +- .../test/systemtest/test_cpu_monitor.py | 8 +++---- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/diagnostic_common_diagnostics/README.md b/diagnostic_common_diagnostics/README.md index d2b5bd6de..f1ae46ead 100644 --- a/diagnostic_common_diagnostics/README.md +++ b/diagnostic_common_diagnostics/README.md @@ -13,6 +13,7 @@ It publishes the usage percentage in a diagnostic message. * Name of the node is "cpu_monitor_" + hostname. * Uses the following args: + * use_average: If true, the average CPU usage over all cores will be used to determine the status. If false, the maximum CPU usage among all cores will be used. * warning_percentage: If the CPU usage is > warning_percentage, a WARN status will be publised. * error_percentage: If the CPU usage is > error_percentage, an ERROR status will be published. * window: the maximum length of the used collections.deque for queuing CPU readings. diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py index a997ce633..03faf84c4 100755 --- a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py @@ -51,12 +51,13 @@ class CpuTask(DiagnosticTask): - def __init__(self, warning_percentage=90, error_percentage=100, window=1): + def __init__(self, warning_percentage, error_percentage, window, use_average): DiagnosticTask.__init__(self, 'CPU Information') self._warning_percentage = int(warning_percentage) self._error_percentage = int(error_percentage) self._readings = deque(maxlen=window) + self._use_average = use_average def _get_average_reading(self): def avg(lst): @@ -75,12 +76,17 @@ def run(self, stat): for idx, cpu_percentage in enumerate(cpu_percentages): stat.add(f'CPU {idx} Load', f'{cpu_percentage:.2f}') - if cpu_average > self._error_percentage: + if self._use_average: + cpu_usage = cpu_average + else: + cpu_usage = max(cpu_percentages) + + if cpu_usage > self._error_percentage: stat.summary(DiagnosticStatus.ERROR, - f'CPU Average exceeds {self._error_percentage} percent') - elif cpu_average > self._warning_percentage: + f'CPU usage exceeds {self._error_percentage} percent') + elif cpu_usage > self._warning_percentage: stat.summary(DiagnosticStatus.WARN, - f'CPU Average exceeds {self._warning_percentage} percent') + f'CPU usage exceeds {self._warning_percentage} percent') else: stat.summary(DiagnosticStatus.OK, f'CPU Average {cpu_average:.2f} percent') @@ -101,20 +107,22 @@ def main(args=None): # Declare and get parameters node.declare_parameter('warning_percentage', 90) - node.declare_parameter('error_percentage', 100) + node.declare_parameter('error_percentage', 95) + node.declare_parameter('use_average', False) node.declare_parameter('window', 1) warning_percentage = node.get_parameter( 'warning_percentage').get_parameter_value().integer_value error_percentage = node.get_parameter( 'error_percentage').get_parameter_value().integer_value + use_average = node.get_parameter('use_average').get_parameter_value().bool_value window = node.get_parameter('window').get_parameter_value().integer_value # Create diagnostic updater with default updater rate of 1 hz updater = Updater(node) updater.setHardwareID(hostname) updater.add(CpuTask(warning_percentage=warning_percentage, error_percentage=error_percentage, - window=window)) + window=window, use_average=use_average)) rclpy.spin(node) diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py index 92eebe3c9..e50d78b89 100755 --- a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/ram_monitor.py @@ -90,7 +90,7 @@ def main(): updater.add( RamTask( node.declare_parameter('warning_percentage', 90).value, - node.declare_parameter('error_percentage', 100).value, + node.declare_parameter('error_percentage', 95).value, node.declare_parameter('window', 1).value, ) ) diff --git a/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py index 4dc83f0c1..33e66f1d6 100644 --- a/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py +++ b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py @@ -67,7 +67,7 @@ def diagnostics_callback(self, msg): def test_ok(self): warning_percentage = 100 - task = CpuTask(warning_percentage) + task = CpuTask(warning_percentage=warning_percentage, error_percentage=100, window=1, use_average=False) stat = DiagnosticStatusWrapper() task.run(stat) self.assertEqual(task.name, 'CPU Information') @@ -79,13 +79,13 @@ def test_ok(self): def test_warn(self): warning_percentage = -1 - task = CpuTask(warning_percentage) + task = CpuTask(warning_percentage=warning_percentage, error_percentage=100, window=1, use_average=False) stat = DiagnosticStatusWrapper() task.run(stat) print(f'Raw readings: {task._readings}') self.assertEqual(task.name, 'CPU Information') self.assertEqual(stat.level, DiagnosticStatus.WARN) - self.assertIn(str('CPU Average exceeds'), stat.message) + self.assertIn(str('CPU usage exceeds'), stat.message) # Check for at least 1 CPU Load Average and 1 CPU Load self.assertGreaterEqual(len(stat.values), 2) @@ -96,7 +96,7 @@ def test_updater(self): node = Node('cpu_monitor_test') updater = Updater(node) updater.setHardwareID('test_id') - updater.add(CpuTask()) + updater.add(CpuTask(warning_percentage=95, error_percentage=100, window=1, use_average=False)) node.create_subscription( DiagnosticArray, '/diagnostics', self.diagnostics_callback, 10)