diff --git a/.gitignore b/.gitignore index 6603f927d..ee7e248e6 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,9 @@ dist .ruff_cache .env .tox + +# Ignore charts directory +simplyblock_core/scripts/charts/charts/ + +# Ignore Helm requirements lock file +simplyblock_core/scripts/charts/requirements.lock diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..37d1834ca --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023-2025 simplyblock GmbH + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docker/Dockerfile b/docker/Dockerfile index ce1a83ae1..c8999b47d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,12 +1,29 @@ # syntax=docker/dockerfile:1 FROM simplyblock/simplyblock:base_image +LABEL name="simplyblock" +LABEL vendor="Simplyblock" +LABEL version="1.0.0" +LABEL release="1" +LABEL summary="Simplyblock controlplane plane component" +LABEL description="Simplyblock controlplane plane container" +LABEL maintainer="developers@simplyblock.io" + +COPY LICENSE /licenses/LICENSE + WORKDIR /app COPY requirements.txt . -RUN pip3 install -r requirements.txt +RUN pip3 install --no-cache-dir -r requirements.txt + COPY . /app RUN python setup.py install + +RUN if [ -d /usr/share/terminfo ]; then \ + find /usr/share/terminfo -lname '*ncr260vt300wpp*' -exec rm -f {} + ; \ + rm -f /usr/share/terminfo/n/ncr260vt300wpp || true ; \ + fi + diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base index 226188c96..735d331b1 100644 --- a/docker/Dockerfile_base +++ b/docker/Dockerfile_base @@ -38,3 +38,4 @@ RUN pip3 install setuptools --upgrade COPY requirements.txt requirements.txt RUN pip3 install -r requirements.txt + diff --git a/docs/talos.md b/docs/talos.md index 47ff817d5..f1406ef38 100644 --- a/docs/talos.md +++ b/docs/talos.md @@ -19,26 +19,12 @@ kubectl label namespace simplyblock \ --overwrite ``` - -Patch the host machine so that OpenEBS could work - Create a machine config patch with the contents below and save as patch.yaml ``` cat > patch.yaml <<'EOF' machine: sysctls: vm.nr_hugepages: "1024" - nodeLabels: - openebs.io/engine: mayastor - kubelet: - extraMounts: - - destination: /var/openebs/local - type: bind - source: /var/openebs/local - options: - - rbind - - rshared - - rw EOF talosctl -e -n patch mc -p @patch.yaml diff --git a/e2e/__init__.py b/e2e/__init__.py index e8cae33f7..31164238e 100644 --- a/e2e/__init__.py +++ b/e2e/__init__.py @@ -55,6 +55,7 @@ from stress_test.continuous_failover_ha_geomtery import RandomMultiGeometryFailoverTest from stress_test.continuous_failover_ha_2node import RandomMultiClient2NodeFailoverTest from stress_test.continuous_failover_ha_rdma import RandomRDMAFailoverTest +from stress_test.continuous_failover_ha_multi_client_quick_outage import RandomRapidFailoverNoGap from e2e_tests.upgrade_tests.major_upgrade import TestMajorUpgrade @@ -96,8 +97,8 @@ def get_all_tests(custom=True, ha_test=False): TestLvolFioNpcs0, TestLvolFioNpcs1, TestLvolFioNpcs2, - TestLvolFioQOSBW, - TestLvolFioQOSIOPS, + # TestLvolFioQOSBW, + # TestLvolFioQOSIOPS, TestSingleNodeOutage, # TestSingleNodeReboot, # TestHASingleNodeReboot, @@ -147,6 +148,7 @@ def get_stress_tests(): RandomMultiGeometryFailoverTest, RandomMultiClient2NodeFailoverTest, RandomRDMAFailoverTest, + RandomRapidFailoverNoGap, ] return tests @@ -161,4 +163,4 @@ def get_load_tests(): tests = [ TestLvolOutageLoadTest ] - return tests \ No newline at end of file + return tests diff --git a/e2e/continuous_log_collector.py b/e2e/continuous_log_collector.py index 48f06fd80..d1ea68c38 100644 --- a/e2e/continuous_log_collector.py +++ b/e2e/continuous_log_collector.py @@ -1,6 +1,5 @@ import os from datetime import datetime -from pathlib import Path from utils.ssh_utils import SshUtils, RunnerK8sLog from logger_config import setup_logger @@ -22,7 +21,7 @@ def __init__(self,docker_logs_path=None): def get_log_directory(self): timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - return os.path.join(Path.home(), "container-logs", f"manual-logs-{timestamp}") + return os.path.join('/mnt/nfs_share/', f"snapshot-repliction-from-replicated-clone-{timestamp}") def collect_logs(self, test_name): all_nodes = set() @@ -75,4 +74,4 @@ def collect_logs(self, test_name): if __name__ == "__main__": collector = ContinuousLogCollector() - collector.collect_logs(test_name="Manual") + collector.collect_logs(test_name="snapshot-repliction-from-replicated-clone") diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py index 5077544b0..d37222c88 100644 --- a/e2e/e2e_tests/cluster_test_base.py +++ b/e2e/e2e_tests/cluster_test_base.py @@ -401,13 +401,17 @@ def collect_management_details(self, post_teardown=False): cmd = f"{self.base_cmd} sn check {result['uuid']} >& {base_path}/node{node}_check{suffix}.txt" self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd) + cmd = f"{self.base_cmd} sn get {result['uuid']} >& {base_path}/node{node}_get{suffix}.txt" + self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd) + node+=1 - for node in self.fio_node: + all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines + for node in all_nodes: base_path = os.path.join(self.docker_logs_path, node) - cmd = f"journalctl -k >& {base_path}/jounalctl_{node}.txt" + cmd = f"journalctl -k --no-tail >& {base_path}/jounalctl_{node}-final.txt" self.ssh_obj.exec_command(node, cmd) - cmd = f"dmesg -T >& {base_path}/dmesg_{node}.txt" + cmd = f"dmesg -T >& {base_path}/dmesg_{node}-final.txt" self.ssh_obj.exec_command(node, cmd) def teardown(self, delete_lvols=True, close_ssh=True): diff --git a/e2e/e2e_tests/single_node_multi_fio_perf.py b/e2e/e2e_tests/single_node_multi_fio_perf.py index 86a75c4d5..681cc1742 100644 --- a/e2e/e2e_tests/single_node_multi_fio_perf.py +++ b/e2e/e2e_tests/single_node_multi_fio_perf.py @@ -187,10 +187,11 @@ def cleanup_lvols(self, lvol_configs): self.logger.info("Starting cleanup of LVOLs") for config in lvol_configs: lvol_name = config['lvol_name'] - self.ssh_obj.unmount_path(node=self.client_machines[0], - device=self.lvol_devices[lvol_name]['MountPath']) - self.ssh_obj.remove_dir(node=self.client_machines[0], - dir_path=self.lvol_devices[lvol_name]['MountPath']) + if config['mount']: + self.ssh_obj.unmount_path(node=self.client_machines[0], + device=self.lvol_devices[lvol_name]['MountPath']) + self.ssh_obj.remove_dir(node=self.client_machines[0], + dir_path=self.lvol_devices[lvol_name]['MountPath']) lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name) subsystems = self.ssh_obj.get_nvme_subsystems(node=self.client_machines[0], nqn_filter=lvol_id) diff --git a/e2e/stress_test/continuous_failover_ha_multi_client.py b/e2e/stress_test/continuous_failover_ha_multi_client.py index a2869482d..0f0c9f94e 100644 --- a/e2e/stress_test/continuous_failover_ha_multi_client.py +++ b/e2e/stress_test/continuous_failover_ha_multi_client.py @@ -42,6 +42,7 @@ def __init__(self, **kwargs): self.sn_nodes = [] self.current_outage_node = None self.snapshot_names = [] + self.current_outage_nodes = [] self.disconnect_thread = None self.outage_start_time = None self.outage_end_time = None @@ -60,8 +61,7 @@ def __init__(self, **kwargs): # self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt", # "interface_partial_network_interrupt", # "partial_nw"] - self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt", - "interface_partial_network_interrupt"] + self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt"] # self.outage_types = ["partial_nw"] self.blocked_ports = None self.outage_log_file = os.path.join("logs", f"outage_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log") @@ -111,7 +111,26 @@ def create_lvols_with_fio(self, count): lvol_name = f"{self.lvol_name}_{i}" if not is_crypto else f"c{self.lvol_name}_{i}" self.logger.info(f"Creating lvol with Name: {lvol_name}, fs type: {fs_type}, crypto: {is_crypto}") try: - if self.current_outage_node: + self.logger.info(f"Current Outage Node: {self.current_outage_nodes}") + if self.current_outage_nodes: + self.logger.info(f"Primary vs secondary: {self.sn_primary_secondary_map}") + skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] in self.current_outage_nodes] + self.logger.info(f"Skip Nodes: {skip_nodes}") + for node in self.current_outage_nodes: + skip_nodes.append(node) + self.logger.info(f"Skip Nodes: {skip_nodes}") + self.logger.info(f"Storage Nodes with sec: {self.sn_nodes_with_sec}") + host_id = [node for node in self.sn_nodes_with_sec if node not in skip_nodes] + self.sbcli_utils.add_lvol( + lvol_name=lvol_name, + pool_name=self.pool_name, + size=self.lvol_size, + crypto=is_crypto, + key1=self.lvol_crypt_keys[0], + key2=self.lvol_crypt_keys[1], + host_id=host_id[0] + ) + elif self.current_outage_node: skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] == self.current_outage_node] skip_nodes.append(self.current_outage_node) skip_nodes.append(self.sn_primary_secondary_map[self.current_outage_node]) @@ -276,7 +295,7 @@ def create_lvols_with_fio(self, count): "iodepth": 1, "numjobs": 5, "time_based": True, - "runtime": 2000, + "runtime": 3000, "log_avg_msec": 1000, "iolog_file": self.lvol_mount_details[lvol_name]["iolog_base_path"], }, @@ -306,11 +325,11 @@ def perform_random_outage(self): node_ip = node_details[0]["mgmt_ip"] node_rpc_port = node_details[0]["rpc_port"] - sleep_n_sec(120) + sleep_n_sec(5) for node in self.sn_nodes_with_sec: - self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], - storage_node_id=node) - + # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], + # storage_node_id=node) + self.logger.info("Skipping lvstore dump!!") for node in self.sn_nodes_with_sec: cur_node_details = self.sbcli_utils.get_storage_node_details(node) cur_node_ip = cur_node_details[0]["mgmt_ip"] @@ -417,7 +436,7 @@ def perform_random_outage(self): self.disconnect_thread = threading.Thread( target=self.ssh_obj.disconnect_all_active_interfaces, - args=(node_ip, active_interfaces, 600), + args=(node_ip, active_interfaces, 300), ) self.disconnect_thread.start() elif outage_type == "interface_partial_network_interrupt": @@ -430,7 +449,7 @@ def perform_random_outage(self): self.disconnect_thread = threading.Thread( target=self.ssh_obj.disconnect_all_active_interfaces, - args=(node_ip, active_interfaces, 600), + args=(node_ip, active_interfaces, 300), ) self.disconnect_thread.start() elif outage_type == "partial_nw": @@ -478,12 +497,12 @@ def perform_random_outage(self): self.ssh_obj.disconnect_lvol_node_device(node=self.lvol_mount_details[lvol]["Client"], device=self.lvol_mount_details[lvol]["Device"]) if outage_type != "partial_nw" or outage_type != "partial_nw_single_port": - sleep_n_sec(120) + sleep_n_sec(10) return outage_type - def restart_nodes_after_failover(self, outage_type): + def restart_nodes_after_failover(self, outage_type, restart=False): """Perform steps for node restart.""" node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) node_ip = node_details[0]["mgmt_ip"] @@ -543,14 +562,48 @@ def restart_nodes_after_failover(self, outage_type): self.ssh_obj.exec_command(node=self.lvol_mount_details[lvol]["Client"], command=connect) elif outage_type == "container_stop": - self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) - # Log the restart event - self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=1) + if restart: + max_retries = 10 + retry_delay = 10 # seconds + + # Retry mechanism for restarting the node + for attempt in range(max_retries): + try: + force=False + if attempt == max_retries - 1: + force=True + self.logger.info("[CHECK] Restarting Node via CLI with Force flag as via API Fails.") + else: + self.logger.info("[CHECK] Restarting Node via CLI as via API Fails.") + self.ssh_obj.restart_node(node=self.mgmt_nodes[0], + node_id=self.current_outage_node, + force=force) + # else: + # self.sbcli_utils.restart_node(node_uuid=self.current_outage_node, expected_error_code=[503]) + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) + break # Exit loop if successful + except Exception as _: + if attempt < max_retries - 2: + self.logger.info(f"Attempt {attempt + 1} failed to restart node. Retrying in {retry_delay} seconds...") + sleep_n_sec(retry_delay) + elif attempt < max_retries - 1: + self.logger.info(f"Attempt {attempt + 1} failed to restart node via API. Retrying in {retry_delay} seconds via CMD...") + sleep_n_sec(retry_delay) + else: + self.logger.info("Max retries reached. Failed to restart node.") + raise # Rethrow the last exception + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) + # Log the restart event + self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=0) + else: + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) + # Log the restart event + self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=2) elif "network_interrupt" in outage_type: self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) # Log the restart event - self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=11) + self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=6) if not self.k8s_test: for node in self.storage_nodes: @@ -608,9 +661,9 @@ def restart_nodes_after_failover(self, outage_type): # sleep_n_sec(30) for node in self.sn_nodes_with_sec: - self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], - storage_node_id=node) - + # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], + # storage_node_id=node) + self.logger.info("Skipping lvstore dump!!") def create_snapshots_and_clones(self): """Create snapshots and clones during an outage.""" @@ -777,7 +830,7 @@ def create_snapshots_and_clones(self): "iodepth": 1, "numjobs": 5, "time_based": True, - "runtime": 2000, + "runtime": 3000, "log_avg_msec": 1000, "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"], }, @@ -786,22 +839,23 @@ def create_snapshots_and_clones(self): self.fio_threads.append(fio_thread) self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}.") - self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"], - new_size=f"{self.int_lvol_size}G") + if self.lvol_mount_details[lvol]["ID"]: + self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"], + new_size=f"{self.int_lvol_size}G") sleep_n_sec(10) - self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"], - new_size=f"{self.int_lvol_size}G") - + if self.clone_mount_details[clone_name]["ID"]: + self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"], + new_size=f"{self.int_lvol_size}G") + def delete_random_lvols(self, count): """Delete random lvols during an outage.""" skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] == self.current_outage_node] skip_nodes.append(self.current_outage_node) skip_nodes.append(self.sn_primary_secondary_map[self.current_outage_node]) - skip_nodes_lvol = [] - self.logger.info(f"Skipping Nodes: {skip_nodes_lvol}") + self.logger.info(f"Skipping Nodes: {skip_nodes}") available_lvols = [ - lvol for node, lvols in self.node_vs_lvol.items() if node not in skip_nodes_lvol for lvol in lvols + lvol for node, lvols in self.node_vs_lvol.items() if node not in skip_nodes for lvol in lvols ] self.logger.info(f"Available Lvols: {available_lvols}") if len(available_lvols) < count: @@ -922,7 +976,7 @@ def perform_failover_during_outage(self): storage_node_id=node, logs_path=self.docker_logs_path ) - self.create_lvols_with_fio(3) + self.create_lvols_with_fio(5) if not self.k8s_test: for node in self.storage_nodes: self.ssh_obj.restart_docker_logging( @@ -1041,7 +1095,7 @@ def restart_fio(self, iteration): "iodepth": 1, "numjobs": 5, "time_based": True, - "runtime": 2000, + "runtime": 3000, "log_avg_msec": 1000, "iolog_file": self.lvol_mount_details[lvol]["iolog_base_path"], }, @@ -1150,7 +1204,7 @@ def run(self): storage_node_id=node, logs_path=self.docker_logs_path ) - self.create_lvols_with_fio(5) + self.create_lvols_with_fio(3) if not self.k8s_test: for node in self.storage_nodes: self.ssh_obj.restart_docker_logging( @@ -1175,7 +1229,7 @@ def run(self): else: self.logger.info(f"Current outage node: {self.current_outage_node} is secondary node. Skipping delete and create") if outage_type != "partial_nw" or outage_type != "partial_nw_single_port": - sleep_n_sec(280) + sleep_n_sec(100) for node in self.sn_nodes_with_sec: cur_node_details = self.sbcli_utils.get_storage_node_details(node) cur_node_ip = cur_node_details[0]["mgmt_ip"] @@ -1195,7 +1249,7 @@ def run(self): ) self.logger.info("Waiting for fallback.") if outage_type != "partial_nw" or outage_type != "partial_nw_single_port": - sleep_n_sec(100) + sleep_n_sec(15) time_duration = self.common_utils.calculate_time_duration( start_timestamp=self.outage_start_time, end_timestamp=self.outage_end_time @@ -1213,23 +1267,24 @@ def run(self): no_task_ok = outage_type in {"partial_nw", "partial_nw_single_port", "lvol_disconnect_primary"} if not self.sbcli_utils.is_secondary_node(self.current_outage_node): self.validate_migration_for_node(self.outage_start_time, 2000, None, 60, no_task_ok=no_task_ok) + # pass for clone, clone_details in self.clone_mount_details.items(): self.common_utils.validate_fio_test(clone_details["Client"], log_file=clone_details["Log"]) - # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"]) - # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"]) for lvol, lvol_details in self.lvol_mount_details.items(): self.common_utils.validate_fio_test(lvol_details["Client"], log_file=lvol_details["Log"]) - # self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"]) - # self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"]) + self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"]) + self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"]) # Perform failover and manage resources during outage outage_type = self.perform_failover_during_outage() if outage_type != "partial_nw" or outage_type != "partial_nw_single_port": - sleep_n_sec(100) + sleep_n_sec(15) time_duration = self.common_utils.calculate_time_duration( start_timestamp=self.outage_start_time, end_timestamp=self.outage_end_time diff --git a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py new file mode 100644 index 000000000..c2c1051a2 --- /dev/null +++ b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py @@ -0,0 +1,534 @@ +# stress_test/continuous_failover_ha_multi_client_quick_outage.py +# Fast outages with long-running FIO, no churn beyond initial setup. +# - Create lvols, snapshots, clones ONCE at the beginning +# - Start 30min FIO on all mounts (lvols + clones) +# - Run fast outages (as soon as node is ONLINE again) +# - Every 5 outages: wait for all FIO to complete, validate, then (optionally) wait for migration window +# - Graceful shutdown: suspend -> wait SUSPENDED -> shutdown -> wait OFFLINE -> keep offline 5 min -> restart +# - After any restart: 15–30s idle then immediately next outage + +import os +import random +import string +import threading +from datetime import datetime +from utils.common_utils import sleep_n_sec +from exceptions.custom_exception import LvolNotConnectException +from stress_test.lvol_ha_stress_fio import TestLvolHACluster + + +def _rand_id(n=15, first_alpha=True): + letters = string.ascii_uppercase + digits = string.digits + allc = letters + digits + if first_alpha: + return random.choice(letters) + ''.join(random.choices(allc, k=n-1)) + return ''.join(random.choices(allc, k=n)) + + +class RandomRapidFailoverNoGap(TestLvolHACluster): + """ + - Minimal churn (only bootstrap creates) + - Long FIO (30 mins) on every lvol/clone + - Outage pacing: next outage right after ONLINE; add 15–30s buffer post-restart + - Validate FIO and pause for migration every 5 outages + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # Base knobs + self.total_lvols = 20 + self.lvol_size = "40G" + self.fio_size = "15G" + + # Validation cadence & FIO runtime + self.validate_every = 5 + self._iter = 0 + self._per_wave_fio_runtime = 3600 # 60 minutes + self._fio_wait_timeout = 5000 # wait for all to finish + + # Internal state + self.fio_threads = [] + self.lvol_mount_details = {} + self.clone_mount_details = {} + self.sn_nodes = [] + self.sn_nodes_with_sec = [] + self.sn_primary_secondary_map = {} + self.node_vs_lvol = {} + self.snapshot_names = [] + self.snap_vs_node = {} + self.current_outage_node = None + self.outage_start_time = None + self.outage_end_time = None + self.first_outage_ts = None # track the first outage for migration window + self.test_name = "longfio_nochurn_rapid_outages" + + self.outage_types = [ + "graceful_shutdown", + "container_stop", + # "interface_full_network_interrupt", + ] + + # Names + self.lvol_base = f"lvl{_rand_id(12)}" + self.clone_base = f"cln{_rand_id(12)}" + self.snap_base = f"snap{_rand_id(12)}" + + # Logging file for outages + self.outage_log_file = os.path.join("logs", f"outage_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log") + self._init_outage_log() + + # ---------- small utilities ---------- + + def _init_outage_log(self): + os.makedirs(os.path.dirname(self.outage_log_file), exist_ok=True) + with open(self.outage_log_file, "w") as f: + f.write("Timestamp,Node,Outage_Type,Event\n") + + def _log_outage_event(self, node, outage_type, event): + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + with open(self.outage_log_file, "a") as f: + f.write(f"{ts},{node},{outage_type},{event}\n") + + def _short_bs(self): + # return f"{2 ** random.randint(2, 7)}K" # 4K–128K + return f"{2 ** 6}K" + + def _pick_outage(self): + random.shuffle(self.outage_types) + return self.outage_types[0] + + # ---------- cluster bootstrap ---------- + + def _wait_cluster_active(self, timeout=900, poll=5): + """ + Poll `sbctl cluster list` until status ACTIVE. + Avoids 400 in_activation when creating lvol/snap/clone during bring-up. + """ + end = datetime.now().timestamp() + timeout + while datetime.now().timestamp() < end: + try: + info = self.ssh_obj.cluster_list(self.mgmt_nodes[0], self.cluster_id) # must wrap "sbctl cluster list" + self.logger.info(info) + # Expect a single row with Status + status = str(info).upper() + if "ACTIVE" in status: + return + except Exception as e: + self.logger.info(f"ERROR: {e}") + sleep_n_sec(poll) + raise RuntimeError("Cluster did not become ACTIVE within timeout") + + def _bootstrap_cluster(self): + # Ensure Cluster is ACTIVE + self._wait_cluster_active() + + # create pool + self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + + # discover storage nodes + storage_nodes = self.sbcli_utils.get_storage_nodes() + for res in storage_nodes['results']: + self.sn_nodes.append(res["uuid"]) + self.sn_nodes_with_sec.append(res["uuid"]) + self.sn_primary_secondary_map[res["uuid"]] = res["secondary_node_id"] + + self.logger.info(f"[LFNG] SN sec map: {self.sn_primary_secondary_map}") + + # initial lvols + mount + then later clone from snapshots + self._create_lvols(count=self.total_lvols) # start_fio=False → we launch after clones + self._seed_snapshots_and_clones() # also mounts clones + + # Start 30 min FIO on all (lvols + clones) + self._kick_fio_for_all(runtime=self._per_wave_fio_runtime) + + # start container logs + if not self.k8s_test: + for node in self.storage_nodes: + self.ssh_obj.restart_docker_logging( + node_ip=node, + containers=self.container_nodes[node], + log_dir=os.path.join(self.docker_logs_path, node), + test_name=self.test_name + ) + else: + self.runner_k8s_log.restart_logging() + + # ---------- lvol / fio helpers ---------- + + def _create_lvols(self, count=1): + for _ in range(count): + fs_type = random.choice(["ext4", "xfs"]) + is_crypto = random.choice([True, False]) + name_core = f"{self.lvol_base}_{_rand_id(6, first_alpha=False)}" + lvol_name = name_core if not is_crypto else f"c{name_core}" + + kwargs = dict( + lvol_name=lvol_name, + pool_name=self.pool_name, + size=self.lvol_size, + crypto=is_crypto, + key1=self.lvol_crypt_keys[0], + key2=self.lvol_crypt_keys[1], + ) + + # Avoid outage node & partner during initial placement + if self.current_outage_node: + skip_nodes = [self.current_outage_node, self.sn_primary_secondary_map.get(self.current_outage_node)] + skip_nodes += [p for p, s in self.sn_primary_secondary_map.items() if s == self.current_outage_node] + host_id = [n for n in self.sn_nodes_with_sec if n not in skip_nodes] + if host_id: + kwargs["host_id"] = host_id[0] + + # Ensure cluster ACTIVE before creating + self._wait_cluster_active() + + try: + self.sbcli_utils.add_lvol(**kwargs) + except Exception as e: + self.logger.warning(f"[LFNG] lvol create failed ({lvol_name}) → {e}; retry once after ACTIVE gate") + self._wait_cluster_active() + self.sbcli_utils.add_lvol(**kwargs) + + # record + lvol_id = self.sbcli_utils.get_lvol_id(lvol_name) + self.lvol_mount_details[lvol_name] = { + "ID": lvol_id, + "Command": None, + "Mount": None, + "Device": None, + "MD5": None, + "FS": fs_type, + "Log": f"{self.log_path}/{lvol_name}.log", + "snapshots": [], + "iolog_base_path": f"{self.log_path}/{lvol_name}_fio_iolog", + } + + # refresh list + self.ssh_obj.exec_command(node=self.mgmt_nodes[0], command=f"{self.base_cmd} lvol list", supress_logs=True) + + # track node placement + lvol_node_id = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)[0]["node_id"] + self.node_vs_lvol.setdefault(lvol_node_id, []).append(lvol_name) + + # connect + connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=lvol_name) + self.lvol_mount_details[lvol_name]["Command"] = connect_ls + + client_node = random.choice(self.fio_node) + self.lvol_mount_details[lvol_name]["Client"] = client_node + + initial = self.ssh_obj.get_devices(node=client_node) + for c in connect_ls: + _, err = self.ssh_obj.exec_command(node=client_node, command=c) + if err: + nqn = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)[0]["nqn"] + self.ssh_obj.disconnect_nvme(node=client_node, nqn_grep=nqn) + self.logger.info(f"[LFNG] connect error → clean lvol {lvol_name}") + self.sbcli_utils.delete_lvol(lvol_name=lvol_name, max_attempt=20, skip_error=True) + sleep_n_sec(3) + del self.lvol_mount_details[lvol_name] + self.node_vs_lvol[lvol_node_id].remove(lvol_name) + break + + final = self.ssh_obj.get_devices(node=client_node) + new_dev = None + for d in final: + if d not in initial: + new_dev = f"/dev/{d.strip()}" + break + if not new_dev: + raise LvolNotConnectException("LVOL did not connect") + + self.lvol_mount_details[lvol_name]["Device"] = new_dev + self.ssh_obj.format_disk(node=client_node, device=new_dev, fs_type=fs_type) + + mnt = f"{self.mount_path}/{lvol_name}" + self.ssh_obj.mount_path(node=client_node, device=new_dev, mount_path=mnt) + self.lvol_mount_details[lvol_name]["Mount"] = mnt + + # clean old logs + self.ssh_obj.delete_files(client_node, [ + f"{mnt}/*fio*", + f"{self.log_path}/local-{lvol_name}_fio*", + f"{self.log_path}/{lvol_name}_fio_iolog*" + ]) + + def _seed_snapshots_and_clones(self): + """Create one snapshot and one clone per lvol (best effort). Mount clones on same client.""" + for lvol, det in list(self.lvol_mount_details.items()): + # Ensure ACTIVE + self._wait_cluster_active() + + snap_name = f"{self.snap_base}_{_rand_id(8, first_alpha=False)}" + out, err = self.ssh_obj.add_snapshot(self.mgmt_nodes[0], det["ID"], snap_name) + if "(False," in str(out) or "(False," in str(err): + self.logger.warning(f"[LFNG] snapshot create failed for {lvol} → skip clone") + continue + + self.snapshot_names.append(snap_name) + node_id = self.sbcli_utils.get_lvol_details(lvol_id=det["ID"])[0]["node_id"] + self.snap_vs_node[snap_name] = node_id + det["snapshots"].append(snap_name) + + snap_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snap_name) + clone_name = f"{self.clone_base}_{_rand_id(8, first_alpha=False)}" + try: + self.ssh_obj.add_clone(self.mgmt_nodes[0], snap_id, clone_name) + except Exception as e: + self.logger.warning(f"[LFNG] clone create failed for {lvol} → {e}") + continue + + # connect clone + fs_type = det["FS"] + client = det["Client"] + + self.clone_mount_details[clone_name] = { + "ID": self.sbcli_utils.get_lvol_id(clone_name), + "Command": None, + "Mount": None, + "Device": None, + "MD5": None, + "FS": fs_type, + "Log": f"{self.log_path}/{clone_name}.log", + "snapshot": snap_name, + "Client": client, + "iolog_base_path": f"{self.log_path}/{clone_name}_fio_iolog", + } + + connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=clone_name) + self.clone_mount_details[clone_name]["Command"] = connect_ls + + initial = self.ssh_obj.get_devices(node=client) + for c in connect_ls: + _, err = self.ssh_obj.exec_command(node=client, command=c) + if err: + nqn = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])[0]["nqn"] + self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn) + self.logger.info("[LFNG] connect clone error → cleanup") + self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True) + sleep_n_sec(3) + del self.clone_mount_details[clone_name] + continue + + final = self.ssh_obj.get_devices(node=client) + new_dev = None + for d in final: + if d not in initial: + new_dev = f"/dev/{d.strip()}" + break + if not new_dev: + raise LvolNotConnectException("Clone did not connect") + + self.clone_mount_details[clone_name]["Device"] = new_dev + if fs_type == "xfs": + self.ssh_obj.clone_mount_gen_uuid(client, new_dev) + mnt = f"{self.mount_path}/{clone_name}" + self.ssh_obj.mount_path(node=client, device=new_dev, mount_path=mnt) + self.clone_mount_details[clone_name]["Mount"] = mnt + + # purge old logs + self.ssh_obj.delete_files(client, [ + f"{self.log_path}/local-{clone_name}_fio*", + f"{self.log_path}/{clone_name}_fio_iolog*", + f"{mnt}/*fio*" + ]) + + def _kick_fio_for_all(self, runtime=None): + """Start verified fio (PID-checked; auto-rerun) for all lvols + clones.""" + # small stagger to avoid SSH bursts + def _launch(name, det): + self.ssh_obj.run_fio_test( + det["Client"], None, det["Mount"], det["Log"], + size=self.fio_size, name=f"{name}_fio", rw="randrw", + bs=self._short_bs(), nrfiles=8, iodepth=1, numjobs=2, + time_based=True, runtime=runtime, log_avg_msec=1000, + iolog_file=det["iolog_base_path"], max_latency="30s", + verify="md5", verify_dump=1, verify_fatal=1, retries=6, + use_latency=False + ) + + for lvol, det in self.lvol_mount_details.items(): + self.ssh_obj.delete_files(det["Client"], [f"/mnt/{lvol}/*"]) + t = threading.Thread(target=_launch, args=(lvol, det)) + t.start() + self.fio_threads.append(t) + sleep_n_sec(0.2) + + for cname, det in self.clone_mount_details.items(): + self.ssh_obj.delete_files(det["Client"], [f"/mnt/{cname}/*"]) + t = threading.Thread(target=_launch, args=(cname, det)) + t.start() + self.fio_threads.append(t) + sleep_n_sec(0.2) + + # ---------- outage flow ---------- + + def _perform_outage(self): + random.shuffle(self.sn_nodes) + self.current_outage_node = self.sn_nodes[0] + outage_type = self._pick_outage() + + if self.first_outage_ts is None: + self.first_outage_ts = int(datetime.now().timestamp()) + + cur_node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=self.current_outage_node, + logs_path=self.docker_logs_path + ) + + # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], + # storage_node_id=self.current_outage_node) + + self.outage_start_time = int(datetime.now().timestamp()) + self._log_outage_event(self.current_outage_node, outage_type, "Outage started") + self.logger.info(f"[LFNG] Outage={outage_type} node={self.current_outage_node}") + + node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) + node_ip = node_details[0]["mgmt_ip"] + node_rpc_port = node_details[0]["rpc_port"] + + if outage_type == "graceful_shutdown": + # suspend -> wait SUSPENDED -> shutdown -> wait OFFLINE + try: + self.logger.info(f"[LFNG] Suspending node via: sbcli-dev sn suspend {self.current_outage_node}") + self.sbcli_utils.suspend_node(node_uuid=self.current_outage_node, expected_error_code=[503]) + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "suspended", timeout=600) + except Exception: + self.logger.warning("[LFNG] Suspend failed from API; ignoring if already suspended") + + try: + self.sbcli_utils.shutdown_node(node_uuid=self.current_outage_node, force=True, expected_error_code=[503]) + except Exception: + self.ssh_obj.shutdown_node(node=self.mgmt_nodes[0], node_id=self.current_outage_node, force=True) + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "offline", timeout=900) + + for node in self.sn_nodes_with_sec: + if node != self.current_outage_node: + cur_node_details = self.sbcli_utils.get_storage_node_details(node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) + # Keep node strictly offline for 5 minutes + sleep_n_sec(500) + + elif outage_type == "container_stop": + self.ssh_obj.stop_spdk_process(node_ip, node_rpc_port) + + elif outage_type == "interface_full_network_interrupt": + # Down all active data interfaces for ~300s (5 minutes) with ping verification + active = self.ssh_obj.get_active_interfaces(node_ip) + self.ssh_obj.disconnect_all_active_interfaces(node_ip, active, 300) + sleep_n_sec(280) + + return outage_type + + def restart_nodes_after_failover(self, outage_type): + + self.logger.info(f"[LFNG] Recover outage={outage_type} node={self.current_outage_node}") + + cur_node_details = self.sbcli_utils.get_storage_node_details(self.sn_primary_secondary_map[self.current_outage_node]) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=self.sn_primary_secondary_map[self.current_outage_node], + logs_path=self.docker_logs_path + ) + + # Only wait for ONLINE (skip deep health) + if outage_type == 'graceful_shutdown': + try: + self.ssh_obj.restart_node(self.mgmt_nodes[0], node_id=self.current_outage_node, force=True) + except Exception: + pass + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900) + elif outage_type == 'container_stop': + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900) + elif "network_interrupt" in outage_type: + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900) + + self._log_outage_event(self.current_outage_node, outage_type, "Node online") + self.outage_end_time = int(datetime.now().timestamp()) + + cur_node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=self.current_outage_node, + logs_path=self.docker_logs_path + ) + + # keep container log streaming going + if not self.k8s_test: + for node in self.storage_nodes: + self.ssh_obj.restart_docker_logging( + node_ip=node, + containers=self.container_nodes[node], + log_dir=os.path.join(self.docker_logs_path, node), + test_name=self.test_name + ) + else: + self.runner_k8s_log.restart_logging() + + # small cool-down before next outage to reduce SSH churn + # sleep_n_sec(random.randint(1, 5)) + + # ---------- main ---------- + + def run(self): + self.logger.info("[LFNG] Starting RandomRapidFailoverNoGap") + self._bootstrap_cluster() + sleep_n_sec(5) + + iteration = 1 + while True: + outage_type = self._perform_outage() + self.restart_nodes_after_failover(outage_type) + + self._iter += 1 + if self._iter % self.validate_every == 0: + self.logger.info(f"[LFNG] {self._iter} outages → wait & validate all FIO") + # Join launch threads so we know all jobs issued + for t in self.fio_threads: + t.join(timeout=10) + self.fio_threads = [] + + # Wait for all fio jobs to end (they’re 30min jobs) + self.common_utils.manage_fio_threads(self.fio_node, [], timeout=self._fio_wait_timeout) + + for node in self.sn_nodes_with_sec: + cur_node_details = self.sbcli_utils.get_storage_node_details(node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) + + self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], + storage_node_id=node) + + # Validate logs + for lvol, det in self.lvol_mount_details.items(): + self.common_utils.validate_fio_test(det["Client"], log_file=det["Log"]) + for cname, det in self.clone_mount_details.items(): + self.common_utils.validate_fio_test(det["Client"], log_file=det["Log"]) + + # Optional: wait for migration window after FIO completes + # (replace with your actual migration-check, if any) + self.logger.info("[LFNG] FIO validated; pausing briefly for migration window") + sleep_n_sec(10) + + # Re-kick next 30min wave + self._kick_fio_for_all(runtime=self._per_wave_fio_runtime) + self.logger.info("[LFNG] Next FIO wave started") + + self.logger.info(f"[LFNG] Iter {iteration} complete → starting next outage ASAP") + iteration += 1 \ No newline at end of file diff --git a/e2e/stress_test/continuous_failover_ha_multi_outage.py b/e2e/stress_test/continuous_failover_ha_multi_outage.py index fb5f6d507..e96a0b547 100644 --- a/e2e/stress_test/continuous_failover_ha_multi_outage.py +++ b/e2e/stress_test/continuous_failover_ha_multi_outage.py @@ -1,5 +1,6 @@ from utils.common_utils import sleep_n_sec from datetime import datetime +from collections import defaultdict from stress_test.continuous_failover_ha_multi_client import RandomMultiClientFailoverTest from exceptions.custom_exception import LvolNotConnectException import threading @@ -8,13 +9,20 @@ import os +generated_sequences = set() + def generate_random_sequence(length): letters = string.ascii_uppercase numbers = string.digits all_chars = letters + numbers - first_char = random.choice(letters) - remaining_chars = ''.join(random.choices(all_chars, k=length - 1)) - return first_char + remaining_chars + + while True: + first_char = random.choice(letters) + remaining_chars = ''.join(random.choices(all_chars, k=length-1)) + result = first_char + remaining_chars + if result not in generated_sequences: + generated_sequences.add(result) + return result class RandomMultiClientMultiFailoverTest(RandomMultiClientFailoverTest): @@ -25,7 +33,7 @@ class RandomMultiClientMultiFailoverTest(RandomMultiClientFailoverTest): def __init__(self, **kwargs): super().__init__(**kwargs) - self.total_lvols = 20 + self.total_lvols = 40 self.lvol_name = f"lvl{generate_random_sequence(15)}" self.clone_name = f"cln{generate_random_sequence(15)}" self.snapshot_name = f"snap{generate_random_sequence(15)}" @@ -48,9 +56,12 @@ def __init__(self, **kwargs): self.lvols_without_sec_connect = [] self.test_name = "n_plus_k_failover_multi_client_ha" self.outage_types = [ + "graceful_shutdown", + "interface_full_network_interrupt" + ] + self.outage_types2 = [ "container_stop", "graceful_shutdown", - "interface_partial_network_interrupt", "interface_full_network_interrupt" ] self.blocked_ports = None @@ -61,30 +72,101 @@ def _initialize_outage_log(self): with open(self.outage_log_file, 'w') as log: log.write("Timestamp,Node,Outage_Type,Event\n") - def log_outage_event(self, node, outage_type, event): - timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + def log_outage_event(self, node, outage_type, event, outage_time=0): + """Log an outage event to the outage log file. + + Args: + node (str): Node UUID or IP where the event occurred. + outage_type (str): Type of outage (e.g., port_network_interrupt, container_stop, graceful_shutdown). + event (str): Event description (e.g., 'Outage started', 'Node restarted'). + outage_time (int): Minutes to add to self.outage_start_time. If 0/None, use current time. + """ + # Compute timestamp + if outage_time: + # Uses self.outage_start_time (epoch seconds) + outage_time (minutes) + base_epoch = getattr(self, "outage_start_time", None) + if isinstance(base_epoch, (int, float)) and base_epoch > 0: + ts_dt = datetime.fromtimestamp(int(base_epoch) + int(outage_time) * 60) + else: + # Fallback to now if outage_start_time is missing/invalid + ts_dt = datetime.now() + else: + ts_dt = datetime.now() + + timestamp = ts_dt.strftime('%Y-%m-%d %H:%M:%S') + + # Write the log line with open(self.outage_log_file, 'a') as log: log.write(f"{timestamp},{node},{outage_type},{event}\n") + def _build_reverse_secondary_map(self): + rev = defaultdict(set) # secondary -> {primary,...} + for p, s in self.sn_primary_secondary_map.items(): + if s: + rev[s].add(p) + return rev + + def _pick_outage_nodes(self, primary_candidates, k): + rev = self._build_reverse_secondary_map() + order = primary_candidates[:] + + random.shuffle(order) + + chosen, blocked = [], set() + for node in order: + if node in blocked: + continue + + chosen.append(node) + blocked.add(node) # itself + sec = self.sn_primary_secondary_map.get(node) + if sec: + blocked.add(sec) # its secondary + blocked.update(rev.get(node, ())) # any primary whose secondary == node + + if len(chosen) == k: + break + + if len(chosen) < k: + raise Exception( + f"Cannot pick {k} nodes without primary/secondary conflicts; only {len(chosen)} possible with current topology." + ) + return chosen + def perform_n_plus_k_outages(self): """ - Perform K (self.npcs) parallel outages as part of N+K configuration. - Ensure only primary nodes are selected for outage. + Select K outage nodes such that no two are in a primary/secondary + relationship (in either direction). Candidates = keys of the map. """ - primary_nodes = [node for node in self.sn_nodes if not self.sbcli_utils.is_secondary_node(node)] + # Candidates are nodes that are primary *for someone* (map keys) + primary_candidates = list(self.sn_primary_secondary_map.keys()) + self.current_outage_nodes = [] - if len(primary_nodes) < self.npcs: - raise Exception(f"Not enough primary nodes to perform {self.npcs} outages. Found only {len(primary_nodes)}.") + if len(primary_candidates) < self.npcs: + raise Exception( + f"Need {self.npcs} outage nodes, but only {len(primary_candidates)} primary-role nodes exist." + ) - outage_nodes = random.sample(primary_nodes, k=self.npcs) + outage_nodes = self._pick_outage_nodes(primary_candidates, self.npcs) + self.logger.info(f"Selected outage nodes: {outage_nodes}") outage_combinations = [] - + outage_num = 0 for node in outage_nodes: - outage_type = random.choice(self.outage_types) + if outage_num == 0: + outage_type = random.choice(self.outage_types) + outage_num = 1 + else: + outage_type = random.choice(self.outage_types2) node_details = self.sbcli_utils.get_storage_node_details(node) node_ip = node_details[0]["mgmt_ip"] node_rpc_port = node_details[0]["rpc_port"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) + self.logger.info(f"Performing {outage_type} on primary node {node}.") self.log_outage_event(node, outage_type, "Outage started") @@ -105,26 +187,74 @@ def perform_n_plus_k_outages(self): def _graceful_shutdown_node(self, node): try: - self.sbcli_utils.suspend_node(node_uuid=node, expected_error_code=[503]) - self.sbcli_utils.wait_for_storage_node_status(node, "suspended", timeout=1000) - self.sbcli_utils.shutdown_node(node_uuid=node, expected_error_code=[503]) - self.sbcli_utils.wait_for_storage_node_status(node, "offline", timeout=1000) + sleep_n_sec(10) + max_retries = 10 + retry_delay = 10 # seconds + # Retry mechanism for suspending the node + for attempt in range(max_retries): + try: + if attempt == max_retries - 1: + self.logger.info("[CHECK] Suspending Node via CLI as via API Fails.") + self.ssh_obj.suspend_node(node=self.mgmt_nodes[0], + node_id=node) + else: + self.sbcli_utils.suspend_node(node_uuid=node, expected_error_code=[503]) + self.sbcli_utils.wait_for_storage_node_status(node, "suspended", timeout=1000) + break # Exit loop if successful + except Exception as _: + if attempt < max_retries - 2: + self.logger.info(f"Attempt {attempt + 1} failed to suspend node. Retrying in {retry_delay} seconds...") + sleep_n_sec(retry_delay) + elif attempt < max_retries - 1: + self.logger.info(f"Attempt {attempt + 1} failed to suspend node via API. Retrying in {retry_delay} seconds via CMD...") + sleep_n_sec(retry_delay) + else: + self.logger.info("Max retries reached. Failed to suspend node.") + raise # Rethrow the last exception + + sleep_n_sec(10) # Wait before shutting down + + # Retry mechanism for shutting down the node + for attempt in range(max_retries): + try: + if attempt == max_retries - 1: + self.logger.info("[CHECK] Shutting down Node via CLI as via API Fails.") + self.ssh_obj.shutdown_node(node=self.mgmt_nodes[0], + node_id=node, + force=True) + else: + self.sbcli_utils.shutdown_node(node_uuid=node, force=True, + expected_error_code=[503]) + self.sbcli_utils.wait_for_storage_node_status(node, "offline", timeout=1000) + break # Exit loop if successful + except Exception as _: + if attempt < max_retries - 2: + self.logger.info(f"Attempt {attempt + 1} failed to shutdown node. Retrying in {retry_delay} seconds...") + sleep_n_sec(retry_delay) + elif attempt < max_retries - 1: + self.logger.info(f"Attempt {attempt + 1} failed to shutdown node via API. Retrying in {retry_delay} seconds via CMD...") + sleep_n_sec(retry_delay) + else: + self.logger.info("Max retries reached. Failed to shutdown node.") + raise # Rethrow the last exception except Exception as e: self.logger.error(f"Failed graceful shutdown for node {node}: {str(e)}") def _disconnect_partial_interface(self, node, node_ip): active_interfaces = [nic["if_name"] for nic in self.sbcli_utils.get_storage_node_details(node)[0]["data_nics"]] + active_interfaces = ['eth1'] self.disconnect_thread = threading.Thread( target=self.ssh_obj.disconnect_all_active_interfaces, - args=(node_ip, active_interfaces, 600) + args=(node_ip, active_interfaces, 300) ) self.disconnect_thread.start() def _disconnect_full_interface(self, node, node_ip): + self.logger.info("Handling full interface based network interruption...") active_interfaces = self.ssh_obj.get_active_interfaces(node_ip) self.disconnect_thread = threading.Thread( target=self.ssh_obj.disconnect_all_active_interfaces, - args=(node_ip, active_interfaces, 600) + args=(node_ip, active_interfaces, 300) ) self.disconnect_thread.start() @@ -134,50 +264,81 @@ def delete_random_lvols(self, count): lvol for node, lvols in self.node_vs_lvol.items() if node not in self.current_outage_nodes for lvol in lvols ] + + self.logger.info(f"Available Lvols: {available_lvols}") if len(available_lvols) < count: self.logger.warning("Not enough lvols available to delete the requested count.") count = len(available_lvols) for lvol in random.sample(available_lvols, count): - self.logger.info(f"Deleting lvol {lvol}") + self.logger.info(f"Deleting lvol {lvol}.") snapshots = self.lvol_mount_details[lvol]["snapshots"] to_delete = [] - - # Handle dependent clones for clone_name, clone_details in self.clone_mount_details.items(): if clone_details["snapshot"] in snapshots: - self.common_utils.validate_fio_test(clone_details["Client"], clone_details["Log"]) + self.common_utils.validate_fio_test(clone_details["Client"], + log_file=clone_details["Log"]) self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=False) fio_pids = self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=True) + sleep_n_sec(10) for pid in fio_pids: self.ssh_obj.kill_processes(clone_details["Client"], pid=pid) + attempt = 1 + while len(fio_pids) > 2: + self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=False) + fio_pids = self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=True) + if attempt >= 30: + raise Exception("FIO not killed on clone") + attempt += 1 + sleep_n_sec(20) + + sleep_n_sec(10) self.ssh_obj.unmount_path(clone_details["Client"], f"/mnt/{clone_name}") self.ssh_obj.remove_dir(clone_details["Client"], dir_path=f"/mnt/{clone_name}") self.disconnect_lvol(clone_details['ID']) - self.sbcli_utils.delete_lvol(clone_name) + self.sbcli_utils.delete_lvol(clone_name, max_attempt=20, skip_error=True) + sleep_n_sec(30) if clone_name in self.lvols_without_sec_connect: self.lvols_without_sec_connect.remove(clone_name) to_delete.append(clone_name) - + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone_name}_fio*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone_name}_fio_iolog*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"/mnt/{clone_name}/*"]) + # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone_name}*.log"]) for del_key in to_delete: del self.clone_mount_details[del_key] - - # Delete snapshots for snapshot in snapshots: snapshot_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snapshot) + # snapshot_node = self.snap_vs_node[snapshot] + # if snapshot_node not in skip_nodes: self.ssh_obj.delete_snapshot(self.mgmt_nodes[0], snapshot_id=snapshot_id) self.snapshot_names.remove(snapshot) - # Stop FIO and cleanup lvol - self.common_utils.validate_fio_test(self.lvol_mount_details[lvol]["Client"], self.lvol_mount_details[lvol]["Log"]) + self.common_utils.validate_fio_test(self.lvol_mount_details[lvol]["Client"], + log_file=self.lvol_mount_details[lvol]["Log"]) self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=False) + sleep_n_sec(10) fio_pids = self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=True) for pid in fio_pids: self.ssh_obj.kill_processes(self.lvol_mount_details[lvol]["Client"], pid=pid) + attempt = 1 + while len(fio_pids) > 2: + self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=False) + fio_pids = self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=True) + if attempt >= 30: + raise Exception("FIO not killed on lvols") + attempt += 1 + sleep_n_sec(20) + + sleep_n_sec(10) self.ssh_obj.unmount_path(self.lvol_mount_details[lvol]["Client"], f"/mnt/{lvol}") self.ssh_obj.remove_dir(self.lvol_mount_details[lvol]["Client"], dir_path=f"/mnt/{lvol}") self.disconnect_lvol(self.lvol_mount_details[lvol]['ID']) - self.sbcli_utils.delete_lvol(lvol) + self.sbcli_utils.delete_lvol(lvol, max_attempt=20, skip_error=True) + self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/local-{lvol}_fio*"]) + self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"]) + self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"/mnt/{lvol}/*"]) + # self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/{lvol}*.log"]) if lvol in self.lvols_without_sec_connect: self.lvols_without_sec_connect.remove(lvol) del self.lvol_mount_details[lvol] @@ -190,14 +351,19 @@ def delete_random_lvols(self, count): def create_snapshots_and_clones(self): """Create snapshots and clones during an outage, avoiding lvols on outage nodes.""" self.int_lvol_size += 1 + skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] in self.current_outage_nodes] + self.logger.info(f"Skip Nodes: {skip_nodes}") + for node in self.current_outage_nodes: + skip_nodes.append(node) + self.logger.info(f"Skip Nodes: {skip_nodes}") available_lvols = [ lvol for node, lvols in self.node_vs_lvol.items() - if node not in self.current_outage_nodes for lvol in lvols + if node not in skip_nodes for lvol in lvols ] if not available_lvols: self.logger.warning("No available lvols to create snapshots and clones.") return - + self.logger.info(f"Available lvols: {available_lvols}") for _ in range(3): random.shuffle(available_lvols) lvol = available_lvols[0] @@ -205,69 +371,140 @@ def create_snapshots_and_clones(self): temp_name = generate_random_sequence(5) if snapshot_name in self.snapshot_names: snapshot_name = f"{snapshot_name}_{temp_name}" - try: output, error = self.ssh_obj.add_snapshot(self.mgmt_nodes[0], self.lvol_mount_details[lvol]["ID"], snapshot_name) - if "(False," in output or "(False," in error: - raise Exception(output or error) + if "(False," in output: + raise Exception(output) + if "(False," in error: + raise Exception(error) except Exception as e: - self.logger.warning(f"Snapshot creation failed: {e}") - continue - + self.logger.warning(f"Snap creation fails with {str(e)}. Retrying with different name.") + try: + snapshot_name = f"snap_{lvol}" + temp_name = generate_random_sequence(5) + snapshot_name = f"{snapshot_name}_{temp_name}" + self.ssh_obj.add_snapshot(self.mgmt_nodes[0], self.lvol_mount_details[lvol]["ID"], snapshot_name) + except Exception as exp: + self.logger.warning(f"Retry Snap creation fails with {str(exp)}.") + continue + self.snapshot_names.append(snapshot_name) + lvol_node_id = self.sbcli_utils.get_lvol_details( + lvol_id=self.lvol_mount_details[lvol]["ID"])[0]["node_id"] + self.snap_vs_node[snapshot_name] = lvol_node_id self.lvol_mount_details[lvol]["snapshots"].append(snapshot_name) - clone_name = f"clone_{generate_random_sequence(15)}" + if clone_name in list(self.clone_mount_details): + clone_name = f"{clone_name}_{temp_name}" sleep_n_sec(30) snapshot_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snapshot_name) try: self.ssh_obj.add_clone(self.mgmt_nodes[0], snapshot_id, clone_name) except Exception as e: - self.logger.warning(f"Clone creation failed: {e}") - continue - + self.logger.warning(f"Clone creation fails with {str(e)}. Retrying with different name.") + try: + clone_name = f"clone_{generate_random_sequence(15)}" + temp_name = generate_random_sequence(5) + clone_name = f"{clone_name}_{temp_name}" + self.ssh_obj.add_clone(self.mgmt_nodes[0], snapshot_id, clone_name) + except Exception as exp: + self.logger.warning(f"Retry Clone creation fails with {str(exp)}.") + continue fs_type = self.lvol_mount_details[lvol]["FS"] client = self.lvol_mount_details[lvol]["Client"] self.clone_mount_details[clone_name] = { - "ID": self.sbcli_utils.get_lvol_id(clone_name), - "Command": None, - "Mount": None, - "Device": None, - "MD5": None, - "FS": fs_type, - "Log": f"{self.log_path}/{clone_name}.log", - "snapshot": snapshot_name, - "Client": client + "ID": self.sbcli_utils.get_lvol_id(clone_name), + "Command": None, + "Mount": None, + "Device": None, + "MD5": None, + "FS": fs_type, + "Log": f"{self.log_path}/{clone_name}.log", + "snapshot": snapshot_name, + "Client": client, + "iolog_base_path": f"{self.log_path}/{clone_name}_fio_iolog" } + self.logger.info(f"Created clone {clone_name}.") + + sleep_n_sec(3) + + self.ssh_obj.exec_command(node=self.mgmt_nodes[0], + command=f"{self.base_cmd} lvol list") + connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=clone_name) self.clone_mount_details[clone_name]["Command"] = connect_ls + + # if self.secondary_outage: + # connect_ls = [connect_ls[0]] + # self.lvols_without_sec_connect.append(clone_name) + initial_devices = self.ssh_obj.get_devices(node=client) for connect_str in connect_ls: _, error = self.ssh_obj.exec_command(node=client, command=connect_str) if error: - self.logger.warning(f"Clone connect failed: {error}") + lvol_details = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"]) + nqn = lvol_details[0]["nqn"] + self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn) + self.logger.info(f"Connecting clone {clone_name} has error: {error}. Disconnect all connections for that clone!!") + self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True) + sleep_n_sec(30) + del self.clone_mount_details[clone_name] continue + sleep_n_sec(3) final_devices = self.ssh_obj.get_devices(node=client) - lvol_device = next((f"/dev/{d.strip()}" for d in final_devices if d not in initial_devices), None) + lvol_device = None + for device in final_devices: + if device not in initial_devices: + lvol_device = f"/dev/{device.strip()}" + break if not lvol_device: - raise LvolNotConnectException("Clone device not found") + raise LvolNotConnectException("LVOL did not connect") self.clone_mount_details[clone_name]["Device"] = lvol_device + # Mount and Run FIO if fs_type == "xfs": self.ssh_obj.clone_mount_gen_uuid(client, lvol_device) - mount_point = f"{self.mount_path}/{clone_name}" self.ssh_obj.mount_path(node=client, device=lvol_device, mount_path=mount_point) self.clone_mount_details[clone_name]["Mount"] = mount_point + # clone_node_id = self.sbcli_utils.get_lvol_details( + # lvol_id=self.lvol_mount_details[clone_name]["ID"])[0]["node_id"] + + # self.node_vs_lvol[clone_node_id].append(clone_name) + + sleep_n_sec(10) + self.ssh_obj.delete_files(client, [f"{mount_point}/*fio*"]) self.ssh_obj.delete_files(client, [f"{self.log_path}/local-{clone_name}_fio*"]) - + self.ssh_obj.delete_files(client, [f"{self.log_path}/{clone_name}_fio_iolog*"]) + + sleep_n_sec(5) + + # Start FIO + # fio_thread = threading.Thread( + # target=self.ssh_obj.run_fio_test, + # args=(client, None, self.clone_mount_details[clone_name]["Mount"], self.clone_mount_details[clone_name]["Log"]), + # kwargs={ + # "size": self.fio_size, + # "name": f"{clone_name}_fio", + # "rw": "randrw", + # "bs": f"{2 ** random.randint(2, 7)}K", + # "nrfiles": 16, + # "iodepth": 1, + # "numjobs": 5, + # "time_based": True, + # "runtime": 2000, + # "log_avg_msec": 1000, + # "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"], + # "debug": True, + # }, + # ) fio_thread = threading.Thread( target=self.ssh_obj.run_fio_test, - args=(client, None, mount_point, self.clone_mount_details[clone_name]["Log"]), + args=(client, None, self.clone_mount_details[clone_name]["Mount"], self.clone_mount_details[clone_name]["Log"]), kwargs={ "size": self.fio_size, "name": f"{clone_name}_fio", @@ -278,15 +515,21 @@ def create_snapshots_and_clones(self): "numjobs": 5, "time_based": True, "runtime": 2000, + "log_avg_msec": 1000, + "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"], }, ) fio_thread.start() self.fio_threads.append(fio_thread) + self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}.") - self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}") - self.sbcli_utils.resize_lvol(self.lvol_mount_details[lvol]["ID"], f"{self.int_lvol_size}G") + if self.lvol_mount_details[lvol]["ID"]: + self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"], + new_size=f"{self.int_lvol_size}G") sleep_n_sec(10) - self.sbcli_utils.resize_lvol(self.clone_mount_details[clone_name]["ID"], f"{self.int_lvol_size}G") + if self.clone_mount_details[clone_name]["ID"]: + self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"], + new_size=f"{self.int_lvol_size}G") def run(self): @@ -301,6 +544,8 @@ def run(self): for result in storage_nodes['results']: self.sn_nodes.append(result["uuid"]) self.sn_nodes_with_sec.append(result["uuid"]) + self.sn_primary_secondary_map[result["uuid"]] = result["secondary_node_id"] + self.logger.info(f"Secondary node map: {self.sn_primary_secondary_map}") sleep_n_sec(30) @@ -320,11 +565,23 @@ def run(self): for node, outage_type in outage_events: self.current_outage_node = node - self.restart_nodes_after_failover(outage_type) + if outage_type == "container_stop" and self.npcs > 1: + self.restart_nodes_after_failover(outage_type, True) + else: + self.restart_nodes_after_failover(outage_type) self.logger.info("Waiting for fallback recovery.") sleep_n_sec(100) + for node in self.sn_nodes_with_sec: + cur_node_details = self.sbcli_utils.get_storage_node_details(node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) + time_duration = self.common_utils.calculate_time_duration( start_timestamp=self.outage_start_time, end_timestamp=self.outage_end_time @@ -343,12 +600,27 @@ def run(self): # for node, outage_type in outage_events: # if not self.sbcli_utils.is_secondary_node(node): self.validate_migration_for_node(self.outage_start_time, 2000, None, 60, no_task_ok=no_task_ok) + self.common_utils.manage_fio_threads(self.fio_node, self.fio_threads, timeout=20000) for clone, clone_details in self.clone_mount_details.items(): self.common_utils.validate_fio_test(clone_details["Client"], clone_details["Log"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"]) for lvol, lvol_details in self.lvol_mount_details.items(): self.common_utils.validate_fio_test(lvol_details["Client"], lvol_details["Log"]) + self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"]) + self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"]) self.logger.info(f"N+K failover iteration {iteration} complete.") + + for node in self.sn_nodes_with_sec: + cur_node_details = self.sbcli_utils.get_storage_node_details(node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) iteration += 1 + diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py index bd06f06f7..a50a61726 100644 --- a/e2e/utils/ssh_utils.py +++ b/e2e/utils/ssh_utils.py @@ -13,6 +13,10 @@ import string import re import subprocess +import shlex +import socket +from collections import defaultdict +from typing import Optional, List SSH_KEY_LOCATION = os.path.join(Path.home(), ".ssh", os.environ.get("KEY_NAME")) @@ -47,31 +51,227 @@ def __init__(self, bastion_server): self.log_monitor_threads = {} self.log_monitor_stop_flags = {} self.ssh_semaphore = threading.Semaphore(10) # Max 10 SSH calls in parallel (tune as needed) + self._bastion_client = None + self._reconnect_locks = defaultdict(threading.Lock) + self.ssh_pass = None + + def _candidate_usernames(self, explicit_user) -> List[str]: + if explicit_user: + if isinstance(explicit_user, (list, tuple)): + return list(explicit_user) + return [str(explicit_user)] + return ["ec2-user", "ubuntu", "rocky", "root"] + + def _load_private_keys(self) -> List[paramiko.PKey]: + """ + Try Ed25519 then RSA. If SSH_KEY_LOCATION/env points to a file, use it. + Else try ~/.ssh/id_ed25519 and ~/.ssh/id_rsa. If SSH_KEY_PATH is a dir, load all files from it. + """ + paths = [] + # explicit single file via KEY_NAME → SSH_KEY_LOCATION + if SSH_KEY_LOCATION and os.path.isfile(SSH_KEY_LOCATION): + paths.append(SSH_KEY_LOCATION) + # defaults + home = os.path.join(Path.home(), ".ssh") + paths.extend([os.path.join(home, "id_ed25519"), os.path.join(home, "id_rsa")]) + + keys = [] + seen = set() + for p in paths: + if not os.path.exists(p) or p in seen: + continue + seen.add(p) + try: + keys.append(paramiko.Ed25519Key.from_private_key_file(p)) + continue + except Exception: + pass + try: + keys.append(paramiko.RSAKey.from_private_key_file(p)) + except Exception: + pass + if not keys and not self.ssh_pass: + raise FileNotFoundError("No usable SSH private key found and SSH_PASS not set.") + return keys + + def _try_connect(self, host: str, username: str, pkey: Optional[paramiko.PKey], password: Optional[str], sock=None, timeout=30): + cli = paramiko.SSHClient() + cli.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + cli.connect( + hostname=host, + username=username, + pkey=pkey, + password=(password if pkey is None else None), + timeout=timeout, + banner_timeout=timeout, + auth_timeout=timeout, + allow_agent=False, + look_for_keys=False, + sock=sock + ) + return cli + + # def connect(self, address: str, port: int = 22, + # bastion_server_address: str = None, + # username: str = "ec2-user", + # is_bastion_server: bool = False): + # """Connect to cluster nodes""" + # # --- prep usernames list --- + # default_users = ["ec2-user", "ubuntu", "rocky", "root"] + # if getattr(self, "ssh_user", None): + # if isinstance(self.ssh_user, (list, tuple)): + # usernames = list(self.ssh_user) + # else: + # usernames = [str(self.ssh_user)] + # else: + # usernames = default_users + + # # Load key (Ed25519 -> RSA fallback) + # if not os.path.exists(SSH_KEY_LOCATION): + # raise FileNotFoundError(f"SSH private key not found at {SSH_KEY_LOCATION}") + # try: + # private_key = paramiko.Ed25519Key(filename=SSH_KEY_LOCATION) + # except Exception: + # private_key = paramiko.RSAKey.from_private_key_file(SSH_KEY_LOCATION) + + # # Helper to store/replace a connection + # def _store(host, client): + # if self.ssh_connections.get(host): + # try: + # self.ssh_connections[host].close() + # except Exception: + # pass + # self.ssh_connections[host] = client + + # # ---------- direct connection ---------- + # bastion_server_address = bastion_server_address or self.bastion_server + # if not bastion_server_address: + # self.logger.info(f"Connecting directly to {address} on port {port}...") + # last_err = None + # for user in usernames: + # ssh = paramiko.SSHClient() + # ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + # try: + # ssh.connect( + # hostname=address, + # username=user, + # port=port, + # pkey=private_key, + # timeout=300, + # banner_timeout=30, + # auth_timeout=30, + # allow_agent=False, + # look_for_keys=False, + # ) + # self.logger.info(f"Connected directly to {address} as '{user}'.") + # _store(address, ssh) + # return + # except Exception as e: + # last_err = e + # self.logger.info(f"Direct login failed for '{user}': {repr(e)}") + # try: + # ssh.close() + # except Exception: + # pass + # raise Exception(f"All usernames failed for {address}. Last error: {repr(last_err)}") + + # # ---------- connect to bastion ---------- + # self.logger.info(f"Connecting to bastion server {bastion_server_address}...") + # bastion_ssh = paramiko.SSHClient() + # bastion_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + # last_err = None + # bastion_user_used = None + # for b_user in usernames: + # try: + # bastion_ssh.connect( + # hostname=bastion_server_address, + # username=b_user, + # port=port, + # pkey=private_key, + # timeout=300, + # banner_timeout=30, + # auth_timeout=30, + # allow_agent=False, + # look_for_keys=False, + # ) + # self.logger.info(f"Connected to bastion as '{b_user}'.") + # _store(bastion_server_address, bastion_ssh) + # bastion_user_used = b_user + # break + # except Exception as e: + # last_err = e + # self.logger.info(f"Bastion login failed for '{b_user}': {repr(e)}") + # if bastion_user_used is None: + # raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}") + # if is_bastion_server: + # return # caller only needed bastion + + # # ---------- tunnel to target through bastion ---------- + # self.logger.info(f"Connecting to target server {address} through bastion server...") + # transport = bastion_ssh.get_transport() + # last_err = None + # for user in usernames: + # # IMPORTANT: open a NEW channel for each username attempt + # try: + # channel = transport.open_channel( + # "direct-tcpip", + # (address, port), + # ("localhost", 0), + # ) + # except paramiko.ssh_exception.ChannelException as ce: + # self.logger.error( + # f"Channel open failed: {repr(ce)} — check AllowTcpForwarding/PermitOpen on bastion." + # ) + # raise + # target_ssh = paramiko.SSHClient() + # target_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + # try: + # target_ssh.connect( + # address, + # username=user, + # port=port, + # sock=channel, + # pkey=private_key, + # timeout=300, + # banner_timeout=30, + # auth_timeout=30, + # allow_agent=False, + # look_for_keys=False, + # ) + # self.logger.info(f"Connected to {address} as '{user}' via bastion '{bastion_user_used}'.") + # _store(address, target_ssh) + # return + # except Exception as e: + # last_err = e + # self.logger.info(f"Target login failed for '{user}': {repr(e)}") + # try: + # target_ssh.close() + # except Exception: + # pass + # try: + # channel.close() + # except Exception: + # pass + + # raise Exception( + # f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}" + # ) def connect(self, address: str, port: int = 22, bastion_server_address: str = None, username: str = "ec2-user", is_bastion_server: bool = False): - """Connect to cluster nodes""" - # --- prep usernames list --- - default_users = ["ec2-user", "ubuntu", "rocky", "root"] - if getattr(self, "ssh_user", None): - if isinstance(self.ssh_user, (list, tuple)): - usernames = list(self.ssh_user) - else: - usernames = [str(self.ssh_user)] - else: - usernames = default_users + """ + Connect to a host directly or via bastion, trying multiple usernames and keys, + with optional password fallback. + """ + # Resolve bastion + bastion_server_address = bastion_server_address or self.bastion_server - # Load key (Ed25519 -> RSA fallback) - if not os.path.exists(SSH_KEY_LOCATION): - raise FileNotFoundError(f"SSH private key not found at {SSH_KEY_LOCATION}") - try: - private_key = paramiko.Ed25519Key(filename=SSH_KEY_LOCATION) - except Exception: - private_key = paramiko.RSAKey.from_private_key_file(SSH_KEY_LOCATION) + usernames = self._candidate_usernames(self.ssh_user or username) + keys = self._load_private_keys() + password = self.ssh_pass - # Helper to store/replace a connection def _store(host, client): if self.ssh_connections.get(host): try: @@ -80,230 +280,291 @@ def _store(host, client): pass self.ssh_connections[host] = client - # ---------- direct connection ---------- - bastion_server_address = bastion_server_address or self.bastion_server + # --- NO BASTION: direct connect --- if not bastion_server_address: - self.logger.info(f"Connecting directly to {address} on port {port}...") last_err = None + self.logger.info(f"Connecting directly to {address} on port {port}...") for user in usernames: - ssh = paramiko.SSHClient() - ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - try: - ssh.connect( - hostname=address, - username=user, - port=port, - pkey=private_key, - timeout=300, - banner_timeout=30, - auth_timeout=30, - allow_agent=False, - look_for_keys=False, - ) - self.logger.info(f"Connected directly to {address} as '{user}'.") - _store(address, ssh) - return - except Exception as e: - last_err = e - self.logger.info(f"Direct login failed for '{user}': {repr(e)}") + # try keys + for key in keys: try: - ssh.close() - except Exception: - pass + cli = self._try_connect(address, user, key, None, timeout=30) + self.logger.info(f"Connected directly to {address} as '{user}'.") + _store(address, cli) + return + except Exception as e: + last_err = e + # then password + if password: + try: + cli = self._try_connect(address, user, None, password, timeout=30) + self.logger.info(f"Connected directly to {address} as '{user}' (password).") + _store(address, cli) + return + except Exception as e: + last_err = e raise Exception(f"All usernames failed for {address}. Last error: {repr(last_err)}") - # ---------- connect to bastion ---------- - self.logger.info(f"Connecting to bastion server {bastion_server_address}...") - bastion_ssh = paramiko.SSHClient() - bastion_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - last_err = None - bastion_user_used = None - for b_user in usernames: - try: - bastion_ssh.connect( - hostname=bastion_server_address, - username=b_user, - port=port, - pkey=private_key, - timeout=300, - banner_timeout=30, - auth_timeout=30, - allow_agent=False, - look_for_keys=False, - ) - self.logger.info(f"Connected to bastion as '{b_user}'.") - _store(bastion_server_address, bastion_ssh) - bastion_user_used = b_user + # --- VIA BASTION --- + # ensure bastion client (reuse if alive) + if (not self._bastion_client) or (not self._bastion_client.get_transport()) or (not self._bastion_client.get_transport().is_active()): + last_err = None + self.logger.info(f"Connecting to bastion server {bastion_server_address}...") + for b_user in self._candidate_usernames(self.ssh_user or username): + for key in keys: + try: + cli = self._try_connect(bastion_server_address, b_user, key, None, timeout=30) + self._bastion_client = cli + self.logger.info(f"Connected to bastion as '{b_user}'.") + break + except Exception as e: + last_err = e + else: + if password: + try: + cli = self._try_connect(bastion_server_address, b_user, None, password, timeout=30) + self._bastion_client = cli + self.logger.info(f"Connected to bastion as '{b_user}' (password).") + break + except Exception as e: + last_err = e + continue break - except Exception as e: - last_err = e - self.logger.info(f"Bastion login failed for '{b_user}': {repr(e)}") - if bastion_user_used is None: - raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}") + if (not self._bastion_client) or (not self._bastion_client.get_transport()) or (not self._bastion_client.get_transport().is_active()): + raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}") + if is_bastion_server: - return # caller only needed bastion + # caller only wanted bastion connection open + _store(bastion_server_address, self._bastion_client) + return - # ---------- tunnel to target through bastion ---------- + # open a channel through bastion → target self.logger.info(f"Connecting to target server {address} through bastion server...") - transport = bastion_ssh.get_transport() + bastion_transport = self._bastion_client.get_transport() + last_err = None for user in usernames: - # IMPORTANT: open a NEW channel for each username attempt - try: - channel = transport.open_channel( - "direct-tcpip", - (address, port), - ("localhost", 0), - ) - except paramiko.ssh_exception.ChannelException as ce: - self.logger.error( - f"Channel open failed: {repr(ce)} — check AllowTcpForwarding/PermitOpen on bastion." - ) - raise - target_ssh = paramiko.SSHClient() - target_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - try: - target_ssh.connect( - address, - username=user, - port=port, - sock=channel, - pkey=private_key, - timeout=300, - banner_timeout=30, - auth_timeout=30, - allow_agent=False, - look_for_keys=False, - ) - self.logger.info(f"Connected to {address} as '{user}' via bastion '{bastion_user_used}'.") - _store(address, target_ssh) - return - except Exception as e: - last_err = e - self.logger.info(f"Target login failed for '{user}': {repr(e)}") + # new channel for each attempt + chan = bastion_transport.open_channel("direct-tcpip", (address, port), ("127.0.0.1", 0)) + # try keys + for key in keys: try: - target_ssh.close() - except Exception: - pass + cli = self._try_connect(address, user, key, None, sock=chan, timeout=30) + self.logger.info(f"Connected to {address} as '{user}' via bastion.") + _store(address, cli) + return + except Exception as e: + last_err = e + # then password + if password: try: - channel.close() - except Exception: - pass - - raise Exception( - f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}" - ) - + cli = self._try_connect(address, user, None, password, sock=chan, timeout=30) + self.logger.info(f"Connected to {address} as '{user}' via bastion (password).") + _store(address, cli) + return + except Exception as e: + last_err = e + try: + chan.close() + except Exception: + pass + + raise Exception(f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}") + + + + # def exec_command(self, node, command, timeout=360, max_retries=3, stream_callback=None, supress_logs=False): + # """Executes a command on a given machine with streaming output and retry mechanism. + + # Args: + # node (str): Machine to run command on. + # command (str): Command to run. + # timeout (int): Timeout in seconds. + # max_retries (int): Number of retries in case of failures. + # stream_callback (callable, optional): A callback function for streaming output. Defaults to None. + + # Returns: + # tuple: Final output and error strings after command execution. + # """ + # retry_count = 0 + # while retry_count < max_retries: + # with self.ssh_semaphore: + # ssh_connection = self.ssh_connections.get(node) + # try: + # # Ensure the SSH connection is active, otherwise reconnect + # if not ssh_connection or not ssh_connection.get_transport().is_active() or retry_count > 0: + # self.logger.info(f"Reconnecting SSH to node {node}") + # self.connect( + # address=node, + # is_bastion_server=True if node == self.bastion_server else False + # ) + # ssh_connection = self.ssh_connections[node] + + # if not supress_logs: + # self.logger.info(f"Executing command: {command}") + # stdin, stdout, stderr = ssh_connection.exec_command(command, timeout=timeout) + + # output = [] + # error = [] + + # # Read stdout and stderr dynamically if stream_callback is provided + # if stream_callback: + # while not stdout.channel.exit_status_ready(): + # # Process stdout + # if stdout.channel.recv_ready(): + # chunk = stdout.channel.recv(1024).decode() + # output.append(chunk) + # stream_callback(chunk, is_error=False) # Callback for stdout + + # # Process stderr + # if stderr.channel.recv_stderr_ready(): + # chunk = stderr.channel.recv_stderr(1024).decode() + # error.append(chunk) + # stream_callback(chunk, is_error=True) # Callback for stderr + + # time.sleep(0.1) + + # # Finalize any remaining output + # if stdout.channel.recv_ready(): + # chunk = stdout.channel.recv(1024).decode() + # output.append(chunk) + # stream_callback(chunk, is_error=False) + + # if stderr.channel.recv_stderr_ready(): + # chunk = stderr.channel.recv_stderr(1024).decode() + # error.append(chunk) + # stream_callback(chunk, is_error=True) + # else: + # # Default behavior: Read the entire output at once + # output = stdout.read().decode() + # error = stderr.read().decode() + + # # Combine the output into strings + # output = "".join(output) if isinstance(output, list) else output + # error = "".join(error) if isinstance(error, list) else error + + # # Log the results + # if output: + # if not supress_logs: + # self.logger.info(f"Command output: {output}") + # if error: + # if not supress_logs: + # self.logger.error(f"Command error: {error}") + + # if not output and not error: + # if not supress_logs: + # self.logger.warning(f"Command '{command}' executed but returned no output or error.") + + # return output, error + + # except EOFError as e: + # self.logger.error(f"EOFError occurred while executing command '{command}': {e}. Retrying ({retry_count + 1}/{max_retries})...") + # retry_count += 1 + # time.sleep(2) # Short delay before retrying + + # except paramiko.SSHException as e: + # self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...") + # retry_count += 1 + # time.sleep(2) # Short delay before retrying + + # except paramiko.buffered_pipe.PipeTimeout as e: + # self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...") + # retry_count += 1 + # time.sleep(2) # Short delay before retrying + + # except Exception as e: + # self.logger.error(f"SSH command failed (General Exception): {e}. Retrying ({retry_count + 1}/{max_retries})...") + # retry_count += 1 + # time.sleep(2) # Short delay before retrying + + # # If we exhaust retries, return failure + # self.logger.error(f"Failed to execute command '{command}' on node {node} after {max_retries} retries.") + # return "", "Command failed after max retries" def exec_command(self, node, command, timeout=360, max_retries=3, stream_callback=None, supress_logs=False): - """Executes a command on a given machine with streaming output and retry mechanism. - - Args: - node (str): Machine to run command on. - command (str): Command to run. - timeout (int): Timeout in seconds. - max_retries (int): Number of retries in case of failures. - stream_callback (callable, optional): A callback function for streaming output. Defaults to None. - - Returns: - tuple: Final output and error strings after command execution. """ - retry_count = 0 - while retry_count < max_retries: + Execute a command with auto-reconnect (serialized per node), optional streaming, + and proper exit-status capture to reduce “ran but no output” confusion. + """ + retry = 0 + while retry < max_retries: with self.ssh_semaphore: - ssh_connection = self.ssh_connections.get(node) + # serialize reconnect attempts per node + lock = self._reconnect_locks[node] + with lock: + ssh = self.ssh_connections.get(node) + if not ssh or not ssh.get_transport() or not ssh.get_transport().is_active() or retry > 0: + if not supress_logs: + self.logger.info(f"Reconnecting SSH to node {node}") + # if node is the bastion itself + self.connect(node, is_bastion_server=(node == self.bastion_server)) + ssh = self.ssh_connections[node] + try: - # Ensure the SSH connection is active, otherwise reconnect - if not ssh_connection or not ssh_connection.get_transport().is_active() or retry_count > 0: - self.logger.info(f"Reconnecting SSH to node {node}") - self.connect( - address=node, - is_bastion_server=True if node == self.bastion_server else False - ) - ssh_connection = self.ssh_connections[node] - if not supress_logs: self.logger.info(f"Executing command: {command}") - stdin, stdout, stderr = ssh_connection.exec_command(command, timeout=timeout) + stdin, stdout, stderr = ssh.exec_command(command, timeout=timeout) + output_chunks, error_chunks = [], [] - output = [] - error = [] - - # Read stdout and stderr dynamically if stream_callback is provided if stream_callback: while not stdout.channel.exit_status_ready(): - # Process stdout if stdout.channel.recv_ready(): - chunk = stdout.channel.recv(1024).decode() - output.append(chunk) - stream_callback(chunk, is_error=False) # Callback for stdout - - # Process stderr + chunk = stdout.channel.recv(8192).decode(errors="replace") + output_chunks.append(chunk) + stream_callback(chunk, is_error=False) if stderr.channel.recv_stderr_ready(): - chunk = stderr.channel.recv_stderr(1024).decode() - error.append(chunk) - stream_callback(chunk, is_error=True) # Callback for stderr - - time.sleep(0.1) - - # Finalize any remaining output - if stdout.channel.recv_ready(): - chunk = stdout.channel.recv(1024).decode() - output.append(chunk) + chunk = stderr.channel.recv_stderr(8192).decode(errors="replace") + error_chunks.append(chunk) + stream_callback(chunk, is_error=True) + time.sleep(0.05) + + # flush remaining + while stdout.channel.recv_ready(): + chunk = stdout.channel.recv(8192).decode(errors="replace") + output_chunks.append(chunk) stream_callback(chunk, is_error=False) - - if stderr.channel.recv_stderr_ready(): - chunk = stderr.channel.recv_stderr(1024).decode() - error.append(chunk) + while stderr.channel.recv_stderr_ready(): + chunk = stderr.channel.recv_stderr(8192).decode(errors="replace") + error_chunks.append(chunk) stream_callback(chunk, is_error=True) + + exit_status = stdout.channel.recv_exit_status() + out = "".join(output_chunks) + err = "".join(error_chunks) else: - # Default behavior: Read the entire output at once - output = stdout.read().decode() - error = stderr.read().decode() + out = stdout.read().decode(errors="replace") + err = stderr.read().decode(errors="replace") + exit_status = stdout.channel.recv_exit_status() - # Combine the output into strings - output = "".join(output) if isinstance(output, list) else output - error = "".join(error) if isinstance(error, list) else error + if (not supress_logs) and out: + self.logger.info(f"Command output: {out.strip()[:2000]}") + if (not supress_logs) and err: + self.logger.error(f"Command error: {err.strip()[:2000]}") - # Log the results - if output: - if not supress_logs: - self.logger.info(f"Command output: {output}") - if error: - if not supress_logs: - self.logger.error(f"Command error: {error}") + if exit_status != 0 and not err: + # some tools write nothing on stderr but non-zero exit + err = f"Non-zero exit status: {exit_status}" - if not output and not error: + if not out and not err: if not supress_logs: self.logger.warning(f"Command '{command}' executed but returned no output or error.") - return output, error - - except EOFError as e: - self.logger.error(f"EOFError occurred while executing command '{command}': {e}. Retrying ({retry_count + 1}/{max_retries})...") - retry_count += 1 - time.sleep(2) # Short delay before retrying + return out, err - except paramiko.SSHException as e: - self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...") - retry_count += 1 - time.sleep(2) # Short delay before retrying - - except paramiko.buffered_pipe.PipeTimeout as e: - self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...") - retry_count += 1 - time.sleep(2) # Short delay before retrying + except (EOFError, paramiko.SSHException, paramiko.buffered_pipe.PipeTimeout, socket.error) as e: + retry += 1 + self.logger.error(f"SSH command failed ({type(e).__name__}): {e}. Retrying ({retry}/{max_retries})...") + time.sleep(min(2 * retry, 5)) except Exception as e: - self.logger.error(f"SSH command failed (General Exception): {e}. Retrying ({retry_count + 1}/{max_retries})...") - retry_count += 1 - time.sleep(2) # Short delay before retrying + retry += 1 + self.logger.error(f"SSH command failed (General): {e}. Retrying ({retry}/{max_retries})...") + time.sleep(min(2 * retry, 5)) - # If we exhaust retries, return failure self.logger.error(f"Failed to execute command '{command}' on node {node} after {max_retries} retries.") return "", "Command failed after max retries" - + def format_disk(self, node, device, fs_type="ext4"): """Format disk on the given node @@ -362,14 +623,133 @@ def get_devices(self, node): return output.strip().split() - def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs): - """Run FIO Tests with given params and proper logging for MD5 error timestamp tracing. + # def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs): + # """ + # Run FIO with optional 'ensure_running' that verifies process presence and retries start up to N times. + + # kwargs: + # - ensure_running: bool (default False) + # - max_start_retries: int (default 3) + # """ + # location = "" + # if device: + # location = f"--filename={device}" + # if directory: + # location = f"--directory={directory}" + + # runtime = kwargs.get("runtime", 3600) + # name = kwargs.get("name", f"fio_{_rid(6)}") + # ioengine = kwargs.get("ioengine", "libaio") + # iodepth = kwargs.get("iodepth", 1) + # time_based = "--time_based" if kwargs.get("time_based", True) else "" + # rw = kwargs.get("rw", "randrw") + # bs = kwargs.get("bs", "4K") + # size = kwargs.get("size", "1G") + # rwmixread = kwargs.get("rwmixread", 70) + # numjobs = kwargs.get("numjobs", 2) + # nrfiles = kwargs.get("nrfiles", 8) + # log_avg_ms = kwargs.get("log_avg_msec", 1000) + # output_fmt = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else '' + # output_file = f" --output={kwargs['output_file']} " if kwargs.get("output_file") else '' + # iolog_base = kwargs.get("iolog_file") + + # iolog_opt = f"--write_iolog={iolog_base}" if iolog_base else "" + # log_opt = f"--log_avg_msec={log_avg_ms}" if log_avg_ms else "" + + # command = ( + # f"sudo fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} " + # f"{time_based} --runtime={runtime} --rw={rw} --max_latency=20s --bs={bs} --size={size} --rwmixread={rwmixread} " + # f"--verify=md5 --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} " + # f"{log_opt} {iolog_opt} {output_fmt}{output_file}" + # ) + # if kwargs.get("debug"): + # command += " --debug=all" + # if log_file: + # command += f" > {log_file} 2>&1" + + # ensure_running = bool(kwargs.get("ensure_running", False)) + # max_start_retries = int(kwargs.get("max_start_retries", 3)) + + # launch_retries = 3 + # for attempt in range(1, launch_retries + 1): + + # try: + # self.logger.info(f"Starting FIO on {node}: {name} → {location} (attempt {attempt}/{launch_retries})") + # self.exec_command(node=node, command=f"sudo {command}", max_retries=2) + # break + # except Exception as e: + # self.logger.error(f"FIO start failed: {e}") + # if attempt == launch_retries: + # raise + # time.sleep(1.0 * attempt) + + # # Ensure process is up (pgrep name) + # start_retries = 6 + # for i in range(start_retries): + # out, err = self.exec_command( + # node=node, + # command=f"pgrep -fa 'fio.*{name}' || true", + # max_retries=1, + # ) + # if out.strip(): + # self.logger.info(f"FIO is running for {name}: {out.strip().splitlines()[0]}") + # return + # # Not running yet → small backoff and try again + # time.sleep(2 + i) + # # If still not, try re-launch quickly + # if i >= 2: + # self.logger.warning(f"FIO still not running for {name}; re-issuing start (try {i-1}/{start_retries-3})") + # try: + # self.exec_command(node=node, command=f"sudo {command}", max_retries=1) + # except Exception as e: + # self.logger.warning(f"Re-start attempt raised: {e}") + + # # If we get here, fio didn’t stick + # raise RuntimeError(f"FIO failed to stay running for job {name} on {node}") + + # def _is_running(): + # # Use pgrep on job name (fio --name=) for a quick check + # # Fall back to ps+grep if pgrep not present. + # try: + # out, _ = self.exec_command(node=node, command=f"pgrep -fl 'fio.*--name={name}'", max_retries=1) + # return bool(out.strip()) + # except Exception: + # out, _ = self.exec_command(node=node, command=f"ps ax | grep -E 'fio.*--name={name}' | grep -v grep || true", max_retries=1) + # return bool(out.strip()) + + # # Try to start; handle EOF/channel close by reconnect+retry + # attempts = 0 + # while True: + # attempts += 1 + # try: + # self.exec_command(node=node, command=command, max_retries=3) + # except Exception as e: + # # Channel/EOF during start is common in churn; retry a few times + # if attempts < max_start_retries: + # self.logger.error(f"FIO start error ({e}); retrying {attempts}/{max_start_retries} in 2s") + # time.sleep(2) + # continue + # else: + # raise + + # if not ensure_running: + # return + + # # Verify started; retry if not + # time.sleep(1.0) + # if _is_running(): + # return + + # if attempts >= max_start_retries: + # raise RuntimeError(f"FIO failed to start after {max_start_retries} attempts for job '{name}'") + + # self.logger.warning(f"FIO not detected running for '{name}'; retrying start {attempts}/{max_start_retries}") + # time.sleep(1.0) - Args: - node (str): Node to perform ssh operation on - device (str): Device path. Defaults to None. - directory (str, optional): Directory to run test on. Defaults to None. - log_file (str, optional): Log file to redirect output to. Defaults to None. + def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs): + """ + Start FIO in a detached tmux session so it survives SSH channel drops during fast outages. + Verifies process presence and re-kicks a few times if missing. """ location = "" if device: @@ -377,72 +757,63 @@ def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwarg if directory: location = f"--directory={directory}" - runtime = kwargs.get("runtime", 3600) - rw = kwargs.get("rw", "randrw") - name = kwargs.get("name", "test") - ioengine = kwargs.get("ioengine", "libaio") - iodepth = kwargs.get("iodepth", 1) - bs = kwargs.get("bs", "4k") - rwmixread = kwargs.get("rwmixread", 70) - size = kwargs.get("size", "10MiB") - time_based = "--time_based" if kwargs.get("time_based", True) else "" - numjobs = kwargs.get("numjobs", 1) - nrfiles = kwargs.get("nrfiles", 1) - - output_format = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else '' + runtime = kwargs.get("runtime", 3600) + name = kwargs.get("name", f"fio_{_rid(6)}") + ioengine = kwargs.get("ioengine", "libaio") + iodepth = kwargs.get("iodepth", 1) + time_based = "--time_based" if kwargs.get("time_based", True) else "" + rw = kwargs.get("rw", "randrw") + bs = kwargs.get("bs", "4K") + size = kwargs.get("size", "1G") + rwmixread = kwargs.get("rwmixread", 70) + numjobs = kwargs.get("numjobs", 2) + nrfiles = kwargs.get("nrfiles", 8) + log_avg_ms = kwargs.get("log_avg_msec", 1000) + max_latency = kwargs.get("max_latency", "20s") + use_latency = kwargs.get("use_latency", True) + output_fmt = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else '' output_file = f" --output={kwargs['output_file']} " if kwargs.get("output_file") else '' + iolog_base = kwargs.get("iolog_file") - log_avg_msec = kwargs.get("log_avg_msec", 1000) - log_avg_msec_opt = f"--log_avg_msec={log_avg_msec}" if log_avg_msec else "" - - iolog_base = kwargs.get("iolog_file", None) - iolog_opt = f"--write_iolog={iolog_base}" if iolog_base else "" - verify_md5 = "--verify=md5" if iodepth == 1 else "" + iolog_opt = f"--write_iolog={iolog_base}" if iolog_base else "" + log_opt = f"--log_avg_msec={log_avg_ms}" if log_avg_ms else "" + latency = f" --max_latency={max_latency}" if use_latency else "" - command = ( - f"sudo fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} " - f"{time_based} --runtime={runtime} --rw={rw} --max_latency=30s --bs={bs} --size={size} --rwmixread={rwmixread} " - f"{verify_md5} --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} " - f"{log_avg_msec_opt} {iolog_opt} " - f"{output_format}{output_file}" - ) - # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - # log_file = log_file or f"/tmp/{name}_{timestamp}.log" + # raw fio command + fio_cmd = ( + f"fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} " + f"{time_based} --runtime={runtime} --rw={rw} {latency} --bs={bs} --size={size} --rwmixread={rwmixread} " + f"--verify=md5 --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} " + f"{log_opt} {iolog_opt} {output_fmt}{output_file}" + ).strip() if kwargs.get("debug"): - command += " --debug=all" + fio_cmd += " --debug=all" + # run fio under tmux so HUP/SSH channel drops don't kill it + session = f"fio_{name}" if log_file: - command += f" > {log_file} 2>&1" - - # else: - # command += " --debug=verify" - - # awk_ts = " | awk '{ print strftime(\"[%Y-%m-%d %H:%M:%S]\"), $0; fflush(); }' | " - # command += awk_ts - # command += f"tee {log_file}" - - self.logger.info(f"Executing FIO command:\n{command}") + fio_cmd = f"{fio_cmd} > {log_file} 2>&1" + + start_cmd = f"sudo tmux new-session -d -s {session} \"{fio_cmd}\" || sudo tmux kill-session -t {session} 2>/dev/null || true; sudo tmux new-session -d -s {session} \"{fio_cmd}\"" + self.logger.info(f"Starting FIO on {node}: {name} in tmux session '{session}'") + self.exec_command(node=node, command=start_cmd, max_retries=2) + + # Ensure process is up: check tmux & pgrep + for i in range(8): + out, _ = self.exec_command(node=node, command=f"pgrep -fa 'fio.*{name}' || true", max_retries=1, supress_logs=True) + tmux_ok, _ = self.exec_command(node=node, command=f"sudo tmux has-session -t {session} 2>/dev/null || echo MISSING", max_retries=1, supress_logs=True) + if out.strip() and "MISSING" not in tmux_ok: + self.logger.info(f"FIO is running for {name}: {out.strip().splitlines()[0]}") + return + if i >= 2: + self.logger.warning(f"FIO not detected yet for {name}; re-issuing start (try {i-1}/5)") + self.exec_command(node=node, command=start_cmd, max_retries=1, supress_logs=True) + time.sleep(2 + i) - start_time = time.time() - output, error = self.exec_command(node=node, command=command, timeout=runtime * 2) - end_time = time.time() - - total_time = end_time - start_time - self.fio_runtime[name] = start_time - self.logger.info(f"Total time taken to run the command: {total_time:.2f} seconds") - - # Return all generated iolog files (one per job) - iolog_files = [f"{iolog_base}.{i}" for i in range(numjobs)] - return { - "output": output, - "error": error, - "start_time": start_time, - "end_time": end_time, - "iolog_files": iolog_files, - } + raise RuntimeError(f"FIO failed to stay running for job {name} on {node}") - + def find_process_name(self, node, process_name, return_pid=False): if return_pid: command = "ps -ef | grep -i '%s' | awk '{print $2}'" % process_name @@ -700,15 +1071,35 @@ def get_lvol_id(self, node, lvol_name): return output.strip().split() def get_snapshot_id(self, node, snapshot_name): - cmd = "%s snapshot list | grep -i '%s ' | awk '{print $2}'" % (self.base_cmd, snapshot_name) - output, error = self.exec_command(node=node, command=cmd) + start = time.time() + deadline = start + 600 # 10 minutes + wait_interval = 10 # seconds between checks + snapshot_id = "" + + while time.time() < deadline: + cmd = "%s snapshot list | grep -i '%s ' | awk '{print $2}'" % (self.base_cmd, snapshot_name) + output, error = self.exec_command(node=node, command=cmd) + if output.strip(): + if hasattr(self, "logger"): + self.logger.info(f"Snapshot '{snapshot_name}' is visible with ID: {snapshot_id}") + break + time.sleep(wait_interval) + + if not output.strip(): + if hasattr(self, "logger"): + self.logger.error(f"Timed out waiting for snapshot '{snapshot_name}' to appear within 10 minutes.") return output.strip() def add_snapshot(self, node, lvol_id, snapshot_name): cmd = f"{self.base_cmd} -d snapshot add {lvol_id} {snapshot_name}" output, error = self.exec_command(node=node, command=cmd) - return output, error + + snapshot_id = self.get_snapshot_id(node=node, snapshot_name=snapshot_name) + + if not snapshot_id: + if hasattr(self, "logger"): + self.logger.error(f"Timed out waiting for snapshot '{snapshot_name}' to appear within 10 minutes.") def add_clone(self, node, snapshot_id, clone_name): cmd = f"{self.base_cmd} -d snapshot clone {snapshot_id} {clone_name}" @@ -971,30 +1362,81 @@ def get_active_interfaces(self, node_ip): return [] - def disconnect_all_active_interfaces(self, node_ip, interfaces, reconnect_time=300): - """ - Disconnect all active network interfaces on a node in a single SSH call. + # def disconnect_all_active_interfaces(self, node_ip, interfaces, reconnect_time=300): + # """ + # Disconnect all active network interfaces on a node in a single SSH call. + + # Args: + # node_ip (str): IP of the target node. + # interfaces (list): List of active network interfaces to disconnect. + # """ + # if not interfaces: + # self.logger.warning(f"No active interfaces to disconnect on node {node_ip}.") + # return + + # # Combine disconnect commands for all interfaces + # disconnect_cmds = " && ".join([f"sudo nmcli connection down {iface}" for iface in interfaces]) + # reconnect_cmds = " && ".join([f"sudo nmcli connection up {iface}" for iface in interfaces]) + + # cmd = ( + # f'nohup sh -c "{disconnect_cmds} && sleep {reconnect_time} && {reconnect_cmds}" &' + # ) + # self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}") + # try: + # self.exec_command(node_ip, cmd) + # except Exception as e: + # self.logger.error(f"Failed to execute combined disconnect command on {node_ip}: {e}") + + def _ping_once(self, ip: str, count: int = 1, wait: int = 1) -> bool: + try: + # Use system ping; True means "ping success" + res = subprocess.run(["ping", "-c", str(count), "-W", str(wait), ip], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return res.returncode == 0 + except Exception: + return False - Args: - node_ip (str): IP of the target node. - interfaces (list): List of active network interfaces to disconnect. + def disconnect_all_active_interfaces( + self, + node_ip: str, + interfaces: list[str], + duration_secs: int = 300, + max_tries: int = 3, + ): + """ + Bring all given interfaces DOWN, verify outage by ping, keep for duration, then bring them UP. + Fire-and-forget style; robust against brief SSH flaps. """ if not interfaces: - self.logger.warning(f"No active interfaces to disconnect on node {node_ip}.") + self.logger.info(f"No active interfaces provided for {node_ip}; skipping NIC down.") return - # Combine disconnect commands for all interfaces - disconnect_cmds = " && ".join([f"sudo nmcli connection down {iface}" for iface in interfaces]) - reconnect_cmds = " && ".join([f"sudo nmcli connection up {iface}" for iface in interfaces]) + down_cmd = " && ".join([f"nmcli connection down {i}" for i in interfaces]) + up_cmd = " && ".join([f"nmcli connection up {i}" for i in interfaces]) + cmd = f'nohup sh -c "{down_cmd} && sleep {duration_secs} && {up_cmd}" &' - cmd = ( - f'nohup sh -c "{disconnect_cmds} && sleep {reconnect_time} && {reconnect_cmds}" &' - ) - self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}") try: - self.exec_command(node_ip, cmd) + self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}") + out, err = self.exec_command(node=node_ip, command=cmd, max_retries=1, timeout=20) + if err: + raise Exception(err) except Exception as e: - self.logger.error(f"Failed to execute combined disconnect command on {node_ip}: {e}") + self.logger.info(f"Command: {cmd}, error: {e}! Checking pings!!") + + # Verify outage begins (best-effort). If ping still works, attempt to issue 'down' again. + time.sleep(5) + tries = 0 + attempts = 10 + while self._ping_once(node_ip) and attempts > 0: + tries += 1 + if tries >= max_tries: + self.logger.warning(f"Ping to {node_ip} still responding after NIC down attempts; continuing anyway.") + break + self.logger.info(f"Ping to {node_ip} still alive; retrying NIC down...") + # re-run only the DOWN part (don’t append sleep again to avoid stacking) + self.exec_command(node=node_ip, command=cmd, max_retries=2) + time.sleep(3) + attempts -= 1 def check_tmux_installed(self, node_ip): """Check tmux installation @@ -1420,132 +1862,263 @@ def dump_lvstore(self, node_ip, storage_node_id): self.logger.error(f"Failed to dump lvstore on {node_ip}: {e}") return None - def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path): - """ - Fetch distrib names using bdev_get_bdevs RPC, generate and execute RPC JSON, - and copy logs from SPDK container. + # def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path): + # """ + # Fetch distrib names using bdev_get_bdevs RPC, generate and execute RPC JSON, + # and copy logs from SPDK container. + + # Args: + # storage_node_ip (str): IP of the storage node + # storage_node_id (str): ID of the storage node + # """ + # self.logger.info(f"Fetching distrib logs for Storage Node ID: {storage_node_id} on {storage_node_ip}") + + # # Step 1: Find the SPDK container + # find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$'" + # container_name_output, _ = self.exec_command(storage_node_ip, find_container_cmd) + # container_name = container_name_output.strip() + + # if not container_name: + # self.logger.warning(f"No SPDK container found on {storage_node_ip}") + # return + + # # Step 2: Get bdev_get_bdevs output + # # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'" + # # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd) + + # # if error: + # # self.logger.error(f"Error running bdev_get_bdevs: {error}") + # # return + + # # # Step 3: Save full output to local file + # # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S") + # # raw_output_path = f"{Path.home()}/bdev_output_{storage_node_ip}_{timestamp}.json" + # # with open(raw_output_path, "w") as f: + # # f.write(bdev_output) + # # self.logger.info(f"Saved raw bdev_get_bdevs output to {raw_output_path}") + + # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S") + # base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs/" + + # cmd = f"sudo mkdir -p '{base_path}'" + # self.exec_command(storage_node_ip, cmd) + + # remote_output_path = f"bdev_output_{storage_node_ip}_{timestamp}.json" + + # # 1. Run to capture output into a variable (for parsing) + # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs'" + # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd) + + # if error: + # self.logger.error(f"Error running bdev_get_bdevs: {error}") + # return + + # # 2. Run again to save output on host machine (audit trail) + # bdev_save_cmd = ( + # f"sudo bash -c \"docker exec {container_name} python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs > {remote_output_path}\"") + + # self.exec_command(storage_node_ip, bdev_save_cmd) + # self.logger.info(f"Saved bdev_get_bdevs output to {remote_output_path} on {storage_node_ip}") + + + # # Step 4: Extract unique distrib names + # try: + # bdevs = json.loads(bdev_output) + # distribs = list({bdev['name'] for bdev in bdevs if bdev['name'].startswith('distrib_')}) + # except json.JSONDecodeError as e: + # self.logger.error(f"JSON parsing failed: {e}") + # return + + # if not distribs: + # self.logger.warning("No distrib names found in bdev_get_bdevs output.") + # return + + # self.logger.info(f"Distributions found: {distribs}") + + # # Step 5: Process each distrib + # for distrib in distribs: + # self.logger.info(f"Processing distrib: {distrib}") + # rpc_json = { + # "subsystems": [ + # { + # "subsystem": "distr", + # "config": [ + # { + # "method": "distr_debug_placement_map_dump", + # "params": {"name": distrib} + # } + # ] + # } + # ] + # } + + # rpc_json_str = json.dumps(rpc_json) + # remote_json_path = "/tmp/stack.json" + + # # Save JSON file remotely + # create_json_command = f"echo '{rpc_json_str}' | sudo tee {remote_json_path}" + # self.exec_command(storage_node_ip, create_json_command) + + # # Copy into container + # copy_json_command = f"sudo docker cp {remote_json_path} {container_name}:{remote_json_path}" + # self.exec_command(storage_node_ip, copy_json_command) + + # # Run RPC inside container + # rpc_command = f"sudo docker exec {container_name} bash -c 'python scripts/rpc_sock.py {remote_json_path} /mnt/ramdisk/{container_name}/spdk.sock'" + # self.exec_command(storage_node_ip, rpc_command) + + # # Find and copy log + # find_log_command = f"sudo docker exec {container_name} ls /tmp/ | grep {distrib}" + # log_file_name, _ = self.exec_command(storage_node_ip, find_log_command) + # log_file_name = log_file_name.strip().replace("\r", "").replace("\n", "") + + # if not log_file_name: + # self.logger.error(f"No log file found for distrib {distrib}.") + # continue + + # log_file_path = f"/tmp/{log_file_name}" + # local_log_path = f"{base_path}/{log_file_name}_{storage_node_ip}_{timestamp}" + # copy_log_cmd = f"sudo docker cp {container_name}:{log_file_path} {local_log_path}" + # self.exec_command(storage_node_ip, copy_log_cmd) + + # self.logger.info(f"Fetched log for {distrib}: {local_log_path}") + + # # Clean up + # delete_log_cmd = f"sudo docker exec {container_name} rm -f {log_file_path}" + # self.exec_command(storage_node_ip, delete_log_cmd) + + # self.logger.info("All distrib logs retrieved successfully.") - Args: - storage_node_ip (str): IP of the storage node - storage_node_id (str): ID of the storage node - """ + def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path): self.logger.info(f"Fetching distrib logs for Storage Node ID: {storage_node_id} on {storage_node_ip}") - # Step 1: Find the SPDK container - find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$'" - container_name_output, _ = self.exec_command(storage_node_ip, find_container_cmd) - container_name = container_name_output.strip() - + # 0) Find SPDK container name + find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' || true" + container_name_out, _ = self.exec_command(storage_node_ip, find_container_cmd) + container_name = (container_name_out or "").strip() if not container_name: self.logger.warning(f"No SPDK container found on {storage_node_ip}") return - # Step 2: Get bdev_get_bdevs output - # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'" - # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd) - - # if error: - # self.logger.error(f"Error running bdev_get_bdevs: {error}") - # return - - # # Step 3: Save full output to local file - # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S") - # raw_output_path = f"{Path.home()}/bdev_output_{storage_node_ip}_{timestamp}.json" - # with open(raw_output_path, "w") as f: - # f.write(bdev_output) - # self.logger.info(f"Saved raw bdev_get_bdevs output to {raw_output_path}") - - timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S") - base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs/" - - cmd = f"sudo mkdir -p '{base_path}'" - self.exec_command(storage_node_ip, cmd) - - remote_output_path = f"bdev_output_{storage_node_ip}_{timestamp}.json" - - # 1. Run to capture output into a variable (for parsing) - bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'" - bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd) - - if error: - self.logger.error(f"Error running bdev_get_bdevs: {error}") + # 1) Get bdevs via correct sock + timestamp = datetime.now().strftime("%Y%m%d_%H-%M-%S") + base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs" + self.exec_command(storage_node_ip, f"sudo mkdir -p '{base_path}' && sudo chmod -R 777 '{base_path}'") + bdev_cmd = ( + f"sudo docker exec {container_name} bash -lc " + f"\"python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs\"" + ) + bdev_output, bdev_err = self.exec_command(storage_node_ip, bdev_cmd) + if (bdev_err and bdev_err.strip()) and not bdev_output: + self.logger.error(f"bdev_get_bdevs error on {storage_node_ip}: {bdev_err.strip()}") return - # 2. Run again to save output on host machine (audit trail) - bdev_save_cmd = ( - f"sudo bash -c \"docker exec {container_name} python spdk/scripts/rpc.py bdev_get_bdevs > {remote_output_path}\"") - - self.exec_command(storage_node_ip, bdev_save_cmd) - self.logger.info(f"Saved bdev_get_bdevs output to {remote_output_path} on {storage_node_ip}") - - - # Step 4: Extract unique distrib names + # Parse distrib names try: bdevs = json.loads(bdev_output) - distribs = list({bdev['name'] for bdev in bdevs if bdev['name'].startswith('distrib_')}) + distribs = sorted({ + b.get("name", "") + for b in bdevs + if isinstance(b, dict) and str(b.get("name","")).startswith("distrib_") + }) except json.JSONDecodeError as e: - self.logger.error(f"JSON parsing failed: {e}") + self.logger.error(f"JSON parsing failed on {storage_node_ip}: {e}") return - if not distribs: - self.logger.warning("No distrib names found in bdev_get_bdevs output.") + self.logger.warning(f"No distrib_* bdevs found on {storage_node_ip}.") + return + self.logger.info(f"[{storage_node_ip}] Distributions: {distribs}") + + # 2) Run multiple docker exec in parallel from ONE SSH exec + distrib_list_str = " ".join(shlex.quote(d) for d in distribs) + remote_tar = f"/tmp/distrib_logs_{timestamp}.tar.gz" + + # IMPORTANT: This script runs on the HOST and spawns many `docker exec ... &` in parallel. + # It throttles with MAXJ, waits, then tars outputs from /tmp inside the container into one tarball on the host. + remote_script = f"""\ +set -euo pipefail +CN={shlex.quote(container_name)} +SOCK="/mnt/ramdisk/$CN/spdk.sock" +TS="{timestamp}" +MAXJ=8 +WORKDIR_HOST="{base_path}" +mkdir -p "$WORKDIR_HOST" + +# Make a temporary host folder to collect per-distrib files copied out of the container +HOST_STAGING="/tmp/distrib_host_collect_$TS" +mkdir -p "$HOST_STAGING" + +pids=() + +for d in {distrib_list_str}; do + ( + # Build JSON on host then copy into container (avoids many ssh execs) + JF="/tmp/stack_${{d}}.json" + cat > "$JF" <<'EOF_JSON' +{{ + "subsystems": [ + {{ + "subsystem": "distr", + "config": [ + {{ + "method": "distr_debug_placement_map_dump", + "params": {{"name": "__DIST__"}} + }} + ] + }} + ] +}} +EOF_JSON + # substitute distrib name + sed -i "s/__DIST__/$d/g" "$JF" + + # Copy JSON into container + sudo docker cp "$JF" "$CN:/tmp/stack_${{d}}.json" + + # Run rpc inside container (socket path respected) + sudo docker exec "$CN" bash -lc "python scripts/rpc_sock.py /tmp/stack_${{d}}.json {shlex.quote('/mnt/ramdisk/'+container_name+'/spdk.sock')} > /tmp/rpc_${{d}}.log 2>&1 || true" + + # Copy any files for this distrib out to host staging (rpc log + any matching /tmp/*d*) + sudo docker cp "$CN:/tmp/rpc_${{d}}.log" "$HOST_STAGING/rpc_${{d}}.log" 2>/dev/null || true + # try to pull any distrib-related artifacts + for f in $(sudo docker exec "$CN" bash -lc "ls /tmp/ 2>/dev/null | grep -F \"$d\" || true"); do + sudo docker cp "$CN:/tmp/$f" "$HOST_STAGING/$f" 2>/dev/null || true + done + + # cleanup container temp for this distrib + sudo docker exec "$CN" bash -lc "rm -f /tmp/stack_${{d}}.json /tmp/rpc_${{d}}.log" || true + rm -f "$JF" || true + ) & + + # throttle parallel jobs + while [ "$(jobs -rp | wc -l)" -ge "$MAXJ" ]; do sleep 0.2; done +done + +# Wait for all background jobs +wait + +# Tar once on host +tar -C "$HOST_STAGING" -czf {shlex.quote(remote_tar)} . 2>/dev/null || true + +# Move artifacts to final location +mv -f {shlex.quote(remote_tar)} "$WORKDIR_HOST/" || true + +# Also copy loose files (for convenience) then clean staging +cp -rf "$HOST_STAGING"/. "$WORKDIR_HOST"/ 2>/dev/null || true +rm -rf "$HOST_STAGING" || true + +echo "$WORKDIR_HOST/{os.path.basename(remote_tar)}" +""" + + run_many_cmd = "bash -lc " + shlex.quote(remote_script) + tar_out, tar_err = self.exec_command(storage_node_ip, run_many_cmd) + if (tar_err and tar_err.strip()) and not tar_out: + self.logger.error(f"[{storage_node_ip}] Parallel docker-exec script error: {tar_err.strip()}") return - self.logger.info(f"Distributions found: {distribs}") - - # Step 5: Process each distrib - for distrib in distribs: - self.logger.info(f"Processing distrib: {distrib}") - rpc_json = { - "subsystems": [ - { - "subsystem": "distr", - "config": [ - { - "method": "distr_debug_placement_map_dump", - "params": {"name": distrib} - } - ] - } - ] - } - - rpc_json_str = json.dumps(rpc_json) - remote_json_path = "/tmp/stack.json" - - # Save JSON file remotely - create_json_command = f"echo '{rpc_json_str}' | sudo tee {remote_json_path}" - self.exec_command(storage_node_ip, create_json_command) - - # Copy into container - copy_json_command = f"sudo docker cp {remote_json_path} {container_name}:{remote_json_path}" - self.exec_command(storage_node_ip, copy_json_command) - - # Run RPC inside container - rpc_command = f"sudo docker exec {container_name} bash -c 'python scripts/rpc_sock.py {remote_json_path}'" - self.exec_command(storage_node_ip, rpc_command) - - # Find and copy log - find_log_command = f"sudo docker exec {container_name} ls /tmp/ | grep {distrib}" - log_file_name, _ = self.exec_command(storage_node_ip, find_log_command) - log_file_name = log_file_name.strip().replace("\r", "").replace("\n", "") - - if not log_file_name: - self.logger.error(f"No log file found for distrib {distrib}.") - continue - - log_file_path = f"/tmp/{log_file_name}" - local_log_path = f"{base_path}/{log_file_name}_{storage_node_ip}_{timestamp}" - copy_log_cmd = f"sudo docker cp {container_name}:{log_file_path} {local_log_path}" - self.exec_command(storage_node_ip, copy_log_cmd) - - self.logger.info(f"Fetched log for {distrib}: {local_log_path}") - - # Clean up - delete_log_cmd = f"sudo docker exec {container_name} rm -f {log_file_path}" - self.exec_command(storage_node_ip, delete_log_cmd) + final_tar = (tar_out or "").strip().splitlines()[-1] if tar_out else f"{base_path}/{os.path.basename(remote_tar)}" + self.logger.info(f"[{storage_node_ip}] Distrib logs saved: {base_path} (tar: {final_tar})") - self.logger.info("All distrib logs retrieved successfully.") def clone_mount_gen_uuid(self, node, device): """Repair the XFS filesystem and generate a new UUID. @@ -1722,8 +2295,8 @@ def start_netstat_dmesg_logging(self, node_ip, log_dir): self.exec_command(node_ip, f"sudo tmux new-session -d -s netstat_log 'bash -c \"while true; do netstat -s | grep \\\"segments dropped\\\" >> {netstat_log}; sleep 5; done\"'") self.exec_command(node_ip, f"sudo tmux new-session -d -s dmesg_log 'bash -c \"while true; do sudo dmesg | grep -i \\\"tcp\\\" >> {dmesg_log}; sleep 5; done\"'") - self.exec_command(node_ip, f"sudo tmux new-session -d -s journalctl_log 'bash -c \"while true; do sudo journalctl -k | grep -i \\\"tcp\\\" >> {journalctl_log}; sleep 5; done\"'") - + self.exec_command(node_ip, f"sudo tmux new-session -d -s journalctl_log 'bash -c \"while true; do sudo journalctl -k --no-tail | grep -i \\\"tcp\\\" >> {journalctl_log}; sleep 5; done\"'") + def reset_iptables_in_spdk(self, node_ip): """ Resets iptables rules inside the SPDK container on a given node. @@ -1915,6 +2488,7 @@ def start_resource_monitors(self, node_ip, log_dir): root_log = f"{log_dir}/root_partition_usage_{node_ip}_{timestamp}.txt" docker_mem_log = f"{log_dir}/docker_mem_usage_{node_ip}_{timestamp}.txt" system_mem_log = f"{log_dir}/system_memory_usage_{node_ip}_{timestamp}.txt" + docker_stats_logs = f"{log_dir}/docker_stats_usage_{node_ip}_{timestamp}.txt" # Ensure log directory exists and is writable self.exec_command(node_ip, f"sudo mkdir -p {log_dir} && sudo chmod 777 {log_dir}") @@ -1939,14 +2513,29 @@ def start_resource_monitors(self, node_ip, log_dir): 'bash -c "while true; do date >> {system_mem_log}; free -h >> {system_mem_log}; echo >> {system_mem_log}; sleep 10; done"' """ + docker_stats_cmd = f""" + sudo tmux new-session -d -s docker_stats_all \ + 'bash -c "while true; do date >> {docker_stats_logs}; docker stats --no-stream >> {docker_stats_logs}; echo >> {docker_stats_logs}; sleep 10; done"' + """ + self.exec_command(node_ip, df_cmd) self.exec_command(node_ip, docker_cmd) self.exec_command(node_ip, system_cmd) + self.exec_command(node_ip, docker_stats_cmd) - self.logger.info(f"Started root partition, container memory, and system memory logging on {node_ip}") + self.logger.info(f"Started root partition, container memory, docker stats and system memory logging on {node_ip}") + + def cluster_list(self, node_ip, cluster_id): + """Sets cluster in suspended state + Args: + node_ip (str): Mgmt Node IP to run command on + cluster_id (str): Cluster id to put in suspended state + """ + cmd = f"{self.base_cmd} cluster list" + output, _ = self.exec_command(node_ip, cmd) + return output.strip() - def suspend_cluster(self, node_ip, cluster_id): """Sets cluster in suspended state @@ -1995,7 +2584,7 @@ def ensure_nfs_mounted(self, node, nfs_server, nfs_path, mount_point, is_local = """ check_cmd = f"mount | grep -w '{mount_point}'" mount_cmd = f"sudo mkdir -p {mount_point} && sudo mount -t nfs {nfs_server}:{nfs_path} {mount_point}" - install_check_cmd = "dnf list installed nfs-util" + install_check_cmd = "dnf list installed nfs-utils" install_cmd = "sudo dnf install -y nfs-utils" try: @@ -2300,3 +2889,10 @@ def stop_log_monitor(self): self._monitor_stop_flag.set() self._monitor_thread.join(timeout=10) print("K8s log monitor thread stopped.") + +def _rid(n=6): + import string + import random + letters = string.ascii_uppercase + digits = string.digits + return random.choice(letters) + ''.join(random.choices(letters + digits, k=n-1)) diff --git a/requirements.txt b/requirements.txt index 030cca8e0..9ee458f00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,3 +24,4 @@ flask-openapi3 jsonschema fastapi uvicorn +prometheus_api_client \ No newline at end of file diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 59357ce89..d5d889a55 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -47,7 +47,7 @@ commands: _150 TiB / 3 * 2 = 100TiB_ would be a safe choice. dest: max_prov type: str - required: true + required: false - name: "--nodes-per-socket" help: "number of each node to be added per each socket." dest: nodes_per_socket @@ -60,6 +60,16 @@ commands: dest: sockets_to_use type: str default: "0" + - name: "--cores-percentage" + help: "The percentage of cores to be used for spdk (0-99)" + description: > + The percentage of cores to be used for spdk (0-99) + dest: cores_percentage + type: + range: + min: 0 + max: 99 + default: 0 - name: "--pci-allowed" help: "Comma separated list of PCI addresses of Nvme devices to use for storage devices." description: > @@ -76,6 +86,35 @@ commands: required: false type: str default: "" + - name: "--device-model" + help: "NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together" + description: > + NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together + dest: device_model + required: false + type: str + default: "" + - name: "--size-range" + help: "NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together" + description: > + NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together + dest: size_range + required: false + type: str + default: "" + - name: "--nvme-names" + help: "Comma separated list of nvme namespace names like nvme0n1,nvme1n1..." + description: > + Comma separated list of nvme namespace names like nvme0n1,nvme1n1... + dest: nvme_names + required: false + type: str + default: "" + - name: "--force" + help: "Force format detected or passed nvme pci address to 4K and clean partitions" + dest: force + type: bool + action: store_true - name: configure-upgrade help: "Upgrade the automated configuration file with new changes of cpu mask or storage devices" description: > @@ -86,6 +125,19 @@ commands: description: > Run locally on storage nodes and control plane hosts. Remove a previous deployment to support a fresh scratch-deployment of cluster software. + + - name: clean-devices + help: "clean devices stored in /etc/simplyblock/sn_config_file (local run)" + description: > + Run locally on storage nodes to clean nvme devices and free them. + arguments: + - name: "--config-path" + help: "Config path to read stored nvme devices from" + dest: config_path + required: false + type: str + default: "/etc/simplyblock/sn_config_file" + - name: add-node help: "Adds a storage node by its IP address" arguments: @@ -106,6 +158,11 @@ commands: dest: partitions type: int default: 1 + - name: "--format-4k" + help: "Force format nvme devices with 4K" + dest: format_4k + type: bool + action: store_true - name: "--jm-percent" help: "Number in percent to use for JM from each device" dest: jm_percent @@ -437,16 +494,6 @@ commands: help: "Device id" dest: device_id type: str - - name: reset-device - help: "Resets a storage device" - usage: > - Hardware device reset. Resetting the device can return the device from an unavailable into online state, if - successful. - arguments: - - name: "device_id" - help: "Device id" - dest: device_id - type: str - name: restart-device help: "Restarts a storage device" usage: > @@ -458,6 +505,11 @@ commands: help: "Device id" dest: device_id type: str + - name: "--force" + help: "Force remove" + dest: force + type: bool + action: store_true - name: add-device help: "Adds a new storage device" usage: > @@ -603,6 +655,11 @@ commands: dest: force type: bool action: store_true + - name: "--format" + help: "Format the Alceml device used for JM device" + dest: format + type: bool + action: store_true - name: send-cluster-map help: "Sends a new cluster map" private: true @@ -657,6 +714,16 @@ commands: help: "attr_value" dest: attr_value type: str + - name: new-device-from-failed + help: "Adds a new device to from failed device information" + usage: > + A previously failed and migrated device may be added back into the cluster as a new device. The new device + would have the same info as the failed device but would be empty and not contain any data. + arguments: + - name: "device_id" + help: "Device id" + dest: device_id + type: str - name: "cluster" help: "Cluster commands" weight: 200 @@ -1272,6 +1339,28 @@ commands: help: "Name" dest: name type: str + - name: add-replication + help: Assigns the snapshot replication target cluster + arguments: + - name: "cluster_id" + help: "Cluster id" + dest: cluster_id + type: str + completer: _completer_get_cluster_list + - name: "target_cluster_id" + help: "Target Cluster id" + dest: target_cluster_id + type: str + completer: _completer_get_cluster_list + - name: "--timeout" + help: "Snapshot replication network timeout" + dest: timeout + type: int + default: "3600" + - name: "--target-pool" + help: "Target cluster pool ID or name" + dest: target_pool + type: str - name: "volume" help: "Logical volume commands" aliases: @@ -1402,6 +1491,11 @@ commands: dest: npcs type: int default: 0 + - name: "--replicate" + help: "Replicate LVol snapshot" + dest: replicate + type: bool + action: store_true - name: qos-set help: "Changes QoS settings for an active logical volume" arguments: @@ -1603,6 +1697,52 @@ commands: help: "Logical volume id" dest: volume_id type: str + - name: replication-start + help: "Start snapshot replication taken from lvol" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str + - name: "--replication-cluster-id" + help: "Cluster ID of the replication target cluster" + dest: replication_cluster_id + type: str + - name: replication-stop + help: "Stop snapshot replication taken from lvol" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str + - name: replication-status + help: "Lists replication status" + arguments: + - name: "cluster_id" + help: "Cluster UUID" + dest: cluster_id + type: str + - name: replication-trigger + help: "Start replication for lvol" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str + - name: suspend + help: "Suspend lvol subsystems" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str + - name: resume + help: "Resume lvol subsystems" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str - name: "control-plane" help: "Control plane commands" aliases: @@ -1827,6 +1967,16 @@ commands: dest: all type: bool action: store_true + - name: "--cluster-id" + help: "Filter snapshots by cluster UUID" + dest: cluster_id + type: str + required: false + - name: "--with-details" + help: "List snapshots with replicate and chaining details" + dest: with_details + type: bool + action: store_true - name: delete help: "Deletes a snapshot" arguments: @@ -1839,6 +1989,13 @@ commands: dest: force type: bool action: store_true + - name: check + help: "Check a snapshot health" + arguments: + - name: "snapshot_id" + help: "Snapshot id" + dest: snapshot_id + type: str - name: clone help: "Provisions a new logical volume from an existing snapshot" arguments: @@ -1855,6 +2012,43 @@ commands: dest: resize type: size default: "0" + - name: replication-status + help: "Lists snapshots replication status" + arguments: + - name: "cluster_id" + help: "Cluster UUID" + dest: cluster_id + type: str + - name: delete-replication-only + help: "Delete replicated version of a snapshot" + arguments: + - name: "snapshot_id" + help: "Snapshot UUID" + dest: snapshot_id + type: str + - name: get + help: "Gets a snapshot information" + arguments: + - name: "snapshot_id" + help: "Snapshot UUID" + dest: snapshot_id + type: str + - name: set + help: "set snapshot db value" + private: true + arguments: + - name: "snapshot_id" + help: "snapshot id" + dest: snapshot_id + type: str + - name: "attr_name" + help: "attr_name" + dest: attr_name + type: str + - name: "attr_value" + help: "attr_value" + dest: attr_value + type: str - name: "qos" help: "qos commands" weight: 700 diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index e70f72339..1c5ed552c 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -5,7 +5,7 @@ import sys import traceback -from simplyblock_cli.clibase import CLIWrapperBase, range_type, regex_type, size_type +from simplyblock_cli.clibase import CLIWrapperBase, range_type, size_type from simplyblock_core import utils class CLIWrapper(CLIWrapperBase): @@ -36,6 +36,7 @@ def init_storage_node(self): self.init_storage_node__configure(subparser) self.init_storage_node__configure_upgrade(subparser) self.init_storage_node__deploy_cleaner(subparser) + self.init_storage_node__clean_devices(subparser) self.init_storage_node__add_node(subparser) self.init_storage_node__delete(subparser) self.init_storage_node__remove(subparser) @@ -51,7 +52,6 @@ def init_storage_node(self): if self.developer_mode: self.init_storage_node__device_testing_mode(subparser) self.init_storage_node__get_device(subparser) - self.init_storage_node__reset_device(subparser) self.init_storage_node__restart_device(subparser) self.init_storage_node__add_device(subparser) self.init_storage_node__remove_device(subparser) @@ -77,6 +77,7 @@ def init_storage_node(self): self.init_storage_node__dump_lvstore(subparser) if self.developer_mode: self.init_storage_node__set(subparser) + self.init_storage_node__new_device_from_failed(subparser) def init_storage_node__deploy(self, subparser): @@ -87,11 +88,16 @@ def init_storage_node__deploy(self, subparser): def init_storage_node__configure(self, subparser): subcommand = self.add_sub_command(subparser, 'configure', 'Prepare a configuration file to be used when adding the storage node') argument = subcommand.add_argument('--max-lvol', help='Max logical volume per storage node', type=int, dest='max_lvol', required=True) - argument = subcommand.add_argument('--max-size', help='Maximum amount of GB to be utilized on this storage node', type=str, dest='max_prov', required=True) + argument = subcommand.add_argument('--max-size', help='Maximum amount of GB to be utilized on this storage node', type=str, dest='max_prov', required=False) argument = subcommand.add_argument('--nodes-per-socket', help='number of each node to be added per each socket.', type=int, default=1, dest='nodes_per_socket') argument = subcommand.add_argument('--sockets-to-use', help='The system socket to use when adding the storage nodes', type=str, default='0', dest='sockets_to_use') + argument = subcommand.add_argument('--cores-percentage', help='The percentage of cores to be used for spdk (0-99)', type=range_type(0, 99), default=0, dest='cores_percentage') argument = subcommand.add_argument('--pci-allowed', help='Comma separated list of PCI addresses of Nvme devices to use for storage devices.', type=str, default='', dest='pci_allowed', required=False) argument = subcommand.add_argument('--pci-blocked', help='Comma separated list of PCI addresses of Nvme devices to not use for storage devices', type=str, default='', dest='pci_blocked', required=False) + argument = subcommand.add_argument('--device-model', help='NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together', type=str, default='', dest='device_model', required=False) + argument = subcommand.add_argument('--size-range', help='NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together', type=str, default='', dest='size_range', required=False) + argument = subcommand.add_argument('--nvme-names', help='Comma separated list of nvme namespace names like nvme0n1,nvme1n1...', type=str, default='', dest='nvme_names', required=False) + argument = subcommand.add_argument('--force', help='Force format detected or passed nvme pci address to 4K and clean partitions', dest='force', action='store_true') def init_storage_node__configure_upgrade(self, subparser): subcommand = self.add_sub_command(subparser, 'configure-upgrade', 'Upgrade the automated configuration file with new changes of cpu mask or storage devices') @@ -99,12 +105,17 @@ def init_storage_node__configure_upgrade(self, subparser): def init_storage_node__deploy_cleaner(self, subparser): subcommand = self.add_sub_command(subparser, 'deploy-cleaner', 'Cleans a previous simplyblock deploy (local run)') + def init_storage_node__clean_devices(self, subparser): + subcommand = self.add_sub_command(subparser, 'clean-devices', 'clean devices stored in /etc/simplyblock/sn_config_file (local run)') + argument = subcommand.add_argument('--config-path', help='Config path to read stored nvme devices from', type=str, default='/etc/simplyblock/sn_config_file', dest='config_path', required=False) + def init_storage_node__add_node(self, subparser): subcommand = self.add_sub_command(subparser, 'add-node', 'Adds a storage node by its IP address') subcommand.add_argument('cluster_id', help='Cluster id', type=str) subcommand.add_argument('node_addr', help='Address of storage node api to add, like :5000', type=str) subcommand.add_argument('ifname', help='Management interface name', type=str) argument = subcommand.add_argument('--journal-partition', help='1: auto-create small partitions for journal on nvme devices. 0: use a separate (the smallest) nvme device of the node for journal. The journal needs a maximum of 3 percent of total available raw disk space.', type=int, default=1, dest='partitions') + argument = subcommand.add_argument('--format-4k', help='Force format nvme devices with 4K', dest='format_4k', action='store_true') if self.developer_mode: argument = subcommand.add_argument('--jm-percent', help='Number in percent to use for JM from each device', type=int, default=3, dest='jm_percent') argument = subcommand.add_argument('--data-nics', help='Storage network interface names. currently one interface is supported.', type=str, dest='data_nics', nargs='+') @@ -210,13 +221,10 @@ def init_storage_node__get_device(self, subparser): subcommand = self.add_sub_command(subparser, 'get-device', 'Gets storage device by its id') subcommand.add_argument('device_id', help='Device id', type=str) - def init_storage_node__reset_device(self, subparser): - subcommand = self.add_sub_command(subparser, 'reset-device', 'Resets a storage device') - subcommand.add_argument('device_id', help='Device id', type=str) - def init_storage_node__restart_device(self, subparser): subcommand = self.add_sub_command(subparser, 'restart-device', 'Restarts a storage device') subcommand.add_argument('device_id', help='Device id', type=str) + argument = subcommand.add_argument('--force', help='Force remove', dest='force', action='store_true') def init_storage_node__add_device(self, subparser): subcommand = self.add_sub_command(subparser, 'add-device', 'Adds a new storage device') @@ -276,6 +284,7 @@ def init_storage_node__restart_jm_device(self, subparser): subcommand = self.add_sub_command(subparser, 'restart-jm-device', 'Restarts a journaling device') subcommand.add_argument('jm_device_id', help='Journaling device id', type=str) argument = subcommand.add_argument('--force', help='Force device remove', dest='force', action='store_true') + argument = subcommand.add_argument('--format', help='Format the Alceml device used for JM device', dest='format', action='store_true') def init_storage_node__send_cluster_map(self, subparser): subcommand = self.add_sub_command(subparser, 'send-cluster-map', 'Sends a new cluster map') @@ -299,6 +308,10 @@ def init_storage_node__set(self, subparser): subcommand.add_argument('attr_name', help='attr_name', type=str) subcommand.add_argument('attr_value', help='attr_value', type=str) + def init_storage_node__new_device_from_failed(self, subparser): + subcommand = self.add_sub_command(subparser, 'new-device-from-failed', 'Adds a new device to from failed device information') + subcommand.add_argument('device_id', help='Device id', type=str) + def init_cluster(self): subparser = self.add_command('cluster', 'Cluster commands') @@ -331,6 +344,7 @@ def init_cluster(self): if self.developer_mode: self.init_cluster__set(subparser) self.init_cluster__change_name(subparser) + self.init_cluster__add_replication(subparser) def init_cluster__create(self, subparser): @@ -513,6 +527,13 @@ def init_cluster__change_name(self, subparser): subcommand.add_argument('cluster_id', help='Cluster id', type=str).completer = self._completer_get_cluster_list subcommand.add_argument('name', help='Name', type=str) + def init_cluster__add_replication(self, subparser): + subcommand = self.add_sub_command(subparser, 'add-replication', 'Assigns the snapshot replication target cluster') + subcommand.add_argument('cluster_id', help='Cluster id', type=str).completer = self._completer_get_cluster_list + subcommand.add_argument('target_cluster_id', help='Target Cluster id', type=str).completer = self._completer_get_cluster_list + argument = subcommand.add_argument('--timeout', help='Snapshot replication network timeout', type=int, default=3600, dest='timeout') + argument = subcommand.add_argument('--target-pool', help='Target cluster pool ID or name', type=str, dest='target_pool') + def init_volume(self): subparser = self.add_command('volume', 'Logical volume commands', aliases=['lvol',]) @@ -533,6 +554,12 @@ def init_volume(self): self.init_volume__get_io_stats(subparser) self.init_volume__check(subparser) self.init_volume__inflate(subparser) + self.init_volume__replication_start(subparser) + self.init_volume__replication_stop(subparser) + self.init_volume__replication_status(subparser) + self.init_volume__replication_trigger(subparser) + self.init_volume__suspend(subparser) + self.init_volume__resume(subparser) def init_volume__add(self, subparser): @@ -562,6 +589,7 @@ def init_volume__add(self, subparser): argument = subcommand.add_argument('--pvc-name', '--pvc_name', help='Set logical volume PVC name for k8s clients', type=str, dest='pvc_name') argument = subcommand.add_argument('--data-chunks-per-stripe', help='Erasure coding schema parameter k (distributed raid), default: 1', type=int, default=0, dest='ndcs') argument = subcommand.add_argument('--parity-chunks-per-stripe', help='Erasure coding schema parameter n (distributed raid), default: 1', type=int, default=0, dest='npcs') + argument = subcommand.add_argument('--replicate', help='Replicate LVol snapshot', dest='replicate', action='store_true') def init_volume__qos_set(self, subparser): subcommand = self.add_sub_command(subparser, 'qos-set', 'Changes QoS settings for an active logical volume') @@ -639,6 +667,31 @@ def init_volume__inflate(self, subparser): subcommand = self.add_sub_command(subparser, 'inflate', 'Inflate a logical volume') subcommand.add_argument('volume_id', help='Logical volume id', type=str) + def init_volume__replication_start(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-start', 'Start snapshot replication taken from lvol') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + argument = subcommand.add_argument('--replication-cluster-id', help='Cluster ID of the replication target cluster', type=str, dest='replication_cluster_id') + + def init_volume__replication_stop(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-stop', 'Stop snapshot replication taken from lvol') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + + def init_volume__replication_status(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-status', 'Lists replication status') + subcommand.add_argument('cluster_id', help='Cluster UUID', type=str) + + def init_volume__replication_trigger(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-trigger', 'Start replication for lvol') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + + def init_volume__suspend(self, subparser): + subcommand = self.add_sub_command(subparser, 'suspend', 'Suspend lvol subsystems') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + + def init_volume__resume(self, subparser): + subcommand = self.add_sub_command(subparser, 'resume', 'Resume lvol subsystems') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + def init_control_plane(self): subparser = self.add_command('control-plane', 'Control plane commands', aliases=['cp','mgmt',]) @@ -738,7 +791,13 @@ def init_snapshot(self): self.init_snapshot__add(subparser) self.init_snapshot__list(subparser) self.init_snapshot__delete(subparser) + self.init_snapshot__check(subparser) self.init_snapshot__clone(subparser) + self.init_snapshot__replication_status(subparser) + self.init_snapshot__delete_replication_only(subparser) + self.init_snapshot__get(subparser) + if self.developer_mode: + self.init_snapshot__set(subparser) def init_snapshot__add(self, subparser): @@ -749,18 +808,42 @@ def init_snapshot__add(self, subparser): def init_snapshot__list(self, subparser): subcommand = self.add_sub_command(subparser, 'list', 'Lists all snapshots') argument = subcommand.add_argument('--all', help='List soft deleted snapshots', dest='all', action='store_true') + argument = subcommand.add_argument('--cluster-id', help='Filter snapshots by cluster UUID', type=str, dest='cluster_id', required=False) + argument = subcommand.add_argument('--with-details', help='List snapshots with replicate and chaining details', dest='with_details', action='store_true') def init_snapshot__delete(self, subparser): subcommand = self.add_sub_command(subparser, 'delete', 'Deletes a snapshot') subcommand.add_argument('snapshot_id', help='Snapshot id', type=str) argument = subcommand.add_argument('--force', help='Force remove', dest='force', action='store_true') + def init_snapshot__check(self, subparser): + subcommand = self.add_sub_command(subparser, 'check', 'Check a snapshot health') + subcommand.add_argument('snapshot_id', help='Snapshot id', type=str) + def init_snapshot__clone(self, subparser): subcommand = self.add_sub_command(subparser, 'clone', 'Provisions a new logical volume from an existing snapshot') subcommand.add_argument('snapshot_id', help='Snapshot id', type=str) subcommand.add_argument('lvol_name', help='Logical volume name', type=str) argument = subcommand.add_argument('--resize', help='New logical volume size: 10M, 10G, 10(bytes). Can only increase.', type=size_type(), default='0', dest='resize') + def init_snapshot__replication_status(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-status', 'Lists snapshots replication status') + subcommand.add_argument('cluster_id', help='Cluster UUID', type=str) + + def init_snapshot__delete_replication_only(self, subparser): + subcommand = self.add_sub_command(subparser, 'delete-replication-only', 'Delete replicated version of a snapshot') + subcommand.add_argument('snapshot_id', help='Snapshot UUID', type=str) + + def init_snapshot__get(self, subparser): + subcommand = self.add_sub_command(subparser, 'get', 'Gets a snapshot information') + subcommand.add_argument('snapshot_id', help='Snapshot UUID', type=str) + + def init_snapshot__set(self, subparser): + subcommand = self.add_sub_command(subparser, 'set', 'set snapshot db value') + subcommand.add_argument('snapshot_id', help='snapshot id', type=str) + subcommand.add_argument('attr_name', help='attr_name', type=str) + subcommand.add_argument('attr_value', help='attr_value', type=str) + def init_qos(self): subparser = self.add_command('qos', 'qos commands') @@ -809,6 +892,8 @@ def run(self): ret = self.storage_node__configure_upgrade(sub_command, args) elif sub_command in ['deploy-cleaner']: ret = self.storage_node__deploy_cleaner(sub_command, args) + elif sub_command in ['clean-devices']: + ret = self.storage_node__clean_devices(sub_command, args) elif sub_command in ['add-node']: if not self.developer_mode: args.jm_percent = 3 @@ -860,8 +945,6 @@ def run(self): ret = self.storage_node__device_testing_mode(sub_command, args) elif sub_command in ['get-device']: ret = self.storage_node__get_device(sub_command, args) - elif sub_command in ['reset-device']: - ret = self.storage_node__reset_device(sub_command, args) elif sub_command in ['restart-device']: ret = self.storage_node__restart_device(sub_command, args) elif sub_command in ['add-device']: @@ -924,6 +1007,8 @@ def run(self): ret = False else: ret = self.storage_node__set(sub_command, args) + elif sub_command in ['new-device-from-failed']: + ret = self.storage_node__new_device_from_failed(sub_command, args) else: self.parser.print_help() @@ -1009,6 +1094,8 @@ def run(self): ret = self.cluster__set(sub_command, args) elif sub_command in ['change-name']: ret = self.cluster__change_name(sub_command, args) + elif sub_command in ['add-replication']: + ret = self.cluster__add_replication(sub_command, args) else: self.parser.print_help() @@ -1055,6 +1142,18 @@ def run(self): ret = self.volume__check(sub_command, args) elif sub_command in ['inflate']: ret = self.volume__inflate(sub_command, args) + elif sub_command in ['replication-start']: + ret = self.volume__replication_start(sub_command, args) + elif sub_command in ['replication-stop']: + ret = self.volume__replication_stop(sub_command, args) + elif sub_command in ['replication-status']: + ret = self.volume__replication_status(sub_command, args) + elif sub_command in ['replication-trigger']: + ret = self.volume__replication_trigger(sub_command, args) + elif sub_command in ['suspend']: + ret = self.volume__suspend(sub_command, args) + elif sub_command in ['resume']: + ret = self.volume__resume(sub_command, args) else: self.parser.print_help() @@ -1100,8 +1199,22 @@ def run(self): ret = self.snapshot__list(sub_command, args) elif sub_command in ['delete']: ret = self.snapshot__delete(sub_command, args) + elif sub_command in ['check']: + ret = self.snapshot__check(sub_command, args) elif sub_command in ['clone']: ret = self.snapshot__clone(sub_command, args) + elif sub_command in ['replication-status']: + ret = self.snapshot__replication_status(sub_command, args) + elif sub_command in ['delete-replication-only']: + ret = self.snapshot__delete_replication_only(sub_command, args) + elif sub_command in ['get']: + ret = self.snapshot__get(sub_command, args) + elif sub_command in ['set']: + if not self.developer_mode: + print("This command is private.") + ret = False + else: + ret = self.snapshot__set(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 834dd2bab..2603bc574 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -87,8 +87,7 @@ def storage_node__configure_upgrade(self, sub_command, args): def storage_node__configure(self, sub_command, args): if not args.max_lvol: self.parser.error(f"Mandatory argument '--max-lvol' not provided for {sub_command}") - if not args.max_prov: - self.parser.error(f"Mandatory argument '--max-size' not provided for {sub_command}") + max_size = getattr(args, "max_prov") or 0 sockets_to_use = [0] if args.sockets_to_use: try: @@ -101,21 +100,41 @@ def storage_node__configure(self, sub_command, args): self.parser.error(f"nodes_per_socket {args.nodes_per_socket}must be either 1 or 2") if args.pci_allowed and args.pci_blocked: self.parser.error("pci-allowed and pci-blocked cannot be both specified") - max_prov = utils.parse_size(args.max_prov, assume_unit='G') + max_prov = utils.parse_size(max_size, assume_unit='G') pci_allowed = [] pci_blocked = [] + nvme_names = [] if args.pci_allowed: pci_allowed = [str(x) for x in args.pci_allowed.split(',')] if args.pci_blocked: pci_blocked = [str(x) for x in args.pci_blocked.split(',')] - - return storage_ops.generate_automated_deployment_config(args.max_lvol, max_prov, sockets_to_use, - args.nodes_per_socket, pci_allowed, pci_blocked) + if (args.device_model and not args.size_range) or (not args.device_model and args.size_range): + self.parser.error("device_model and size_range must be set together") + if args.nvme_names: + nvme_names = [str(x) for x in args.nvme_names.split(',')] + use_pci_allowed = bool(args.pci_allowed) + use_pci_blocked = bool(args.pci_blocked) + use_model_range = bool(args.device_model and args.size_range) + if sum([use_pci_allowed, use_pci_blocked, use_model_range]) > 1: + self.parser.error( + "Options --pci-allowed, --pci-blocked, and " + "(--device-model with --size-range) are mutually exclusive; choose only one." + ) + cores_percentage = int(args.cores_percentage) + + return storage_ops.generate_automated_deployment_config( + args.max_lvol, max_prov, sockets_to_use,args.nodes_per_socket, + pci_allowed, pci_blocked, force=args.force, device_model=args.device_model, + size_range=args.size_range, cores_percentage=cores_percentage, nvme_names=nvme_names) def storage_node__deploy_cleaner(self, sub_command, args): storage_ops.deploy_cleaner() return True # remove once CLI changed to exceptions + def storage_node__clean_devices(self, sub_command, args): + storage_ops.clean_devices(args.config_path) + return True # remove once CLI changed to exceptions + def storage_node__add_node(self, sub_command, args): cluster_id = args.cluster_id node_addr = args.node_addr @@ -134,26 +153,31 @@ def storage_node__add_node(self, sub_command, args): enable_ha_jm = args.enable_ha_jm namespace = args.namespace ha_jm_count = args.ha_jm_count - - out = storage_ops.add_node( - cluster_id=cluster_id, - node_addr=node_addr, - iface_name=ifname, - data_nics_list=data_nics, - max_snap=max_snap, - spdk_image=spdk_image, - spdk_debug=spdk_debug, - small_bufsize=small_bufsize, - large_bufsize=large_bufsize, - num_partitions_per_dev=num_partitions_per_dev, - jm_percent=jm_percent, - enable_test_device=enable_test_device, - namespace=namespace, - enable_ha_jm=enable_ha_jm, - id_device_by_nqn=args.id_device_by_nqn, - partition_size=args.partition_size, - ha_jm_count=ha_jm_count, - ) + format_4k = args.format_4k + try: + out = storage_ops.add_node( + cluster_id=cluster_id, + node_addr=node_addr, + iface_name=ifname, + data_nics_list=data_nics, + max_snap=max_snap, + spdk_image=spdk_image, + spdk_debug=spdk_debug, + small_bufsize=small_bufsize, + large_bufsize=large_bufsize, + num_partitions_per_dev=num_partitions_per_dev, + jm_percent=jm_percent, + enable_test_device=enable_test_device, + namespace=namespace, + enable_ha_jm=enable_ha_jm, + id_device_by_nqn=args.id_device_by_nqn, + partition_size=args.partition_size, + ha_jm_count=ha_jm_count, + format_4k=format_4k + ) + except Exception as e: + print(e) + return False return out @@ -184,11 +208,15 @@ def storage_node__restart(self, sub_command, args): large_bufsize = args.large_bufsize ssd_pcie = args.ssd_pcie - return storage_ops.restart_storage_node( - node_id, max_lvol, max_snap, max_prov, - spdk_image, spdk_debug, - small_bufsize, large_bufsize, node_ip=args.node_ip, reattach_volume=reattach_volume, force=args.force, - new_ssd_pcie=ssd_pcie, force_lvol_recreate=args.force_lvol_recreate) + try: + return storage_ops.restart_storage_node( + node_id, max_lvol, max_snap, max_prov, + spdk_image, spdk_debug, + small_bufsize, large_bufsize, node_ip=args.node_ip, reattach_volume=reattach_volume, force=args.force, + new_ssd_pcie=ssd_pcie, force_lvol_recreate=args.force_lvol_recreate) + except Exception as e: + print(e) + return False def storage_node__shutdown(self, sub_command, args): return storage_ops.shutdown_storage_node(args.node_id, args.force) @@ -233,7 +261,7 @@ def storage_node__reset_device(self, sub_command, args): return device_controller.reset_storage_device(args.device_id) def storage_node__restart_device(self, sub_command, args): - return device_controller.restart_device(args.device_id) + return device_controller.restart_device(args.device_id, args.force) def storage_node__add_device(self, sub_command, args): return device_controller.add_device(args.device_id) @@ -292,7 +320,7 @@ def storage_node__remove_jm_device(self, sub_command, args): return device_controller.remove_jm_device(args.jm_device_id, args.force) def storage_node__restart_jm_device(self, sub_command, args): - return device_controller.restart_jm_device(args.jm_device_id, args.force) + return device_controller.restart_jm_device(args.jm_device_id, args.force, args.format) def storage_node__send_cluster_map(self, sub_command, args): node_id = args.node_id @@ -310,6 +338,9 @@ def storage_node__dump_lvstore(self, sub_command, args): node_id = args.node_id return storage_ops.dump_lvstore(node_id) + def storage_node__new_device_from_failed(self, sub_command, args): + return device_controller.new_device_from_failed(args.device_id) + def storage_node__set(self, sub_command, args): return storage_ops.set_value(args.node_id, args.attr_name, args.attr_value) @@ -447,6 +478,9 @@ def cluster__complete_expand(self, sub_command, args): cluster_ops.cluster_expand(args.cluster_id) return True + def cluster__add_replication(self, sub_command, args): + return cluster_ops.add_replication(args.cluster_id, args.target_cluster_id, args.timeout, args.target_pool) + def volume__add(self, sub_command, args): name = args.name size = args.size @@ -474,7 +508,8 @@ def volume__add(self, sub_command, args): crypto_key2=args.crypto_key2, lvol_priority_class=lvol_priority_class, uid=args.uid, pvc_name=args.pvc_name, namespace=args.namespace, - max_namespace_per_subsys=args.max_namespace_per_subsys, ndcs=ndcs, npcs=npcs, fabric=args.fabric) + max_namespace_per_subsys=args.max_namespace_per_subsys, ndcs=ndcs, npcs=npcs, fabric=args.fabric, + do_replicate=args.replicate) if results: return results else: @@ -555,6 +590,24 @@ def volume__check(self, sub_command, args): def volume__inflate(self, sub_command, args): return lvol_controller.inflate_lvol(args.volume_id) + def volume__replication_start(self, sub_command, args): + return lvol_controller.replication_start(args.lvol_id, args.replication_cluster_id) + + def volume__replication_stop(self, sub_command, args): + return lvol_controller.replication_stop(args.lvol_id) + + def volume__replication_status(self, sub_command, args): + return snapshot_controller.list_replication_tasks(args.cluster_id) + + def volume__replication_trigger(self, sub_command, args): + return lvol_controller.replication_trigger(args.lvol_id) + + def volume__suspend(self, sub_command, args): + return lvol_controller.suspend_lvol(args.lvol_id) + + def volume__resume(self, sub_command, args): + return lvol_controller.resume_lvol(args.lvol_id) + def control_plane__add(self, sub_command, args): cluster_id = args.cluster_id cluster_ip = args.cluster_ip @@ -623,16 +676,31 @@ def snapshot__add(self, sub_command, args): return snapshot_id if not error else error def snapshot__list(self, sub_command, args): - return snapshot_controller.list(args.all) + return snapshot_controller.list(args.all, args.cluster_id, args.with_details) def snapshot__delete(self, sub_command, args): return snapshot_controller.delete(args.snapshot_id, args.force) + def snapshot__check(self, sub_command, args): + return health_controller.check_snap(args.snapshot_id) + def snapshot__clone(self, sub_command, args): new_size = args.resize - success, details = snapshot_controller.clone(args.snapshot_id, args.lvol_name, new_size) - return details + clone_id, error = snapshot_controller.clone(args.snapshot_id, args.lvol_name, new_size) + return clone_id if not error else error + + def snapshot__replication_status(self, sub_command, args): + return snapshot_controller.list_replication_tasks(args.cluster_id) + + def snapshot__delete_replication_only(self, sub_command, args): + return snapshot_controller.delete_replicated(args.snapshot_id) + + def snapshot__get(self, sub_command, args): + return snapshot_controller.get(args.snapshot_id) + + def snapshot__set(self, sub_command, args): + return snapshot_controller.set(args.snapshot_id, args.attr_name, args.attr_value) def qos__add(self, sub_command, args): return qos_controller.add_class(args.name, args.weight, args.cluster_id) diff --git a/simplyblock_cli/scripts/cli-wrapper.jinja2 b/simplyblock_cli/scripts/cli-wrapper.jinja2 index 423b11992..80932e582 100644 --- a/simplyblock_cli/scripts/cli-wrapper.jinja2 +++ b/simplyblock_cli/scripts/cli-wrapper.jinja2 @@ -5,7 +5,7 @@ import logging import sys import traceback -from simplyblock_cli.clibase import CLIWrapperBase, range_type, regex_type, size_type +from simplyblock_cli.clibase import CLIWrapperBase, range_type, size_type from simplyblock_core import utils class CLIWrapper(CLIWrapperBase): diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 103123934..84c2c3442 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -15,7 +15,7 @@ from docker.errors import DockerException from simplyblock_core import utils, scripts, constants, mgmt_node_ops, storage_node_ops -from simplyblock_core.controllers import cluster_events, device_controller, qos_controller +from simplyblock_core.controllers import cluster_events, device_controller, qos_controller, tasks_controller from simplyblock_core.db_controller import DBController from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule @@ -25,6 +25,7 @@ from simplyblock_core.models.stats import LVolStatObject, ClusterStatObject, NodeStatObject, DeviceStatObject from simplyblock_core.models.nvme_device import NVMeDevice from simplyblock_core.models.storage_node import StorageNode +from simplyblock_core.prom_client import PromClient from simplyblock_core.utils import pull_docker_image_with_retry logger = utils.get_logger(__name__) @@ -79,7 +80,7 @@ def _create_update_user(cluster_id, grafana_url, grafana_secret, user_secret, up def _add_graylog_input(cluster_ip, password): - base_url = f"http://{cluster_ip}/graylog/api" + base_url = f"{cluster_ip}/api" input_url = f"{base_url}/system/inputs" retries = 30 @@ -160,7 +161,7 @@ def _add_graylog_input(cluster_ip, password): def _set_max_result_window(cluster_ip, max_window=100000): - url_existing_indices = f"http://{cluster_ip}/opensearch/_all/_settings" + url_existing_indices = f"{cluster_ip}/_all/_settings" retries = 30 reachable=False @@ -187,7 +188,7 @@ def _set_max_result_window(cluster_ip, max_window=100000): logger.error(f"Failed to update settings for existing indices: {response.text}") return False - url_template = f"http://{cluster_ip}/opensearch/_template/all_indices_template" + url_template = f"{cluster_ip}/_template/all_indices_template" payload_template = json.dumps({ "index_patterns": ["*"], "settings": { @@ -281,9 +282,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, if not dev_ip: raise ValueError("Error getting ip: For Kubernetes-based deployments, please supply --mgmt-ip.") - current_node = utils.get_node_name_by_ip(dev_ip) - utils.label_node_as_mgmt_plane(current_node) - if not cli_pass: cli_pass = utils.generate_string(10) @@ -315,12 +313,17 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.fabric_tcp = protocols["tcp"] cluster.fabric_rdma = protocols["rdma"] cluster.is_single_node = is_single_node - if grafana_endpoint: - cluster.grafana_endpoint = grafana_endpoint - elif ingress_host_source == "hostip": - cluster.grafana_endpoint = f"http://{dev_ip}/grafana" + + if ingress_host_source == "hostip": + base = dev_ip else: - cluster.grafana_endpoint = f"http://{dns_name}/grafana" + base = dns_name + + graylog_endpoint = f"http://{base}/graylog" + os_endpoint = f"http://{base}/opensearch" + default_grafana = f"http://{base}/grafana" + + cluster.grafana_endpoint = grafana_endpoint or default_grafana cluster.enable_node_affinity = enable_node_affinity cluster.qpair_count = qpair_count or constants.QPAIR_COUNT cluster.client_qpair_count = client_qpair_count or constants.CLIENT_QPAIR_COUNT @@ -331,6 +334,7 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.contact_point = contact_point cluster.disable_monitoring = disable_monitoring cluster.mode = mode + cluster.full_page_unmap = False if mode == "docker": if not disable_monitoring: @@ -350,20 +354,21 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, elif mode == "kubernetes": logger.info("Retrieving foundationdb connection string...") fdb_cluster_string = utils.get_fdb_cluster_string(constants.FDB_CONFIG_NAME, constants.K8S_NAMESPACE) - db_connection = fdb_cluster_string + + logger.info("Patching prometheus configmap...") + utils.patch_prometheus_configmap(cluster.uuid, cluster.secret) if not disable_monitoring: if ingress_host_source == "hostip": dns_name = dev_ip - _set_max_result_window(dns_name) - _add_graylog_input(dns_name, monitoring_secret) + _set_max_result_window(os_endpoint) + + _add_graylog_input(graylog_endpoint, monitoring_secret) _create_update_user(cluster.uuid, cluster.grafana_endpoint, monitoring_secret, cluster.secret) - if mode == "kubernetes": - utils.patch_prometheus_configmap(cluster.uuid, cluster.secret) cluster.db_connection = db_connection cluster.status = Cluster.STATUS_UNREADY @@ -371,8 +376,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.write_to_db(db_controller.kv_store) - qos_controller.add_class("Default", 100, cluster.get_id()) - cluster_events.cluster_create(cluster) mgmt_node_ops.add_mgmt_node(dev_ip, mode, cluster.uuid) @@ -437,18 +440,24 @@ def _run_fio(mount_point) -> None: def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, qpair_count, - max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric="tcp") -> str: + max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, cr_name=None, + cr_namespace=None, cr_plural=None, fabric="tcp", cluster_ip=None, grafana_secret=None) -> str: + + default_cluster = None + monitoring_secret = os.environ.get("MONITORING_SECRET", "") + enable_monitoring = os.environ.get("ENABLE_MONITORING", "") clusters = db_controller.get_clusters() - if not clusters: - raise ValueError("No previous clusters found!") + if clusters: + default_cluster = clusters[0] + else: + logger.info("No previous clusters found") if distr_ndcs == 0 and distr_npcs == 0: raise ValueError("both distr_ndcs and distr_npcs cannot be 0") - monitoring_secret = os.environ.get("MONITORING_SECRET", "") - logger.info("Adding new cluster") + cluster = Cluster() cluster.uuid = str(uuid.uuid4()) cluster.cluster_name = name @@ -457,13 +466,40 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.nqn = f"{constants.CLUSTER_NQN}:{cluster.uuid}" cluster.secret = utils.generate_string(20) cluster.strict_node_anti_affinity = strict_node_anti_affinity + if default_cluster: + cluster.mode = default_cluster.mode + cluster.db_connection = default_cluster.db_connection + cluster.grafana_secret = grafana_secret if grafana_secret else default_cluster.grafana_secret + cluster.grafana_endpoint = default_cluster.grafana_endpoint + else: + # creating first cluster on k8s + cluster.mode = "kubernetes" + logger.info("Retrieving foundationdb connection string...") + fdb_cluster_string = utils.get_fdb_cluster_string(constants.FDB_CONFIG_NAME, constants.K8S_NAMESPACE) + cluster.db_connection = fdb_cluster_string + if monitoring_secret: + cluster.grafana_secret = monitoring_secret + elif enable_monitoring != "true": + cluster.grafana_secret = "" + else: + raise Exception("monitoring_secret is required") + cluster.grafana_endpoint = constants.GRAFANA_K8S_ENDPOINT + if not cluster_ip: + cluster_ip = "0.0.0.0" + + # add mgmt node object + mgmt_node_ops.add_mgmt_node(cluster_ip, "kubernetes", cluster.uuid) + if enable_monitoring == "true": + graylog_endpoint = constants.GRAYLOG_K8S_ENDPOINT + os_endpoint = constants.OS_K8S_ENDPOINT + _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret) + + _set_max_result_window(os_endpoint) - default_cluster = clusters[0] - cluster.db_connection = default_cluster.db_connection - cluster.grafana_secret = monitoring_secret if default_cluster.mode == "kubernetes" else default_cluster.grafana_secret - cluster.grafana_endpoint = default_cluster.grafana_endpoint + _add_graylog_input(graylog_endpoint, monitoring_secret) - _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret) + if cluster.mode == "kubernetes": + utils.patch_prometheus_configmap(cluster.uuid, cluster.secret) cluster.distr_ndcs = distr_ndcs cluster.distr_npcs = distr_npcs @@ -475,6 +511,10 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.qpair_count = qpair_count or constants.QPAIR_COUNT cluster.max_queue_size = max_queue_size cluster.inflight_io_threshold = inflight_io_threshold + cluster.cr_name = cr_name + cluster.cr_namespace = cr_namespace + cluster.cr_plural = cr_plural + if cap_warn and cap_warn > 0: cluster.cap_warn = cap_warn if cap_crit and cap_crit > 0: @@ -486,12 +526,12 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn protocols = parse_protocols(fabric) cluster.fabric_tcp = protocols["tcp"] cluster.fabric_rdma = protocols["rdma"] + cluster.full_page_unmap = False cluster.status = Cluster.STATUS_UNREADY cluster.create_dt = str(datetime.datetime.now()) cluster.write_to_db(db_controller.kv_store) cluster_events.cluster_create(cluster) - qos_controller.add_class("Default", 100, cluster.get_id()) return cluster.get_id() @@ -597,9 +637,8 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None: snode.lvstore_status = "failed" snode.write_to_db() logger.error(f"Failed to restore lvstore on node {snode.get_id()}") - if not force: - set_cluster_status(cl_id, ols_status) - raise ValueError("Failed to activate cluster") + set_cluster_status(cl_id, ols_status) + raise ValueError("Failed to activate cluster") snodes = db_controller.get_storage_nodes_by_cluster_id(cl_id) for snode in snodes: @@ -621,10 +660,8 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None: snode.lvstore_status = "failed" snode.write_to_db() logger.error(f"Failed to restore lvstore on node {snode.get_id()}") - if not force: - logger.error("Failed to activate cluster") - set_cluster_status(cl_id, ols_status) - raise ValueError("Failed to activate cluster") + set_cluster_status(cl_id, ols_status) + raise ValueError("Failed to activate cluster") # reorder qos classes ids qos_classes = db_controller.get_qos(cl_id) @@ -645,6 +682,15 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None: if not ret: logger.error(f"Failed to set Alcemls QOS on node: {node.get_id()}") + # Start JC compression on each node + if ols_status == Cluster.STATUS_UNREADY: + for node in db_controller.get_storage_nodes_by_cluster_id(cl_id): + if node.status == StorageNode.STATUS_ONLINE: + ret, err = node.rpc_client().jc_suspend_compression(jm_vuid=node.jm_vuid, suspend=False) + if not ret: + logger.info("Failed to resume JC compression adding task...") + tasks_controller.add_jc_comp_resume_task(node.cluster_id, node.get_id(), jm_vuid=node.jm_vuid) + if not cluster.cluster_max_size: cluster = db_controller.get_cluster_by_id(cl_id) cluster.cluster_max_size = max_size @@ -792,6 +838,7 @@ def list() -> t.List[dict]: "#storage": len(st), "Mod": f"{cl.distr_ndcs}x{cl.distr_npcs}", "Status": status.upper(), + "Replicate": cl.snapshot_replication_target_cluster, }) return data @@ -1002,16 +1049,11 @@ def list_all_info(cluster_id) -> str: def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]: - cluster = db_controller.get_cluster_by_id(cluster_id) - - if history: - records_number = utils.parse_history_param(history) - if not records_number: - raise ValueError(f"Error parsing history string: {history}") - else: - records_number = 20 - - records = db_controller.get_cluster_capacity(cluster, records_number) + try: + _ = db_controller.get_cluster_by_id(cluster_id) + except KeyError: + logger.error(f"Cluster not found: {cluster_id}") + return [] cap_stats_keys = [ "date", @@ -1022,20 +1064,17 @@ def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]: "size_util", "size_prov_util", ] + prom_client = PromClient(cluster_id) + records = prom_client.get_cluster_metrics(cluster_id, cap_stats_keys, history) return utils.process_records(records, records_count, keys=cap_stats_keys) def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes=False) -> t.List[dict]: - cluster = db_controller.get_cluster_by_id(cluster_id) - - if history_string: - records_number = utils.parse_history_param(history_string) - if not records_number: - raise ValueError(f"Error parsing history string: {history_string}") - else: - records_number = 20 - - records = db_controller.get_cluster_stats(cluster, records_number) + try: + _ = db_controller.get_cluster_by_id(cluster_id) + except KeyError: + logger.error(f"Cluster not found: {cluster_id}") + return [] io_stats_keys = [ "date", @@ -1073,6 +1112,9 @@ def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes "write_latency_ticks", ] ) + + prom_client = PromClient(cluster_id) + records = prom_client.get_cluster_metrics(cluster_id, io_stats_keys, history_string) # combine records return utils.process_records(records, records_count, keys=io_stats_keys) @@ -1137,6 +1179,7 @@ def get_logs(cluster_id, limit=50, **kwargs) -> t.List[dict]: if record.event in ["device_status", "node_status"]: msg = msg+f" ({record.count})" + logger.debug(record) out.append({ "Date": record.get_date_string(), "NodeId": record.node_id, @@ -1159,10 +1202,6 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, logger.info("Updating mgmt cluster") if cluster.mode == "docker": - sbcli=constants.SIMPLY_BLOCK_CLI_NAME - subprocess.check_call(f"pip install {sbcli} --upgrade".split(' ')) - logger.info(f"{sbcli} upgraded") - cluster_docker = utils.get_docker_client(cluster_id) logger.info(f"Pulling image {constants.SIMPLY_BLOCK_DOCKER_IMAGE}") pull_docker_image_with_retry(cluster_docker, constants.SIMPLY_BLOCK_DOCKER_IMAGE) @@ -1176,37 +1215,52 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, for service in cluster_docker.services.list(): if image_parts in service.attrs['Spec']['Labels']['com.docker.stack.image'] or \ "simplyblock" in service.attrs['Spec']['Labels']['com.docker.stack.image']: - logger.info(f"Updating service {service.name}") - service.update(image=service_image, force_update=True) - service_names.append(service.attrs['Spec']['Name']) + if service.name in ["app_CachingNodeMonitor", "app_CachedLVolStatsCollector"]: + logger.info(f"Removing service {service.name}") + service.remove() + else: + logger.info(f"Updating service {service.name}") + service.update(image=service_image, force_update=True) + service_names.append(service.attrs['Spec']['Name']) if "app_SnapshotMonitor" not in service_names: - logger.info("Creating snapshot monitor service") - cluster_docker.services.create( - image=service_image, - command="python simplyblock_core/services/snapshot_monitor.py", - name="app_SnapshotMonitor", - mounts=["/etc/foundationdb:/etc/foundationdb"], - env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"], - networks=["host"], - constraints=["node.role == manager"] - ) + utils.create_docker_service( + cluster_docker=cluster_docker, + service_name="app_SnapshotMonitor", + service_file="python simplyblock_core/services/snapshot_monitor.py", + service_image=service_image) + + if "app_TasksRunnerLVolSyncDelete" not in service_names: + utils.create_docker_service( + cluster_docker=cluster_docker, + service_name="app_TasksRunnerLVolSyncDelete", + service_file="python simplyblock_core/services/tasks_runner_sync_lvol_del.py", + service_image=service_image) + + if "app_TasksRunnerJCCompResume" not in service_names: + utils.create_docker_service( + cluster_docker=cluster_docker, + service_name="app_TasksRunnerJCCompResume", + service_file="python simplyblock_core/services/tasks_runner_jc_comp.py", + service_image=service_image) + logger.info("Done updating mgmt cluster") elif cluster.mode == "kubernetes": utils.load_kube_config_with_fallback() apps_v1 = k8s_client.AppsV1Api() - + namespace = constants.K8S_NAMESPACE image_without_tag = constants.SIMPLY_BLOCK_DOCKER_IMAGE.split(":")[0] image_parts = "/".join(image_without_tag.split("/")[-2:]) service_image = mgmt_image or constants.SIMPLY_BLOCK_DOCKER_IMAGE - + deployment_names = [] # Update Deployments - deployments = apps_v1.list_namespaced_deployment(namespace=constants.K8S_NAMESPACE) + deployments = apps_v1.list_namespaced_deployment(namespace=namespace) for deploy in deployments.items: if deploy.metadata.name == constants.ADMIN_DEPLOY_NAME: logger.info(f"Skipping deployment {deploy.metadata.name}") continue + deployment_names.append(deploy.metadata.name) for c in deploy.spec.template.spec.containers: if image_parts in c.image: logger.info(f"Updating deployment {deploy.metadata.name} image to {service_image}") @@ -1216,12 +1270,28 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, deploy.spec.template.metadata.annotations = annotations apps_v1.patch_namespaced_deployment( name=deploy.metadata.name, - namespace=constants.K8S_NAMESPACE, + namespace=namespace, body={"spec": {"template": deploy.spec.template}} ) + if "simplyblock-tasks-runner-sync-lvol-del" not in deployment_names: + utils.create_k8s_service( + namespace=namespace, + deployment_name="simplyblock-tasks-runner-sync-lvol-del", + container_name="tasks-runner-sync-lvol-del", + service_file="simplyblock_core/services/tasks_runner_sync_lvol_del.py", + container_image=service_image) + + if "simplyblock-snapshot-monitor" not in deployment_names: + utils.create_k8s_service( + namespace=namespace, + deployment_name="simplyblock-snapshot-monitor", + container_name="snapshot-monitor", + service_file="simplyblock_core/services/snapshot_monitor.py", + container_image=service_image) + # Update DaemonSets - daemonsets = apps_v1.list_namespaced_daemon_set(namespace=constants.K8S_NAMESPACE) + daemonsets = apps_v1.list_namespaced_daemon_set(namespace=namespace) for ds in daemonsets.items: for c in ds.spec.template.spec.containers: if image_parts in c.image: @@ -1232,7 +1302,7 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, ds.spec.template.metadata.annotations = annotations apps_v1.patch_namespaced_daemon_set( name=ds.metadata.name, - namespace=constants.K8S_NAMESPACE, + namespace=namespace, body={"spec": {"template": ds.spec.template}} ) @@ -1270,7 +1340,12 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, logger.info(f"Restarting node: {node.get_id()} with SPDK image: {spdk_image}") else: logger.info(f"Restarting node: {node.get_id()}") - storage_node_ops.restart_storage_node(node.get_id(), force=True, spdk_image=spdk_image) + try: + storage_node_ops.restart_storage_node(node.get_id(), force=True, spdk_image=spdk_image) + except Exception as e: + logger.debug(e) + logger.error(f"Failed to restart node: {node.get_id()}") + return logger.info("Done") @@ -1329,3 +1404,30 @@ def set(cl_id, attr, value) -> None: logger.info(f"Setting {attr} to {value}") setattr(cluster, attr, value) cluster.write_to_db() + + +def add_replication(source_cl_id, target_cl_id, timeout=0, target_pool=None) -> bool: + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(source_cl_id) + if not cluster: + raise ValueError(f"Cluster not found: {source_cl_id}") + + target_cluster = db_controller.get_cluster_by_id(target_cl_id) + if not target_cluster: + raise ValueError(f"Target cluster not found: {target_cl_id}") + + logger.info("Updating Cluster replication target") + cluster.snapshot_replication_target_cluster = target_cl_id + if target_pool: + pool = db_controller.get_pool_by_id(target_pool) + if not pool: + raise ValueError(f"Pool not found: {target_pool}") + if pool.status != Pool.STATUS_ACTIVE: + raise ValueError(f"Pool not active: {target_pool}") + cluster.snapshot_replication_target_pool = target_pool + + if timeout and timeout > 0: + cluster.snapshot_replication_timeout = timeout + cluster.write_to_db() + logger.info("Done") + return True diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index 41824c73a..23cb100d8 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -27,7 +27,6 @@ def get_config_var(name, default=None): KVD_DB_FILE_PATH = os.getenv('FDB_CLUSTER_FILE', '/etc/foundationdb/fdb.cluster') KVD_DB_TIMEOUT_MS = 10000 SPK_DIR = '/home/ec2-user/spdk' -RPC_HTTP_PROXY_PORT = 8080 LOG_LEVEL = logging.INFO LOG_WEB_LEVEL = logging.DEBUG LOG_WEB_DEBUG = True if LOG_WEB_LEVEL == logging.DEBUG else False @@ -93,7 +92,7 @@ def get_config_var(name, default=None): MIN_SYS_MEMORY_FOR_LVOL = 524288000 EXTRA_SMALL_POOL_COUNT = 4096 EXTRA_LARGE_POOL_COUNT = 10240 -EXTRA_HUGE_PAGE_MEMORY = 1147483648 +EXTRA_HUGE_PAGE_MEMORY = 3221225472 EXTRA_SYS_MEMORY = 0.10 INSTANCE_STORAGE_DATA = { @@ -133,12 +132,10 @@ def get_config_var(name, default=None): LVOL_NVME_CONNECT_NR_IO_QUEUES=3 LVOL_NVME_KEEP_ALIVE_TO=10 LVOL_NVME_KEEP_ALIVE_TO_TCP=7 -LVOL_NVMF_PORT_START=int(os.getenv('LVOL_NVMF_PORT_START', 9100)) QPAIR_COUNT=32 CLIENT_QPAIR_COUNT=3 NVME_TIMEOUT_US=8000000 NVMF_MAX_SUBSYSTEMS=50000 -HA_JM_COUNT=3 KATO=10000 ACK_TO=11 BDEV_RETRY=0 @@ -157,15 +154,22 @@ def get_config_var(name, default=None): LINUX_DRV_MASS_STORAGE_ID = 1 LINUX_DRV_MASS_STORAGE_NVME_TYPE_ID = 8 -NODE_NVMF_PORT_START=9060 -NODE_HUBLVOL_PORT_START=9030 NODES_CONFIG_FILE = "/etc/simplyblock/sn_config_file" SYSTEM_INFO_FILE = "/etc/simplyblock/system_info" LVO_MAX_NAMESPACES_PER_SUBSYS=32 +CR_GROUP = "simplyblock.simplyblock.io" +CR_VERSION = "v1alpha1" + +GRAFANA_K8S_ENDPOINT = "http://simplyblock-grafana:3000" +GRAYLOG_K8S_ENDPOINT = "http://simplyblock-graylog:9000" +OS_K8S_ENDPOINT = "http://opensearch-cluster-master:9200" + +WEBAPI_K8S_ENDPOINT = "http://simplyblock-webappapi:5000/api/v2" + K8S_NAMESPACE = os.getenv('K8S_NAMESPACE', 'simplyblock') OS_STATEFULSET_NAME = "simplyblock-opensearch" MONGODB_STATEFULSET_NAME = "simplyblock-mongo" @@ -224,4 +228,14 @@ def get_config_var(name, default=None): qos_class_meta_and_migration_weight_percent = 25 -MIG_PARALLEL_JOBS = 16 \ No newline at end of file +MIG_PARALLEL_JOBS = 64 +MIG_JOB_SIZE = 64 + +# ports ranges +RPC_PORT_RANGE_START = 8080 +NODE_NVMF_PORT_START=9060 +NODE_HUBLVOL_PORT_START=9030 +FW_PORT_START = 50001 +# todo(hamdy): make it configurable: sfam-2586 +LVOL_NVMF_PORT_ENV = os.getenv("LVOL_NVMF_PORT_START", "") +LVOL_NVMF_PORT_START = int(LVOL_NVMF_PORT_ENV) if LVOL_NVMF_PORT_ENV else 9100 \ No newline at end of file diff --git a/simplyblock_core/controllers/cluster_events.py b/simplyblock_core/controllers/cluster_events.py index e8e6c406e..e201c53a9 100644 --- a/simplyblock_core/controllers/cluster_events.py +++ b/simplyblock_core/controllers/cluster_events.py @@ -4,6 +4,7 @@ from simplyblock_core.controllers import events_controller as ec from simplyblock_core.db_controller import DBController from simplyblock_core.models.events import EventObj +from simplyblock_core import utils, constants logger = logging.getLogger() db_controller = DBController() @@ -39,6 +40,15 @@ def cluster_status_change(cluster, new_state, old_status): caused_by=ec.CAUSED_BY_CLI, message=f"Cluster status changed from {old_status} to {new_state}") + if cluster.mode == "kubernetes": + utils.patch_cr_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=cluster.cr_plural, + namespace=cluster.cr_namespace, + name=cluster.cr_name, + status_patch={"status": new_state}) + def _cluster_cap_event(cluster, msg, event_level): return ec.log_event_cluster( @@ -80,3 +90,21 @@ def cluster_delete(cluster): db_object=cluster, caused_by=ec.CAUSED_BY_CLI, message=f"Cluster deleted {cluster.get_id()}") + + +def cluster_rebalancing_change(cluster, new_state, old_status): + ec.log_event_cluster( + cluster_id=cluster.get_id(), + domain=ec.DOMAIN_CLUSTER, + event=ec.EVENT_STATUS_CHANGE, + db_object=cluster, + caused_by=ec.CAUSED_BY_CLI, + message=f"Cluster rebalancing changed from {old_status} to {new_state}") + if cluster.mode == "kubernetes": + utils.patch_cr_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=cluster.cr_plural, + namespace=cluster.cr_namespace, + name=cluster.cr_name, + status_patch={"rebalancing": new_state}) diff --git a/simplyblock_core/controllers/device_controller.py b/simplyblock_core/controllers/device_controller.py index 8e684c942..b51801302 100644 --- a/simplyblock_core/controllers/device_controller.py +++ b/simplyblock_core/controllers/device_controller.py @@ -1,13 +1,15 @@ import time import logging +import uuid from simplyblock_core import distr_controller, utils, storage_node_ops from simplyblock_core.controllers import device_events, tasks_controller from simplyblock_core.db_controller import DBController from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice from simplyblock_core.models.storage_node import StorageNode +from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient - +from simplyblock_core.snode_client import SNodeClient logger = logging.getLogger() @@ -68,7 +70,9 @@ def device_set_state(device_id, state): for node in snodes: if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE: continue - node.remote_devices = storage_node_ops._connect_to_remote_devs(node) + remote_devices = storage_node_ops._connect_to_remote_devs(node) + node = db_controller.get_storage_node_by_id(node.get_id()) + node.remote_devices = remote_devices node.write_to_db() distr_controller.send_dev_status_event(device, device.status) @@ -121,7 +125,7 @@ def get_alceml_name(alceml_id): return f"alceml_{alceml_id}" -def _def_create_device_stack(device_obj, snode, force=False): +def _def_create_device_stack(device_obj, snode, force=False, clear_data=False): db_controller = DBController() rpc_client = RPCClient( @@ -155,7 +159,7 @@ def _def_create_device_stack(device_obj, snode, force=False): if alceml_name not in bdev_names: ret = snode.create_alceml( alceml_name, nvme_bdev, alceml_id, - pba_init_mode=2, + pba_init_mode=3 if clear_data else 2, write_protection=cluster.distr_ndcs > 1, pba_page_size=cluster.page_size_in_blocks, full_page_unmap=cluster.full_page_unmap @@ -240,6 +244,10 @@ def restart_device(device_id, force=False): device_obj = dev break + if not device_obj: + logger.error("device not found") + return False + task_id = tasks_controller.get_active_dev_restart_task(snode.cluster_id, device_id) if task_id: logger.error(f"Restart task found: {task_id}, can not restart device") @@ -250,6 +258,17 @@ def restart_device(device_id, force=False): device_set_retries_exhausted(device_id, True) device_set_unavailable(device_id) + if not snode.rpc_client().bdev_nvme_controller_list(device_obj.nvme_controller): + try: + ret = SNodeClient(snode.api_endpoint, timeout=30, retry=1).bind_device_to_spdk(device_obj.pcie_address) + logger.debug(ret) + snode.rpc_client().bdev_nvme_controller_attach(device_obj.nvme_controller, device_obj.pcie_address) + snode.rpc_client().bdev_examine(f"{device_obj.nvme_controller}n1") + snode.rpc_client().bdev_wait_for_examine() + except Exception as e: + logger.error(e) + return False + ret = _def_create_device_stack(device_obj, snode, force=force) if not ret: @@ -263,22 +282,33 @@ def restart_device(device_id, force=False): device_set_online(device_id) device_events.device_restarted(device_obj) - # add to jm raid - if snode.jm_device and snode.jm_device.raid_bdev and snode.jm_device.status != JMDevice.STATUS_REMOVED: - # looking for jm partition - rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) - jm_dev_part = f"{dev.nvme_bdev[:-1]}1" - ret = rpc_client.get_bdevs(jm_dev_part) - if ret: - logger.info(f"JM part found: {jm_dev_part}") + if snode.jm_device and snode.jm_device.status != JMDevice.STATUS_REMOVED: + if not snode.jm_device.raid_bdev: if snode.jm_device.status == JMDevice.STATUS_UNAVAILABLE: - restart_jm_device(snode.jm_device.get_id(), force=True) - - if snode.jm_device.status == JMDevice.STATUS_ONLINE and \ - jm_dev_part not in snode.jm_device.jm_nvme_bdev_list: - remove_jm_device(snode.jm_device.get_id(), force=True) - time.sleep(3) - restart_jm_device(snode.jm_device.get_id(), force=True) + set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) + else: + # looking for jm partition + rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) + jm_dev_part = f"{dev.nvme_bdev[:-1]}1" + ret = rpc_client.get_bdevs(jm_dev_part) + if ret: + logger.info(f"JM part found: {jm_dev_part}") + if snode.jm_device.status == JMDevice.STATUS_UNAVAILABLE: + if snode.rpc_client().get_bdevs(snode.jm_device.raid_bdev): + logger.info("Raid found, setting jm device online") + ret = snode.rpc_client().bdev_raid_get_bdevs() + has_bdev = any( + bdev["name"] == jm_dev_part + for raid in ret + for bdev in raid.get("base_bdevs_list", []) + ) + if not has_bdev: + logger.info(f"Adding to raid: {jm_dev_part}") + snode.rpc_client().bdev_raid_add_base_bdev(snode.jm_device.raid_bdev, jm_dev_part) + set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) + else: + logger.info("Raid not found, restarting jm device") + restart_jm_device(snode.jm_device.get_id(), force=True) return "Done" @@ -337,15 +367,25 @@ def device_remove(device_id, force=True): logger.error(e) return False + device = None for dev in snode.nvme_devices: if dev.get_id() == device_id: device = dev break - if device.status in [NVMeDevice.STATUS_REMOVED, NVMeDevice.STATUS_FAILED]: - logger.error(f"Unsupported device status: {device.status}") + if not device: + logger.error("device not found") return False + if device.status == NVMeDevice.STATUS_REMOVED: + return True + + if device.status in [NVMeDevice.STATUS_FAILED, NVMeDevice.STATUS_FAILED_AND_MIGRATED, + NVMeDevice.STATUS_NEW]: + logger.error(f"Unsupported device status: {device.status}") + if force is False: + return False + task_id = tasks_controller.get_active_dev_restart_task(snode.cluster_id, device_id) if task_id: logger.error(f"Restart task found: {task_id}, can not remove device") @@ -359,33 +399,46 @@ def device_remove(device_id, force=True): distr_controller.disconnect_device(device) logger.info("Removing device fabric") - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password) + rpc_client = snode.rpc_client() + node_bdev = {} + ret = rpc_client.get_bdevs() + if ret: + for b in ret: + node_bdev[b['name']] = b + for al in b['aliases']: + node_bdev[al] = b + + if rpc_client.subsystem_list(device.nvmf_nqn): + logger.info("Removing device subsystem") + ret = rpc_client.subsystem_delete(device.nvmf_nqn) + if not ret: + logger.error(f"Failed to remove subsystem: {device.nvmf_nqn}") + if not force: + return False - ret = rpc_client.subsystem_delete(device.nvmf_nqn) - if not ret: - logger.error(f"Failed to remove subsystem: {device.nvmf_nqn}") - if not force: - return False + if f"{device.alceml_bdev}_PT" in node_bdev or force: + logger.info("Removing device PT") + ret = rpc_client.bdev_PT_NoExcl_delete(f"{device.alceml_bdev}_PT") + if not ret: + logger.error(f"Failed to remove bdev: {device.alceml_bdev}_PT") + if not force: + return False - logger.info("Removing device bdevs") - ret = rpc_client.bdev_PT_NoExcl_delete(f"{device.alceml_bdev}_PT") - if not ret: - logger.error(f"Failed to remove bdev: {device.alceml_bdev}_PT") - if not force: - return False - ret = rpc_client.bdev_alceml_delete(device.alceml_bdev) - if not ret: - logger.error(f"Failed to remove bdev: {device.alceml_bdev}") - if not force: - return False - ret = rpc_client.qos_vbdev_delete(device.qos_bdev) - if not ret: - logger.error(f"Failed to remove bdev: {device.qos_bdev}") - if not force: - return False - if snode.enable_test_device: + if device.alceml_bdev in node_bdev or force: + ret = rpc_client.bdev_alceml_delete(device.alceml_bdev) + if not ret: + logger.error(f"Failed to remove bdev: {device.alceml_bdev}") + if not force: + return False + + if device.qos_bdev in node_bdev or force: + ret = rpc_client.qos_vbdev_delete(device.qos_bdev) + if not ret: + logger.error(f"Failed to remove bdev: {device.qos_bdev}") + if not force: + return False + + if snode.enable_test_device and device.testing_bdev in node_bdev or force: ret = rpc_client.bdev_passtest_delete(device.testing_bdev) if not ret: logger.error(f"Failed to remove bdev: {device.testing_bdev}") @@ -394,8 +447,9 @@ def device_remove(device_id, force=True): device_set_state(device_id, NVMeDevice.STATUS_REMOVED) - # remove device from jm raid - if snode.jm_device.raid_bdev: + if not snode.jm_device.raid_bdev: + remove_jm_device(snode.jm_device.get_id()) + else: nvme_controller = device.nvme_controller dev_to_remove = None for part in snode.jm_device.jm_nvme_bdev_list: @@ -404,11 +458,49 @@ def device_remove(device_id, force=True): break if dev_to_remove: - if snode.jm_device.status == NVMeDevice.STATUS_ONLINE: - remove_jm_device(snode.jm_device.get_id(), force=True) - time.sleep(3) + raid_found = False + for raid_info in rpc_client.bdev_raid_get_bdevs(): + if raid_info["name"] == snode.jm_device.raid_bdev: + raid_found = True + base_bdevs = raid_info.get("base_bdevs_list", []) + if any(bdev["name"] == dev_to_remove for bdev in base_bdevs): + remove_from_jm_device(snode.jm_device.get_id(), dev_to_remove) + if not raid_found: + set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_UNAVAILABLE) + + return True - restart_jm_device(snode.jm_device.get_id(), force=True) + +def remove_from_jm_device(device_id, jm_bdev): + db_controller = DBController() + + try: + snode = get_storage_node_by_jm_device(db_controller, device_id) + except KeyError as e: + logger.error(e) + return False + + if snode.status == StorageNode.STATUS_ONLINE: + rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) + + if snode.jm_device.raid_bdev: + logger.info("device part of raid1: only remove from raid") + try: + has_any = False + for raid_info in rpc_client.bdev_raid_get_bdevs(): + if raid_info["name"] == snode.jm_device.raid_bdev: + base_bdevs = raid_info.get("base_bdevs_list", []) + if any(bdev["name"] and bdev["name"] != jm_bdev for bdev in base_bdevs): + has_any = True + if has_any: + rpc_client.bdev_raid_remove_base_bdev(jm_bdev) + return True + else: + set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_UNAVAILABLE) + + except KeyError as e: + logger.error(e) + return False return True @@ -438,9 +530,9 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True): if not records_number: return False else: - records_number = 20 + records_number = records_count - records = db_controller.get_device_capacity(device, records_number) + # records = db_controller.get_device_capacity(device, records_number) cap_stats_keys = [ "date", "size_total", @@ -448,6 +540,8 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True): "size_free", "size_util", ] + prom_client = PromClient(device.cluster_id) + records = prom_client.get_device_metrics(device_id, cap_stats_keys, history) records_list = utils.process_records(records, records_count, keys=cap_stats_keys) if not parse_sizes: @@ -474,15 +568,6 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True): logger.error("device not found") return False - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records_list = db_controller.get_device_stats(device, records_number) io_stats_keys = [ "date", "read_bytes", @@ -496,8 +581,10 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True): "write_io_ps", "write_latency_ps", ] + prom_client = PromClient(device.cluster_id) + records = prom_client.get_device_metrics(device_id, io_stats_keys, history) # combine records - new_records = utils.process_records(records_list, records_count, keys=io_stats_keys) + new_records = utils.process_records(records, records_count, keys=io_stats_keys) if not parse_sizes: return new_records @@ -591,14 +678,15 @@ def device_set_failed(device_id): logger.error(e) return False + if dev.status != NVMeDevice.STATUS_REMOVED: + logger.error(f"Device must be in removed status, current status: {dev.status}") + return False + task_id = tasks_controller.get_active_dev_restart_task(snode.cluster_id, device_id) if task_id: logger.error(f"Restart task found: {task_id}, can not fail device") return False - if dev.status == NVMeDevice.STATUS_FAILED: - return True - ret = device_set_state(device_id, NVMeDevice.STATUS_FAILED) if not ret: logger.warning("Failed to set device state to failed") @@ -608,6 +696,7 @@ def device_set_failed(device_id): rpc_client.distr_replace_id_in_map_prob(dev.cluster_device_order, -1) tasks_controller.add_device_failed_mig_task(device_id) + return True def add_device(device_id, add_migration_task=True): @@ -623,14 +712,18 @@ def add_device(device_id, add_migration_task=True): logger.error("Device must be in new state") return False + device_obj = None for dev in snode.nvme_devices: if dev.get_id() == device_id: device_obj = dev break + if not device_obj: + logger.error("device not found") + return False + logger.info(f"Adding device {device_id}") - # if snode.num_partitions_per_dev == 0 or device_obj.is_partition: - ret = _def_create_device_stack(device_obj, snode, force=True) + ret = _def_create_device_stack(device_obj, snode, force=True, clear_data=True) if not ret: logger.error("Failed to create device stack") return False @@ -657,81 +750,6 @@ def add_device(device_id, add_migration_task=True): tasks_controller.add_new_device_mig_task(device_id) return device_id - # - # # create partitions - # partitions = snode.num_partitions_per_dev - # rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) - # # look for partitions - # partitioned_devices = storage_node_ops._search_for_partitions(rpc_client, device_obj) - # logger.debug("partitioned_devices") - # logger.debug(partitioned_devices) - # if len(partitioned_devices) == partitions+1: - # logger.info("Partitioned devices found") - # else: - # logger.info(f"Creating partitions for {device_obj.nvme_bdev}") - # storage_node_ops._create_device_partitions(rpc_client, device_obj, snode, partitions, snode.jm_percent) - # partitioned_devices = storage_node_ops._search_for_partitions(rpc_client, device_obj) - # if len(partitioned_devices) == partitions+1: - # logger.info("Device partitions created") - # else: - # logger.error("Failed to create partitions") - # return False - # - # jm_part = partitioned_devices.pop(0) - # new_devices = [] - # dev_order = storage_node_ops.get_next_cluster_device_order(db_controller, snode.cluster_id) - # for dev in partitioned_devices: - # new_device = storage_node_ops._create_storage_device_stack(rpc_client, dev, snode, after_restart=False) - # if not new_device: - # logger.error("failed to create dev stack") - # continue - # - # new_device.cluster_device_order = dev_order - # dev_order += 1 - # device_events.device_create(new_device) - # new_devices.append(new_device) - # - # if new_devices: - # snode.nvme_devices.remove(device_obj) - # snode.nvme_devices.extend(new_devices) - # snode.write_to_db(db_controller.kv_store) - # else: - # logger.error("failed to create devices") - # return False - # - # for dev in new_devices: - # distr_controller.send_cluster_map_add_device(dev, snode) - # - # logger.info("Make other nodes connect to the node devices") - # snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id) - # for node in snodes: - # if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE: - # continue - # node.remote_devices = storage_node_ops._connect_to_remote_devs(node) - # node.write_to_db() - # for dev in new_devices: - # distr_controller.send_cluster_map_add_device(dev, node) - # - # for dev in new_devices: - # tasks_controller.add_new_device_mig_task(dev.get_id()) - # - # # add to jm raid - # if snode.jm_device and snode.jm_device.raid_bdev and jm_part: - # # looking for jm partition - # jm_dev_part = jm_part.nvme_bdev - # ret = rpc_client.get_bdevs(jm_dev_part) - # if ret: - # logger.info(f"JM part found: {jm_dev_part}") - # if snode.jm_device.status in [JMDevice.STATUS_UNAVAILABLE, JMDevice.STATUS_REMOVED]: - # restart_jm_device(snode.jm_device.get_id(), force=True, format_alceml=True) - # - # if snode.jm_device.status == JMDevice.STATUS_ONLINE and \ - # jm_dev_part not in snode.jm_device.jm_nvme_bdev_list: - # remove_jm_device(snode.jm_device.get_id(), force=True) - # restart_jm_device(snode.jm_device.get_id(), force=True) - # - # return "Done" - def device_set_failed_and_migrated(device_id): db_controller = DBController() @@ -924,3 +942,58 @@ def restart_jm_device(device_id, force=False, format_alceml=False): set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) return True + + +def new_device_from_failed(device_id): + db_controller = DBController() + device = None + device_node = None + for node in db_controller.get_storage_nodes(): + for dev in node.nvme_devices: + if dev.get_id() == device_id: + device = dev + device_node = node + break + + if not device: + logger.info(f"Device not found: {device_id}") + return False + + if not device_node: + logger.info("node not found") + return False + + if device.status != NVMeDevice.STATUS_FAILED_AND_MIGRATED: + logger.error(f"Device status: {device.status} but expected status is {NVMeDevice.STATUS_FAILED_AND_MIGRATED}") + return False + + if device.serial_number.endswith("_failed"): + logger.error("Device is already added back from failed") + return False + + if not device_node.rpc_client().bdev_nvme_controller_list(device.nvme_controller): + try: + ret = SNodeClient(device_node.api_endpoint, timeout=30, retry=1).bind_device_to_spdk(device.pcie_address) + logger.debug(ret) + device_node.rpc_client().bdev_nvme_controller_attach(device.nvme_controller, device.pcie_address) + except Exception as e: + logger.error(e) + return False + + if not device_node.rpc_client().bdev_nvme_controller_list(device.nvme_controller): + logger.error(f"Failed to find device nvme controller {device.nvme_controller}") + return False + + new_device = NVMeDevice(device.to_dict()) + new_device.uuid = str(uuid.uuid4()) + new_device.status = NVMeDevice.STATUS_NEW + new_device.cluster_device_order = -1 + new_device.deleted = False + new_device.io_error = False + new_device.retries_exhausted = False + device_node.nvme_devices.append(new_device) + + device.serial_number = f"{device.serial_number}_failed" + device_node.write_to_db(db_controller.kv_store) + logger.info(f"New device created from failed device: {device_id}, new device id: {new_device.get_id()}") + return new_device.get_id() \ No newline at end of file diff --git a/simplyblock_core/controllers/device_events.py b/simplyblock_core/controllers/device_events.py index f2e1e959d..1f5ee881a 100644 --- a/simplyblock_core/controllers/device_events.py +++ b/simplyblock_core/controllers/device_events.py @@ -3,6 +3,8 @@ from simplyblock_core.controllers import events_controller as ec from simplyblock_core.db_controller import DBController +from simplyblock_core.models.nvme_device import NVMeDevice +from simplyblock_core import utils, constants logger = logging.getLogger() @@ -20,6 +22,24 @@ def _device_event(device, message, caused_by, event): node_id=device.get_id(), storage_id=device.cluster_device_order) + cluster = db_controller.get_cluster_by_id(snode.cluster_id) + if cluster.mode == "kubernetes": + total_devices = len(snode.nvme_devices) + online_devices = 0 + for dev in snode.nvme_devices: + if dev.status == NVMeDevice.STATUS_ONLINE: + online_devices += 1 + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=snode.cr_plural, + namespace=snode.cr_namespace, + name=snode.cr_name, + node_uuid=snode.get_id(), + node_mgmt_ip=snode.mgmt_ip, + updates={"devices": f"{total_devices}/{online_devices}"}, + ) + def device_create(device, caused_by=ec.CAUSED_BY_CLI): _device_event(device, f"Device created: {device.get_id()}", caused_by, ec.EVENT_OBJ_CREATED) diff --git a/simplyblock_core/controllers/health_controller.py b/simplyblock_core/controllers/health_controller.py index c013e2d58..fb0444348 100644 --- a/simplyblock_core/controllers/health_controller.py +++ b/simplyblock_core/controllers/health_controller.py @@ -9,7 +9,7 @@ from simplyblock_core.db_controller import DBController from simplyblock_core.fw_api_client import FirewallClient from simplyblock_core.models.cluster import Cluster -from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice +from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice, RemoteDevice from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.rpc_client import RPCClient from simplyblock_core.snode_client import SNodeClient @@ -18,7 +18,7 @@ logger = utils.get_logger(__name__) -def check_bdev(name, *, rpc_client=None, bdev_names=None): +def check_bdev(name, *, rpc_client=None, bdev_names=None) -> bool: present = ( ((bdev_names is not None) and (name in bdev_names)) or (rpc_client is not None and (rpc_client.get_bdevs(name) is not None)) @@ -27,7 +27,7 @@ def check_bdev(name, *, rpc_client=None, bdev_names=None): return present -def check_subsystem(nqn, *, rpc_client=None, nqns=None, ns_uuid=None): +def check_subsystem(nqn, *, rpc_client=None, nqns=None, ns_uuid=None) -> bool: if rpc_client: subsystem = subsystems[0] if (subsystems := rpc_client.subsystem_list(nqn)) is not None else None elif nqns: @@ -59,7 +59,7 @@ def check_subsystem(nqn, *, rpc_client=None, nqns=None, ns_uuid=None): for listener in listeners: logger.info(f"Checking listener {listener['traddr']}:{listener['trsvcid']} ... ok") - return bool(listeners) and namespaces + return bool(listeners) and bool(namespaces) def check_cluster(cluster_id): @@ -109,15 +109,17 @@ def _check_node_rpc(rpc_ip, rpc_port, rpc_username, rpc_password, timeout=5, ret ret = rpc_client.get_version() if ret: logger.debug(f"SPDK version: {ret['version']}") - return True + return True, True + else: + return True, False except Exception as e: logger.debug(e) - return False + return False, False def _check_node_api(ip): try: - snode_api = SNodeClient(f"{ip}:5000", timeout=10, retry=2) + snode_api = SNodeClient(f"{ip}:5000", timeout=90, retry=2) logger.debug(f"Node API={ip}:5000") ret, _ = snode_api.is_live() logger.debug(f"snode is alive: {ret}") @@ -128,43 +130,35 @@ def _check_node_api(ip): return False -def _check_spdk_process_up(ip, rpc_port): - try: - snode_api = SNodeClient(f"{ip}:5000", timeout=10, retry=2) - logger.debug(f"Node API={ip}:5000") - is_up, _ = snode_api.spdk_process_is_up(rpc_port) - logger.debug(f"SPDK is {is_up}") - return is_up - except Exception as e: - logger.debug(e) - return False - - -def _check_port_on_node(snode, port_id): - try: - fw_api = FirewallClient(snode, timeout=5, retry=2) - iptables_command_output, _ = fw_api.get_firewall(snode.rpc_port) - if type(iptables_command_output) is str: - iptables_command_output = [iptables_command_output] - for rules in iptables_command_output: - result = jc.parse('iptables', rules) - for chain in result: - if chain['chain'] in ["INPUT", "OUTPUT"]: # type: ignore - for rule in chain['rules']: # type: ignore - if str(port_id) in rule['options']: # type: ignore - action = rule['target'] # type: ignore - if action in ["DROP"]: - return False - - # check RDMA port block - if snode.active_rdma: - rdma_fw_port_list = snode.rpc_client().nvmf_get_blocked_ports_rdma() - if port_id in rdma_fw_port_list: - return False +def _check_spdk_process_up(ip, rpc_port, cluster_id): + snode_api = SNodeClient(f"{ip}:5000", timeout=90, retry=2) + logger.debug(f"Node API={ip}:5000") + is_up, _ = snode_api.spdk_process_is_up(rpc_port, cluster_id) + logger.debug(f"SPDK is {is_up}") + return is_up + + +def check_port_on_node(snode, port_id): + fw_api = FirewallClient(snode, timeout=5, retry=2) + iptables_command_output, _ = fw_api.get_firewall(snode.rpc_port) + if type(iptables_command_output) is str: + iptables_command_output = [iptables_command_output] + for rules in iptables_command_output: + result = jc.parse('iptables', rules) + for chain in result: + if chain['chain'] in ["INPUT", "OUTPUT"]: # type: ignore + for rule in chain['rules']: # type: ignore + if str(port_id) in rule['options']: # type: ignore + action = rule['target'] # type: ignore + if action in ["DROP"]: + return False + + # check RDMA port block + if snode.active_rdma: + rdma_fw_port_list = snode.rpc_client().nvmf_get_blocked_ports_rdma() + if port_id in rdma_fw_port_list: + return False - return True - except Exception as e: - logger.error(e) return True @@ -175,7 +169,7 @@ def _check_node_ping(ip): else: return False -def _check_node_hublvol(node: StorageNode, node_bdev_names=None, node_lvols_nqns=None): +def _check_node_hublvol(node: StorageNode, node_bdev_names=None, node_lvols_nqns=None) -> bool: if not node.hublvol: logger.error(f"Node {node.get_id()} does not have a hublvol") return False @@ -235,15 +229,17 @@ def _check_node_hublvol(node: StorageNode, node_bdev_names=None, node_lvols_nqns passed = False else: lvs_info_dict.append({"Key": k, "Value": v, "expected": " "}) - for line in utils.print_table(lvs_info_dict).splitlines(): - logger.info(line) + if not passed: + for line in utils.print_table(lvs_info_dict).splitlines(): + logger.info(line) except Exception as e: logger.exception(e) + return False return passed -def _check_sec_node_hublvol(node: StorageNode, node_bdev=None, node_lvols_nqns=None, auto_fix=False): +def _check_sec_node_hublvol(node: StorageNode, node_bdev=None, node_lvols_nqns=None, auto_fix=False) -> bool: db_controller = DBController() try: primary_node = db_controller.get_storage_node_by_id(node.lvstore_stack_secondary_1) @@ -294,6 +290,16 @@ def _check_sec_node_hublvol(node: StorageNode, node_bdev=None, node_lvols_nqns=N passed = bool(ret) logger.info(f"Checking controller: {primary_node.hublvol.bdev_name} ... {passed}") + node_bdev = {} + ret = rpc_client.get_bdevs() + if ret: + for b in ret: + node_bdev[b['name']] = b + for al in b['aliases']: + node_bdev[al]= b + else: + node_bdev = [] + passed &= check_bdev(primary_node.hublvol.get_remote_bdev_name(), bdev_names=node_bdev) if not passed: return False @@ -331,20 +337,20 @@ def _check_sec_node_hublvol(node: StorageNode, node_bdev=None, node_lvols_nqns=N else: lvs_info_dict.append({"Key": k, "Value": v, "expected": " "}) - for line in utils.print_table(lvs_info_dict).splitlines(): - logger.info(line) + if not passed: + for line in utils.print_table(lvs_info_dict).splitlines(): + logger.info(line) except Exception as e: logger.exception(e) + return False return passed def _check_node_lvstore( - lvstore_stack, node, auto_fix=False, node_bdev_names=None, stack_src_node=None): + lvstore_stack, node, auto_fix=False, node_bdev_names=None, stack_src_node=None) -> bool: db_controller = DBController() - lvstore_check = True logger.info(f"Checking distr stack on node : {node.get_id()}") - rpc_client = RPCClient( - node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=5, retry=1) + cluster = db_controller.get_cluster_by_id(node.cluster_id) if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: auto_fix = False @@ -367,12 +373,24 @@ def _check_node_lvstore( node_distribs_list = bdev["distribs_list"] if not node_bdev_names: - ret = rpc_client.get_bdevs() + try: + ret = node.rpc_client().get_bdevs() + except Exception as e: + logger.info(e) + return False + if ret: node_bdev_names = [b['name'] for b in ret] else: node_bdev_names = [] + nodes = {} + devices = {} + for n in db_controller.get_storage_nodes(): + nodes[n.get_id()] = n + for dev in n.nvme_devices: + devices[dev.get_id()] = dev + for distr in distribs_list: if distr in node_bdev_names: logger.info(f"Checking distr bdev : {distr} ... ok") @@ -386,22 +404,34 @@ def _check_node_lvstore( for jm in jm_names: logger.info(jm) logger.info("Checking Distr map ...") - ret = rpc_client.distr_get_cluster_map(distr) + try: + ret = node.rpc_client().distr_get_cluster_map(distr) + except Exception as e: + logger.info(f"Failed to get cluster map: {e}") + return False if not ret: logger.error("Failed to get cluster map") - lvstore_check = False + return False else: - results, is_passed = distr_controller.parse_distr_cluster_map(ret) + results, is_passed = distr_controller.parse_distr_cluster_map(ret, nodes, devices) if results: - logger.info(utils.print_table(results)) logger.info(f"Checking Distr map ... {is_passed}") - if not is_passed and auto_fix: + if is_passed: + continue + + elif not auto_fix: + return False + + else: # is_passed is False and auto_fix is True + logger.info(utils.print_table(results)) for result in results: if result['Results'] == 'failed': if result['Kind'] == "Device": if result['Found Status']: dev = db_controller.get_storage_device_by_id(result['UUID']) - if dev.status == NVMeDevice.STATUS_ONLINE: + dev_node = db_controller.get_storage_node_by_id(dev.node_id) + if dev.status == NVMeDevice.STATUS_ONLINE and dev_node.status in [ + StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: try: remote_bdev = storage_node_ops.connect_device( f"remote_{dev.alceml_bdev}", dev, node, @@ -413,44 +443,67 @@ def _check_node_lvstore( if dev.get_id() == rem_dev.get_id(): continue new_remote_devices.append(rem_dev) - dev.remote_bdev = remote_bdev - new_remote_devices.append(dev) + + remote_device = RemoteDevice() + remote_device.uuid = dev.uuid + remote_device.alceml_name = dev.alceml_name + remote_device.node_id = dev.node_id + remote_device.size = dev.size + remote_device.status = NVMeDevice.STATUS_ONLINE + remote_device.nvmf_multipath = dev.nvmf_multipath + remote_device.remote_bdev = remote_bdev + new_remote_devices.append(remote_device) n.remote_devices = new_remote_devices n.write_to_db() distr_controller.send_dev_status_event(dev, dev.status, node) except Exception as e: logger.error(f"Failed to connect to {dev.get_id()}: {e}") + else: + if dev_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: + distr_controller.send_dev_status_event(dev, dev.status, node) + if result['Kind'] == "Node": n = db_controller.get_storage_node_by_id(result['UUID']) distr_controller.send_node_status_event(n, n.status, node) - ret = rpc_client.distr_get_cluster_map(distr) + + try: + ret = node.rpc_client().distr_get_cluster_map(distr) + except Exception as e: + logger.error(e) + return False if not ret: logger.error("Failed to get cluster map") - lvstore_check = False + return False else: - results, is_passed = distr_controller.parse_distr_cluster_map(ret) + results, is_passed = distr_controller.parse_distr_cluster_map(ret, nodes, devices) logger.info(f"Checking Distr map ... {is_passed}") + if not is_passed: + return False else: logger.error("Failed to parse distr cluster map") - lvstore_check &= is_passed + return False else: logger.info(f"Checking distr bdev : {distr} ... not found") - lvstore_check = False + return False if raid: if raid in node_bdev_names: logger.info(f"Checking raid bdev: {raid} ... ok") else: logger.info(f"Checking raid bdev: {raid} ... not found") - lvstore_check = False + return False if bdev_lvstore: - ret = rpc_client.bdev_lvol_get_lvstores(bdev_lvstore) + try: + ret = node.rpc_client().bdev_lvol_get_lvstores(bdev_lvstore) + except Exception as e: + logger.error(e) + return False if ret: logger.info(f"Checking lvstore: {bdev_lvstore} ... ok") else: logger.info(f"Checking lvstore: {bdev_lvstore} ... not found") - lvstore_check = False - return lvstore_check + return False + return True def check_node(node_id, with_devices=True): db_controller = DBController() @@ -479,7 +532,7 @@ def check_node(node_id, with_devices=True): logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}") # 3- check node RPC - node_rpc_check = _check_node_rpc( + node_rpc_check, _ = _check_node_rpc( snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") @@ -493,13 +546,19 @@ def check_node(node_id, with_devices=True): if snode.lvstore_stack_secondary_1: try: n = db_controller.get_storage_node_by_id(snode.lvstore_stack_secondary_1) - lvol_port_check = _check_port_on_node(snode, n.lvol_subsys_port) + lvol_port_check = check_port_on_node(snode, n.lvol_subsys_port) logger.info(f"Check: node {snode.mgmt_ip}, port: {n.lvol_subsys_port} ... {lvol_port_check}") except KeyError: - pass + logger.error("node not found") + except Exception: + logger.error("Check node port failed, connection error") + if not snode.is_secondary_node: - lvol_port_check = _check_port_on_node(snode, snode.lvol_subsys_port) - logger.info(f"Check: node {snode.mgmt_ip}, port: {snode.lvol_subsys_port} ... {lvol_port_check}") + try: + lvol_port_check = check_port_on_node(snode, snode.lvol_subsys_port) + logger.info(f"Check: node {snode.mgmt_ip}, port: {snode.lvol_subsys_port} ... {lvol_port_check}") + except Exception: + logger.error("Check node port failed, connection error") is_node_online = ping_check and node_api_check and node_rpc_check @@ -722,17 +781,23 @@ def check_lvol_on_node(lvol_id, node_id, node_bdev_names=None, node_lvols_nqns=N if not node_bdev_names: node_bdev_names = {} - ret = rpc_client.get_bdevs() - if ret: - for bdev in ret: - node_bdev_names[bdev['name']] = bdev + try: + ret = rpc_client.get_bdevs() + if ret: + for bdev in ret: + node_bdev_names[bdev['name']] = bdev + except Exception as e: + logger.error(f"Failed to connect to node's SPDK: {e}") if not node_lvols_nqns: node_lvols_nqns = {} - ret = rpc_client.subsystem_list() - if ret: - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub + try: + ret = rpc_client.subsystem_list() + if ret: + for sub in ret: + node_lvols_nqns[sub['nqn']] = sub + except Exception as e: + logger.error(f"Failed to connect to node's SPDK: {e}") passed = True try: @@ -785,12 +850,14 @@ def check_snap(snap_id): return False snode = db_controller.get_storage_node_by_id(snap.lvol.node_id) - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, timeout=5, retry=1) - - ret = rpc_client.get_bdevs(snap.snap_bdev) - return ret + check_primary = snode.rpc_client().get_bdevs(snap.snap_bdev) + logger.info(f"Checking snap bdev: {snap.snap_bdev} on node: {snap.lvol.node_id} is {bool(check_primary)}") + if snode.secondary_node_id: + secondary_node = db_controller.get_storage_node_by_id(snode.secondary_node_id) + check_secondary = secondary_node.rpc_client().get_bdevs(snap.snap_bdev) + logger.info(f"Checking snap bdev: {snap.snap_bdev} on node: {snode.secondary_node_id} is {bool(check_secondary)}") + return check_primary and check_secondary + return check_primary def check_jm_device(device_id): diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 4d7a5aad3..f25f8cec7 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1,4 +1,5 @@ # coding=utf-8 +import copy import logging as lg import json import math @@ -10,11 +11,15 @@ from typing import List, Tuple from simplyblock_core import utils, constants -from simplyblock_core.controllers import snapshot_controller, pool_controller, lvol_events +from simplyblock_core.controllers import snapshot_controller, pool_controller, lvol_events, tasks_controller, \ + snapshot_events from simplyblock_core.db_controller import DBController +from simplyblock_core.models.cluster import Cluster +from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.pool import Pool from simplyblock_core.models.lvol_model import LVol from simplyblock_core.models.storage_node import StorageNode +from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient logger = lg.getLogger() @@ -139,36 +144,17 @@ def _get_next_3_nodes(cluster_id, lvol_size=0): for node in snodes: if node.is_secondary_node: # pass continue - if node.status == node.STATUS_ONLINE: - lvol_count = len(db_controller.get_lvols_by_node_id(node.get_id())) if lvol_count >= node.max_lvol: continue - - # Validate Eligible nodes for adding lvol - # snode_api = SNodeClient(node.api_endpoint) - # result, _ = snode_api.info() - # memory_free = result["memory_details"]["free"] - # huge_free = result["memory_details"]["huge_free"] - # total_node_capacity = db_controller.get_snode_size(node.get_id()) - # error = utils.validate_add_lvol_or_snap_on_node(memory_free, huge_free, node.max_lvol, lvol_size, total_node_capacity, len(node.lvols)) - # if error: - # logger.warning(error) - # continue - # + if node.lvol_sync_del(): + logger.warning(f"LVol sync delete task found on node: {node.get_id()}, skipping") + continue online_nodes.append(node) - # node_stat_list = db_controller.get_node_stats(node, limit=1000) - # combined_record = utils.sum_records(node_stat_list) node_st = { - "lvol": lvol_count+1, - # "cpu": 1 + (node.cpu * node.cpu_hz), - # "r_io": combined_record.read_io_ps, - # "w_io": combined_record.write_io_ps, - # "r_b": combined_record.read_bytes_ps, - # "w_b": combined_record.write_bytes_ps + "lvol": lvol_count+1 } - node_stats[node.get_id()] = node_st if len(online_nodes) <= 1: @@ -263,10 +249,11 @@ def validate_aes_xts_keys(key1: str, key2: str) -> Tuple[bool, str]: return True, "" -def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, use_crypto, - distr_vuid, max_rw_iops, max_rw_mbytes, max_r_mbytes, max_w_mbytes, +def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp=False, use_crypto=False, + distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, with_snapshot=False, max_size=0, crypto_key1=None, crypto_key2=None, lvol_priority_class=0, - uid=None, pvc_name=None, namespace=None, max_namespace_per_subsys=1, fabric="tcp", ndcs=0, npcs=0): + uid=None, pvc_name=None, namespace=None, max_namespace_per_subsys=1, fabric="tcp", ndcs=0, npcs=0, + do_replicate=False, replication_cluster_id=None): db_controller = DBController() logger.info(f"Adding LVol: {name}") @@ -280,6 +267,9 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, host_node = nodes[0] else: return False, f"Can not find storage node: {host_id_or_name}" + if host_node.lvol_sync_del(): + logger.error(f"LVol sync deletion found on node: {host_node.get_id()}") + return False, f"LVol sync deletion found on node: {host_node.get_id()}" if namespace: try: @@ -455,14 +445,12 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, lvol.nqn = cl.nqn + ":lvol:" + lvol.uuid lvol.max_namespace_per_subsys = max_namespace_per_subsys - nodes = [] - if host_node: - nodes.insert(0, host_node) - else: + if not host_node: nodes = _get_next_3_nodes(cl.get_id(), lvol.size) if not nodes: return False, "No nodes found with enough resources to create the LVol" host_node = nodes[0] + s_node = db_controller.get_storage_node_by_id(host_node.secondary_node_id) attr_name = f"active_{fabric}" is_active_primary = getattr(host_node, attr_name) @@ -484,6 +472,16 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, else: lvol.npcs = cl.distr_npcs lvol.ndcs = cl.distr_ndcs + lvol.do_replicate = bool(do_replicate) + if lvol.do_replicate: + if replication_cluster_id: + replication_cluster = db_controller.get_cluster_by_id(replication_cluster_id) + if not replication_cluster: + return False, f"Replication cluster not found: {replication_cluster_id}" + else: + replication_cluster_id = cl.snapshot_replication_target_cluster + random_nodes = _get_next_3_nodes(replication_cluster_id, lvol.size) + lvol.replication_node_id = random_nodes[0].get_id() lvol_count = len(db_controller.get_lvols_by_node_id(host_node.get_id())) if lvol_count > host_node.max_lvol: @@ -731,7 +729,7 @@ def add_lvol_on_node(lvol, snode, is_primary=True): return False, f"Failed to create listener for {lvol.get_id()}" logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid) + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, lvol.ns_id, f"{lvol.vuid:016X}") if not ret: return False, "Failed to add bdev to subsystem" lvol.ns_id = int(ret) @@ -775,7 +773,7 @@ def recreate_lvol_on_node(lvol, snode, ha_inode_self=0, ana_state=None): # if namespace_found is False: logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid) + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, lvol.ns_id) # if not ret: # return False, "Failed to add bdev to subsystem" @@ -1225,7 +1223,8 @@ def list_lvols(is_json, cluster_id, pool_id_or_name, all=False): "IO Err": lvol.io_error, "Health": lvol.health_check, "NS ID": lvol.ns_id, - "Mode": mode + "Mode": mode, + "Replicated On": lvol.replication_node_id, } data.append(lvol_data) @@ -1265,6 +1264,62 @@ def list_lvols_mem(is_json, is_csv): return utils.print_table(data) +def get_replication_info(lvol_id_or_name): + db_controller = DBController() + lvol = None + for lv in db_controller.get_lvols(): # pass + if lv.get_id() == lvol_id_or_name or lv.lvol_name == lvol_id_or_name: + lvol = lv + break + + if not lvol: + logger.error(f"LVol id or name not found: {lvol_id_or_name}") + return False + + tasks = [] + snaps = [] + out = { + "last_snapshot_id": None, + "last_replication_time": None, + "last_replication_duration": None, + "replicated_count": None, + "snaps": None, + "tasks": None, + } + node = db_controller.get_storage_node_by_id(lvol.node_id) + for task in db_controller.get_job_tasks(node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + if snap.lvol.get_id() != lvol.get_id(): + continue + snaps.append(snap) + tasks.append(task) + + if tasks: + tasks = sorted(tasks, key=lambda x: x.date) + snaps = sorted(snaps, key=lambda x: x.created_at) + out["snaps"] = [s.to_dict() for s in snaps] + out["tasks"] = [t.to_dict() for t in tasks] + out["replicated_count"] = len(snaps) + last_task = tasks[-1] + last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) + out["last_snapshot_id"] = last_snap.get_id() + out["last_replication_time"] = last_task.updated_at + if "end_time" in last_task.function_params: + duration = utils.strfdelta_seconds( + last_task.function_params["end_time"] - last_task.function_params["start_time"]) + else: + duration = utils.strfdelta_seconds(int(time.time()) - last_task.function_params["start_time"]) + out["last_replication_duration"] = duration + + return out + + def get_lvol(lvol_id_or_name, is_json): db_controller = DBController() lvol = None @@ -1281,6 +1336,7 @@ def get_lvol(lvol_id_or_name, is_json): del data['nvme_dev'] + if is_json: return json.dumps(data, indent=2) else: @@ -1296,6 +1352,16 @@ def connect_lvol(uuid, ctrl_loss_tmo=constants.LVOL_NVME_CONNECT_CTRL_LOSS_TMO): logger.error(e) return False + node = db_controller.get_storage_node_by_id(lvol.node_id) + cluster = db_controller.get_cluster_by_id(node.cluster_id) + if cluster.status == Cluster.STATUS_SUSPENDED and cluster.snapshot_replication_target_cluster: + logger.error("Cluster is suspended, looking for replicated lvol") + for lv in db_controller.get_lvols(cluster.snapshot_replication_target_cluster): + if lv.nqn == lvol.nqn: + logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") + lvol = lv + break + out = [] nodes_ids = [] if lvol.ha_type == 'single': @@ -1380,6 +1446,10 @@ def resize_lvol(id, new_size): snode = db_controller.get_storage_node_by_id(lvol.node_id) + if snode.lvol_sync_del(): + logger.error(f"LVol sync deletion found on node: {snode.get_id()}") + return False, f"LVol sync deletion found on node: {snode.get_id()}" + logger.info(f"Resizing LVol: {lvol.get_id()}") logger.info(f"Current size: {utils.humanbytes(lvol.size)}, new size: {utils.humanbytes(new_size)}") @@ -1521,19 +1591,11 @@ def get_capacity(lvol_uuid, history, records_count=20, parse_sizes=True): db_controller = DBController() try: lvol = db_controller.get_lvol_by_id(lvol_uuid) + pool = db_controller.get_pool_by_id(lvol.pool_uuid) except KeyError as e: logger.error(e) return False - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records_list = db_controller.get_lvol_stats(lvol, limit=records_number) cap_stats_keys = [ "date", "size_total", @@ -1543,6 +1605,8 @@ def get_capacity(lvol_uuid, history, records_count=20, parse_sizes=True): "size_prov", "size_prov_util" ] + prom_client = PromClient(pool.cluster_id) + records_list = prom_client.get_lvol_metrics(lvol_uuid, cap_stats_keys, history) new_records = utils.process_records(records_list, records_count, keys=cap_stats_keys) if not parse_sizes: @@ -1564,19 +1628,11 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si db_controller = DBController() try: lvol = db_controller.get_lvol_by_id(lvol_uuid) + pool = db_controller.get_pool_by_id(lvol.pool_uuid) except KeyError as e: logger.error(e) return False - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records_list = db_controller.get_lvol_stats(lvol, limit=records_number) io_stats_keys = [ "date", "read_bytes", @@ -1587,7 +1643,6 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si "write_bytes_ps", "write_io_ps", "write_latency_ps", - "connected_clients", ] if with_sizes: io_stats_keys.extend( @@ -1612,6 +1667,8 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si "write_latency_ticks", ] ) + prom_client = PromClient(pool.cluster_id) + records_list = prom_client.get_lvol_metrics(lvol_uuid, io_stats_keys, history) # combine records new_records = utils.process_records(records_list, records_count, keys=io_stats_keys) @@ -1630,7 +1687,6 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si "Write speed": utils.humanbytes(record['write_bytes_ps']), "Write IOPS": record['write_io_ps'], "Write lat": record['write_latency_ps'], - "Con": record['connected_clients'], }) return out @@ -1765,3 +1821,447 @@ def inflate_lvol(lvol_id): else: logger.error(f"Failed to inflate LVol: {lvol_id}") return ret + +def replication_trigger(lvol_id): + # create snapshot and replicate it + db_controller = DBController() + lvol = db_controller.get_lvol_by_id(lvol_id) + node = db_controller.get_storage_node_by_id(lvol.node_id) + snapshot_controller.add(lvol_id, f"replication_{uuid.uuid4()}") + + tasks = [] + snaps = [] + out = { + "lvol": lvol, + "last_snapshot_id": None, + "last_replication_time": None, + "last_replication_duration": None, + "replicated_count": None, + "snaps": None, + "tasks": None, + } + for task in db_controller.get_job_tasks(node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + if snap.lvol.get_id() != lvol_id: + continue + snaps.append(snap) + tasks.append(task) + + if tasks: + tasks = sorted(tasks, key=lambda x: x.date) + snaps = sorted(snaps, key=lambda x: x.created_at) + out["snaps"] = snaps + out["tasks"] = tasks + out["replicated_count"] = len(snaps) + last_task = tasks[-1] + last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) + out["last_snapshot_id"] = last_snap.get_id() + out["last_replication_time"] = last_task.updated_at + duration = 0 + if "start_time" in last_task.function_params: + if "end_time" in last_task.function_params: + duration = utils.strfdelta_seconds( + last_task.function_params["end_time"] - last_task.function_params["start_time"]) + else: + duration = utils.strfdelta_seconds(int(time.time()) - last_task.function_params["start_time"]) + out["last_replication_duration"] = duration + + return out + +def replication_start(lvol_id, replication_cluster_id=None): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + lvol.do_replicate = True + if not lvol.replication_node_id: + excluded_nodes = [] + if lvol.cloned_from_snap: + lvol_snap = db_controller.get_snapshot_by_id(lvol.cloned_from_snap) + if lvol_snap.source_replicated_snap_uuid: + org_snap = db_controller.get_snapshot_by_id(lvol_snap.source_replicated_snap_uuid) + excluded_nodes.append(org_snap.lvol.node_id) + snode = db_controller.get_storage_node_by_id(lvol.node_id) + cluster = db_controller.get_cluster_by_id(snode.cluster_id) + if not replication_cluster_id: + replication_cluster_id = cluster.snapshot_replication_target_cluster + if not replication_cluster_id: + logger.error(f"Cluster: {snode.cluster_id} not replicated") + return False + random_nodes = _get_next_3_nodes(replication_cluster_id, lvol.size) + for r_node in random_nodes: + if r_node.get_id() not in excluded_nodes: + logger.info(f"Replicating on node: {r_node.get_id()}") + lvol.replication_node_id = r_node.get_id() + lvol.write_to_db() + break + if not lvol.replication_node_id: + logger.error(f"Replication node not found for lvol: {lvol.get_id()}") + return False + logger.info("Setting LVol do_replicate: True") + + for snap in db_controller.get_snapshots(): + if snap.lvol.uuid == lvol.uuid: + if not snap.target_replicated_snap_uuid: + task = tasks_controller.add_snapshot_replication_task(snap.cluster_id, snap.lvol.node_id, snap.get_id()) + if task: + snapshot_events.replication_task_created(snap) + return True + + +def replication_stop(lvol_id, delete=False): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + logger.info("Setting LVol do_replicate: False") + lvol.do_replicate = False + lvol.write_to_db() + + snode = db_controller.get_storage_node_by_id(lvol.node_id) + tasks = db_controller.get_job_tasks(snode.cluster_id) + + + for task in tasks: + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and task.status != JobSchedule.STATUS_DONE: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + if snap.lvol.uuid == lvol.uuid: + tasks_controller.cancel_task(task.uuid) + + return True + + +def replicate_lvol_on_target_cluster(lvol_id): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + if not lvol.replication_node_id: + logger.error(f"LVol: {lvol_id} replication node id not found") + return False + + target_node = db_controller.get_storage_node_by_id(lvol.replication_node_id) + if not target_node: + logger.error(f"Node not found: {lvol.replication_node_id}") + return False + + if target_node.status != StorageNode.STATUS_ONLINE: + logger.error(f"Node is not online!: {target_node}, status: {target_node.status}") + return False + + source_node = db_controller.get_storage_node_by_id(lvol.node_id) + source_cluster = db_controller.get_cluster_by_id(source_node.cluster_id) + + for lv in db_controller.get_lvols(source_cluster.snapshot_replication_target_cluster): + if lv.nqn == lvol.nqn: + logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") + return lv.get_id() + + snaps = [] + snapshot = None + for task in db_controller.get_job_tasks(source_node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + if snap.lvol.get_id() != lvol_id: + continue + snaps.append(snap) + + if snaps: + snaps = sorted(snaps, key=lambda x: x.created_at) + last_snapshot = snaps[-1] + rep_snap = db_controller.get_snapshot_by_id(last_snapshot.target_replicated_snap_uuid) + snapshot = rep_snap + + if not snapshot: + logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") + return False + + # create lvol on target node + new_lvol = copy.deepcopy(lvol) + new_lvol.uuid = str(uuid.uuid4()) + new_lvol.create_dt = str(datetime.now()) + new_lvol.node_id = target_node.get_id() + new_lvol.nodes = [target_node.get_id(), target_node.secondary_node_id] + new_lvol.replication_node_id = "" + new_lvol.do_replicate = False + new_lvol.cloned_from_snap = snapshot.get_id() + new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool + new_lvol.lvs_name = target_node.lvstore + new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" + new_lvol.snapshot_name = snapshot.snap_bdev + new_lvol.status = LVol.STATUS_IN_CREATION + + new_lvol.bdev_stack = [ + { + "type": "bdev_lvol_clone", + "name": new_lvol.top_bdev, + "params": { + "snapshot_name": snapshot.snap_bdev, + "clone_name": new_lvol.lvol_bdev + } + } + ] + + if new_lvol.crypto_bdev: + new_lvol.bdev_stack.append({ + "type": "crypto", + "name": new_lvol.crypto_bdev, + "params": { + "name": new_lvol.crypto_bdev, + "base_name": new_lvol.top_bdev, + "key1": new_lvol.crypto_key1, + "key2": new_lvol.crypto_key2, + } + }) + + new_lvol.write_to_db(db_controller.kv_store) + + lvol_bdev, error = add_lvol_on_node(new_lvol, target_node) + if error: + logger.error(error) + new_lvol.remove(db_controller.kv_store) + return False, error + + new_lvol.lvol_uuid = lvol_bdev['uuid'] + new_lvol.blobid = lvol_bdev['driver_specific']['lvol']['blobid'] + + secondary_node = db_controller.get_storage_node_by_id(target_node.secondary_node_id) + if secondary_node.status == StorageNode.STATUS_ONLINE: + lvol_bdev, error = add_lvol_on_node(new_lvol, secondary_node, is_primary=False) + if error: + logger.error(error) + # remove lvol from primary + ret = delete_lvol_from_node(new_lvol, target_node) + if not ret: + logger.error("") + new_lvol.remove(db_controller.kv_store) + return False, error + + new_lvol.status = LVol.STATUS_ONLINE + new_lvol.write_to_db(db_controller.kv_store) + lvol = db_controller.get_lvol_by_id(lvol_id) + lvol.from_source = False + lvol.write_to_db() + lvol_events.lvol_replicated(lvol, new_lvol) + + return new_lvol.lvol_uuid + + +def list_replication_tasks(lvol_id): + db_controller = DBController() + lvol = db_controller.get_lvol_by_id(lvol_id) + node = db_controller.get_storage_node_by_id(lvol.node_id) + tasks = [] + for task in db_controller.get_job_tasks(node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + if snap.lvol.get_id() != lvol_id: + continue + tasks.append(task) + + return tasks + + +def suspend_lvol(lvol_id): + + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + logger.info(f"suspending LVol subsystem: {lvol.get_id()}") + snode = db_controller.get_storage_node_by_id(lvol.node_id) + for iface in snode.data_nics: + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): + logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) + ret = snode.rpc_client().nvmf_subsystem_listener_set_ana_state(lvol.nqn, iface.ip4_address, lvol.subsys_port, ana="inaccessible") + if not ret: + logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") + return False + + if snode.secondary_node_id: + sec_node = db_controller.get_storage_node_by_id(snode.secondary_node_id) + if sec_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN, StorageNode.STATUS_SUSPENDED]: + for iface in sec_node.data_nics: + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): + logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) + ret = sec_node.rpc_client().nvmf_subsystem_listener_set_ana_state(lvol.nqn, iface.ip4_address, lvol.subsys_port, ana="inaccessible") + if not ret: + logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") + return False + + return True + + +def resume_lvol(lvol_id): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + logger.info(f"suspending LVol subsystem: {lvol.get_id()}") + snode = db_controller.get_storage_node_by_id(lvol.node_id) + for iface in snode.data_nics: + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): + logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) + ret = snode.rpc_client().nvmf_subsystem_listener_set_ana_state( + lvol.nqn, iface.ip4_address, lvol.subsys_port, is_optimized=True) + if not ret: + logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") + return False + + if snode.secondary_node_id: + sec_node = db_controller.get_storage_node_by_id(snode.secondary_node_id) + if sec_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN, StorageNode.STATUS_SUSPENDED]: + for iface in sec_node.data_nics: + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): + logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) + ret = sec_node.rpc_client().nvmf_subsystem_listener_set_ana_state( + lvol.nqn, iface.ip4_address, lvol.subsys_port, is_optimized=False) + if not ret: + logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") + return False + + return True + + +def replicate_lvol_on_source_cluster(lvol_id): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + source_node = db_controller.get_storage_node_by_id(lvol.node_id) + source_cluster = db_controller.get_cluster_by_id(source_node.cluster_id) + + if not source_node: + logger.error(f"Node not found: {lvol.node_id}") + return False + + if source_node.status != StorageNode.STATUS_ONLINE: + logger.error(f"Node is not online!: {source_node.get_id()}, status: {source_node.status}") + return False + + # for lv in db_controller.get_lvols(source_cluster.snapshot_replication_target_cluster): + # if lv.nqn == lvol.nqn: + # logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") + # return lv.get_id() + + snaps = [] + snapshot = None + for task in db_controller.get_job_tasks(source_node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + if snap.lvol.get_id() != lvol_id: + continue + snaps.append(snap) + + if snaps: + snaps = sorted(snaps, key=lambda x: x.created_at) + snapshot = snaps[-1] + + if not snapshot: + logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") + return False + + # create lvol on target node + new_lvol = copy.deepcopy(lvol) + new_lvol.cloned_from_snap = snapshot.get_id() + new_lvol.snapshot_name = snapshot.snap_bdev + new_lvol.from_source = True + new_lvol.status = LVol.STATUS_IN_CREATION + + new_lvol.bdev_stack = [ + { + "type": "bdev_lvol_clone", + "name": new_lvol.top_bdev, + "params": { + "snapshot_name": snapshot.snap_bdev, + "clone_name": new_lvol.lvol_bdev + } + } + ] + + if new_lvol.crypto_bdev: + new_lvol.bdev_stack.append({ + "type": "crypto", + "name": new_lvol.crypto_bdev, + "params": { + "name": new_lvol.crypto_bdev, + "base_name": new_lvol.top_bdev, + "key1": new_lvol.crypto_key1, + "key2": new_lvol.crypto_key2, + } + }) + + new_lvol.write_to_db(db_controller.kv_store) + + lvol = db_controller.get_lvol_by_id(lvol_id) + lvol.uuid = str(uuid.uuid4()) + lvol.from_source = True + lvol.write_to_db() + delete_lvol(lvol.uuid) + + time.sleep(3) + + lvol_bdev, error = add_lvol_on_node(new_lvol, source_node) + if error: + logger.error(error) + new_lvol.remove(db_controller.kv_store) + return False, error + + new_lvol.lvol_uuid = lvol_bdev['uuid'] + new_lvol.blobid = lvol_bdev['driver_specific']['lvol']['blobid'] + + secondary_node = db_controller.get_storage_node_by_id(source_node.secondary_node_id) + if secondary_node.status == StorageNode.STATUS_ONLINE: + lvol_bdev, error = add_lvol_on_node(new_lvol, secondary_node, is_primary=False) + if error: + logger.error(error) + # remove lvol from primary + ret = delete_lvol_from_node(new_lvol, source_node) + if not ret: + logger.error("") + new_lvol.remove(db_controller.kv_store) + return False, error + + new_lvol.status = LVol.STATUS_ONLINE + new_lvol.write_to_db(db_controller.kv_store) + lvol_events.lvol_replicated(lvol, new_lvol) + + return new_lvol.lvol_uuid + diff --git a/simplyblock_core/controllers/lvol_events.py b/simplyblock_core/controllers/lvol_events.py index 636c444b3..c4f2abde8 100644 --- a/simplyblock_core/controllers/lvol_events.py +++ b/simplyblock_core/controllers/lvol_events.py @@ -3,6 +3,7 @@ from simplyblock_core.controllers import events_controller as ec from simplyblock_core.db_controller import DBController +from simplyblock_core import utils, constants logger = logging.getLogger() @@ -10,6 +11,7 @@ def _lvol_event(lvol, message, caused_by, event): db_controller = DBController() snode = db_controller.get_storage_node_by_id(lvol.node_id) + cluster = db_controller.get_cluster_by_id(snode.cluster_id) ec.log_event_cluster( cluster_id=snode.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -18,7 +20,79 @@ def _lvol_event(lvol, message, caused_by, event): caused_by=caused_by, message=message, node_id=lvol.get_id()) - + if cluster.mode == "kubernetes": + pool = db_controller.get_pool_by_id(lvol.pool_uuid) + + if event == ec.EVENT_OBJ_CREATED: + crypto_key=( + (lvol.crypto_key1, lvol.crypto_key2) + if lvol.crypto_key1 and lvol.crypto_key2 + else None + ) + + node_urls = [ + f"{constants.WEBAPI_K8S_ENDPOINT}/clusters/{snode.cluster_id}/storage-nodes/{node_id}/" + for node_id in lvol.nodes + ] + + utils.patch_cr_lvol_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=pool.lvols_cr_plural, + namespace=pool.lvols_cr_namespace, + name=pool.lvols_cr_name, + add={ + "uuid": lvol.get_id(), + "lvolName": lvol.lvol_name, + "status": lvol.status, + "nodeUUID": node_urls, + "size": utils.humanbytes(lvol.size), + "health": lvol.health_check, + "isCrypto": crypto_key is not None, + "nqn": lvol.nqn, + "subsysPort": lvol.subsys_port, + "hostname": lvol.hostname, + "fabric": lvol.fabric, + "ha": lvol.ha_type == 'ha', + "poolUUID": lvol.pool_uuid, + "poolName": lvol.pool_name, + "PvcName": lvol.pvc_name, + "snapName": lvol.snapshot_name, + "clonedFromSnap": lvol.cloned_from_snap, + "stripeWdata": lvol.ndcs, + "stripeWparity": lvol.npcs, + "blobID": lvol.blobid, + "namespaceID": lvol.ns_id, + "qosClass": lvol.lvol_priority_class, + "maxNamespacesPerSubsystem": lvol.max_namespace_per_subsys, + "qosIOPS": lvol.rw_ios_per_sec, + "qosRWTP": lvol.rw_mbytes_per_sec, + "qosRTP": lvol.r_mbytes_per_sec, + "qosWTP": lvol.w_mbytes_per_sec, + }, + ) + + elif event == ec.EVENT_STATUS_CHANGE: + utils.patch_cr_lvol_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=pool.lvols_cr_plural, + namespace=pool.lvols_cr_namespace, + name=pool.lvols_cr_name, + lvol_uuid=lvol.get_id(), + updates={"status": lvol.status, "health": lvol.health_check}, + ) + elif event == ec.EVENT_OBJ_DELETED: + logger.info("Deleting lvol CR object") + utils.patch_cr_lvol_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=pool.lvols_cr_plural, + namespace=pool.lvols_cr_namespace, + name=pool.lvols_cr_name, + lvol_uuid=lvol.get_id(), + remove=True, + ) def lvol_create(lvol, caused_by=ec.CAUSED_BY_CLI): _lvol_event(lvol, "LVol created", caused_by, ec.EVENT_OBJ_CREATED) @@ -43,3 +117,7 @@ def lvol_health_check_change(lvol, new_state, old_status, caused_by=ec.CAUSED_BY def lvol_io_error_change(lvol, new_state, old_status, caused_by=ec.CAUSED_BY_CLI): _lvol_event(lvol, f"LVol IO Error changed from: {old_status} to: {new_state}", caused_by, ec.EVENT_STATUS_CHANGE) + +def lvol_replicated(lvol, new_lvol, caused_by=ec.CAUSED_BY_CLI): + _lvol_event(lvol, f"LVol Replicated, {lvol.get_id()}, new lvol: {new_lvol.get_id()}", caused_by, ec.EVENT_STATUS_CHANGE) + diff --git a/simplyblock_core/controllers/pool_controller.py b/simplyblock_core/controllers/pool_controller.py index db7016d7d..0d2738e67 100644 --- a/simplyblock_core/controllers/pool_controller.py +++ b/simplyblock_core/controllers/pool_controller.py @@ -12,6 +12,7 @@ from simplyblock_core.controllers import pool_events, lvol_controller from simplyblock_core.db_controller import DBController from simplyblock_core.models.pool import Pool +from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient logger = lg.getLogger() @@ -22,7 +23,8 @@ def _generate_string(length): string.ascii_letters + string.digits) for _ in range(length)) -def add_pool(name, pool_max, lvol_max, max_rw_iops, max_rw_mbytes, max_r_mbytes, max_w_mbytes, cluster_id, qos_host=None): +def add_pool(name, pool_max, lvol_max, max_rw_iops, max_rw_mbytes, max_r_mbytes, max_w_mbytes, cluster_id, + cr_name=None, cr_namespace=None, cr_plural=None, qos_host=None): db_controller = DBController() if not name: logger.error("Pool name is empty!") @@ -70,6 +72,9 @@ def add_pool(name, pool_max, lvol_max, max_rw_iops, max_rw_mbytes, max_r_mbytes, pool.max_rw_mbytes_per_sec = max_rw_mbytes pool.max_r_mbytes_per_sec = max_r_mbytes pool.max_w_mbytes_per_sec = max_w_mbytes + pool.cr_name = cr_name + pool.cr_namespace = cr_namespace + pool.cr_plural = cr_plural if pool.has_qos() and not qos_host: next_nodes = lvol_controller._get_next_3_nodes(cluster_id) if next_nodes: @@ -120,7 +125,8 @@ def qos_exists_on_child_lvol(db_controller: DBController, pool_uuid): return False def set_pool(uuid, pool_max=0, lvol_max=0, max_rw_iops=0, - max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, name=""): + max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, name="", + lvols_cr_name="", lvols_cr_namespace="", lvols_cr_plural=""): db_controller = DBController() try: pool = db_controller.get_pool_by_id(uuid) @@ -142,6 +148,17 @@ def set_pool(uuid, pool_max=0, lvol_max=0, max_rw_iops=0, return False, msg pool.pool_name = name + if lvols_cr_name and lvols_cr_name != pool.lvols_cr_name: + for p in db_controller.get_pools(): + if p.lvols_cr_name == lvols_cr_name: + msg = f"Pool found with the same lvol cr name: {name}" + logger.error(msg) + return False, msg + pool.lvols_cr_name = lvols_cr_name + pool.lvols_cr_namespace = lvols_cr_namespace + pool.lvols_cr_plural = lvols_cr_plural + + # Normalize inputs max_rw_iops = max_rw_iops or 0 max_rw_mbytes = max_rw_mbytes or 0 @@ -264,8 +281,10 @@ def set_status(pool_id, status): except KeyError: logger.error(f"Pool not found {pool_id}") return False + old_status = pool.status pool.status = status pool.write_to_db(db_controller.kv_store) + pool_events.pool_status_change(pool, pool.status, old_status) logger.info("Done") @@ -321,15 +340,18 @@ def get_io_stats(pool_id, history, records_count=20): logger.error(f"Pool not found {pool_id}") return False - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 + io_stats_keys = [ + "date", + "read_bytes_ps", + "read_io_ps", + "read_latency_ps", + "write_bytes_ps", + "write_io_ps", + "write_latency_ps", + ] - out = db_controller.get_pool_stats(pool, records_number) + prom_client = PromClient(pool.cluster_id) + out = prom_client.get_pool_metrics(pool_id, io_stats_keys, history) new_records = utils.process_records(out, records_count) return utils.print_table([ diff --git a/simplyblock_core/controllers/pool_events.py b/simplyblock_core/controllers/pool_events.py index 2581d59b1..8c4f0ea08 100644 --- a/simplyblock_core/controllers/pool_events.py +++ b/simplyblock_core/controllers/pool_events.py @@ -2,7 +2,8 @@ import logging from simplyblock_core.controllers import events_controller as ec - +from simplyblock_core.db_controller import DBController +from simplyblock_core import utils, constants logger = logging.getLogger() @@ -29,3 +30,24 @@ def pool_remove(pool): def pool_updated(pool): _add(pool, f"Pool updated {pool.pool_name}", event=ec.EVENT_STATUS_CHANGE) + +def pool_status_change(pool, new_state, old_status): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(pool.cluster_id) + ec.log_event_cluster( + cluster_id=pool.cluster_id, + domain=ec.DOMAIN_CLUSTER, + event=ec.EVENT_STATUS_CHANGE, + db_object=pool, + caused_by=ec.CAUSED_BY_CLI, + message=f"Pool status changed from {old_status} to {new_state}", + node_id=pool.cluster_id) + + if cluster.mode == "kubernetes": + utils.patch_cr_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=pool.cr_plural, + namespace=pool.cr_namespace, + name=pool.cr_name, + status_patch={"status": new_state}) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index d3eca0e00..99c84d080 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -1,12 +1,14 @@ # coding=utf-8 +import json import logging as lg import time import uuid -from simplyblock_core.controllers import lvol_controller, snapshot_events, pool_controller +from simplyblock_core.controllers import lvol_controller, snapshot_events, pool_controller, tasks_controller from simplyblock_core import utils, constants from simplyblock_core.db_controller import DBController +from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.pool import Pool from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.lvol_model import LVol @@ -49,9 +51,14 @@ def add(lvol_id, snapshot_name): if sn.snap_name == snapshot_name: return False, f"Snapshot name must be unique: {snapshot_name}" - logger.info(f"Creating snapshot: {snapshot_name} from LVol: {lvol.get_id()}") snode = db_controller.get_storage_node_by_id(lvol.node_id) + if snode.lvol_sync_del(): + logger.error(f"LVol sync deletion found on node: {snode.get_id()}") + return False, f"LVol sync deletion found on node: {snode.get_id()}" + + logger.info(f"Creating snapshot: {snapshot_name} from LVol: {lvol.get_id()}") + rec = db_controller.get_lvol_stats(lvol, 1) if rec: size = rec[0].size_used @@ -217,19 +224,42 @@ def add(lvol_id, snapshot_name): snap.snap_ref_id = original_snap.get_id() snap.write_to_db(db_controller.kv_store) - logger.info("Done") + for sn in db_controller.get_snapshots(cluster.get_id()): + if sn.get_id() == snap.get_id(): + continue + if sn.lvol.get_id() == lvol_id: + if not sn.next_snap_uuid: + sn.next_snap_uuid = snap.get_id() + snap.prev_snap_uuid = sn.get_id() + sn.write_to_db() + snap.write_to_db() + break + snapshot_events.snapshot_create(snap) + if lvol.do_replicate: + task = tasks_controller.add_snapshot_replication_task(snap.cluster_id, snap.lvol.node_id, snap.get_id()) + if task: + snapshot_events.replication_task_created(snap) + if lvol.cloned_from_snap: + lvol_snap = db_controller.get_snapshot_by_id(lvol.cloned_from_snap) + if lvol_snap.source_replicated_snap_uuid: + org_snap = db_controller.get_snapshot_by_id(lvol_snap.source_replicated_snap_uuid) + if org_snap and org_snap.status == SnapShot.STATUS_ONLINE: + task = tasks_controller.add_snapshot_replication_task( + snap.cluster_id, org_snap.lvol.node_id, snap.get_id(), replicate_to_source=True) + if task: + logger.info("Created snapshot replication task on original node") return snap.uuid, False -def list(all=False): - snaps = db_controller.get_snapshots() +def list(all=False, cluster_id=None, with_details=False): + snaps = db_controller.get_snapshots(cluster_id) data = [] for snap in snaps: logger.debug(snap) if snap.deleted is True and all is False: continue - data.append({ + d = { "UUID": snap.uuid, "Name": snap.snap_name, "Size": utils.humanbytes(snap.used_size), @@ -239,7 +269,13 @@ def list(all=False): "Created At": time.strftime("%H:%M:%S, %d/%m/%Y", time.gmtime(snap.created_at)), "Health": snap.health_check, "Status": snap.status, - }) + } + if with_details: + d["Replication target snap"] = snap.target_replicated_snap_uuid + d["Replication source snap"] = snap.source_replicated_snap_uuid + d["Rrev snap"] = snap.prev_snap_uuid + d["Next snap"] = snap.next_snap_uuid + data.append(d) return utils.print_table(data) @@ -250,6 +286,10 @@ def delete(snapshot_uuid, force_delete=False): logger.error(f"Snapshot not found {snapshot_uuid}") return False + if snap.status == SnapShot.STATUS_IN_REPLICATION: + logger.error("Snapshot is in replication") + return False + try: snode = db_controller.get_storage_node_by_id(snap.lvol.node_id) except KeyError: @@ -351,6 +391,9 @@ def delete(snapshot_uuid, force_delete=False): except KeyError: pass + if snap.target_replicated_snap_uuid: + delete_replicated(snap.uuid) + logger.info("Done") return True @@ -381,6 +424,10 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None logger.exception(msg) return False, msg + if snode.lvol_sync_del(): + logger.error(f"LVol sync deletion found on node: {snode.get_id()}") + return False, f"LVol sync deletion found on node: {snode.get_id()}" + cluster = db_controller.get_cluster_by_id(pool.cluster_id) if cluster.status not in [cluster.STATUS_ACTIVE, cluster.STATUS_DEGRADED]: return False, f"Cluster is not active, status: {cluster.status}" @@ -587,3 +634,98 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None if new_size: lvol_controller.resize_lvol(lvol.get_id(), new_size) return lvol.uuid, False + + +def list_replication_tasks(cluster_id): + tasks = db_controller.get_job_tasks(cluster_id) + + data = [] + for task in tasks: + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + duration = "" + try: + if task.status == JobSchedule.STATUS_RUNNING: + duration = utils.strfdelta_seconds(int(time.time()) - task.function_params["start_time"]) + elif "end_time" in task.function_params: + duration = utils.strfdelta_seconds( + task.function_params["end_time"] - task.function_params["start_time"]) + except Exception as e: + logger.error(e) + status = task.status + if task.canceled: + status = "cancelled" + replicate_to = "target" + if "replicate_to_source" in task.function_params: + if task.function_params["replicate_to_source"] is True: + replicate_to = "source" + offset = 0 + if "offset" in task.function_params: + offset = task.function_params["offset"] + data.append({ + "Task ID": task.uuid, + "Snapshot ID": snap.uuid, + "Size": utils.humanbytes(snap.used_size), + "Duration": duration, + "Offset": offset, + "Status": status, + "Replicate to": replicate_to, + "Result": task.function_result, + "Cluster ID": task.cluster_id, + }) + return utils.print_table(data) + + +def delete_replicated(snapshot_id): + try: + snap = db_controller.get_snapshot_by_id(snapshot_id) + except KeyError: + logger.error(f"Snapshot not found {snapshot_id}") + return False + + try: + target_replicated_snap = db_controller.get_snapshot_by_id(snap.target_replicated_snap_uuid) + logger.info("Deleting replicated snapshot %s", target_replicated_snap.uuid) + ret = delete(target_replicated_snap.uuid) + if not ret: + logger.error("Failed to delete snapshot %s", target_replicated_snap.uuid) + return False + + except KeyError: + logger.error(f"Snapshot not found {snap.target_replicated_snap_uuid}") + return False + + return True + + +def get(snapshot_uuid): + try: + snap = db_controller.get_snapshot_by_id(snapshot_uuid) + except KeyError: + logger.error(f"Snapshot not found {snapshot_uuid}") + return False + + return json.dumps(snap.get_clean_dict(), indent=2) + + +def set(snapshot_uuid, attr, value) -> bool: + try: + snap = db_controller.get_snapshot_by_id(snapshot_uuid) + except KeyError: + logger.error(f"Snapshot not found {snapshot_uuid}") + return False + + if attr not in snap.get_attrs_map(): + raise KeyError('Attribute not found') + + value = snap.get_attrs_map()[attr]['type'](value) + logger.info(f"Setting {attr} to {value}") + setattr(snap, attr, value) + snap.write_to_db() + return True + diff --git a/simplyblock_core/controllers/snapshot_events.py b/simplyblock_core/controllers/snapshot_events.py index 4cb107dcd..9b29f8b6f 100644 --- a/simplyblock_core/controllers/snapshot_events.py +++ b/simplyblock_core/controllers/snapshot_events.py @@ -31,3 +31,10 @@ def snapshot_delete(snapshot, caused_by=ec.CAUSED_BY_CLI): def snapshot_clone(snapshot, lvol_clone, caused_by=ec.CAUSED_BY_CLI): _snapshot_event(snapshot, f"Snapshot cloned: {snapshot.get_id()} clone id: {lvol_clone.get_id()}", caused_by, ec.EVENT_STATUS_CHANGE) + +def replication_task_created(snapshot, caused_by=ec.CAUSED_BY_CLI): + _snapshot_event(snapshot, "Snapshot replication task created", caused_by, ec.EVENT_OBJ_CREATED) + + +def replication_task_finished(snapshot, caused_by=ec.CAUSED_BY_CLI): + _snapshot_event(snapshot, "Snapshot replication task finished", caused_by, ec.EVENT_OBJ_CREATED) diff --git a/simplyblock_core/controllers/storage_events.py b/simplyblock_core/controllers/storage_events.py index b73890cd8..486daa8ee 100644 --- a/simplyblock_core/controllers/storage_events.py +++ b/simplyblock_core/controllers/storage_events.py @@ -3,6 +3,8 @@ from simplyblock_core.controllers import events_controller as ec from simplyblock_core.models.events import EventObj +from simplyblock_core.db_controller import DBController +from simplyblock_core import utils, constants logger = logging.getLogger() @@ -19,6 +21,8 @@ def snode_add(node): def snode_delete(node): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(node.cluster_id) ec.log_event_cluster( cluster_id=node.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -27,9 +31,21 @@ def snode_delete(node): caused_by=ec.CAUSED_BY_CLI, message=f"Storage node deleted {node.get_id()}", node_id=node.get_id()) - + if cluster.mode == "kubernetes": + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=node.cr_plural, + namespace=node.cr_namespace, + name=node.cr_name, + node_uuid=node.get_id(), + node_mgmt_ip=node.mgmt_ip, + remove=True, + ) def snode_status_change(node, new_state, old_status, caused_by=ec.CAUSED_BY_CLI): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(node.cluster_id) ec.log_event_cluster( cluster_id=node.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -38,9 +54,22 @@ def snode_status_change(node, new_state, old_status, caused_by=ec.CAUSED_BY_CLI) caused_by=caused_by, message=f"Storage node status changed from: {old_status} to: {new_state}", node_id=node.get_id()) + if cluster.mode == "kubernetes": + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=node.cr_plural, + namespace=node.cr_namespace, + name=node.cr_name, + node_uuid=node.get_id(), + node_mgmt_ip=node.mgmt_ip, + updates={"status": new_state}, + ) def snode_health_check_change(node, new_state, old_status, caused_by=ec.CAUSED_BY_CLI): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(node.cluster_id) ec.log_event_cluster( cluster_id=node.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -49,7 +78,17 @@ def snode_health_check_change(node, new_state, old_status, caused_by=ec.CAUSED_B caused_by=caused_by, message=f"Storage node health check changed from: {old_status} to: {new_state}", node_id=node.get_id()) - + if cluster.mode == "kubernetes": + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=node.cr_plural, + namespace=node.cr_namespace, + name=node.cr_name, + node_uuid=node.get_id(), + node_mgmt_ip=node.mgmt_ip, + updates={"health": new_state}, + ) def snode_restart_failed(node): ec.log_event_cluster( @@ -72,3 +111,40 @@ def snode_rpc_timeout(node, timeout_seconds, caused_by=ec.CAUSED_BY_MONITOR): event_level=EventObj.LEVEL_WARN, message=f"Storage node RPC timeout detected after {timeout_seconds} seconds", node_id=node.get_id()) + + +def jm_repl_tasks_found(node, jm_vuid, caused_by=ec.CAUSED_BY_MONITOR): + ec.log_event_cluster( + cluster_id=node.cluster_id, + domain=ec.DOMAIN_CLUSTER, + event=ec.EVENT_STATUS_CHANGE, + db_object=node, + caused_by=caused_by, + event_level=EventObj.LEVEL_WARN, + message=f"JM replication task found for jm {jm_vuid}", + node_id=node.get_id()) + + +def node_ports_changed(node, caused_by=ec.CAUSED_BY_MONITOR): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(node.cluster_id) + ec.log_event_cluster( + cluster_id=node.cluster_id, + domain=ec.DOMAIN_CLUSTER, + event=ec.EVENT_STATUS_CHANGE, + db_object=node, + caused_by=caused_by, + event_level=EventObj.LEVEL_WARN, + message=f"Storage node ports set, LVol:{node.lvol_subsys_port} RPC:{node.rpc_port} Internal:{node.nvmf_port}", + node_id=node.get_id()) + if cluster.mode == "kubernetes": + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=node.cr_plural, + namespace=node.cr_namespace, + name=node.cr_name, + node_uuid=node.get_id(), + node_mgmt_ip=node.mgmt_ip, + updates={"nvmf_port": node.nvmf_port, "rpc_port": node.rpc_port, "lvol_port": node.lvol_subsys_port}, + ) diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index 689027d08..eb3184068 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -70,6 +70,18 @@ def _add_task(function_name, cluster_id, node_id, device_id, if task_id: logger.info(f"Task found, skip adding new task: {task_id}") return False + elif function_name == JobSchedule.FN_LVOL_SYNC_DEL: + task_id = get_lvol_sync_del_task(cluster_id, node_id, function_params['lvol_bdev_name']) + if task_id: + logger.info(f"Task found, skip adding new task: {task_id}") + return False + + elif function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + task_id = get_snapshot_replication_task( + cluster_id, function_params['snapshot_id'], function_params['replicate_to_source']) + if task_id: + logger.info(f"Task found, skip adding new task: {task_id}") + return False task_obj = JobSchedule() task_obj.uuid = str(uuid.uuid4()) @@ -95,11 +107,13 @@ def add_device_mig_task(device_id_list, cluster_id): device = db.get_storage_device_by_id(device_id_list[0]) tasks = db.get_job_tasks(cluster_id) + master_task = None for task in tasks: if task.function_name == JobSchedule.FN_BALANCING_AFTER_NODE_RESTART : if task.status != JobSchedule.STATUS_DONE and task.canceled is False: - logger.info(f"Task found, skip adding new task: {task.get_id()}") - return False + logger.info("Master task found, skip adding new master task") + master_task = task + break for node in db.get_storage_nodes_by_cluster_id(cluster_id): if node.status == StorageNode.STATUS_REMOVED: @@ -112,16 +126,19 @@ def add_device_mig_task(device_id_list, cluster_id): if task_id: sub_tasks.append(task_id) if sub_tasks: - task_obj = JobSchedule() - task_obj.uuid = str(uuid.uuid4()) - task_obj.cluster_id = cluster_id - task_obj.date = int(time.time()) - task_obj.function_name = JobSchedule.FN_BALANCING_AFTER_NODE_RESTART - task_obj.sub_tasks = sub_tasks - task_obj.status = JobSchedule.STATUS_NEW - task_obj.write_to_db(db.kv_store) - tasks_events.task_create(task_obj) - + if master_task: + master_task.sub_tasks.extend(sub_tasks) + master_task.write_to_db() + else: + task_obj = JobSchedule() + task_obj.uuid = str(uuid.uuid4()) + task_obj.cluster_id = cluster_id + task_obj.date = int(time.time()) + task_obj.function_name = JobSchedule.FN_BALANCING_AFTER_NODE_RESTART + task_obj.sub_tasks = sub_tasks + task_obj.status = JobSchedule.STATUS_NEW + task_obj.write_to_db(db.kv_store) + tasks_events.task_create(task_obj) return True @@ -135,10 +152,13 @@ def add_node_to_auto_restart(node): Cluster.STATUS_READONLY, Cluster.STATUS_UNREADY]: logger.warning(f"Cluster is not active, skip node auto restart, status: {cluster.status}") return False + offline_nodes = 0 for sn in db.get_storage_nodes_by_cluster_id(node.cluster_id): if node.get_id() != sn.get_id() and sn.status != StorageNode.STATUS_ONLINE and node.mgmt_ip != sn.mgmt_ip: - logger.info("Node found that is not online, skip node auto restart") - return False + offline_nodes += 1 + if offline_nodes > cluster.distr_npcs : + logger.info("Node found that is not online, skip node auto restart") + return False return _add_task(JobSchedule.FN_NODE_RESTART, node.cluster_id, node.get_id(), "", max_retry=11) @@ -150,23 +170,26 @@ def list_tasks(cluster_id, is_json=False, limit=50, **kwargs): return False data = [] - tasks = db.get_job_tasks(cluster_id, reverse=True, limit=limit) + tasks = db.get_job_tasks(cluster_id, reverse=True) tasks.reverse() if is_json is True: for t in tasks: if t.function_name == JobSchedule.FN_DEV_MIG: continue data.append(t.get_clean_dict()) + if len(data)+1 > limit > 0: + return json.dumps(data, indent=2) return json.dumps(data, indent=2) for task in tasks: if task.function_name == JobSchedule.FN_DEV_MIG: continue + logger.debug(task) if task.max_retry > 0: retry = f"{task.retry}/{task.max_retry}" else: retry = f"{task.retry}" - + logger.debug(task) upd = task.updated_at if upd: try: @@ -192,6 +215,8 @@ def list_tasks(cluster_id, is_json=False, limit=50, **kwargs): "Result": task.function_result, "Updated At": upd or "", }) + if len(data)+1 > limit > 0: + return utils.print_table(data) return utils.print_table(data) @@ -234,6 +259,7 @@ def get_subtasks(master_task_id): except Exception as e: logger.error(e) + logger.debug(sub_task) data.append({ "Task ID": sub_task.uuid, "Node ID / Device ID": f"{sub_task.node_id}\n{sub_task.device_id}", @@ -303,7 +329,8 @@ def add_new_device_mig_task(device_id): def add_node_add_task(cluster_id, function_params): - return _add_task(JobSchedule.FN_NODE_ADD, cluster_id, "", "", function_params=function_params) + return _add_task(JobSchedule.FN_NODE_ADD, cluster_id, "", "", + function_params=function_params, max_retry=16) def get_active_node_tasks(cluster_id, node_id): @@ -334,7 +361,7 @@ def get_new_device_mig_task(cluster_id, node_id, distr_name, dev_id=None): def get_device_mig_task(cluster_id, node_id, device_id, distr_name): tasks = db.get_job_tasks(cluster_id) for task in tasks: - if task.function_name == JobSchedule.FN_DEV_MIG and task.node_id == node_id and task.device_id == device_id: + if task.function_name == JobSchedule.FN_DEV_MIG and task.node_id == node_id: if task.status != JobSchedule.STATUS_DONE and task.canceled is False \ and "distr_name" in task.function_params and task.function_params["distr_name"] == distr_name: return task.uuid @@ -386,3 +413,58 @@ def get_jc_comp_task(cluster_id, node_id, jm_vuid=0): if jm_vuid and "jm_vuid" in task.function_params and task.function_params["jm_vuid"] == jm_vuid: return task.uuid return False + + +def add_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name, primary_node): + return _add_task(JobSchedule.FN_LVOL_SYNC_DEL, cluster_id, node_id, "", + function_params={"lvol_bdev_name": lvol_bdev_name, "primary_node": primary_node}, max_retry=10) + +def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None): + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL and task.node_id == node_id : + if task.status != JobSchedule.STATUS_DONE and task.canceled is False: + if lvol_bdev_name: + if "lvol_bdev_name" in task.function_params and task.function_params["lvol_bdev_name"] == lvol_bdev_name: + return task.uuid + else: + return task.uuid + return False + +def get_snapshot_replication_task(cluster_id, snapshot_id, replicate_to_source): + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and task.function_params["snapshot_id"] == snapshot_id: + if task.status != JobSchedule.STATUS_DONE and task.canceled is False: + if task.function_params["replicate_to_source"] == replicate_to_source: + return task.uuid + return False + + +def _check_snap_instance_on_node(snapshot_id: str , node_id: str): + snapshot = db.get_snapshot_by_id(snapshot_id) + for sn_inst in snapshot.instances: + if sn_inst.lvol.node_id == node_id: + logger.info("Snapshot instance found on node, skip adding replication task") + return + + if snapshot.snap_ref_id: + prev_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) + _check_snap_instance_on_node(prev_snap.get_id(), node_id) + + _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, snapshot.cluster_id, node_id, "", + function_params={"snapshot_id": snapshot.get_id(), "replicate_to_source": False, + "replicate_as_snap_instance": True}, + send_to_cluster_log=False) + + +def add_snapshot_replication_task(cluster_id, node_id, snapshot_id, replicate_to_source=False): + if not replicate_to_source: + snapshot = db.get_snapshot_by_id(snapshot_id) + if snapshot.snap_ref_id: + prev_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) + _check_snap_instance_on_node(prev_snap.get_id(), node_id) + + return _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, cluster_id, node_id, "", + function_params={"snapshot_id": snapshot_id, "replicate_to_source": replicate_to_source}, + send_to_cluster_log=False) diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index 277d1b68a..ddcd4272b 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -2,7 +2,7 @@ import os.path import fdb -from typing import List +from typing import List, Optional from simplyblock_core import constants from simplyblock_core.models.cluster import Cluster @@ -17,8 +17,7 @@ from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.stats import DeviceStatObject, NodeStatObject, ClusterStatObject, LVolStatObject, \ PoolStatObject, CachedLVolStatObject -from simplyblock_core.models.storage_node import StorageNode - +from simplyblock_core.models.storage_node import StorageNode, NodeLVolDelLock class Singleton(type): @@ -159,9 +158,11 @@ def get_hostnames_by_pool_id(self, pool_id) -> List[str]: hostnames.append(lv.hostname) return hostnames - def get_snapshots(self) -> List[SnapShot]: - ret = SnapShot().read_from_db(self.kv_store) - return ret + def get_snapshots(self, cluster_id=None) -> List[SnapShot]: + snaps = SnapShot().read_from_db(self.kv_store) + if cluster_id: + snaps = [n for n in snaps if n.cluster_id == cluster_id] + return sorted(snaps, key=lambda x: x.created_at) def get_snapshot_by_id(self, id) -> SnapShot: ret = SnapShot().read_from_db(self.kv_store, id) @@ -258,7 +259,9 @@ def get_events(self, event_id=" ", limit=0, reverse=False) -> List[EventObj]: return EventObj().read_from_db(self.kv_store, id=event_id, limit=limit, reverse=reverse) def get_job_tasks(self, cluster_id, reverse=True, limit=0) -> List[JobSchedule]: - return JobSchedule().read_from_db(self.kv_store, id=cluster_id, reverse=reverse, limit=limit) + ret = JobSchedule().read_from_db(self.kv_store, id=cluster_id, reverse=reverse, limit=limit) + return sorted(ret, key=lambda x: x.date) + def get_task_by_id(self, task_id) -> JobSchedule: for task in self.get_job_tasks(" "): @@ -272,7 +275,7 @@ def get_snapshots_by_node_id(self, node_id) -> List[SnapShot]: for snap in snaps: if snap.lvol.node_id == node_id: ret.append(snap) - return ret + return sorted(ret, key=lambda x: x.create_dt) def get_snode_size(self, node_id) -> int: snode = self.get_storage_node_by_id(node_id) @@ -309,3 +312,10 @@ def get_qos(self, cluster_id=None) -> List[QOSClass]: else: classes = QOSClass().read_from_db(self.kv_store) return sorted(classes, key=lambda x: x.class_id) + + def get_lvol_del_lock(self, node_id) -> Optional[NodeLVolDelLock]: + ret = NodeLVolDelLock().read_from_db(self.kv_store, id=node_id) + if ret: + return ret[0] + else: + return None diff --git a/simplyblock_core/distr_controller.py b/simplyblock_core/distr_controller.py index e50115f62..420b9e3fe 100644 --- a/simplyblock_core/distr_controller.py +++ b/simplyblock_core/distr_controller.py @@ -2,6 +2,7 @@ import datetime import logging import re +import threading from simplyblock_core import utils from simplyblock_core.models.nvme_device import NVMeDevice @@ -26,6 +27,7 @@ def send_node_status_event(node, node_status, target_node=None): events = {"events": [node_status_event]} logger.debug(node_status_event) skipped_nodes = [] + connect_threads = [] if target_node: snodes = [target_node] else: @@ -45,10 +47,14 @@ def send_node_status_event(node, node_status, target_node=None): if node_found_same_host: continue logger.info(f"Sending to: {node.get_id()}") - rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1) - ret = rpc_client.distr_status_events_update(events) - if not ret: - logger.warning("Failed to send event update") + t = threading.Thread( + target=_send_event_to_node, + args=(node, events,)) + connect_threads.append(t) + t.start() + + for t in connect_threads: + t.join() def send_dev_status_event(device, status, target_node=None): @@ -57,7 +63,7 @@ def send_dev_status_event(device, status, target_node=None): db_controller = DBController() storage_ID = device.cluster_device_order skipped_nodes = [] - + connect_threads = [] if target_node: snodes = [db_controller.get_storage_node_by_id(target_node.get_id())] else: @@ -67,7 +73,8 @@ def send_dev_status_event(device, status, target_node=None): skipped_nodes.append(node) for node in snodes: - if node.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + if node.status in [StorageNode.STATUS_OFFLINE, StorageNode.STATUS_REMOVED]: + logger.info(f"skipping node: {node.get_id()} with status: {node.status}") continue node_found_same_host = False for n in skipped_nodes: @@ -95,10 +102,14 @@ def send_dev_status_event(device, status, target_node=None): "storage_ID": storage_ID, "status": dev_status}]} logger.debug(f"Sending event updates, device: {storage_ID}, status: {dev_status}, node: {node.get_id()}") - rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1) - ret = rpc_client.distr_status_events_update(events) - if not ret: - logger.warning("Failed to send event update") + t = threading.Thread( + target=_send_event_to_node, + args=(node,events,)) + connect_threads.append(t) + t.start() + + for t in connect_threads: + t.join() def disconnect_device(device): @@ -192,12 +203,20 @@ def get_distr_cluster_map(snodes, target_node, distr_name=""): return cl_map -def parse_distr_cluster_map(map_string): +def parse_distr_cluster_map(map_string, nodes=None, devices=None): db_controller = DBController() node_pattern = re.compile(r".*uuid_node=(.*) status=(.*)$", re.IGNORECASE) device_pattern = re.compile( r".*storage_ID=(.*) status=(.*) uuid_device=(.*) storage_bdev_name=(.*)$", re.IGNORECASE) + if not nodes or not devices: + nodes = {} + devices = {} + for n in db_controller.get_storage_nodes(): + nodes[n.get_id()] = n + for dev in n.nvme_devices: + devices[dev.get_id()] = dev + results = [] passed = True for line in map_string.split('\n'): @@ -213,8 +232,7 @@ def parse_distr_cluster_map(map_string): "Results": "", } try: - nd = db_controller.get_storage_node_by_id(node_id) - node_status = nd.status + node_status = nodes[node_id].status if node_status == StorageNode.STATUS_SCHEDULABLE: node_status = StorageNode.STATUS_UNREACHABLE data["Desired Status"] = node_status @@ -238,7 +256,7 @@ def parse_distr_cluster_map(map_string): "Results": "", } try: - sd = db_controller.get_storage_device_by_id(device_id) + sd = devices[device_id] data["Desired Status"] = sd.status if sd.status == status: data["Results"] = "ok" @@ -252,38 +270,26 @@ def parse_distr_cluster_map(map_string): return results, passed -def send_cluster_map_to_node(node): +def send_cluster_map_to_node(node: StorageNode): db_controller = DBController() snodes = db_controller.get_storage_nodes_by_cluster_id(node.cluster_id) - rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=10) - - # if node.lvstore_stack_secondary_1: - # for snode in db_controller.get_primary_storage_nodes_by_secondary_node_id(node.get_id()): - # for bdev in snode.lvstore_stack: - # if bdev['type'] == "bdev_distr": - # cluster_map_data = get_distr_cluster_map(snodes, node, bdev["name"]) - # ret = rpc_client.distr_send_cluster_map(cluster_map_data) - # if not ret: - # logger.error("Failed to send cluster map") - # return False - # return True - # else: cluster_map_data = get_distr_cluster_map(snodes, node) - ret = rpc_client.distr_send_cluster_map(cluster_map_data) - if not ret: + try: + node.rpc_client(timeout=10).distr_send_cluster_map(cluster_map_data) + except Exception: logger.error("Failed to send cluster map") logger.info(cluster_map_data) return False return True -def send_cluster_map_to_distr(node, distr_name): +def send_cluster_map_to_distr(node: StorageNode, distr_name: str): db_controller = DBController() snodes = db_controller.get_storage_nodes_by_cluster_id(node.cluster_id) - rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=10) cluster_map_data = get_distr_cluster_map(snodes, node, distr_name) - ret = rpc_client.distr_send_cluster_map(cluster_map_data) - if not ret: + try: + node.rpc_client(timeout=10).distr_send_cluster_map(cluster_map_data) + except Exception: logger.error("Failed to send cluster map") logger.info(cluster_map_data) return False @@ -294,14 +300,13 @@ def send_cluster_map_add_node(snode, target_node): if target_node.status != StorageNode.STATUS_ONLINE: return False logger.info(f"Sending to: {target_node.get_id()}") - rpc_client = RPCClient(target_node.mgmt_ip, target_node.rpc_port, target_node.rpc_username, target_node.rpc_password, timeout=5) - cluster_map_data = get_distr_cluster_map([snode], target_node) cl_map = { "map_cluster": cluster_map_data['map_cluster'], "map_prob": cluster_map_data['map_prob']} - ret = rpc_client.distr_add_nodes(cl_map) - if not ret: + try: + target_node.rpc_client(timeout=10).distr_add_nodes(cl_map) + except Exception: logger.error("Failed to send cluster map") return False return True @@ -353,10 +358,20 @@ def send_cluster_map_add_device(device: NVMeDevice, target_node: StorageNode): "bdev_name": name, "status": device.status, "weight": dev_w_gib, + "physical_label": device.physical_label if device.physical_label > 0 else -1, }} } - ret = rpc_client.distr_add_devices(cl_map) - if not ret: + try: + rpc_client.distr_add_devices(cl_map) + except Exception: logger.error("Failed to send cluster map") return False return True + + +def _send_event_to_node(node, events): + try: + node.rpc_client(timeout=1, retry=0).distr_status_events_update(events) + except Exception as e: + logger.warning("Failed to send event update") + logger.error(e) diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index f3e377ee4..4c8c24d06 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,6 +1,6 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev -SIMPLY_BLOCK_VERSION=19.2.23 +SIMPLY_BLOCK_VERSION=19.2.24 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main -SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main-sfam-2359 +SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:transfer-feature-latest diff --git a/simplyblock_core/fw_api_client.py b/simplyblock_core/fw_api_client.py index d17255c80..8f089ce5c 100644 --- a/simplyblock_core/fw_api_client.py +++ b/simplyblock_core/fw_api_client.py @@ -6,6 +6,7 @@ from requests.adapters import HTTPAdapter from urllib3 import Retry + logger = logging.getLogger() @@ -18,7 +19,7 @@ class FirewallClient: def __init__(self, node, timeout=300, retry=5): self.node = node - self.ip_address = f"{node.mgmt_ip}:5001" + self.ip_address = f"{node.mgmt_ip}:{node.firewall_port}" self.url = 'http://%s/' % self.ip_address self.timeout = timeout self.session = requests.session() @@ -41,7 +42,7 @@ def _request(self, method, path, payload=None): response = self.session.request(method, self.url+path, data=data, timeout=self.timeout, params=params) except Exception as e: - raise e + raise FirewallClientException(str(e)) logger.debug("Response: status_code: %s, content: %s", response.status_code, response.content) diff --git a/simplyblock_core/mgmt_node_ops.py b/simplyblock_core/mgmt_node_ops.py index 84375d819..6d752a86c 100644 --- a/simplyblock_core/mgmt_node_ops.py +++ b/simplyblock_core/mgmt_node_ops.py @@ -106,18 +106,13 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo logger.info(f"Node IP: {dev_ip}") - hostname = utils.get_node_name_by_ip(dev_ip) - utils.label_node_as_mgmt_plane(hostname) db_connection = cluster_data['db_connection'] db_controller = DBController() nodes = db_controller.get_mgmt_nodes() if not nodes: logger.error("No mgmt nodes was found in the cluster!") return False - for node in nodes: - if node.hostname == hostname: - logger.error("Node already exists in the cluster") - return False + logger.info("Adding management node object") node_id = add_mgmt_node(dev_ip, mode, cluster_id) @@ -225,10 +220,9 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo def add_mgmt_node(mgmt_ip, mode, cluster_id=None): db_controller = DBController() + hostname = "" if mode == "docker": hostname = utils.get_hostname() - elif mode == "kubernetes": - hostname = utils.get_node_name_by_ip(mgmt_ip) try: node = db_controller.get_mgmt_node_by_hostname(hostname) if node: diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index fd4802771..bcf111f1b 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -45,7 +45,7 @@ class Cluster(BaseModel): distr_npcs: int = 0 enable_node_affinity: bool = False grafana_endpoint: str = "" - mode: str = "" + mode: str = "docker" grafana_secret: str = "" contact_point: str = "" ha_type: str = "single" @@ -63,12 +63,18 @@ class Cluster(BaseModel): fabric_rdma: bool = False client_qpair_count: int = 3 secret: str = "" + cr_name: str = "" + cr_namespace: str = "" + cr_plural: str = "" disable_monitoring: bool = False strict_node_anti_affinity: bool = False tls: bool = False is_re_balancing: bool = False full_page_unmap: bool = True is_single_node: bool = False + snapshot_replication_target_cluster: str = "" + snapshot_replication_target_pool: str = "" + snapshot_replication_timeout: int = 60*10 def get_status_code(self): if self.status in self.STATUS_CODE_MAP: diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py index 3d87a9aca..3a20b3499 100644 --- a/simplyblock_core/models/job_schedule.py +++ b/simplyblock_core/models/job_schedule.py @@ -22,6 +22,8 @@ class JobSchedule(BaseModel): FN_BALANCING_AFTER_DEV_REMOVE = "balancing_on_dev_rem" FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add" FN_JC_COMP_RESUME = "jc_comp_resume" + FN_SNAPSHOT_REPLICATION = "snapshot_replication" + FN_LVOL_SYNC_DEL = "lvol_sync_del" canceled: bool = False cluster_id: str = "" diff --git a/simplyblock_core/models/lvol_model.py b/simplyblock_core/models/lvol_model.py index f84091473..a67032c53 100644 --- a/simplyblock_core/models/lvol_model.py +++ b/simplyblock_core/models/lvol_model.py @@ -66,6 +66,9 @@ class LVol(BaseModel): fabric: str = "tcp" ndcs: int = 0 npcs: int = 0 + do_replicate: bool = False + replication_node_id: str = "" + from_source: bool = True def has_qos(self): return (self.rw_ios_per_sec > 0 or self.rw_mbytes_per_sec > 0 or self.r_mbytes_per_sec > 0 or self.w_mbytes_per_sec > 0) diff --git a/simplyblock_core/models/nvme_device.py b/simplyblock_core/models/nvme_device.py index b86e25c44..82749e30a 100644 --- a/simplyblock_core/models/nvme_device.py +++ b/simplyblock_core/models/nvme_device.py @@ -47,25 +47,39 @@ class NVMeDevice(BaseModel): nvmf_nqn: str = "" nvmf_port: int = 0 nvmf_multipath: bool = False - overload_percentage: int = 0 # Unused - partition_jm_bdev: str = "" # Unused - partition_jm_size: int = 0 # Unused - partition_main_bdev: str = "" # Unused - partition_main_size: int = 0 # Unused - partitions_count: int = 0 # Unused pcie_address: str = "" physical_label: int = 0 pt_bdev: str = "" qos_bdev: str = "" remote_bdev: str = "" retries_exhausted: bool = False - sequential_number: int = 0 # Unused serial_number: str = "" size: int = -1 testing_bdev: str = "" connecting_from_node: str = "" previous_status: str = "" + def __change_dev_connection_to(self, connecting_from_node): + from simplyblock_core.db_controller import DBController + db = DBController() + for n in db.get_storage_nodes(): + if n.nvme_devices: + for d in n.nvme_devices: + if d.get_id() == self.get_id(): + d.connecting_from_node = connecting_from_node + n.write_to_db() + break + + def lock_device_connection(self, node_id): + self.__change_dev_connection_to(node_id) + + def release_device_connection(self): + self.__change_dev_connection_to("") + + def is_connection_in_progress_to_node(self, node_id): + if self.connecting_from_node and self.connecting_from_node == node_id: + return True + class JMDevice(NVMeDevice): @@ -73,3 +87,18 @@ class JMDevice(NVMeDevice): jm_bdev: str = "" jm_nvme_bdev_list: List[str] = [] raid_bdev: str = "" + + +class RemoteDevice(BaseModel): + + remote_bdev: str = "" + alceml_name: str = "" + node_id: str = "" + size: int = -1 + nvmf_multipath: bool = False + + +class RemoteJMDevice(RemoteDevice): + + jm_bdev: str = "" + diff --git a/simplyblock_core/models/pool.py b/simplyblock_core/models/pool.py index 27b2a23e5..683eafe1e 100644 --- a/simplyblock_core/models/pool.py +++ b/simplyblock_core/models/pool.py @@ -29,6 +29,12 @@ class Pool(BaseModel): secret: str = "" # unused users: List[str] = [] qos_host: str = "" + cr_name: str = "" + cr_namespace: str = "" + cr_plural: str = "" + lvols_cr_name: str = "" + lvols_cr_namespace: str = "" + lvols_cr_plural: str = "" def has_qos(self): diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py index 1da571ec8..ab91a0087 100644 --- a/simplyblock_core/models/snapshot.py +++ b/simplyblock_core/models/snapshot.py @@ -9,6 +9,7 @@ class SnapShot(BaseModel): STATUS_ONLINE = 'online' STATUS_OFFLINE = 'offline' STATUS_IN_DELETION = 'in_deletion' + STATUS_IN_REPLICATION = 'in_replication' base_bdev: str = "" blobid: int = 0 @@ -29,3 +30,8 @@ class SnapShot(BaseModel): deletion_status: str = "" status: str = "" fabric: str = "tcp" + target_replicated_snap_uuid: str = "" + source_replicated_snap_uuid: str = "" + next_snap_uuid: str = "" + prev_snap_uuid: str = "" + instances: list = [] \ No newline at end of file diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index 8c76d3649..4dd24b9e6 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -1,13 +1,14 @@ # coding=utf-8 - +import time from typing import List from uuid import uuid4 from simplyblock_core import utils -from simplyblock_core.models.base_model import BaseNodeObject +from simplyblock_core.models.base_model import BaseNodeObject, BaseModel from simplyblock_core.models.hublvol import HubLVol from simplyblock_core.models.iface import IFace -from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice, RemoteDevice, RemoteJMDevice from simplyblock_core.rpc_client import RPCClient, RPCException logger = utils.get_logger(__name__) @@ -79,8 +80,8 @@ class StorageNode(BaseNodeObject): pollers_mask: str = "" primary_ip: str = "" raid: str = "" - remote_devices: List[NVMeDevice] = [] - remote_jm_devices: List[JMDevice] = [] + remote_devices: List[RemoteDevice] = [] + remote_jm_devices: List[RemoteJMDevice] = [] rpc_password: str = "" rpc_port: int = -1 rpc_username: str = "" @@ -97,12 +98,17 @@ class StorageNode(BaseNodeObject): subsystem: str = "" system_uuid: str = "" lvstore_status: str = "" + cr_name: str = "" + cr_namespace: str = "" + cr_plural: str = "" nvmf_port: int = 4420 physical_label: int = 0 hublvol: HubLVol = None # type: ignore[assignment] active_tcp: bool = True active_rdma: bool = False - lvol_sync_del_queue: List[str] = [] + socket: int = 0 + firewall_port: int = 5001 + lvol_poller_mask: str = "" def rpc_client(self, **kwargs): """Return rpc client to this node @@ -303,3 +309,70 @@ def create_alceml(self, name, nvme_bdev, uuid, **kwargs): alceml_worker_cpu_mask=alceml_worker_cpu_mask, **kwargs, ) + + def wait_for_jm_rep_tasks_to_finish(self, jm_vuid): + if not self.rpc_client().bdev_lvol_get_lvstores(self.lvstore): + return True # no lvstore means no need to wait + retry = 10 + while retry > 0: + try: + jm_replication_tasks = False + ret = self.rpc_client().jc_get_jm_status(jm_vuid) + for jm in ret: + if ret[jm] is False: # jm is not ready (has active replication task) + jm_replication_tasks = True + break + if jm_replication_tasks: + logger.warning(f"Replication task found on node: {self.get_id()}, jm_vuid: {jm_vuid}, retry...") + retry -= 1 + time.sleep(20) + else: + return True + except Exception: + logger.warning("Failed to get replication task!") + return False + + def lvol_sync_del(self) -> bool: + from simplyblock_core.db_controller import DBController + db_controller = DBController() + lock = db_controller.get_lvol_del_lock(self.get_id()) + if lock: + return True + return False + + def lvol_del_sync_lock(self) -> bool: + from simplyblock_core.db_controller import DBController + db_controller = DBController() + lock = db_controller.get_lvol_del_lock(self.get_id()) + if not lock: + lock = NodeLVolDelLock({"uuid": self.uuid}) + lock.write_to_db() + logger.info(f"Created lvol_del_sync_lock on node: {self.get_id()}") + return True + + def lvol_del_sync_lock_reset(self) -> bool: + from simplyblock_core.db_controller import DBController + db_controller = DBController() + task_found = False + tasks = db_controller.get_job_tasks(self.cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL and task.node_id == self.secondary_node_id: + if task.status != JobSchedule.STATUS_DONE and task.canceled is False: + task_found = True + break + + lock = db_controller.get_lvol_del_lock(self.get_id()) + if task_found: + if not lock: + lock = NodeLVolDelLock({"uuid": self.uuid}) + lock.write_to_db() + logger.info(f"Created lvol_del_sync_lock on node: {self.get_id()}") + else: + if lock: + lock.remove(db_controller.kv_store) + logger.info(f"remove lvol_del_sync_lock from node: {self.get_id()}") + return True + + +class NodeLVolDelLock(BaseModel): + pass \ No newline at end of file diff --git a/simplyblock_core/prom_client.py b/simplyblock_core/prom_client.py new file mode 100644 index 000000000..833d42b36 --- /dev/null +++ b/simplyblock_core/prom_client.py @@ -0,0 +1,130 @@ +import logging +import re +from datetime import datetime, timedelta + +from simplyblock_core import constants +from simplyblock_core.db_controller import DBController +from simplyblock_core.models.mgmt_node import MgmtNode + +from prometheus_api_client import PrometheusConnect + +logger = logging.getLogger() + + +class PromClientException(Exception): + def __init__(self, message): + self.message = message + + +class PromClient: + + def __init__(self, cluster_id): + db_controller = DBController() + cluster_ip = None + cluster = db_controller.get_cluster_by_id(cluster_id) + if cluster.mode == "docker": + for node in db_controller.get_mgmt_nodes(): + if node.cluster_id == cluster_id and node.status == MgmtNode.STATUS_ONLINE: + cluster_ip = node.mgmt_ip + break + if cluster_ip is None: + raise PromClientException("Cluster has no online mgmt nodes") + else: + cluster_ip = constants.PROMETHEUS_STATEFULSET_NAME + self.ip_address = f"{cluster_ip}:9090" + self.url = 'http://%s/' % self.ip_address + self.client = PrometheusConnect(url=self.url, disable_ssl=True) + + def parse_history_param(self, history_string): + if not history_string: + logger.error("Invalid history value") + return False + + # process history + results = re.search(r'^(\d+[hmd])(\d+[hmd])?$', history_string.lower()) + if not results: + logger.error(f"Error parsing history string: {history_string}") + logger.info("History format: xxdyyh , e.g: 1d12h, 1d, 2h, 1m") + return False + + history_in_days = 0 + history_in_hours = 0 + history_in_minutes = 0 + for s in results.groups(): + if not s: + continue + ind = s[-1] + v = int(s[:-1]) + if ind == 'd': + history_in_days = v + if ind == 'h': + history_in_hours = v + if ind == 'm': + history_in_minutes = v + + history_in_hours += int(history_in_minutes/60) + history_in_minutes = history_in_minutes % 60 + history_in_days += int(history_in_hours/24) + history_in_hours = history_in_hours % 24 + return history_in_days, history_in_hours, history_in_minutes + + def get_metrics(self, key_prefix, metrics_lst, params, history=None): + start_time = datetime.now() - timedelta(minutes=10) + if history: + try: + days,hours,minutes = self.parse_history_param(history) + start_time = datetime.now() - timedelta(days=days, hours=hours, minutes=minutes) + except Exception: + raise PromClientException(f"Error parsing history string: {history}") + end_time = datetime.now() + data_out: list[dict] = [] + for key in metrics_lst: + metrics = self.client.get_metric_range_data( + f"{key_prefix}_{key}", label_config=params, start_time=start_time, end_time=end_time) + for m in metrics: + mt_name = key + mt_values = m["values"] + for i, v in enumerate(mt_values): + value = v[1] + try: + value = int(value) + except Exception: + pass + if len(data_out) <= i: + data_out.append({mt_name: value}) + else: + d = data_out[i] + if mt_name not in d: + d[mt_name] = value + + return data_out + + def get_cluster_metrics(self, cluster_uuid, metrics_lst, history=None): + params = { + "cluster": cluster_uuid + } + return self.get_metrics("cluster", metrics_lst, params, history) + + def get_node_metrics(self, snode_uuid, metrics_lst, history=None): + params = { + "snode": snode_uuid + } + return self.get_metrics("snode", metrics_lst, params, history) + + def get_device_metrics(self, device_uuid, metrics_lst, history=None): + params = { + "device": device_uuid + } + return self.get_metrics("device", metrics_lst, params, history) + + def get_lvol_metrics(self, lvol_uuid, metrics_lst, history=None): + params = { + "lvol": lvol_uuid + } + return self.get_metrics("lvol", metrics_lst, params, history) + + def get_pool_metrics(self, pool_uuid, metrics_lst, history=None): + params = { + "pool": pool_uuid + } + return self.get_metrics("pool", metrics_lst, params, history) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 62f37b1e9..abfd5a216 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -109,11 +109,10 @@ def _request2(self, method, params=None): if params: payload['params'] = params try: - logger.debug("Requesting method: %s, params: %s", method, params) + logger.debug("From: %s, Requesting method: %s, params: %s", self.ip_address, method, params) response = self.session.post(self.url, data=json.dumps(payload), timeout=self.timeout) - except Exception as e: - logger.error(e) - return False, str(e) + except Exception: + raise RPCException("connection error") ret_code = response.status_code ret_content = response.content @@ -306,7 +305,7 @@ def ultra21_alloc_ns_init(self, pci_addr): } return self._request2("ultra21_alloc_ns_init", params) - def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None): + def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None, eui64=None): params = { "nqn": nqn, "namespace": { @@ -323,6 +322,11 @@ def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None) if nsid: params['namespace']['nsid'] = nsid + if eui64: + params['namespace']['eui64'] = eui64 + params['namespace']['ptpl_file'] = "/mnt/ns_resv"+eui64+".json" + + return self._request("nvmf_subsystem_add_ns", params) def nvmf_subsystem_remove_ns(self, nqn, nsid): @@ -379,11 +383,11 @@ def create_lvol(self, name, size_in_mib, lvs_name, lvol_priority_class=0, ndcs=0 "clear_method": "unmap", "lvol_priority_class": lvol_priority_class, } - # if ndcs or npcs: - # params.update({ - # 'ndcs' : ndcs, - # 'npcs' : npcs, - # }) + if ndcs or npcs: + params.update({ + 'ndcs' : ndcs, + 'npcs' : npcs, + }) return self._request("bdev_lvol_create", params) def delete_lvol(self, name, del_async=False): @@ -581,7 +585,7 @@ def get_lvol_stats(self, uuid=""): params["uuid"] = uuid return self._request("bdev_get_iostat", params) - def bdev_raid_create(self, name, bdevs_list, raid_level="0", strip_size_kb=4): + def bdev_raid_create(self, name, bdevs_list, raid_level="0", strip_size_kb=4, superblock=False): try: ret = self.get_bdevs(name) if ret: @@ -593,7 +597,8 @@ def bdev_raid_create(self, name, bdevs_list, raid_level="0", strip_size_kb=4): "raid_level": raid_level, "strip_size_kb": strip_size_kb, "base_bdevs": bdevs_list, - "io_unmap_limit": 100 + "io_unmap_limit": 100, + "superblock": superblock } if raid_level == "1": params["strip_size_kb"] = 0 @@ -886,6 +891,12 @@ def nbd_stop_disk(self, nbd_device): } return self._request("nbd_stop_disk", params) + def nbd_get_disks(self, nbd_device): + params = { + "nbd_device": nbd_device + } + return self._request("nbd_get_disks", params) + def bdev_jm_unmap_vuid(self, name, vuid): params = {"name": name, "vuid": vuid} return self._request("bdev_jm_unmap_vuid", params) @@ -922,7 +933,7 @@ def distr_migration_status(self, name): params = {"name": name} return self._request("distr_migration_status", params) - def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=1024, jobs=4): + def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=constants.MIG_JOB_SIZE, jobs=constants.MIG_PARALLEL_JOBS): params = { "name": name, "storage_ID": storage_ID, @@ -935,7 +946,7 @@ def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=Fals params["jobs"] = jobs return self._request("distr_migration_failure_start", params) - def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=1024, jobs=4): + def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=constants.MIG_JOB_SIZE, jobs=constants.MIG_PARALLEL_JOBS): params = { "name": name, } @@ -954,10 +965,9 @@ def bdev_raid_add_base_bdev(self, raid_bdev, base_bdev): } return self._request("bdev_raid_add_base_bdev", params) - def bdev_raid_remove_base_bdev(self, raid_bdev, base_bdev): + def bdev_raid_remove_base_bdev(self, base_bdev): params = { - "raid_bdev": raid_bdev, - "base_bdev": base_bdev, + "name": base_bdev, } return self._request("bdev_raid_remove_base_bdev", params) @@ -1142,7 +1152,7 @@ def jc_suspend_compression(self, jm_vuid, suspend=False): "jm_vuid": jm_vuid, "suspend": suspend, } - return self._request("jc_suspend_compression", params) + return self._request2("jc_suspend_compression", params) def nvmf_subsystem_add_listener(self, nqn, trtype, traddr, trsvcid, ana_state=None): params = { @@ -1182,6 +1192,51 @@ def bdev_distrib_check_inflight_io(self, jm_vuid): } return self._request("bdev_distrib_check_inflight_io", params) + def bdev_lvol_create_poller_group(self, cpu_mask): + params = { + "cpu_mask": cpu_mask, + } + return self._request("bdev_lvol_create_poller_group", params) + + def bdev_lvol_transfer(self, lvol_name, offset, cluster_batch, gateway, operation): + # --operation {migrate,replicate} + params = { + "lvol_name": lvol_name, + "offset": offset, + "cluster_batch": cluster_batch, + "gateway": gateway, + "operation": operation, + } + return self._request("bdev_lvol_transfer", params) + + def bdev_lvol_transfer_stat(self, lvol_name): + """ + example: + ./rpc.py bdev_lvol_transfer_stat lvs_raid0_lvol/snapshot_1 + { + "transfer_state": "No process", + "offset": 0 + } + transfer_state values: + - No process + - In progress + - Failed + - Done + """ + params = { + "lvol_name": lvol_name, + } + return self._request("bdev_lvol_transfer_stat", params) + + def bdev_lvol_convert(self, lvol_name): + """ + convert lvol to snapshot + """ + params = { + "lvol_name": lvol_name, + } + return self._request("bdev_lvol_convert", params) + def bdev_lvol_remove_from_group(self, group_id, lvol_name_list): params = { "bdev_group_id": group_id , @@ -1229,3 +1284,16 @@ def nvmf_port_unblock_rdma(self, port): def nvmf_get_blocked_ports_rdma(self): return self._request("nvmf_get_blocked_ports") + + def bdev_lvol_add_clone(self, lvol_name, child_name): + params = { + "lvol_name": lvol_name, + "child_name": child_name, + } + return self._request("bdev_lvol_add_clone", params) + + def bdev_raid_get_bdevs(self): + params = { + "category": "online" + } + return self._request("bdev_raid_get_bdevs", params) diff --git a/simplyblock_core/scripts/charts/Chart.yaml b/simplyblock_core/scripts/charts/Chart.yaml index 9d1b62643..671f39cfa 100644 --- a/simplyblock_core/scripts/charts/Chart.yaml +++ b/simplyblock_core/scripts/charts/Chart.yaml @@ -17,20 +17,14 @@ dependencies: version: 1.4.0 repository: https://mongodb.github.io/helm-charts alias: mongodb - condition: monitoring.enabled + condition: observability.enabled - name: opensearch version: 2.9.0 repository: https://opensearch-project.github.io/helm-charts - condition: monitoring.enabled + condition: observability.enabled - name: prometheus version: "25.18.0" repository: "https://prometheus-community.github.io/helm-charts" - condition: monitoring.enabled - - name: openebs - version: 3.9.0 - repository: https://openebs.github.io/charts - alias: openebs - condition: openebs.enabled - name: ingress-nginx version: 4.10.1 repository: "https://kubernetes.github.io/ingress-nginx" diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockdevices.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockdevices.yaml new file mode 100644 index 000000000..272030736 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockdevices.yaml @@ -0,0 +1,135 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblockdevices.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockDevice + listKind: SimplyBlockDeviceList + plural: simplyblockdevices + singular: simplyblockdevice + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockDevice is the Schema for the simplyblockdevices API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockDevice + properties: + action: + enum: + - remove + - restart + type: string + clusterName: + type: string + deviceID: + type: string + nodeUUID: + type: string + required: + - clusterName + type: object + status: + description: status defines the observed state of SimplyBlockDevice + properties: + actionStatus: + properties: + action: + type: string + message: + type: string + nodeUUID: + type: string + observedGeneration: + format: int64 + type: integer + state: + type: string + triggered: + type: boolean + updatedAt: + format: date-time + type: string + type: object + nodes: + items: + properties: + devices: + items: + properties: + health: + type: string + model: + type: string + size: + type: string + stats: + items: + properties: + capacityUtil: + format: int64 + type: integer + riops: + format: int64 + type: integer + rtp: + format: int64 + type: integer + wiops: + format: int64 + type: integer + wtp: + format: int64 + type: integer + type: object + type: array + status: + type: string + utilization: + format: int64 + type: integer + uuid: + type: string + type: object + type: array + nodeUUID: + type: string + type: object + type: array + type: object + required: + - spec + type: object + x-kubernetes-validations: + - message: nodeUUID and deviceID are required when action is specified + rule: '!(has(self.spec.action) && self.spec.action != "" && ((!has(self.spec.nodeUUID) + || self.spec.nodeUUID == "") || (!has(self.spec.deviceID) || self.spec.deviceID + == "")))' + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocklvols.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocklvols.yaml new file mode 100644 index 000000000..8e44a687d --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocklvols.yaml @@ -0,0 +1,144 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblocklvols.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockLvol + listKind: SimplyBlockLvolList + plural: simplyblocklvols + singular: simplyblocklvol + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.lvols.length() + name: LVOLs + type: integer + name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockLvol is the Schema for the simplyblocklvols API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockLvol + properties: + clusterName: + type: string + poolName: + type: string + required: + - clusterName + - poolName + type: object + status: + description: status defines the observed state of SimplyBlockLvol + properties: + configured: + type: boolean + lvols: + items: + properties: + blobID: + format: int64 + type: integer + clonedFromSnap: + type: string + createDt: + format: date-time + type: string + fabric: + type: string + ha: + type: boolean + health: + type: boolean + hostname: + type: string + isCrypto: + type: boolean + lvolName: + type: string + maxNamespacesPerSubsystem: + format: int64 + type: integer + namespaceID: + format: int64 + type: integer + nodeUUID: + items: + type: string + type: array + nqn: + type: string + poolName: + type: string + poolUUID: + type: string + pvcName: + type: string + qosClass: + format: int64 + type: integer + qosIOPS: + format: int64 + type: integer + qosRTP: + format: int64 + type: integer + qosRWTP: + format: int64 + type: integer + qosWTP: + format: int64 + type: integer + size: + type: string + snapName: + type: string + status: + type: string + stripeWdata: + format: int64 + type: integer + stripeWparity: + format: int64 + type: integer + subsysPort: + format: int64 + type: integer + updateDt: + format: date-time + type: string + uuid: + type: string + type: object + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockpools.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockpools.yaml new file mode 100644 index 000000000..693322dc3 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockpools.yaml @@ -0,0 +1,96 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblockpools.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockPool + listKind: SimplyBlockPoolList + plural: simplyblockpools + singular: simplyblockpool + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockPool is the Schema for the pools API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of Pool + properties: + action: + type: string + capacityLimit: + type: string + clusterName: + type: string + name: + type: string + qosIOPSLimit: + format: int32 + type: integer + rLimit: + format: int32 + type: integer + rwLimit: + format: int32 + type: integer + status: + type: string + wLimit: + format: int32 + type: integer + required: + - clusterName + - name + type: object + status: + description: status defines the observed state of Pool + properties: + qosHost: + type: string + qosIOPSLimit: + format: int32 + type: integer + rLimit: + format: int32 + type: integer + rwLimit: + format: int32 + type: integer + status: + type: string + uuid: + type: string + wLimit: + format: int32 + type: integer + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml new file mode 100644 index 000000000..8eebd8370 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml @@ -0,0 +1,154 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblocksnapshotreplications.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockSnapshotReplication + listKind: SimplyBlockSnapshotReplicationList + plural: simplyblocksnapshotreplications + singular: simplyblocksnapshotreplication + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockSnapshotReplication is the Schema for the simplyblocksnapshotreplications + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockSnapshotReplication + properties: + action: + enum: + - failback + type: string + excludeVolumeIDs: + description: 'Optional: volumes to exclude from failback.' + items: + type: string + type: array + includeVolumeIDs: + description: |- + Optional: only these volumes are included in failback. + If empty, all volumes are candidates unless excluded below. + items: + type: string + type: array + interval: + description: 'snapshot replication interval in seconds (default: 300sec)' + format: int32 + type: integer + sourceCluster: + description: Source cluster for the snapshots + type: string + targetCluster: + description: Target cluster for replication + type: string + targetPool: + description: Target cluster pool for replication + type: string + timeout: + description: snapshot replication timeout + format: int32 + type: integer + volumeIDs: + description: 'Optional: list of volumes to replicate. Empty means + all volumes' + items: + type: string + type: array + required: + - sourceCluster + - targetCluster + - targetPool + type: object + status: + description: status defines the observed state of SimplyBlockSnapshotReplication + properties: + configured: + type: boolean + observedFailbackGeneration: + description: The metadata.generation value for which failback was + last processed. + format: int64 + type: integer + volumes: + description: Per-volume replication status + items: + description: VolumeReplicationStatus tracks the replication state + of an individual volume + properties: + errors: + description: 'Optional: list of errors encountered for this + volume' + items: + description: ReplicationError stores timestamped error messages + properties: + message: + type: string + timestamp: + format: date-time + type: string + required: + - message + - timestamp + type: object + type: array + lastReplicationTime: + description: Timestamp of the last successful replication for + this volume + format: date-time + type: string + lastSnapshotID: + description: Last snapshot ID replicated for this volume + type: string + phase: + description: Current phase for this volume + enum: + - Pending + - Running + - Completed + - Failed + - Paused + type: string + replicatedCount: + description: Number of snapshots successfully replicated + format: int32 + type: integer + volumeID: + description: Volume ID + type: string + required: + - volumeID + type: object + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstorageclusters.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstorageclusters.yaml new file mode 100644 index 000000000..cfd99fdee --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstorageclusters.yaml @@ -0,0 +1,173 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblockstorageclusters.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockStorageCluster + listKind: SimplyBlockStorageClusterList + plural: simplyblockstorageclusters + singular: simplyblockstoragecluster + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockStorageCluster is the Schema for the simplyblockstorageclusters + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockStorageCluster + properties: + action: + enum: + - activate + - expand + type: string + blkSize: + format: int32 + type: integer + capCrit: + format: int32 + type: integer + capWarn: + format: int32 + type: integer + clientQpairCount: + format: int32 + type: integer + clusterName: + type: string + distrBs: + format: int32 + type: integer + distrChunkBs: + format: int32 + type: integer + enableNodeAffinity: + type: boolean + eventLogEntries: + format: int32 + type: integer + fabric: + type: string + haType: + type: string + includeEventLog: + type: boolean + inflightIOThreshold: + format: int32 + type: integer + isSingleNode: + type: boolean + maxQueueSize: + format: int32 + type: integer + mgmtIfc: + description: Create-only + type: string + pageSizeInBlocks: + format: int32 + type: integer + provCapCrit: + format: int32 + type: integer + provCapWarn: + format: int32 + type: integer + qosClasses: + description: Updatable + type: string + qpairCount: + format: int32 + type: integer + strictNodeAntiAffinity: + type: boolean + stripeWdata: + format: int32 + type: integer + stripeWparity: + format: int32 + type: integer + required: + - clusterName + type: object + status: + description: status defines the observed state of SimplyBlockStorageCluster + properties: + MOD: + type: string + NQN: + type: string + UUID: + type: string + actionStatus: + properties: + action: + type: string + message: + type: string + nodeUUID: + type: string + observedGeneration: + format: int64 + type: integer + state: + type: string + triggered: + type: boolean + updatedAt: + format: date-time + type: string + type: object + clusterName: + type: string + configured: + type: boolean + created: + format: date-time + type: string + lastUpdated: + format: date-time + type: string + mgmtNodes: + format: int32 + type: integer + rebalancing: + type: boolean + secretName: + type: string + status: + type: string + storageNodes: + format: int32 + type: integer + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstoragenodes.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstoragenodes.yaml new file mode 100644 index 000000000..1e6af7724 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstoragenodes.yaml @@ -0,0 +1,204 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblockstoragenodes.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockStorageNode + listKind: SimplyBlockStorageNodeList + plural: simplyblockstoragenodes + singular: simplyblockstoragenode + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockStorageNode is the Schema for the storagenodes API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of StorageNode + properties: + action: + enum: + - shutdown + - restart + - suspend + - resume + - remove + type: string + addPcieToAllowList: + description: restart params + items: + type: string + type: array + clusterImage: + type: string + clusterName: + type: string + coreIsolation: + type: boolean + coreMask: + type: string + corePercentage: + format: int32 + type: integer + dataNIC: + items: + type: string + type: array + driveSizeRange: + type: string + force: + type: boolean + haJM: + type: boolean + haJmCount: + format: int32 + type: integer + idDeviceByNQN: + type: boolean + jmPercent: + format: int32 + type: integer + maxLVol: + format: int32 + type: integer + maxSize: + type: string + mgmtIfc: + type: string + nodeAddr: + type: string + nodeUUID: + description: NodeUUID is required when action is specified + type: string + nodesPerSocket: + format: int32 + type: integer + openShiftCluster: + type: boolean + partitions: + format: int32 + type: integer + pcieAllowList: + items: + type: string + type: array + pcieDenyList: + items: + type: string + type: array + pcieModel: + type: string + socketsToUse: + format: int32 + type: integer + spdkDebug: + type: boolean + spdkImage: + type: string + useSeparateJournalDevice: + type: boolean + workerNode: + type: string + workerNodes: + items: + type: string + type: array + required: + - clusterName + type: object + status: + description: status defines the observed state of StorageNode + properties: + actionStatus: + properties: + action: + type: string + message: + type: string + nodeUUID: + type: string + observedGeneration: + format: int64 + type: integer + state: + type: string + updatedAt: + format: date-time + type: string + type: object + nodes: + items: + properties: + cpu: + format: int32 + type: integer + devices: + type: string + health: + type: boolean + hostname: + type: string + lvol_port: + format: int32 + type: integer + memory: + type: string + mgmtIp: + type: string + nvmf_port: + format: int32 + type: integer + rpc_port: + format: int32 + type: integer + status: + type: string + uptime: + type: string + uuid: + type: string + volumes: + format: int32 + type: integer + type: object + type: array + type: object + required: + - spec + type: object + x-kubernetes-validations: + - message: nodeUUID is required when action is specified + rule: '!(has(self.spec.action) && self.spec.action != "" && (!has(self.spec.nodeUUID) + || self.spec.nodeUUID == ""))' + - message: clusterImage, maxLVol, and workerNodes are required when action + is not specified + rule: (has(self.spec.action) && self.spec.action != "") || (has(self.spec.clusterImage) + && self.spec.clusterImage != "" && has(self.spec.maxLVol) && has(self.spec.workerNodes) + && size(self.spec.workerNodes) > 0) + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocktasks.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocktasks.yaml new file mode 100644 index 000000000..2d25e21e1 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocktasks.yaml @@ -0,0 +1,84 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblocktasks.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockTask + listKind: SimplyBlockTaskList + plural: simplyblocktasks + singular: simplyblocktask + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockTask is the Schema for the simplyblocktasks API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockTask + properties: + clusterName: + type: string + subtasks: + type: boolean + taskID: + type: string + required: + - clusterName + type: object + status: + description: status defines the observed state of SimplyBlockTask + properties: + tasks: + items: + properties: + canceled: + type: boolean + parentTask: + type: string + retried: + format: int32 + type: integer + startedAt: + format: date-time + type: string + taskResult: + type: string + taskStatus: + type: string + taskType: + type: string + uuid: + type: string + type: object + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/templates/_helpers.tpl b/simplyblock_core/scripts/charts/templates/_helpers.tpl new file mode 100644 index 000000000..710260fdc --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/_helpers.tpl @@ -0,0 +1,21 @@ +{{- define "simplyblock.commonContainer" }} +env: + - name: SIMPLYBLOCK_LOG_LEVEL + valueFrom: + configMapKeyRef: + name: simplyblock-config + key: LOG_LEVEL + +volumeMounts: + - name: fdb-cluster-file + mountPath: /etc/foundationdb/fdb.cluster + subPath: fdb.cluster + +resources: + requests: + cpu: "50m" + memory: "100Mi" + limits: + cpu: "300m" + memory: "1Gi" +{{- end }} diff --git a/simplyblock_core/scripts/charts/templates/app_configmap.yaml b/simplyblock_core/scripts/charts/templates/app_configmap.yaml index de0a4da08..a4d1d57dd 100644 --- a/simplyblock_core/scripts/charts/templates/app_configmap.yaml +++ b/simplyblock_core/scripts/charts/templates/app_configmap.yaml @@ -6,8 +6,8 @@ metadata: namespace: {{ .Release.Namespace }} data: - LOG_LEVEL: {{ .Values.log.level }} - LOG_DELETION_INTERVAL: {{ .Values.log.deletionInterval }} + LOG_LEVEL: {{ .Values.observability.level }} + LOG_DELETION_INTERVAL: {{ .Values.observability.deletionInterval }} --- @@ -29,6 +29,7 @@ data: Path /var/log/containers/*.log Parser docker Tag kube.* + Exclude_Path /var/log/containers/*fluent-bit*.log Refresh_Interval 5 Mem_Buf_Limit 5MB Skip_Long_Lines On @@ -69,9 +70,11 @@ data: filter.lua: | function filter_tagged_pods(tag, timestamp, record) - annotations = record["kubernetes"]["annotations"] - if annotations ~= nil and annotations["log-collector/enabled"] == "true" then - return 1, record + if record["kubernetes"] ~= nil then + local annotations = record["kubernetes"]["annotations"] + if annotations ~= nil and annotations["log-collector/enabled"] == "true" then + return 1, record + end end return -1, record end diff --git a/simplyblock_core/scripts/charts/templates/app_ingress.yaml b/simplyblock_core/scripts/charts/templates/app_ingress.yaml index 67e7b0912..b49b0c396 100644 --- a/simplyblock_core/scripts/charts/templates/app_ingress.yaml +++ b/simplyblock_core/scripts/charts/templates/app_ingress.yaml @@ -1,4 +1,5 @@ -{{- if (not .Values.ingress.useDNS) }} +{{- if .Values.ingress.enabled }} + {{- if not .Values.ingress.useDNS }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -17,7 +18,7 @@ spec: port: number: 5000 --- -{{- else if .Values.ingress.useDNS }} + {{- else if .Values.ingress.useDNS }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -45,4 +46,5 @@ spec: name: simplyblock-webappapi port: number: 5000 + {{- end }} {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index ec2e5b378..82f1d4f2c 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -5,7 +5,7 @@ metadata: name: simplyblock-admin-control namespace: {{ .Release.Namespace }} spec: - replicas: 1 + replicas: 2 selector: matchLabels: app: simplyblock-admin-control @@ -18,7 +18,16 @@ spec: labels: app: simplyblock-admin-control spec: - serviceAccountName: simplyblock-control-sa + serviceAccountName: simplyblock-sa + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app: simplyblock-admin-control + topologyKey: kubernetes.io/hostname containers: - name: simplyblock-control image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -31,11 +40,13 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace +{{- if .Values.observability.enabled }} - name: MONITORING_SECRET valueFrom: secretKeyRef: name: simplyblock-grafana-secrets key: MONITORING_SECRET +{{- end }} - name: SIMPLYBLOCK_LOG_LEVEL valueFrom: configMapKeyRef: @@ -61,11 +72,12 @@ spec: path: fdb.cluster --- apiVersion: apps/v1 -kind: DaemonSet +kind: Deployment metadata: name: simplyblock-webappapi namespace: {{ .Release.Namespace }} spec: + replicas: 2 selector: matchLabels: app: simplyblock-webappapi @@ -77,7 +89,15 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-webappapi - spec: + spec: + serviceAccountName: simplyblock-sa + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app: simplyblock-admin-control + topologyKey: kubernetes.io/hostname containers: - name: webappapi image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -91,6 +111,21 @@ spec: configMapKeyRef: name: simplyblock-config key: LOG_LEVEL + - name: LVOL_NVMF_PORT_START + value: "{{ .Values.ports.lvolNvmfPortStart }}" + - name: ENABLE_MONITORING + value: "{{ .Values.observability.enabled }}" + - name: K8S_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace +{{- if .Values.observability.enabled }} + - name: MONITORING_SECRET + valueFrom: + secretKeyRef: + name: simplyblock-grafana-secrets + key: MONITORING_SECRET +{{- end }} - name: FLASK_DEBUG value: "False" - name: FLASK_ENV @@ -106,55 +141,20 @@ spec: limits: cpu: "500m" memory: "2Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-storage-node-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-storage-node-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-storage-node-monitor - spec: - containers: - - name: storage-node-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/storage_node_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL + - name: fluent-bit + image: fluent/fluent-bit:1.8.11 volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster + - name: varlog + mountPath: /var/log + - name: config + mountPath: /fluent-bit/etc/ resources: requests: - cpu: "200m" - memory: "256Mi" + cpu: "100m" + memory: "200Mi" limits: - cpu: "400m" - memory: "1Gi" + cpu: "200m" + memory: "400Mi" volumes: - name: fdb-cluster-file configMap: @@ -162,18 +162,23 @@ spec: items: - key: cluster-file path: fdb.cluster - + - name: varlog + hostPath: + path: /var/log + - name: config + configMap: + name: simplyblock-fluent-bit-config --- apiVersion: apps/v1 kind: Deployment metadata: - name: simplyblock-mgmt-node-monitor + name: simplyblock-monitoring namespace: {{ .Release.Namespace }} spec: replicas: 1 selector: matchLabels: - app: simplyblock-mgmt-node-monitor + app: simplyblock-monitoring template: metadata: annotations: @@ -181,201 +186,182 @@ spec: reloader.stakater.com/auto: "true" reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: - app: simplyblock-mgmt-node-monitor + app: simplyblock-monitoring spec: + serviceAccountName: simplyblock-sa + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - - name: mgmt-node-monitor + - name: storage-node-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/storage_node_monitor.py"] imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: mgmt-node-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/mgmt_node_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" env: - - name: BACKEND_TYPE - value: "k8s" - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL + - name: BACKEND_TYPE + value: "k8s" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-lvol-stats-collector - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-lvol-stats-collector - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-lvol-stats-collector - spec: - containers: - name: lvol-stats-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/lvol_stat_collector.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-main-distr-event-collector - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-main-distr-event-collector - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-main-distr-event-collector - spec: - containers: - name: main-distr-event-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/main_distr_event_collector.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: capacity-and-stats-collector + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/capacity_and_stats_collector.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: capacity-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/cap_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: health-check + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/health_check_service.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: device-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/device_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: lvol-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/lvol_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: snapshot-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/snapshot_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: fluent-bit + image: fluent/fluent-bit:1.8.11 + volumeMounts: + - name: varlog + mountPath: /var/log + - name: config + mountPath: /fluent-bit/etc/ resources: requests: - cpu: "200m" - memory: "256Mi" + cpu: "100m" + memory: "200Mi" limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster + cpu: "200m" + memory: "400Mi" ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-capacity-and-stats-collector - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-capacity-and-stats-collector - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-capacity-and-stats-collector - spec: - containers: - - name: capacity-and-stats-collector - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/capacity_and_stats_collector.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster - + - name: fdb-cluster-file + configMap: + name: simplyblock-fdb-cluster-config + items: + - key: cluster-file + path: fdb.cluster + - name: varlog + hostPath: + path: /var/log + - name: config + configMap: + name: simplyblock-fluent-bit-config --- apiVersion: apps/v1 kind: Deployment metadata: - name: simplyblock-capacity-monitor + name: simplyblock-tasks namespace: {{ .Release.Namespace }} spec: replicas: 1 selector: matchLabels: - app: simplyblock-capacity-monitor + app: simplyblock-tasks template: metadata: annotations: @@ -383,730 +369,168 @@ spec: reloader.stakater.com/auto: "true" reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: - app: simplyblock-capacity-monitor + app: simplyblock-tasks spec: - + serviceAccountName: simplyblock-sa + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - - name: capacity-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/cap_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-health-check - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-health-check - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-health-check - spec: - containers: - - name: health-check - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/health_check_service.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-device-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-device-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-device-monitor - spec: - containers: - - name: device-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/device_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-lvol-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-lvol-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-lvol-monitor - spec: - containers: - - name: lvol-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/lvol_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-snapshot-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-snapshot-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-snapshot-monitor - spec: - containers: - - name: snapshot-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/snapshot_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-cleanupfdb - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-cleanupfdb - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-cleanupfdb - spec: - containers: - - name: cleanupfdb + - name: tasks-node-add-runner image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/tasks_runner_node_add.py"] imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/workers/cleanup_foundationdb.py"] env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - - name: LOG_DELETION_INTERVAL - value: "${LOG_DELETION_INTERVAL}" + - name: LVOL_NVMF_PORT_START + value: "{{ .Values.ports.lvolNvmfPortStart }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-restart - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-restart - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-restart - spec: - containers: - name: tasks-runner-restart image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_restart.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-migration - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-migration - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-migration - spec: - containers: - name: tasks-runner-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_migration.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-failed-migration - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-failed-migration - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-failed-migration - spec: - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: tasks-runner-failed-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_failed_migration.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-cluster-status - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-cluster-status - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-cluster-status - spec: - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: tasks-runner-cluster-status image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_cluster_status.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-new-device-migration - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-new-device-migration - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-new-device-migration - spec: - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: tasks-runner-new-device-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_new_dev_migration.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-node-add-runner - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-node-add-runner - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-node-add-runner - spec: - containers: - - name: tasks-node-addrunner +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: tasks-runner-port-allow image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/tasks_runner_port_allow.py"] imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/tasks_runner_node_add.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: LVOL_NVMF_PORT_START - value: "{{ .Values.ports.lvolNvmfPortStart }}" - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-port-allow - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-port-allow - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-port-allow - spec: - containers: - - name: tasks-runner-port-allow + - name: tasks-runner-jc-comp-resume image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/tasks_runner_jc_comp.py"] imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/tasks_runner_port_allow.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-jc-comp-resume - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-jc-comp-resume - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-jc-comp-resume - spec: - containers: - - name: tasks-runner-jc-comp-resume +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: tasks-runner-sync-lvol-del image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/tasks_runner_sync_lvol_del.py"] imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/tasks_runner_jc_comp.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- +{{ toYaml .resources | nindent 12 }} +{{- end }} -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: simplyblock-fluent-bit - namespace: {{ .Release.Namespace }} - labels: - app: simplyblock-fluent-bit -spec: - selector: - matchLabels: - app: simplyblock-fluent-bit - template: - metadata: - labels: - app: simplyblock-fluent-bit - spec: - containers: + - name: tasks-runner-snapshot-replication + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/snapshot_replication.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} - name: fluent-bit image: fluent/fluent-bit:1.8.11 - securityContext: - privileged: true volumeMounts: - name: varlog mountPath: /var/log - - name: varlibdockercontainers - mountPath: /var/lib/docker/containers - readOnly: true - name: config mountPath: /fluent-bit/etc/ resources: requests: + cpu: "100m" + memory: "200Mi" + limits: cpu: "200m" memory: "400Mi" - limits: - cpu: "400m" - memory: "1Gi" + volumes: + - name: fdb-cluster-file + configMap: + name: simplyblock-fdb-cluster-config + items: + - key: cluster-file + path: fdb.cluster - name: varlog hostPath: path: /var/log - - name: varlibdockercontainers - hostPath: - path: /var/lib/docker/containers - name: config configMap: name: simplyblock-fluent-bit-config diff --git a/simplyblock_core/scripts/charts/templates/app_sa.yaml b/simplyblock_core/scripts/charts/templates/app_sa.yaml index a5dee735b..f04fc14b3 100644 --- a/simplyblock_core/scripts/charts/templates/app_sa.yaml +++ b/simplyblock_core/scripts/charts/templates/app_sa.yaml @@ -1,13 +1,13 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: simplyblock-control-sa + name: simplyblock-sa namespace: {{ .Release.Namespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: simplyblock-control-role + name: simplyblock-role rules: - apiGroups: [""] resources: ["configmaps"] @@ -21,16 +21,23 @@ rules: - apiGroups: ["mongodbcommunity.mongodb.com"] resources: ["mongodbcommunity"] verbs: ["get", "list", "watch", "patch", "update"] + - apiGroups: ["simplyblock.simplyblock.io"] + resources: ["simplyblockpools/status", "simplyblocklvols/status", "simplyblockstorageclusters/status", "simplyblockstoragenodes/status", "simplyblockdevices/status", "simplyblocktasks/status"] + verbs: ["get", "patch", "update"] + - apiGroups: ["simplyblock.simplyblock.io"] + resources: ["simplyblockpools", "simplyblocklvols", "simplyblockstorageclusters", "simplyblockstoragenodes", "simplyblockdevices", "simplyblocktasks"] + verbs: ["get","list" ,"patch", "update", "watch"] + --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: simplyblock-control-binding + name: simplyblock-binding subjects: - kind: ServiceAccount - name: simplyblock-control-sa + name: simplyblock-sa namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole - name: simplyblock-control-role + name: simplyblock-role apiGroup: rbac.authorization.k8s.io diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml new file mode 100644 index 000000000..2a9d7d044 --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml @@ -0,0 +1,24 @@ +apiVersion: storage.k8s.io/v1 +kind: CSIDriver +metadata: + name: hostpath.csi.k8s.io + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: hostpath.csi.k8s.io + app.kubernetes.io/component: csi-driver +spec: + # Supports persistent and ephemeral inline volumes. + volumeLifecycleModes: + - Persistent + - Ephemeral + # To determine at runtime which mode a volume uses, pod info and its + # "csi.storage.k8s.io/ephemeral" entry are needed. + podInfoOnMount: true + # No attacher needed. + attachRequired: false + storageCapacity: false + # Kubernetes may use fsGroup to change permissions and ownership + # of the volume to match user requested fsGroup in the pod's SecurityPolicy + fsGroupPolicy: File + \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml new file mode 100644 index 000000000..aa645bff4 --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml @@ -0,0 +1,232 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-hostpathplugin-sa + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: csi-hostpathplugin +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: [""] + resources: ["persistentvolumeclaims/status"] + verbs: ["get", "update", "patch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csinodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csistoragecapacities"] + verbs: ["get", "list", "watch", "create", "update", "delete"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch", "update", "get", "list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: csi-hostpathplugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: csi-hostpathplugin +subjects: + - kind: ServiceAccount + name: csi-hostpathplugin-sa + namespace: {{ .Release.Namespace }} + +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: csi-hostpathplugin + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin +spec: + selector: + matchLabels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + template: + metadata: + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + spec: + serviceAccountName: csi-hostpathplugin-sa + containers: + - name: csi-provisioner + image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0 + args: + - -v=5 + - --csi-address=/csi/csi.sock + - --feature-gates=Topology=true + - --node-deployment=true + - --strict-topology=true + - --immediate-topology=false + - --worker-threads=5 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + volumeMounts: + - mountPath: /csi + name: socket-dir + - name: csi-resizer + image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0 + args: + - -v=5 + - -csi-address=/csi/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + volumeMounts: + - mountPath: /csi + name: socket-dir + + - name: node-driver-registrar + image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0 + args: + - --v=5 + - --csi-address=/csi/csi.sock + - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + env: + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /registration + name: registration-dir + - mountPath: /csi-data-dir + name: csi-data-dir + + - name: hostpath + image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0 + args: + - --drivername=hostpath.csi.k8s.io + - --v=5 + - --endpoint=$(CSI_ENDPOINT) + - --nodeid=$(KUBE_NODE_NAME) + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + securityContext: + privileged: true + ports: + - containerPort: 9898 + name: healthz + protocol: TCP + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: healthz + initialDelaySeconds: 10 + timeoutSeconds: 3 + periodSeconds: 2 + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /var/lib/kubelet/pods + mountPropagation: Bidirectional + name: mountpoint-dir + - mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + name: plugins-dir + - mountPath: /csi-data-dir + name: csi-data-dir + - mountPath: /dev + name: dev-dir + - name: liveness-probe + volumeMounts: + - mountPath: /csi + name: socket-dir + image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0 + args: + - --csi-address=/csi/csi.sock + - --health-port=9898 + + volumes: + - hostPath: + path: /var/lib/kubelet/plugins/csi-hostpath + type: DirectoryOrCreate + name: socket-dir + - hostPath: + path: /var/lib/kubelet/pods + type: DirectoryOrCreate + name: mountpoint-dir + - hostPath: + path: /var/lib/kubelet/plugins_registry + type: Directory + name: registration-dir + - hostPath: + path: /var/lib/kubelet/plugins + type: Directory + name: plugins-dir + - hostPath: + # 'path' is where PV data is persisted on host. + # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot + path: /var/lib/csi-hostpath-data/ + type: DirectoryOrCreate + name: csi-data-dir + - hostPath: + path: /dev + type: Directory + name: dev-dir + \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/dashboards.yaml b/simplyblock_core/scripts/charts/templates/dashboards.yaml index 981e961d0..165bad130 100644 --- a/simplyblock_core/scripts/charts/templates/dashboards.yaml +++ b/simplyblock_core/scripts/charts/templates/dashboards.yaml @@ -1,4 +1,4 @@ -{{- if .Values.monitoring.enabled }} +{{- if .Values.observability.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -12512,14796 +12512,4 @@ data: "weekStart": "" } ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: simplyblock-grafana-dashboard-node-exporter - namespace: {{ .Release.Namespace }} - labels: - grafana_dashboard: "1" -data: - node-exporter.json: | - { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "gnetId": 1860, - "graphTooltip": 1, - "id": null, - "links": [], - "liveNow": false, - "panels": [ - { - "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 261, - "panels": [], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Quick CPU / Mem / Disk", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Resource pressure via PSI", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "links": [], - "mappings": [], - "max": 1, - "min": 0, - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "dark-yellow", - "value": 70 - }, - { - "color": "dark-red", - "value": 90 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 0, - "y": 1 - }, - "id": 323, - "links": [], - "options": { - "displayMode": "basic", - "minVizHeight": 10, - "minVizWidth": 0, - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "text": {} - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "irate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "instant": true, - "intervalFactor": 1, - "legendFormat": "CPU", - "range": false, - "refId": "CPU some", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "irate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "Mem", - "range": false, - "refId": "Memory some", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "irate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "I/O", - "range": false, - "refId": "I/O some", - "step": 240 - } - ], - "title": "Pressure", - "type": "bargauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Busy state of all CPU cores together", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 85 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 95 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 3, - "y": 1 - }, - "id": 20, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[$__rate_interval])))", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "", - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "CPU Busy", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "System load over all CPU cores together", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 85 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 95 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 6, - "y": 1 - }, - "id": 155, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "scalar(node_load1{instance=\"$node\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "Sys Load", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Non available RAM memory", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 80 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 90 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 9, - "y": 1 - }, - "hideTimeOverride": false, - "id": 16, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "((node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\", job=\"$job\"}) / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"}) * 100", - "format": "time_series", - "hide": true, - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "(1 - (node_memory_MemAvailable_bytes{instance=\"$node\", job=\"$job\"} / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"})) * 100", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "B", - "step": 240 - } - ], - "title": "RAM Used", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Used Swap", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 10 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 25 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 12, - "y": 1 - }, - "id": 21, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"})) * 100", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "SWAP Used", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Used Root FS", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 80 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 90 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 15, - "y": 1 - }, - "id": 154, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"})", - "format": "time_series", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "Root FS Used", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Total number of CPU cores", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 1 - }, - "id": 14, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - } - ], - "title": "CPU Cores", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "System uptime", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 20, - "y": 1 - }, - "hideTimeOverride": true, - "id": 15, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "Uptime", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Total RootFS", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 70 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 90 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 3 - }, - "id": 23, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "RootFS Total", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Total RAM", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 20, - "y": 3 - }, - "id": 75, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "RAM Total", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Total SWAP", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 22, - "y": 3 - }, - "id": 18, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "SWAP Total", - "type": "stat" - }, - { - "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 5 - }, - "id": 263, - "panels": [], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Basic CPU / Mem / Net / Disk", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Basic CPU info", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "percent" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Busy Iowait" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Idle" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Busy Iowait" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Idle" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Busy System" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Busy User" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Busy Other" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 6 - }, - "id": 77, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true, - "width": 250 - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "hide": false, - "instant": false, - "intervalFactor": 1, - "legendFormat": "Busy System", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Busy User", - "range": true, - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Busy Iowait", - "range": true, - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Busy IRQs", - "range": true, - "refId": "D", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Busy Other", - "range": true, - "refId": "E", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Idle", - "range": true, - "refId": "F", - "step": 240 - } - ], - "title": "CPU Basic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Basic memory usage", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "SWAP Used" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap Used" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "custom.stacking", - "value": { - "group": false, - "mode": "normal" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM Cache + Buffer" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Available" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#DEDAF7", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "custom.stacking", - "value": { - "group": false, - "mode": "normal" - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 6 - }, - "id": 78, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "RAM Total", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "RAM Used", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "RAM Cache + Buffer", - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "RAM Free", - "refId": "D", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "SWAP Used", - "refId": "E", - "step": 240 - } - ], - "title": "Memory Basic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Basic network info per interface", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bps" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Recv_bytes_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_bytes_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_drop_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_drop_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_errs_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_errs_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CCA300", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_bytes_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_bytes_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_drop_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_drop_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_errs_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_errs_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CCA300", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_bytes_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_drop_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_drop_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#967302", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_errs_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_errs_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_bytes_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_bytes_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_drop_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_drop_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#967302", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_errs_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_errs_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*trans.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 74, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "recv {{ "{{" }}device{{ "}}" }}", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "trans {{ "{{" }}device{{ "}}" }} ", - "refId": "B", - "step": 240 - } - ], - "title": "Network Traffic Basic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Disk space used of all filesystems mounted", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 152, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }}", - "refId": "A", - "step": 240 - } - ], - "title": "Disk Space Used Basic", - "type": "timeseries" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 20 - }, - "id": 265, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "percentage", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 70, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "percent" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Idle - Waiting for something to happen" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Iowait - Waiting for I/O to complete" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Irq - Servicing interrupts" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Nice - Niced processes executing in user mode" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Softirq - Servicing softirqs" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Steal - Time spent in other operating systems when running in a virtualized environment" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCE2DE", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "System - Processes executing in kernel mode" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "User - Normal processes executing in user mode" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#5195CE", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 21 - }, - "id": 3, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 250 - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "System - Processes executing in kernel mode", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "User - Normal processes executing in user mode", - "range": true, - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"nice\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Nice - Niced processes executing in user mode", - "range": true, - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Iowait - Waiting for I/O to complete", - "range": true, - "refId": "E", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Irq - Servicing interrupts", - "range": true, - "refId": "F", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"softirq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Softirq - Servicing softirqs", - "range": true, - "refId": "G", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"steal\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment", - "range": true, - "refId": "H", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Idle - Waiting for something to happen", - "range": true, - "refId": "J", - "step": 240 - } - ], - "title": "CPU", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap - Swap memory usage" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused - Free memory unassigned" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Hardware Corrupted - *./" - }, - "properties": [ - { - "id": "custom.stacking", - "value": { - "group": false, - "mode": "normal" - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 21 - }, - "id": 24, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Apps - Memory used by user-space applications", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified", - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)", - "refId": "D", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Cache - Parked file data (file content) cache", - "refId": "E", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Buffers - Block device (e.g. harddisk) cache", - "refId": "F", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Unused - Free memory unassigned", - "refId": "G", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Swap - Swap space used", - "refId": "H", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", - "refId": "I", - "step": 240 - } - ], - "title": "Memory Stack", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bits out (-) / in (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bps" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "receive_packets_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "receive_packets_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "transmit_packets_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "transmit_packets_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Trans.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 33 - }, - "id": 84, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Receive", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Transmit", - "refId": "B", - "step": 240 - } - ], - "title": "Network Traffic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 33 - }, - "id": 156, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }}", - "refId": "A", - "step": 240 - } - ], - "title": "Disk Space Used", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "IO read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "iops" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 45 - }, - "id": 229, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Reads completed", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Writes completed", - "refId": "B", - "step": 240 - } - ], - "title": "Disk IOps", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "io time" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*read*./" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byType", - "options": "time" - }, - "properties": [ - { - "id": "custom.axisPlacement", - "value": "hidden" - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 45 - }, - "id": 42, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Successfully read bytes", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Successfully written bytes", - "refId": "B", - "step": 240 - } - ], - "title": "I/O Usage Read / Write", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "%util", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "io time" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byType", - "options": "time" - }, - "properties": [ - { - "id": "custom.axisPlacement", - "value": "hidden" - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 57 - }, - "id": 127, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [$__rate_interval])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }}", - "refId": "A", - "step": 240 - } - ], - "title": "I/O Utilization", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "percentage", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "bars", - "fillOpacity": 70, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 3, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "max": 1, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/^Guest - /" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#5195ce", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/^GuestNice - /" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#c15c17", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 57 - }, - "id": 319, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[1m])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[1m])))", - "hide": false, - "legendFormat": "Guest - Time spent running a virtual CPU for a guest operating system", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\", mode=\"nice\"}[1m])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[1m])))", - "hide": false, - "legendFormat": "GuestNice - Time spent running a niced guest (virtual CPU for guest operating system)", - "range": true, - "refId": "B" - } - ], - "title": "CPU spent seconds in guests (VMs)", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "CPU / Memory / Net / Disk", - "type": "row" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 266, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 54 - }, - "id": 136, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Inactive - Memory which has been less recently used. It is more eligible to be reclaimed for other purposes", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Active / Inactive", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*CommitLimit - *./" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 54 - }, - "id": 135, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Committed_AS - Amount of memory presently allocated on the system", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "CommitLimit - Amount of memory currently available to be allocated on the system", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Committed", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 64 - }, - "id": 191, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Inactive_file - File-backed memory on inactive LRU list", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Active_file - File-backed memory on active LRU list", - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs", - "refId": "D", - "step": 240 - } - ], - "title": "Memory Active / Inactive Detail", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 64 - }, - "id": 130, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Writeback - Memory which is actively being written back to disk", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "WritebackTmp - Memory used by FUSE for temporary writeback buffers", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Dirty - Memory which is waiting to get written back to the disk", - "refId": "C", - "step": 240 - } - ], - "title": "Memory Writeback and Dirty", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" - }, - "properties": [ - { - "id": "custom.fillOpacity", - "value": 0 - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" - }, - "properties": [ - { - "id": "custom.fillOpacity", - "value": 0 - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 74 - }, - "id": 138, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Mapped - Used memory in mapped pages files which have been mapped, such as libraries", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Shmem - Used shared memory (shared between several processes, thus including RAM disks)", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "ShmemPmdMapped - Amount of shared (shmem/tmpfs) memory backed by huge pages", - "refId": "D", - "step": 240 - } - ], - "title": "Memory Shared and Mapped", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 74 - }, - "id": 131, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "SReclaimable - Part of Slab, that might be reclaimed, such as caches", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Slab", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 84 - }, - "id": 70, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "VmallocChunk - Largest contiguous block of vmalloc area which is free", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "VmallocTotal - Total size of vmalloc memory area", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "VmallocUsed - Amount of vmalloc area which is used", - "refId": "C", - "step": 240 - } - ], - "title": "Memory Vmalloc", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 84 - }, - "id": 159, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Bounce - Memory used for block device bounce buffers", - "refId": "A", - "step": 240 - } - ], - "title": "Memory Bounce", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Inactive *./" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 94 - }, - "id": 129, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "AnonHugePages - Memory in anonymous huge pages", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "AnonPages - Memory in user pages not backed by files", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Anonymous", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 94 - }, - "id": 160, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "KernelStack - Kernel memory stack. This is not reclaimable", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "PerCPU - Per CPU memory allocated dynamically by loadable modules", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Kernel / CPU", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "pages", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 104 - }, - "id": 140, - "links": [], - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "HugePages_Free - Huge pages in the pool that are not yet allocated", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages", - "refId": "C", - "step": 240 - } - ], - "title": "Memory HugePages Counter", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 104 - }, - "id": 71, - "links": [], - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "HugePages - Total size of the pool of huge pages", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Hugepagesize - Huge Page size", - "refId": "B", - "step": 240 - } - ], - "title": "Memory HugePages Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 114 - }, - "id": 128, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "DirectMap1G - Amount of pages mapped as this size", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "DirectMap2M - Amount of pages mapped as this size", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "DirectMap4K - Amount of pages mapped as this size", - "refId": "C", - "step": 240 - } - ], - "title": "Memory DirectMap", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 114 - }, - "id": 137, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "MLocked - Size of pages locked to memory using the mlock() system call", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Unevictable and MLocked", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 124 - }, - "id": 132, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "NFS Unstable - Memory in NFS pages sent to the server, but not yet committed to the storage", - "refId": "A", - "step": 240 - } - ], - "title": "Memory NFS", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Memory Meminfo", - "type": "row" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 28 - }, - "id": 270, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The number (after merges) of I/O requests completed per second for the device", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "IO read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "iops" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 47 - }, - "id": 9, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Reads completed", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Writes completed", - "refId": "B", - "step": 240 - } - ], - "title": "Disk IOps Completed", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The number of bytes read from or written to the device per second", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 47 - }, - "id": 33, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Read bytes", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Written bytes", - "refId": "B", - "step": 240 - } - ], - "title": "Disk R/W Data", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "time. read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 57 - }, - "id": 37, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "hide": false, - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Read wait time avg", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Write wait time avg", - "refId": "B", - "step": 240 - } - ], - "title": "Disk Average Wait Time", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The average queue length of the requests that were issued to the device", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "aqu-sz", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 57 - }, - "id": 35, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }}", - "refId": "A", - "step": 240 - } - ], - "title": "Average Queue Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The number of read and write requests merged per second that were queued to the device", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "I/Os", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "iops" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 133, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Read merged", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Write merged", - "refId": "B", - "step": 240 - } - ], - "title": "Disk R/W Merged", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Percentage of elapsed time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100% for devices serving requests serially. But for devices serving requests in parallel, such as RAID arrays and modern SSDs, this number does not reflect their performance limits.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "%util", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 36, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - IO", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - discard", - "refId": "B", - "step": 240 - } - ], - "title": "Time Spent Doing I/Os", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The number of outstanding requests at the instant the sample was taken. Incremented as requests are given to appropriate struct request_queue and decremented as they finish.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Outstanding req.", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 77 - }, - "id": 34, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - IO now", - "refId": "A", - "step": 240 - } - ], - "title": "Instantaneous Queue Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "IOs", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "iops" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 77 - }, - "id": 301, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Discards completed", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Discards merged", - "refId": "B", - "step": 240 - } - ], - "title": "Disk IOps Discards completed / merged", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Storage Disk", - "type": "row" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 29 - }, - "id": 271, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 62 - }, - "id": 43, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Available", - "metric": "", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Free", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Size", - "refId": "C", - "step": 240 - } - ], - "title": "Filesystem space available", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "file nodes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 62 - }, - "id": 41, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Free file nodes", - "refId": "A", - "step": 240 - } - ], - "title": "File Nodes Free", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "files", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 72 - }, - "id": 28, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 4, - "legendFormat": "Max open files", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Open files", - "refId": "B", - "step": 240 - } - ], - "title": "File Descriptor", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "file Nodes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 72 - }, - "id": 219, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - File nodes total", - "refId": "A", - "step": 240 - } - ], - "title": "File Nodes Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "counter", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "max": 1, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "/ ReadOnly" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 82 - }, - "id": 44, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - ReadOnly", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Device error", - "refId": "B", - "step": 240 - } - ], - "title": "Filesystem in ReadOnly / Error", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Storage Filesystem", - "type": "row" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 279, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "seconds", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 66 - }, - "id": 40, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}collector{{ "}}" }} - Scrape duration", - "refId": "A", - "step": 240 - } - ], - "title": "Node Exporter Scrape Time", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "counter", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineStyle": { - "fill": "solid" - }, - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*error.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F2495C", - "mode": "fixed" - } - }, - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 66 - }, - "id": 157, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}collector{{ "}}" }} - Scrape success", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}collector{{ "}}" }} - Scrape textfile error (1 = true)", - "refId": "B", - "step": 240 - } - ], - "title": "Node Exporter Scrape", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Node Exporter", - "type": "row" - } - ], - "refresh": "1m", - "revision": 1, - "schemaVersion": 38, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "default", - "value": "default" - }, - "hide": 0, - "includeAll": false, - "label": "Datasource", - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "queryValue": "", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "definition": "", - "hide": 0, - "includeAll": false, - "label": "Job", - "multi": false, - "name": "job", - "options": [], - "query": { - "query": "label_values(node_uname_info, job)", - "refId": "Prometheus-job-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "definition": "label_values(node_uname_info{job=\"$job\"}, instance)", - "hide": 0, - "includeAll": false, - "label": "Host", - "multi": false, - "name": "node", - "options": [], - "query": { - "query": "label_values(node_uname_info{job=\"$job\"}, instance)", - "refId": "Prometheus-node-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "selected": false, - "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", - "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" - }, - "hide": 2, - "includeAll": false, - "multi": false, - "name": "diskdevices", - "options": [ - { - "selected": true, - "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", - "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" - } - ], - "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", - "skipUrlSync": false, - "type": "custom" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "NodeExporter", - "uid": "d56e0ae7-48d5-481d-a2ea-3192da4d9e42", - "version": 5, - "weekStart": "" - } {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml index 1a3134e58..96d1c1979 100644 --- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml +++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml @@ -2,20 +2,20 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: controller-manager + name: simplyblock-fdb-controller-manager labels: - control-plane: controller-manager - app: controller-manager + control-plane: simplyblock-fdb-controller-manager + app: simplyblock-fdb-controller-manager spec: selector: matchLabels: - app: controller-manager + app: simplyblock-fdb-controller-manager replicas: 1 template: metadata: labels: - control-plane: controller-manager - app: controller-manager + control-plane: simplyblock-fdb-controller-manager + app: simplyblock-fdb-controller-manager spec: securityContext: runAsUser: 4059 @@ -28,7 +28,7 @@ spec: emptyDir: {} - name: fdb-binaries emptyDir: {} - serviceAccountName: controller-manager + serviceAccountName: simplyblock-fdb-controller-manager initContainers: - name: foundationdb-kubernetes-init-7-3 image: foundationdb/fdb-kubernetes-monitor:7.3.63 @@ -51,7 +51,9 @@ spec: containers: - command: - /manager - image: foundationdb/fdb-kubernetes-operator:v2.13.0 + args: + - "--health-probe-bind-address=:9443" + image: foundationdb/fdb-kubernetes-operator:v2.18.0 name: manager env: - name: WATCH_NAMESPACE @@ -86,13 +88,13 @@ spec: apiVersion: v1 kind: ServiceAccount metadata: - name: controller-manager + name: simplyblock-fdb-controller-manager --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: manager-role + name: simplyblock-fdb-manager-role rules: - apiGroups: - "" @@ -164,7 +166,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: creationTimestamp: null - name: manager-clusterrole + name: simplyblock-fdb-manager-clusterrole rules: - apiGroups: - "" @@ -179,27 +181,27 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: creationTimestamp: null - name: manager-rolebinding + name: simplyblock-fdb-manager-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: manager-role + name: simplyblock-fdb-manager-role subjects: - kind: ServiceAccount - name: controller-manager + name: simplyblock-fdb-controller-manager --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: creationTimestamp: null - name: manager-clusterrolebinding + name: simplyblock-fdb-manager-clusterrolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: manager-clusterrole + name: simplyblock-fdb-manager-clusterrole subjects: - kind: ServiceAccount - name: controller-manager + name: simplyblock-fdb-controller-manager namespace: metadata.namespace ##### cluster file ################# @@ -213,7 +215,11 @@ spec: replacements: enabled: true faultDomain: + {{- if .Values.foundationdb.multiAZ }} + key: topology.kubernetes.io/zone + {{- else }} key: foundationdb.org/none + {{- end }} imageType: split labels: filterOnOwnerReference: false @@ -224,16 +230,24 @@ spec: processGroupIDLabels: - foundationdb.org/fdb-process-group-id minimumUptimeSecondsForBounce: 60 + databaseConfiguration: + redundancy_mode: triple processCounts: + {{- if .Values.foundationdb.multiAZ }} + cluster_controller: 1 + log: 4 + storage: 4 + stateless: -1 + {{- else }} cluster_controller: 1 log: 3 storage: 3 stateless: -1 + {{- end }} processes: general: customParameters: - knob_disable_posix_kernel_aio=1 - - listen_address=0.0.0.0:4501 podTemplate: spec: containers: @@ -270,7 +284,7 @@ spec: runAsUser: 0 volumeClaimTemplate: spec: - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath accessModes: - ReadWriteOnce resources: @@ -285,10 +299,10 @@ spec: resources: limits: cpu: 500m - memory: 2Gi + memory: 4Gi requests: cpu: 100m - memory: 512Mi + memory: 1Gi securityContext: runAsUser: 0 affinity: @@ -308,10 +322,10 @@ spec: resources: limits: cpu: 500m - memory: 2Gi + memory: 4Gi requests: cpu: 100m - memory: 512Mi + memory: 1Gi securityContext: runAsUser: 0 affinity: diff --git a/simplyblock_core/scripts/charts/templates/mongodb.yaml b/simplyblock_core/scripts/charts/templates/mongodb.yaml index 740dd7642..6c004f314 100644 --- a/simplyblock_core/scripts/charts/templates/mongodb.yaml +++ b/simplyblock_core/scripts/charts/templates/mongodb.yaml @@ -1,3 +1,4 @@ +{{- if .Values.observability.enabled }} apiVersion: mongodbcommunity.mongodb.com/v1 kind: MongoDBCommunity metadata: @@ -14,7 +15,7 @@ spec: name: data-volume spec: accessModes: [ "ReadWriteOnce" ] - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath resources: requests: storage: 5Gi @@ -22,7 +23,7 @@ spec: name: logs-volume spec: accessModes: [ "ReadWriteOnce" ] - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath resources: requests: storage: 5Gi @@ -51,4 +52,5 @@ metadata: name: admin-password type: Opaque stringData: - password: {{ .Values.monitoring.secret }} + password: {{ .Values.observability.secret }} +{{- end }} diff --git a/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml b/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml index cb4243493..7f621fbb1 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml @@ -1,13 +1,17 @@ -{{- if .Values.monitoring.enabled }} - +{{- $name := printf "%s-simplyblock-prometheus-config" .Release.Name -}} +{{- $existing := (lookup "v1" "ConfigMap" .Release.Namespace $name) -}} apiVersion: v1 kind: ConfigMap metadata: - name: {{ .Release.Name }}-simplyblock-prometheus-config + name: {{ $name }} labels: app: simplyblock-prometheus namespace: {{ .Release.Namespace }} data: + {{- if $existing }} + prometheus.yml: | +{{ index $existing.data "prometheus.yml" | indent 4 }} + {{- else }} prometheus.yml: | global: scrape_interval: 30s @@ -15,7 +19,6 @@ data: monitor: 'codelab-monitor' scrape_configs: - - job_name: 'cluster_metrics' static_configs: - targets: ['simplyblock-webappapi:5000'] @@ -24,14 +27,7 @@ data: basic_auth: username: password: - - - job_name: 'node' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_endpoints_name] - action: keep - regex: 'simplyblock-node-exporter' + {{- end }} --- apiVersion: v1 @@ -46,6 +42,7 @@ data: type: FILESYSTEM config: directory: /mnt/thanos +{{- if .Values.observability.enabled }} --- apiVersion: v1 kind: ConfigMap @@ -60,7 +57,7 @@ data: datasources: - name: Thanos type: prometheus - url: http://simplyblock-thanos-query:9091 + url: http://simplyblock-thanos:9091 isDefault: true access: proxy uid: PBFA97CFB590B2093 @@ -829,7 +826,7 @@ data: type: slack settings: username: grafana_bot - url: '{{ .Values.grafana.contactPoint }}' + url: '{{ .Values.observability.grafana.contactPoint }}' title: | '{{ "{{" }} template "slack.title" . {{ "}}" }}' text: | diff --git a/simplyblock_core/scripts/charts/templates/monitoring_ingress.yaml b/simplyblock_core/scripts/charts/templates/monitoring_ingress.yaml index ec0e1ab80..bcccf4a35 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_ingress.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_ingress.yaml @@ -1,4 +1,5 @@ -{{- if (not .Values.ingress.useDNS) }} +{{- if .Values.ingress.enabled }} + {{- if not .Values.ingress.useDNS }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -32,9 +33,8 @@ spec: name: simplyblock-graylog port: number: 9000 - --- -{{- else if .Values.ingress.useDNS }} + {{- else if .Values.ingress.useDNS }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -77,4 +77,5 @@ spec: name: simplyblock-graylog port: number: 9000 + {{- end }} {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml index 9c0f46e1f..f54a9c2f5 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml @@ -1,4 +1,4 @@ -{{- if .Values.monitoring.enabled }} +{{- if .Values.observability.enabled }} --- apiVersion: apps/v1 kind: Deployment @@ -46,7 +46,7 @@ spec: - name: GRAYLOG_ELASTICSEARCH_HOSTS value: "http://opensearch-cluster-master:9200" - name: GRAYLOG_MONGODB_URI - value: "mongodb://admin:{{ .Values.monitoring.secret }}@simplyblock-mongo-svc:27017/graylog" + value: "mongodb://admin:{{ .Values.observability.secret }}@simplyblock-mongo-svc:27017/graylog" - name: GRAYLOG_SKIP_PREFLIGHT_CHECKS value: "true" - name: GRAYLOG_ROTATION_STRATEGY @@ -68,6 +68,8 @@ spec: value: "false" - name: GRAYLOG_ELASTICSEARCH_REPLICAS value: "1" + - name: GRAYLOG_MESSAGE_JOURNAL_MAX_SIZE + value: "10gb" ports: - containerPort: 5044 - containerPort: 5140 @@ -103,30 +105,37 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: simplyblock-thanos-store + name: simplyblock-thanos namespace: {{ .Release.Namespace }} spec: replicas: 1 selector: matchLabels: - app: simplyblock-thanos-store + app: simplyblock-thanos template: metadata: labels: - app: simplyblock-thanos-store - spec: + app: simplyblock-thanos + spec: containers: - name: thanos-store image: thanosio/thanos:v0.31.0 args: - store + - --grpc-address=0.0.0.0:10901 + - --http-address=0.0.0.0:10902 - --objstore.config-file=/etc/thanos/objstore.yml - --index-cache-size=500MB - --chunk-pool-size=500MB + ports: + - name: grpc + containerPort: 10901 + - name: http + containerPort: 10902 volumeMounts: - name: objstore-config mountPath: /etc/thanos - - name: thanos-data + - name: data mountPath: /data resources: requests: @@ -135,37 +144,20 @@ spec: limits: cpu: "250m" memory: "1Gi" - volumes: - - name: objstore-config - configMap: - name: simplyblock-objstore-config - - name: thanos-data - emptyDir: {} - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-thanos-query - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-thanos-query - template: - metadata: - labels: - app: simplyblock-thanos-query - spec: - containers: + - name: thanos-query image: thanosio/thanos:v0.31.0 args: - query + - --grpc-address=0.0.0.0:10911 - --http-address=0.0.0.0:9091 - - --store=simplyblock-thanos-store:10901 + - --store=simplyblock-thanos:10901 - --store=simplyblock-prometheus:10901 + ports: + - containerPort: 9091 + name: http + - containerPort: 10911 + name: grpc resources: requests: cpu: "100m" @@ -174,28 +166,11 @@ spec: cpu: "250m" memory: "1Gi" ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-thanos-compactor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-thanos-compactor - template: - metadata: - labels: - app: simplyblock-thanos-compactor - spec: - - containers: - name: thanos-compactor image: thanosio/thanos:v0.31.0 args: - compact + - --http-address=0.0.0.0:10922 - --data-dir=/data - --objstore.config-file=/etc/thanos/objstore.yml - --retention.resolution-raw=30d @@ -203,10 +178,13 @@ spec: - --retention.resolution-1h=90d - --compact.concurrency=1 - --wait + ports: + - containerPort: 10922 + name: http volumeMounts: - name: objstore-config mountPath: /etc/thanos - - name: compactor-data + - name: data mountPath: /data resources: requests: @@ -215,72 +193,14 @@ spec: limits: cpu: "250m" memory: "1Gi" + volumes: - name: objstore-config configMap: name: simplyblock-objstore-config - - name: compactor-data - emptyDir: {} ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: simplyblock-node-exporter - namespace: {{ .Release.Namespace }} -spec: - selector: - matchLabels: - app: simplyblock-node-exporter - template: - metadata: - labels: - app: simplyblock-node-exporter - spec: - containers: - - name: node-exporter - image: prom/node-exporter:v1.7.0 - args: - - '--path.procfs=/host/proc' - - '--path.sysfs=/host/sys' - - '--path.rootfs=/host/root' - - '--collector.filesystem.ignored-mount-points=^(/rootfs|/host|)/(sys|proc|dev|host|etc|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)' - - '--collector.filesystem.ignored-fs-types=^(sys|proc|auto|cgroup|devpts|ns|au|fuse.lxc|mqueue)(fs|)$' - - '--no-collector.ipvs' - - '--web.listen-address=:9200' - ports: - - containerPort: 9200 - protocol: TCP - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - mountPropagation: HostToContainer - - name: sys - mountPath: /host/sys - mountPropagation: HostToContainer - readOnly: true - - name: root - mountPath: /host/root - mountPropagation: HostToContainer - readOnly: true - resources: - requests: - cpu: "100m" - memory: "256Mi" - limits: - cpu: "250m" - memory: "1Gi" - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys - - name: root - hostPath: - path: / + - name: data + emptyDir: {} --- apiVersion: apps/v1 @@ -343,9 +263,6 @@ spec: - name: dashboard-pools mountPath: /var/lib/grafana/dashboards/pools.json subPath: pools.json - - name: dashboard-node-exporter - mountPath: /var/lib/grafana/dashboards/node-exporter.json - subPath: node-exporter.json - name: grafana-data mountPath: /var/lib/grafana volumes: @@ -373,9 +290,6 @@ spec: - name: dashboard-pools configMap: name: simplyblock-grafana-dashboard-pools - - name: dashboard-node-exporter - configMap: - name: simplyblock-grafana-dashboard-node-exporter - name: grafana-data emptyDir: {} {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/monitoring_secret.yaml b/simplyblock_core/scripts/charts/templates/monitoring_secret.yaml index c39735159..df741f026 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_secret.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_secret.yaml @@ -1,4 +1,4 @@ -{{- if .Values.monitoring.enabled }} +{{- if .Values.observability.enabled }} apiVersion: v1 kind: Secret metadata: @@ -6,8 +6,8 @@ metadata: namespace: {{ .Release.Namespace }} type: Opaque stringData: - MONITORING_SECRET: "{{ .Values.monitoring.secret }}" - GRAFANA_ENDPOINT: "{{ .Values.grafana.endpoint }}" + MONITORING_SECRET: "{{ .Values.observability.secret }}" + GRAFANA_ENDPOINT: "{{ .Values.observability.grafana.endpoint }}" --- apiVersion: v1 @@ -17,7 +17,7 @@ metadata: namespace: {{ .Release.Namespace }} type: Opaque stringData: - GRAYLOG_PASSWORD_SECRET: "{{ .Values.graylog.passwordSecret }}" - GRAYLOG_ROOT_PASSWORD_SHA2: "{{ .Values.graylog.rootPasswordSha2 }}" - MAX_NUMBER_OF_INDICES: "{{ .Values.log.maxNumberIndex }}" + GRAYLOG_PASSWORD_SECRET: "{{ .Values.observability.graylog.passwordSecret }}" + GRAYLOG_ROOT_PASSWORD_SHA2: "{{ .Values.observability.graylog.rootPasswordSha2 }}" + MAX_NUMBER_OF_INDICES: "{{ .Values.observability.graylog.maxNumberIndex }}" {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/monitoring_svc.yaml b/simplyblock_core/scripts/charts/templates/monitoring_svc.yaml index 55b15dccc..5a0936434 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_svc.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_svc.yaml @@ -1,4 +1,4 @@ -{{- if .Values.monitoring.enabled }} +{{- if .Values.observability.enabled }} --- apiVersion: v1 kind: Service @@ -25,44 +25,19 @@ spec: apiVersion: v1 kind: Service metadata: - name: simplyblock-thanos-store + name: simplyblock-thanos namespace: {{ .Release.Namespace }} spec: selector: - app: simplyblock-thanos-store + app: simplyblock-thanos ports: - - name: thanos-store + - name: store port: 10901 targetPort: 10901 ---- -apiVersion: v1 -kind: Service -metadata: - name: simplyblock-thanos-query - namespace: {{ .Release.Namespace }} -spec: - selector: - app: simplyblock-thanos-query - ports: - - name: thanos-query + - name: query port: 9091 targetPort: 9091 ---- -apiVersion: v1 -kind: Service -metadata: - name: simplyblock-node-exporter - namespace: {{ .Release.Namespace }} -spec: - selector: - app: simplyblock-node-exporter - ports: - - name: simplyblock-node-exporter - protocol: TCP - port: 9200 - targetPort: 9200 - --- apiVersion: v1 kind: Service diff --git a/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml b/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml new file mode 100644 index 000000000..cca5e522d --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml @@ -0,0 +1,199 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simplyblock-manager + labels: + control-plane: simplyblock-manager + app: simplyblock-manager +spec: + selector: + matchLabels: + app: simplyblock-manager + replicas: 1 + template: + metadata: + labels: + control-plane: simplyblock-manager + app: simplyblock-manager + spec: + securityContext: + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + serviceAccountName: simplyblock-manager + containers: + - image: simplyblock/simplyblock-manager:snapshot_replication + imagePullPolicy: Always + name: manager + env: + - name: WATCH_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 500m + memory: 256Mi + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + privileged: false + terminationGracePeriodSeconds: 10 + +################# ROLE AND ROLE BINDING ############################## +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: simplyblock-manager + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: simplyblock-manager-clusterrole +rules: +- apiGroups: + - "" + resources: + - configmaps + - events + - persistentvolumeclaims + - pods + - pods/exec + - namespaces + - secrets + - services + - serviceaccounts + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - apps + resources: + - deployments + - daemonsets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - update + - patch +- apiGroups: + - "rbac.authorization.k8s.io" + resources: + - roles + - clusterroles + verbs: + - create + - get + - list + - watch + - update + - patch +- apiGroups: + - "rbac.authorization.k8s.io" + resources: + - rolebindings + - clusterrolebindings + verbs: + - create + - get + - list + - watch + - update + - patch +- apiGroups: + - simplyblock.simplyblock.io + resources: + - simplyblockpools + - simplyblocklvols + - simplyblockstorageclusters + - simplyblockstoragenodes + - simplyblockdevices + - simplyblocktasks + - simplyblocksnapshotreplications + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - simplyblock.simplyblock.io + resources: + - simplyblockpools/finalizers + - simplyblocklvols/finalizers + - simplyblockstorageclusters/finalizers + - simplyblockstoragenodes/finalizers + - simplyblockdevices/finalizers + - simplyblocktasks/finalizers + - simplyblocksnapshotreplications/finalizers + verbs: + - update + - delete +- apiGroups: + - simplyblock.simplyblock.io + resources: + - simplyblockpools/status + - simplyblocklvols/status + - simplyblockstorageclusters/status + - simplyblockstoragenodes/status + - simplyblockdevices/status + - simplyblocktasks/status + - simplyblocksnapshotreplications/status + verbs: + - get + - patch + - update + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + creationTimestamp: null + name: simplyblock-manager-clusterrolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: simplyblock-manager-clusterrole +subjects: +- kind: ServiceAccount + name: simplyblock-manager + namespace: {{ .Release.Namespace }} + \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/simplyblock_customresource.yaml b/simplyblock_core/scripts/charts/templates/simplyblock_customresource.yaml new file mode 100644 index 000000000..eb360b60a --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/simplyblock_customresource.yaml @@ -0,0 +1,145 @@ +{{- if .Values.simplyblock.cluster }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockStorageCluster +metadata: + name: {{ .Values.simplyblock.cluster.clusterName }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} + + {{- if .Values.simplyblock.cluster.mgmtIfc }} + mgmtIfc: {{ .Values.simplyblock.cluster.mgmtIfc }} + {{- end }} + + {{- if .Values.simplyblock.cluster.fabric }} + fabric: {{ .Values.simplyblock.cluster.fabric }} + {{- end }} + + {{- if hasKey .Values.simplyblock.cluster "isSingleNode" }} + isSingleNode: {{ .Values.simplyblock.cluster.isSingleNode }} + {{- end }} + + {{- if hasKey .Values.simplyblock.cluster "enableNodeAffinity" }} + enableNodeAffinity: {{ .Values.simplyblock.cluster.enableNodeAffinity }} + {{- end }} + + {{- if hasKey .Values.simplyblock.cluster "strictNodeAntiAffinity" }} + strictNodeAntiAffinity: {{ .Values.simplyblock.cluster.strictNodeAntiAffinity }} + {{- end }} + + {{- if .Values.simplyblock.cluster.capWarn }} + capWarn: {{ .Values.simplyblock.cluster.capWarn }} + {{- end }} + + {{- if .Values.simplyblock.cluster.capCrit }} + capCrit: {{ .Values.simplyblock.cluster.capCrit }} + {{- end }} + + {{- if .Values.simplyblock.cluster.provCapWarn }} + provCapWarn: {{ .Values.simplyblock.cluster.provCapWarn }} + {{- end }} + + {{- if .Values.simplyblock.cluster.provCapCrit }} + provCapCrit: {{ .Values.simplyblock.cluster.provCapCrit }} + {{- end }} +{{- end }} + +--- +{{- if .Values.simplyblock.pool }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockPool +metadata: + name: {{ .Values.simplyblock.pool.name }} + namespace: {{ .Release.Namespace }} +spec: + name: {{ .Values.simplyblock.pool.name }} + clusterName: {{ .Values.simplyblock.cluster.clusterName }} + + {{- if .Values.simplyblock.pool.capacityLimit }} + capacityLimit: {{ .Values.simplyblock.pool.capacityLimit | quote }} + {{- end }} +{{- end }} + +--- +{{- if .Values.simplyblock.lvol }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockLvol +metadata: + name: {{ .Values.simplyblock.lvol.name }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} + poolName: {{ .Values.simplyblock.pool.name }} +{{- end }} + +--- +{{- if .Values.simplyblock.storageNodes }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockStorageNode +metadata: + name: {{ .Values.simplyblock.storageNodes.name }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} + + {{- if .Values.simplyblock.storageNodes.clusterImage }} + clusterImage: {{ .Values.simplyblock.storageNodes.clusterImage }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.mgmtIfc }} + mgmtIfc: {{ .Values.simplyblock.storageNodes.mgmtIfc }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.maxLVol }} + maxLVol: {{ .Values.simplyblock.storageNodes.maxLVol }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.maxSize }} + maxSize: {{ .Values.simplyblock.storageNodes.maxSize | quote }} + {{- end }} + + {{- if hasKey .Values.simplyblock.storageNodes "partitions" }} + partitions: {{ .Values.simplyblock.storageNodes.partitions }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.corePercentage }} + corePercentage: {{ .Values.simplyblock.storageNodes.corePercentage }} + {{- end }} + + {{- if hasKey .Values.simplyblock.storageNodes "spdkDebug" }} + spdkDebug: {{ .Values.simplyblock.storageNodes.spdkDebug }} + {{- end }} + + {{- if hasKey .Values.simplyblock.storageNodes "coreIsolation" }} + coreIsolation: {{ .Values.simplyblock.storageNodes.coreIsolation }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.workerNodes }} + workerNodes: + {{- range .Values.simplyblock.storageNodes.workerNodes }} + - {{ . }} + {{- end }} + {{- end }} +{{- end }} + +--- +{{- if .Values.simplyblock.devices }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockDevice +metadata: + name: {{ .Values.simplyblock.devices.name }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} +{{- end }} + +--- +{{- if .Values.simplyblock.tasks }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockTask +metadata: + name: {{ .Values.simplyblock.tasks.name }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} +{{- end }} diff --git a/simplyblock_core/scripts/charts/templates/storage_class.yaml b/simplyblock_core/scripts/charts/templates/storage_class.yaml index 64e5e6280..b23cb4a07 100644 --- a/simplyblock_core/scripts/charts/templates/storage_class.yaml +++ b/simplyblock_core/scripts/charts/templates/storage_class.yaml @@ -2,9 +2,22 @@ apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: - name: openebs-local-hostpath -provisioner: openebs.io/local + name: local-hostpath + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpath-fast + app.kubernetes.io/component: storageclass +provisioner: hostpath.csi.k8s.io allowVolumeExpansion: true reclaimPolicy: Retain volumeBindingMode: WaitForFirstConsumer - +{{- if .Values.storageclass.allowedTopologyZones }} +allowedTopologies: +- matchLabelExpressions: + - key: topology.kubernetes.io/zone + values: +{{- range .Values.storageclass.allowedTopologyZones }} + - {{ . }} +{{- end }} +{{- end }} diff --git a/simplyblock_core/scripts/charts/values-template.yaml b/simplyblock_core/scripts/charts/values-template.yaml deleted file mode 100644 index 79693e7cd..000000000 --- a/simplyblock_core/scripts/charts/values-template.yaml +++ /dev/null @@ -1,194 +0,0 @@ -graylog: - rootPasswordSha2: "${GRAYLOG_ROOT_PASSWORD_SHA2}" - passwordSecret: "${GRAYLOG_PASSWORD_SECRET}" - -cluster: - secret: "${CLUSTER_SECRET}" - id: "${CLUSTER_ID}" - ip: "${CLUSTER_IP}" - -monitoring: - enabled: ${ENABLE_MONITORING} - -log: - deletionInterval: "${LOG_DELETION_INTERVAL}" - retentionPeriod: "${RETENTION_PERIOD}" - level: "${LOG_LEVEL}" - maxNumberIndex: "${MAX_NUMBER_OF_INDICES}" - -grafana: - endpoint: "${GRAFANA_ENDPOINT}" - contactPoint: "${CONTACT_POINT}" - -image: - simplyblock: - repository: "${SIMPLYBLOCK_REPOSITORY}" - tag: "${SIMPLYBLOCK_TAG}" - pullPolicy: "Always" - -openebs: - enabled: true - -mongodb: - name: "simplyblock-mongodb" - deployment_name: "simplyblock-mongodb" - resources: - requests: - cpu: 100m - memory: 300Mi - limits: - cpu: 250m - memory: 1Gi - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app.kubernetes.io/component - operator: In - values: - - mongodb - topologyKey: "kubernetes.io/hostname" - -opensearch: - fullnameOverride: "simplyblock-opensearch" - singleNode: true - replicas: 1 - - antiAffinity: "hard" - persistence: - enabled: true - storageClass: openebs-local-hostpath - size: 10Gi - - resources: - requests: - cpu: "100m" - memory: "512Mi" - limits: - cpu: "500m" - memory: "3Gi" - - extraEnvs: - - name: OPENSEARCH_JAVA_OPTS - value: "-Xms1g -Xmx1g" - - name: bootstrap.memory_lock - value: "true" - - name: action.auto_create_index - value: "false" - - name: plugins.security.ssl.http.enabled - value: "false" - - name: plugins.security.disabled - value: "true" - - securityConfig: - enabled: false - -prometheus: - server: - fullnameOverride: simplyblock-prometheus - enabled: true - statefulSet: - enabled: true - name: simplyblock-prometheus - replicaCount: 1 - podLabels: - app: simplyblock-prometheus - podAnnotations: {} - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app.kubernetes.io/component - operator: In - values: - - simplyblock-prometheus - topologyKey: "kubernetes.io/hostname" - service: - servicePort: 9090 - type: ClusterIP - gRPC: - enabled: true - servicePort: 10901 - additionalPorts: - - name: http-thanos - port: 10902 - targetPort: 10902 - protocol: TCP - securityContext: - fsGroup: 65534 - persistentVolume: - enabled: true - size: 5Gi - storageClass: openebs-local-hostpath - extraArgs: - storage.tsdb.min-block-duration: 2h - storage.tsdb.max-block-duration: 2h - sidecarContainers: - thanos-sidecar: - image: thanosio/thanos:v0.31.0 - args: - - sidecar - - --tsdb.path=/prometheus - - --prometheus.url=http://localhost:9090 - - --objstore.config-file=/etc/thanos/objstore.yml - ports: - - name: grpc - containerPort: 10901 - - name: http - containerPort: 10902 - volumeMounts: - - name: storage-volume - mountPath: /prometheus - - name: objstore-config - mountPath: /etc/thanos - resources: - requests: - cpu: "100m" - memory: "256Mi" - limits: - cpu: "250m" - memory: "1Gi" - resources: - requests: - cpu: "100m" - memory: "512Mi" - limits: - cpu: "500m" - memory: "1Gi" - configMapOverrideName: simplyblock-prometheus-config - extraVolumes: - - name: objstore-config - configMap: - name: simplyblock-objstore-config - alertmanager: - enabled: false - - prometheus-pushgateway: - enabled: false - - prometheus-node-exporter: - enabled: false - - kube-state-metrics: - enabled: false - -ingress: - enabled: true - ingressClassName: nginx - useDNS: ${USE_DNS} - host: "${DNS_NAME}" - tlsSecret: ${TLS_SECRET} - controller: - hostNetwork: ${USE_HOST} - dnsPolicy: ClusterFirstWithHostNet - service: - type: ${SERVICE_TYPE} - nodePorts: - tcp: - 4501: 32451 - extraArgs: - tcp-services-configmap: "${K8S_NAMESPACE}/simplyblock-tcp-services" - nodeSelector: - simplyblock.io/role: mgmt-plane diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml index 467734176..3c17f041e 100644 --- a/simplyblock_core/scripts/charts/values.yaml +++ b/simplyblock_core/scripts/charts/values.yaml @@ -1,32 +1,32 @@ -graylog: - rootPasswordSha2: "b87c15a8ae4736d771ca60a7cc2014baaeab19b11c31f5fedef9421958a403c9" - passwordSecret: "is6SP2EdWg0NdmVGv6CEp5hRHNL7BKVMFem4t9pouMqDQnHwXMSomas1qcbKSt5yISr8eBHv4Y7Dbswhyz84Ut0TW6kqsiPs" -monitoring: - enabled: true +observability: + enabled: false secret: "sWbpOgba1bKnCfcPkVQi" - -log: deletionInterval: "3d" - retentionPeriod: "7d" level: "DEBUG" - maxNumberIndex: "3" - -grafana: - endpoint: "" - contactPoint: "https://hooks.slack.com/services/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" + graylog: + rootPasswordSha2: "b87c15a8ae4736d771ca60a7cc2014baaeab19b11c31f5fedef9421958a403c9" + passwordSecret: "is6SP2EdWg0NdmVGv6CEp5hRHNL7BKVMFem4t9pouMqDQnHwXMSomas1qcbKSt5yISr8eBHv4Y7Dbswhyz84Ut0TW6kqsiPs" + maxNumberIndex: "3" + retentionPeriod: "7d" + grafana: + endpoint: "" + contactPoint: "https://hooks.slack.com/services/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" image: simplyblock: repository: "public.ecr.aws/simply-block/simplyblock" - tag: "main" + tag: "main-sfam-2359" pullPolicy: "Always" ports: - lvolNvmfPortStart: - -openebs: - enabled: true + lvolNvmfPortStart: 9100 + +storageclass: + allowedTopologyZones: [] + +foundationdb: + multiAZ: false mongodb: name: "simplyblock-mongodb" @@ -57,8 +57,8 @@ opensearch: antiAffinity: "hard" persistence: enabled: true - storageClass: openebs-local-hostpath - size: 10Gi + storageClass: local-hostpath + size: 20Gi resources: requests: @@ -123,7 +123,7 @@ prometheus: persistentVolume: enabled: true size: 5Gi - storageClass: openebs-local-hostpath + storageClass: local-hostpath extraArgs: storage.tsdb.min-block-duration: 2h storage.tsdb.max-block-duration: 2h @@ -177,7 +177,7 @@ prometheus: enabled: false ingress: - enabled: true + enabled: false ingressClassName: nginx useDNS: false host: "" @@ -185,8 +185,61 @@ ingress: controller: hostNetwork: true dnsPolicy: ClusterFirstWithHostNet + replicaCount: 2 service: type: ClusterIP extraArgs: tcp-services-configmap: "simplyblock/simplyblock-tcp-services" + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - ingress + topologyKey: "kubernetes.io/hostname" nodeSelector: {} + + +simplyblock: + cluster: + clusterName: simplyblock-cluster + mgmtIfc: eth0 + fabric: tcp + isSingleNode: false + enableNodeAffinity: false + strictNodeAntiAffinity: false + capWarn: 80 + capCrit: 90 + provCapWarn: 120 + provCapCrit: 150 + + pool: + name: simplyblock-pool + capacityLimit: 100Gi + + lvol: + name: simplyblock-lvol + + storageNodes: + name: simplyblock-node + clusterImage: public.ecr.aws/simply-block/simplyblock:main-sfam-2359 + mgmtIfc: eth0 + maxLVol: 10 + maxSize: 0 + partitions: 0 + corePercentage: 65 + spdkDebug: false + coreIsolation: false + workerNodes: + - israel-storage-node-1 + - israel-storage-node-2 + - israel-storage-node-3 + + devices: + name: simplyblock-devices + + tasks: + name: simplyblock-task diff --git a/simplyblock_core/scripts/config_docker.sh b/simplyblock_core/scripts/config_docker.sh index 9f75cdde3..590664ca7 100644 --- a/simplyblock_core/scripts/config_docker.sh +++ b/simplyblock_core/scripts/config_docker.sh @@ -38,7 +38,7 @@ create_override ${DEV_IP} sudo systemctl daemon-reload sudo systemctl restart docker -activate-global-python-argcomplete --user +activate-global-python-argcomplete --user -y if [ ! -s "$HOME/.bashrc" ] || [ -z "$(grep "source $HOME/.bash_completion" $HOME/.bashrc)" ] then echo -e "\nsource $HOME/.bash_completion\n" >> $HOME/.bashrc diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml index ba0f8b61d..e407d89d7 100644 --- a/simplyblock_core/scripts/docker-compose-swarm.yml +++ b/simplyblock_core/scripts/docker-compose-swarm.yml @@ -130,6 +130,7 @@ services: - 80:80 - 12202:12202 - 9200:9200 + - 9090:9090 networks: - localnet - monitoring-net @@ -349,6 +350,34 @@ services: environment: SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + TasksRunnerLVolSyncDelete: + <<: *service-base + image: $SIMPLYBLOCK_DOCKER_IMAGE + command: "python simplyblock_core/services/tasks_runner_sync_lvol_del.py" + deploy: + placement: + constraints: [node.role == manager] + volumes: + - "/etc/foundationdb:/etc/foundationdb" + networks: + - hostnet + environment: + SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + + SnapshotReplication: + <<: *service-base + image: $SIMPLYBLOCK_DOCKER_IMAGE + command: "python simplyblock_core/services/snapshot_replication.py" + deploy: + placement: + constraints: [node.role == manager] + volumes: + - "/etc/foundationdb:/etc/foundationdb" + networks: + - hostnet + environment: + SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + networks: monitoring-net: external: true diff --git a/simplyblock_core/scripts/haproxy.cfg b/simplyblock_core/scripts/haproxy.cfg index d95d3ebec..667989baf 100644 --- a/simplyblock_core/scripts/haproxy.cfg +++ b/simplyblock_core/scripts/haproxy.cfg @@ -65,6 +65,11 @@ backend graylog_input_services balance roundrobin server-template graylog_input- 1 graylog:12201 check resolvers docker init-addr libc,none +backend prometheus_input_services + mode tcp + balance roundrobin + server-template prometheus_input- 1 prometheus:9090 check resolvers docker init-addr libc,none + backend opensearch_services balance roundrobin http-request set-path %[path,regsub(^/opensearch/?,/)] @@ -85,3 +90,8 @@ frontend graylog_input_front bind *:12202 mode tcp default_backend graylog_input_services + +frontend prometheus_input_front + bind *:9090 + mode tcp + default_backend prometheus_input_services diff --git a/simplyblock_core/scripts/install_deps.sh b/simplyblock_core/scripts/install_deps.sh index 256a55500..56d0bf96e 100644 --- a/simplyblock_core/scripts/install_deps.sh +++ b/simplyblock_core/scripts/install_deps.sh @@ -2,15 +2,15 @@ if [[ "$1" == "docker" ]]; then sudo yum install -y yum-utils - sudo yum install -y https://repo.almalinux.org/almalinux/9/devel/aarch64/os/Packages/tuned-profiles-realtime-2.24.0-1.el9.noarch.rpm + sudo yum install -y https://repo.almalinux.org/almalinux/9/devel/aarch64/os/Packages/tuned-profiles-realtime-2.26.0-1.el9.noarch.rpm sudo yum install -y yum-utils xorg-x11-xauth nvme-cli fio tuned sudo yum install hostname pkg-config git wget python3-pip yum-utils \ iptables pciutils -y sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo - sudo yum install docker-ce docker-ce-cli \ - containerd.io docker-buildx-plugin docker-compose-plugin -y + sudo yum install docker-ce-29.1.3-1.el9 docker-ce-cli-29.1.3-1.el9 \ + containerd.io-2.2.0-2.el9 docker-buildx-plugin-0.30.1-1.el9 docker-compose-plugin-5.0.1-1.el9 -y sudo systemctl enable docker sudo systemctl start docker diff --git a/simplyblock_core/services/capacity_and_stats_collector.py b/simplyblock_core/services/capacity_and_stats_collector.py index 6f702d051..07a850edd 100644 --- a/simplyblock_core/services/capacity_and_stats_collector.py +++ b/simplyblock_core/services/capacity_and_stats_collector.py @@ -4,7 +4,6 @@ from simplyblock_core import constants, db_controller, utils from simplyblock_core.models.nvme_device import NVMeDevice from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.rpc_client import RPCClient from simplyblock_core.models.stats import DeviceStatObject, NodeStatObject, ClusterStatObject logger = utils.get_logger(__name__) @@ -62,17 +61,17 @@ def add_device_stats(cl, device, capacity_dict, stats_dict): if last_record: time_diff = (now - last_record.date) if time_diff > 0: - data['read_bytes_ps'] = int((data['read_bytes'] - last_record['read_bytes']) / time_diff) - data['read_io_ps'] = int((data['read_io'] - last_record['read_io']) / time_diff) - data['read_latency_ps'] = int((data['read_latency_ticks'] - last_record['read_latency_ticks']) / time_diff) + data['read_bytes_ps'] = abs(int((data['read_bytes'] - last_record['read_bytes']) / time_diff)) + data['read_io_ps'] = abs(int((data['read_io'] - last_record['read_io']) / time_diff)) + data['read_latency_ps'] = abs(int((data['read_latency_ticks'] - last_record['read_latency_ticks']) / time_diff)) - data['write_bytes_ps'] = int((data['write_bytes'] - last_record['write_bytes']) / time_diff) - data['write_io_ps'] = int((data['write_io'] - last_record['write_io']) / time_diff) - data['write_latency_ps'] = int((data['write_latency_ticks'] - last_record['write_latency_ticks']) / time_diff) + data['write_bytes_ps'] = abs(int((data['write_bytes'] - last_record['write_bytes']) / time_diff)) + data['write_io_ps'] = abs(int((data['write_io'] - last_record['write_io']) / time_diff)) + data['write_latency_ps'] = abs(int((data['write_latency_ticks'] - last_record['write_latency_ticks']) / time_diff)) - data['unmap_bytes_ps'] = int((data['unmap_bytes'] - last_record['unmap_bytes']) / time_diff) - data['unmap_io_ps'] = int((data['unmap_io'] - last_record['unmap_io']) / time_diff) - data['unmap_latency_ps'] = int((data['unmap_latency_ticks'] - last_record['unmap_latency_ticks']) / time_diff) + data['unmap_bytes_ps'] = abs(int((data['unmap_bytes'] - last_record['unmap_bytes']) / time_diff)) + data['unmap_io_ps'] = abs(int((data['unmap_io'] - last_record['unmap_io']) / time_diff)) + data['unmap_latency_ps'] = abs(int((data['unmap_latency_ticks'] - last_record['unmap_latency_ticks']) / time_diff)) else: logger.warning("last record not found") @@ -83,6 +82,11 @@ def add_device_stats(cl, device, capacity_dict, stats_dict): stat_obj.write_to_db(db.kv_store) last_object_record[device.get_id()] = stat_obj + all_stats = db.get_device_stats(device, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj @@ -117,6 +121,11 @@ def add_node_stats(node, records): stat_obj = NodeStatObject(data=data) stat_obj.write_to_db(db.kv_store) + all_stats = db.get_node_stats(node, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj @@ -146,6 +155,11 @@ def add_cluster_stats(cl, records): stat_obj = ClusterStatObject(data=data) stat_obj.write_to_db(db.kv_store) + all_stats = db.get_cluster_stats(cl, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj @@ -173,15 +187,15 @@ def add_cluster_stats(cl, records): logger.error("No devices found in node: %s", node.get_id()) continue - rpc_client = RPCClient( - node.mgmt_ip, node.rpc_port, - node.rpc_username, node.rpc_password, - timeout=5, retry=2) - + rpc_client = node.rpc_client(timeout=5, retry=2) node_devs_stats = {} - ret = rpc_client.get_lvol_stats() - if ret: - node_devs_stats = {b['name']: b for b in ret['bdevs']} + try: + ret = rpc_client.get_lvol_stats() + if ret: + node_devs_stats = {b['name']: b for b in ret['bdevs']} + except Exception as e: + logger.error(e) + continue devices_records = [] for device in node.nvme_devices: @@ -189,7 +203,11 @@ def add_cluster_stats(cl, records): if device.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, NVMeDevice.STATUS_CANNOT_ALLOCATE]: logger.info(f"Device is skipped: {device.get_id()} status: {device.status}") continue - capacity_dict = rpc_client.alceml_get_capacity(device.alceml_name) + try: + capacity_dict = rpc_client.alceml_get_capacity(device.alceml_name) + except Exception as e: + logger.error(e) + continue if device.nvme_bdev in node_devs_stats: stats_dict = node_devs_stats[device.nvme_bdev] record = add_device_stats(cl, device, capacity_dict, stats_dict) diff --git a/simplyblock_core/services/health_check_service.py b/simplyblock_core/services/health_check_service.py index bb48e9620..8fc5f0489 100644 --- a/simplyblock_core/services/health_check_service.py +++ b/simplyblock_core/services/health_check_service.py @@ -1,4 +1,5 @@ # coding=utf-8 +import threading import time from datetime import datetime @@ -10,10 +11,10 @@ from simplyblock_core.rpc_client import RPCClient from simplyblock_core import constants, db_controller, distr_controller, storage_node_ops -logger = utils.get_logger(__name__) - utils.init_sentry_sdk() +logger = utils.get_logger(__name__) + def set_node_health_check(snode, health_check_status): snode = db.get_storage_node_by_id(snode.get_id()) @@ -42,223 +43,242 @@ def set_device_health_check(cluster_id, device, health_check_status): return -# get DB controller -db = db_controller.DBController() +def check_node(snode): -logger.info("Starting health check service") -while True: - clusters = db.get_clusters() - for cluster in clusters: - cluster_id = cluster.get_id() - snodes = db.get_storage_nodes_by_cluster_id(cluster_id) - if not snodes: - logger.warning("storage nodes list is empty") - - for snode in snodes: - logger.info("Node: %s, status %s", snode.get_id(), snode.status) - - if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE, - StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - logger.info(f"Node status is: {snode.status}, skipping") - set_node_health_check(snode, False) - for device in snode.nvme_devices: - set_device_health_check(cluster_id, device, False) - continue - - # 1- check node ping - ping_check = health_controller._check_node_ping(snode.mgmt_ip) - logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") - - # 2- check node API - node_api_check = health_controller._check_node_api(snode.mgmt_ip) - logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}") - - # 3- check node RPC - node_rpc_check = health_controller._check_node_rpc( - snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) - logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") - - is_node_online = ping_check and node_api_check and node_rpc_check - - health_check_status = is_node_online - if node_rpc_check: - logger.info(f"Node device count: {len(snode.nvme_devices)}") - node_devices_check = True - node_remote_devices_check = True - - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, - timeout=3, retry=2) - connected_devices = [] - - node_bdevs = rpc_client.get_bdevs() - if node_bdevs: - # node_bdev_names = [b['name'] for b in node_bdevs] - node_bdev_names = {} - for b in node_bdevs: - node_bdev_names[b['name']] = b - for al in b['aliases']: - node_bdev_names[al] = b - else: - node_bdev_names = {} - - subsystem_list = rpc_client.subsystem_list() or [] - subsystems = { - subsystem['nqn']: subsystem - for subsystem - in subsystem_list - } - - for device in snode.nvme_devices: - passed = True - - if device.io_error: - logger.info(f"Device io_error {device.get_id()}") - passed = False - - if device.status != NVMeDevice.STATUS_ONLINE: - logger.info(f"Device status {device.status}") - passed = False - - if snode.enable_test_device: - bdevs_stack = [device.nvme_bdev, device.testing_bdev, device.alceml_bdev, device.pt_bdev] - else: - bdevs_stack = [device.nvme_bdev, device.alceml_bdev, device.pt_bdev] - - logger.info(f"Checking Device: {device.get_id()}, status:{device.status}") - problems = 0 - for bdev in bdevs_stack: - if not bdev: - continue - - if not health_controller.check_bdev(bdev, bdev_names=node_bdev_names): - problems += 1 - passed = False - - logger.info(f"Checking Device's BDevs ... ({(len(bdevs_stack) - problems)}/{len(bdevs_stack)})") - - passed &= health_controller.check_subsystem(device.nvmf_nqn, nqns=subsystems) - - set_device_health_check(cluster_id, device, passed) - if device.status == NVMeDevice.STATUS_ONLINE: - node_devices_check &= passed - - logger.info(f"Node remote device: {len(snode.remote_devices)}") - - for remote_device in snode.remote_devices: - org_dev = db.get_storage_device_by_id(remote_device.get_id()) - org_node = db.get_storage_node_by_id(remote_device.node_id) - if org_dev.status == NVMeDevice.STATUS_ONLINE and org_node.status == StorageNode.STATUS_ONLINE: - if health_controller.check_bdev(remote_device.remote_bdev, bdev_names=node_bdev_names): - connected_devices.append(remote_device.get_id()) - continue - - if not org_dev.alceml_bdev: - logger.error(f"device alceml bdev not found!, {org_dev.get_id()}") - continue - - try: - storage_node_ops.connect_device( - f"remote_{org_dev.alceml_bdev}", org_dev, snode, - bdev_names=list(node_bdev_names), reattach=False, - ) - connected_devices.append(org_dev.get_id()) - sn = db.get_storage_node_by_id(snode.get_id()) - for d in sn.remote_devices: - if d.get_id() == remote_device.get_id(): - d.status = NVMeDevice.STATUS_ONLINE - sn.write_to_db() - break - distr_controller.send_dev_status_event(org_dev, NVMeDevice.STATUS_ONLINE, snode) - except RuntimeError: - logger.error(f"Failed to connect to device: {org_dev.get_id()}") - node_remote_devices_check = False - - connected_jms = [] - if snode.jm_device and snode.jm_device.get_id(): - jm_device = snode.jm_device - logger.info(f"Node JM: {jm_device.get_id()}") - if jm_device.jm_bdev in node_bdev_names: - logger.info(f"Checking jm bdev: {jm_device.jm_bdev} ... ok") - connected_jms.append(jm_device.get_id()) + snode = db.get_storage_node_by_id(snode.get_id()) + logger.info("Node: %s, status %s", snode.get_id(), snode.status) + + if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE, + StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + logger.info(f"Node status is: {snode.status}, skipping") + set_node_health_check(snode, False) + for device in snode.nvme_devices: + set_device_health_check(snode.cluster_id, device, False) + return + + # 1- check node ping + ping_check = health_controller._check_node_ping(snode.mgmt_ip) + logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") + + # 2- check node API + node_api_check = health_controller._check_node_api(snode.mgmt_ip) + logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}") + + # 3- check node RPC + node_rpc_check = health_controller._check_node_rpc( + snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) + logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") + + is_node_online = ping_check and node_api_check and node_rpc_check + + health_check_status = is_node_online + if node_rpc_check: + logger.info(f"Node device count: {len(snode.nvme_devices)}") + node_devices_check = True + node_remote_devices_check = True + + rpc_client = RPCClient( + snode.mgmt_ip, snode.rpc_port, + snode.rpc_username, snode.rpc_password, + timeout=3, retry=2) + connected_devices = [] + + node_bdevs = rpc_client.get_bdevs() + if node_bdevs: + # node_bdev_names = [b['name'] for b in node_bdevs] + node_bdev_names = {} + for b in node_bdevs: + node_bdev_names[b['name']] = b + for al in b['aliases']: + node_bdev_names[al] = b + else: + node_bdev_names = {} + + subsystem_list = rpc_client.subsystem_list() or [] + subsystems = { + subsystem['nqn']: subsystem + for subsystem + in subsystem_list + } + + for device in snode.nvme_devices: + passed = True + + if device.io_error: + logger.info(f"Device io_error {device.get_id()}") + passed = False + + if device.status != NVMeDevice.STATUS_ONLINE: + logger.info(f"Device status {device.status}") + passed = False + + if snode.enable_test_device: + bdevs_stack = [device.nvme_bdev, device.testing_bdev, device.alceml_bdev, device.pt_bdev] + else: + bdevs_stack = [device.nvme_bdev, device.alceml_bdev, device.pt_bdev] + + logger.info(f"Checking Device: {device.get_id()}, status:{device.status}") + problems = 0 + for bdev in bdevs_stack: + if not bdev: + continue + + if not health_controller.check_bdev(bdev, bdev_names=node_bdev_names): + problems += 1 + passed = False + + logger.info(f"Checking Device's BDevs ... ({(len(bdevs_stack) - problems)}/{len(bdevs_stack)})") + + passed &= health_controller.check_subsystem(device.nvmf_nqn, nqns=subsystems) + + set_device_health_check(snode.cluster_id, device, passed) + if device.status == NVMeDevice.STATUS_ONLINE: + node_devices_check &= passed + + logger.info(f"Node remote device: {len(snode.remote_devices)}") + + for remote_device in snode.remote_devices: + org_dev = db.get_storage_device_by_id(remote_device.get_id()) + org_node = db.get_storage_node_by_id(remote_device.node_id) + if org_dev.status == NVMeDevice.STATUS_ONLINE and org_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: + if health_controller.check_bdev(remote_device.remote_bdev, bdev_names=node_bdev_names): + connected_devices.append(remote_device.get_id()) + continue + + if not org_dev.alceml_bdev: + logger.error(f"device alceml bdev not found!, {org_dev.get_id()}") + continue + + try: + storage_node_ops.connect_device( + f"remote_{org_dev.alceml_bdev}", org_dev, snode, + bdev_names=list(node_bdev_names), reattach=False, + ) + connected_devices.append(org_dev.get_id()) + sn = db.get_storage_node_by_id(snode.get_id()) + for d in sn.remote_devices: + if d.get_id() == remote_device.get_id(): + d.status = NVMeDevice.STATUS_ONLINE + sn.write_to_db() + break + distr_controller.send_dev_status_event(org_dev, NVMeDevice.STATUS_ONLINE, snode) + except RuntimeError: + logger.error(f"Failed to connect to device: {org_dev.get_id()}") + node_remote_devices_check = False + + connected_jms = [] + if snode.jm_device and snode.jm_device.get_id(): + jm_device = snode.jm_device + logger.info(f"Node JM: {jm_device.get_id()}") + if jm_device.jm_bdev in node_bdev_names: + logger.info(f"Checking jm bdev: {jm_device.jm_bdev} ... ok") + connected_jms.append(jm_device.get_id()) + else: + logger.info(f"Checking jm bdev: {jm_device.jm_bdev} ... not found") + + if snode.enable_ha_jm: + logger.info(f"Node remote JMs: {len(snode.remote_jm_devices)}") + for remote_device in snode.remote_jm_devices: + if remote_device.remote_bdev: + check = health_controller.check_bdev(remote_device.remote_bdev, bdev_names=node_bdev_names) + if check: + connected_jms.append(remote_device.get_id()) else: - logger.info(f"Checking jm bdev: {jm_device.jm_bdev} ... not found") - - if snode.enable_ha_jm: - logger.info(f"Node remote JMs: {len(snode.remote_jm_devices)}") - for remote_device in snode.remote_jm_devices: - if remote_device.remote_bdev: - check = health_controller.check_bdev(remote_device.remote_bdev, bdev_names=node_bdev_names) - if check: - connected_jms.append(remote_device.get_id()) - else: + node_remote_devices_check = False + + for jm_id in snode.jm_ids: + if jm_id and jm_id not in connected_jms: + for nd in db.get_storage_nodes(): + if nd.jm_device and nd.jm_device.get_id() == jm_id: + if nd.status == StorageNode.STATUS_ONLINE: node_remote_devices_check = False + break - for jm_id in snode.jm_ids: - if jm_id and jm_id not in connected_jms: - for nd in db.get_storage_nodes(): - if nd.jm_device and nd.jm_device.get_id() == jm_id: - if nd.status == StorageNode.STATUS_ONLINE: - node_remote_devices_check = False - break - - if not node_remote_devices_check and cluster.status in [ - Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: - snode = db.get_storage_node_by_id(snode.get_id()) - snode.remote_jm_devices = storage_node_ops._connect_to_remote_jm_devs(snode) - snode.write_to_db() - - lvstore_check = True + if not node_remote_devices_check and cluster.status in [ + Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: + remote_jm_devices = storage_node_ops._connect_to_remote_jm_devs(snode) snode = db.get_storage_node_by_id(snode.get_id()) - if snode.lvstore_status == "ready" or snode.status == StorageNode.STATUS_ONLINE or \ - snode.lvstore_status == "failed" : + snode.remote_jm_devices = remote_jm_devices + snode.write_to_db() - lvstore_stack = snode.lvstore_stack + lvstore_check = True + snode = db.get_storage_node_by_id(snode.get_id()) + if snode.lvstore_status == "ready" or snode.status == StorageNode.STATUS_ONLINE or \ + snode.lvstore_status == "failed": + + lvstore_stack = snode.lvstore_stack + lvstore_check &= health_controller._check_node_lvstore( + lvstore_stack, snode, auto_fix=True, node_bdev_names=node_bdev_names) + + if snode.secondary_node_id: + + lvstore_check &= health_controller._check_node_hublvol( + snode, node_bdev_names=node_bdev_names, node_lvols_nqns=subsystems) + + second_node_1 = db.get_storage_node_by_id(snode.secondary_node_id) + if second_node_1 and second_node_1.status == StorageNode.STATUS_ONLINE: lvstore_check &= health_controller._check_node_lvstore( - lvstore_stack, snode, auto_fix=True, node_bdev_names=node_bdev_names) - - if snode.secondary_node_id: - - lvstore_check &= health_controller._check_node_hublvol( - snode, node_bdev_names=node_bdev_names, node_lvols_nqns=subsystems) - - second_node_1 = db.get_storage_node_by_id(snode.secondary_node_id) - if second_node_1 and second_node_1.status == StorageNode.STATUS_ONLINE: - lvstore_check &= health_controller._check_node_lvstore( - lvstore_stack, second_node_1, auto_fix=True, stack_src_node=snode) - sec_node_check = health_controller._check_sec_node_hublvol(second_node_1) - if not sec_node_check: - if snode.status == StorageNode.STATUS_ONLINE: - ret = second_node_1.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if ret: - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - # is_sec_node_leader = True - # check jc_compression status - jc_compression_is_active = second_node_1.rpc_client().jc_compression_get_status(snode.jm_vuid) - if not jc_compression_is_active: - lvstore_check &= health_controller._check_sec_node_hublvol(second_node_1, auto_fix=True) - - - lvol_port_check = False - # if node_api_check: - ports = [snode.lvol_subsys_port] - - if snode.lvstore_stack_secondary_1: - second_node_1 = db.get_storage_node_by_id(snode.lvstore_stack_secondary_1) - if second_node_1 and second_node_1.status == StorageNode.STATUS_ONLINE: - ports.append(second_node_1.lvol_subsys_port) - - for port in ports: - lvol_port_check = health_controller._check_port_on_node(snode, port) - logger.info( - f"Check: node {snode.mgmt_ip}, port: {port} ... {lvol_port_check}") - if not lvol_port_check and snode.status != StorageNode.STATUS_SUSPENDED: - tasks_controller.add_port_allow_task(snode.cluster_id, snode.get_id(), port) - - health_check_status = is_node_online and node_devices_check and node_remote_devices_check and lvstore_check - set_node_health_check(snode, bool(health_check_status)) + lvstore_stack, second_node_1, auto_fix=True, stack_src_node=snode) + sec_node_check = health_controller._check_sec_node_hublvol(second_node_1) + if not sec_node_check: + if snode.status == StorageNode.STATUS_ONLINE: + ret = second_node_1.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if ret: + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + # is_sec_node_leader = True + # check jc_compression status + jc_compression_is_active = second_node_1.rpc_client().jc_compression_get_status( + snode.jm_vuid) + if not jc_compression_is_active: + lvstore_check &= health_controller._check_sec_node_hublvol(second_node_1, + auto_fix=True) + + lvol_port_check = False + # if node_api_check: + ports = [snode.lvol_subsys_port] + + if snode.lvstore_stack_secondary_1: + second_node_1 = db.get_storage_node_by_id(snode.lvstore_stack_secondary_1) + if second_node_1 and second_node_1.status == StorageNode.STATUS_ONLINE: + ports.append(second_node_1.lvol_subsys_port) + + for port in ports: + try: + lvol_port_check = health_controller.check_port_on_node(snode, port) + logger.info( + f"Check: node {snode.mgmt_ip}, port: {port} ... {lvol_port_check}") + if not lvol_port_check and snode.status != StorageNode.STATUS_SUSPENDED: + tasks_controller.add_port_allow_task(snode.cluster_id, snode.get_id(), port) + except Exception: + logger.error("Check node port failed, connection error") + + health_check_status = is_node_online and node_devices_check and node_remote_devices_check and lvstore_check + set_node_health_check(snode, bool(health_check_status)) + time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC) + + +def loop_for_node(snode): + while True: + try: + check_node(snode) + except Exception as e: + logger.error(e) + time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC) + + +logger.info("Starting health check service") +db = db_controller.DBController() +threads_maps: dict[str, threading.Thread] = {} +while True: + clusters = db.get_clusters() + for cluster in clusters: + for node in db.get_storage_nodes_by_cluster_id(cluster.get_id()): + node_id = node.get_id() + if node_id not in threads_maps or threads_maps[node_id].is_alive() is False: + t = threading.Thread(target=loop_for_node, args=(node,)) + t.start() + threads_maps[node_id] = t time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC) diff --git a/simplyblock_core/services/lvol_monitor.py b/simplyblock_core/services/lvol_monitor.py index 884b67396..79c492a40 100644 --- a/simplyblock_core/services/lvol_monitor.py +++ b/simplyblock_core/services/lvol_monitor.py @@ -60,8 +60,8 @@ def resume_comp(lvol): return rpc_client = RPCClient( node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=5, retry=2) - ret, err = rpc_client.jc_compression_start(jm_vuid=node.jm_vuid) - if err and "code" in err and err["code"] != -2: + ret, err = rpc_client.jc_suspend_compression(jm_vuid=node.jm_vuid, suspend=False) + if err: logger.info("Failed to resume JC compression adding task...") tasks_controller.add_jc_comp_resume_task(node.cluster_id, node.get_id(), node.jm_vuid) @@ -118,22 +118,24 @@ def process_lvol_delete_finish(lvol): lvol_controller.delete_lvol_from_node(lvol.get_id(), leader_node.get_id()) return + if snode.get_id() == leader_node.get_id(): + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + else: + sec_node = db.get_storage_node_by_id(snode.get_id()) + # 3-1 async delete lvol bdev from primary primary_node = db.get_storage_node_by_id(leader_node.get_id()) if primary_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + if sec_node and sec_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN, StorageNode.STATUS_UNREACHABLE]: + primary_node.lvol_del_sync_lock() ret = lvol_controller.delete_lvol_from_node(lvol.get_id(), primary_node.get_id(), del_async=True) if not ret: logger.error(f"Failed to delete lvol from primary_node node: {primary_node.get_id()}") # 3-2 async delete lvol bdev from secondary - if snode.get_id() == leader_node.get_id(): - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - else: - sec_node = db.get_storage_node_by_id(snode.get_id()) - - if sec_node: - sec_node.lvol_sync_del_queue.append(f"{lvol.lvs_name}/{lvol.lvol_bdev}") - sec_node.write_to_db() + if sec_node and sec_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN, StorageNode.STATUS_UNREACHABLE]: + tasks_controller.add_lvol_sync_del_task(sec_node.cluster_id, sec_node.get_id(), f"{lvol.lvs_name}/{lvol.lvol_bdev}", primary_node.get_id()) lvol_events.lvol_delete(lvol) lvol.remove(db.kv_store) @@ -160,6 +162,193 @@ def process_lvol_delete_try_again(lvol): lvol.write_to_db() +def check_node(snode): + node_bdev_names = [] + node_lvols_nqns = {} + sec_node_bdev_names = {} + sec_node_lvols_nqns = {} + sec_node = None + + if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + node_bdevs = snode.rpc_client().get_bdevs() + if node_bdevs: + node_bdev_names = [b['name'] for b in node_bdevs] + for bdev in node_bdevs: + if "aliases" in bdev and bdev["aliases"]: + node_bdev_names.extend(bdev['aliases']) + ret = snode.rpc_client().subsystem_list() + if ret: + for sub in ret: + node_lvols_nqns[sub['nqn']] = sub + + if snode.secondary_node_id: + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + sec_rpc_client = RPCClient( + sec_node.mgmt_ip, sec_node.rpc_port, + sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) + ret = sec_rpc_client.get_bdevs() + if ret: + for bdev in ret: + sec_node_bdev_names[bdev['name']] = bdev + + ret = sec_rpc_client.subsystem_list() + if ret: + for sub in ret: + sec_node_lvols_nqns[sub['nqn']] = sub + + for lvol in db.get_lvols_by_node_id(snode.get_id()): + + if lvol.status == LVol.STATUS_IN_CREATION: + continue + + if lvol.status == lvol.STATUS_IN_DELETION: + # check leadership + leader_node = None + snode = db.get_storage_node_by_id(snode.get_id()) + if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if not ret: + raise Exception("Failed to get LVol info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = snode + + if not leader_node and sec_node: + ret = sec_node.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if not ret: + raise Exception("Failed to get LVol info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = sec_node + + if not leader_node: + raise Exception("Failed to get leader node") + + if lvol.deletion_status == "" or lvol.deletion_status != leader_node.get_id(): + lvol_controller.delete_lvol_from_node(lvol.get_id(), leader_node.get_id()) + time.sleep(3) + + try: + ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status( + f"{lvol.lvs_name}/{lvol.lvol_bdev}") + except Exception as e: + logger.error(e) + # timeout detected, check other node + break + + if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed + process_lvol_delete_finish(lvol) + + elif ret == 1: # Async lvol deletion is in progress or queued + logger.info(f"LVol deletion in progress, id: {lvol.get_id()}") + pre_lvol_delete_rebalance() + + elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Async deletion is done, but leadership has changed (sync deletion is now blocked)") + + elif ret == 4: # No async delete request exists for this lvol + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("No async delete request exists for this lvol") + lvol = db.get_lvol_by_id(lvol.get_id()) + lvol.io_error = True + lvol.write_to_db() + set_lvol_status(lvol, LVol.STATUS_OFFLINE) + + elif ret == -1: # Operation not permitted + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Operation not permitted") + lvol = db.get_lvol_by_id(lvol.get_id()) + lvol.io_error = True + lvol.write_to_db() + set_lvol_status(lvol, LVol.STATUS_OFFLINE) + + elif ret == -2: # No such file or directory + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("No such file or directory") + process_lvol_delete_finish(lvol) + + elif ret == -5: # I/O error + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("I/O error") + process_lvol_delete_try_again(lvol) + + elif ret == -11: # Try again + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Try again") + process_lvol_delete_try_again(lvol) + + elif ret == -12: # Out of memory + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Out of memory") + process_lvol_delete_try_again(lvol) + + elif ret == -16: # Device or resource busy + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Device or resource busy") + process_lvol_delete_try_again(lvol) + + elif ret == -19: # No such device + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Finishing lvol delete") + process_lvol_delete_finish(lvol) + + elif ret == -35: # Leadership changed + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Leadership changed") + process_lvol_delete_try_again(lvol) + + elif ret == -36: # Failed to update lvol for deletion + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Failed to update lvol for deletion") + process_lvol_delete_try_again(lvol) + + else: # Failed to update lvol for deletion + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Failed to update lvol for deletion") + + continue + + passed = True + try: + ret = health_controller.check_lvol_on_node( + lvol.get_id(), lvol.node_id, node_bdev_names, node_lvols_nqns) + if not ret: + passed = False + except Exception as e: + logger.error(f"Failed to check lvol:{lvol.get_id()} on node: {lvol.node_id}") + logger.error(e) + + if lvol.ha_type == "ha": + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + try: + ret = health_controller.check_lvol_on_node( + lvol.get_id(), snode.secondary_node_id, sec_node_bdev_names, sec_node_lvols_nqns) + if not ret: + passed = False + else: + passed = True + except Exception as e: + logger.error(f"Failed to check lvol: {lvol.get_id()} on node: {snode.secondary_node_id}") + logger.error(e) + + if snode.lvstore_status == "ready": + + logger.info(f"LVol: {lvol.get_id()}, is healthy: {passed}") + set_lvol_health_check(lvol, passed) + if passed: + set_lvol_status(lvol, LVol.STATUS_ONLINE) + + if snode.lvstore_status == "ready": + + for snap in db.get_snapshots_by_node_id(snode.get_id()): + present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) + set_snapshot_health_check(snap, present) + + + # get DB controller db = db_controller.DBController() @@ -173,195 +362,9 @@ def process_lvol_delete_try_again(lvol): continue for snode in db.get_storage_nodes_by_cluster_id(cluster.get_id()): - node_bdev_names = [] - node_lvols_nqns = {} - sec_node_bdev_names = {} - sec_node_lvols_nqns = {} - sec_node = None - - if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - node_bdevs = snode.rpc_client().get_bdevs() - if node_bdevs: - node_bdev_names = [b['name'] for b in node_bdevs] - for bdev in node_bdevs: - if "aliases" in bdev and bdev["aliases"]: - node_bdev_names.extend(bdev['aliases']) - ret = snode.rpc_client().subsystem_list() - if ret: - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub - - if snode.secondary_node_id: - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: - sec_rpc_client = RPCClient( - sec_node.mgmt_ip, sec_node.rpc_port, - sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) - ret = sec_rpc_client.get_bdevs() - if ret: - for bdev in ret: - sec_node_bdev_names[bdev['name']] = bdev - - ret = sec_rpc_client.subsystem_list() - if ret: - for sub in ret: - sec_node_lvols_nqns[sub['nqn']] = sub - - for lvol in db.get_lvols_by_node_id(snode.get_id()): - - if lvol.status == LVol.STATUS_IN_CREATION: - continue - - if lvol.status == lvol.STATUS_IN_DELETION: - # check leadership - leader_node = None - snode = db.get_storage_node_by_id(snode.get_id()) - if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if not ret: - raise Exception("Failed to get LVol info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = snode - - if not leader_node and sec_node: - ret = sec_node.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if not ret: - raise Exception("Failed to get LVol info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = sec_node - - if not leader_node: - raise Exception("Failed to get leader node") - - if lvol.deletion_status == "" or lvol.deletion_status != leader_node.get_id(): - lvol_controller.delete_lvol_from_node(lvol.get_id(), leader_node.get_id()) - time.sleep(3) - - try: - ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status( - f"{lvol.lvs_name}/{lvol.lvol_bdev}") - except Exception as e: - logger.error(e) - # timeout detected, check other node - break - - if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed - process_lvol_delete_finish(lvol) - - elif ret == 1: # Async lvol deletion is in progress or queued - logger.info(f"LVol deletion in progress, id: {lvol.get_id()}") - pre_lvol_delete_rebalance() - - elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Async deletion is done, but leadership has changed (sync deletion is now blocked)") - - elif ret == 4: # No async delete request exists for this lvol - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("No async delete request exists for this lvol") - lvol = db.get_lvol_by_id(lvol.get_id()) - lvol.io_error = True - lvol.write_to_db() - set_lvol_status(lvol, LVol.STATUS_OFFLINE) - - elif ret == -1: # Operation not permitted - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Operation not permitted") - lvol = db.get_lvol_by_id(lvol.get_id()) - lvol.io_error = True - lvol.write_to_db() - set_lvol_status(lvol, LVol.STATUS_OFFLINE) - - elif ret == -2: # No such file or directory - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("No such file or directory") - process_lvol_delete_finish(lvol) - - elif ret == -5: # I/O error - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("I/O error") - process_lvol_delete_try_again(lvol) - - elif ret == -11: # Try again - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Try again") - process_lvol_delete_try_again(lvol) - - elif ret == -12: # Out of memory - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Out of memory") - process_lvol_delete_try_again(lvol) - - elif ret == -16: # Device or resource busy - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Device or resource busy") - process_lvol_delete_try_again(lvol) - - elif ret == -19: # No such device - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Finishing lvol delete") - process_lvol_delete_finish(lvol) - - elif ret == -35: # Leadership changed - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Leadership changed") - process_lvol_delete_try_again(lvol) - - elif ret == -36: # Failed to update lvol for deletion - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Failed to update lvol for deletion") - process_lvol_delete_try_again(lvol) - - else: # Failed to update lvol for deletion - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Failed to update lvol for deletion") - - continue - - passed = True - ret = health_controller.check_lvol_on_node( - lvol.get_id(), lvol.node_id, node_bdev_names, node_lvols_nqns) - if not ret: - passed = False - - if lvol.ha_type == "ha": - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: - ret = health_controller.check_lvol_on_node( - lvol.get_id(), snode.secondary_node_id, sec_node_bdev_names, sec_node_lvols_nqns) - if not ret: - passed = False - else: - passed = True - - if snode.lvstore_status == "ready": - - logger.info(f"LVol: {lvol.get_id()}, is healthy: {passed}") - set_lvol_health_check(lvol, passed) - if passed: - set_lvol_status(lvol, LVol.STATUS_ONLINE) - - if snode.lvstore_status == "ready": - - for snap in db.get_snapshots_by_node_id(snode.get_id()): - present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) - set_snapshot_health_check(snap, present) - - snode = db.get_storage_node_by_id(snode.get_id()) - if snode.status == StorageNode.STATUS_ONLINE: - not_deleted = [] - for bdev_name in snode.lvol_sync_del_queue: - logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}") - ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True) - if not ret: - if "code" in err and err["code"] == -19: - logger.error(f"Sync delete completed with error: {err}") - else: - logger.error(f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}") - not_deleted.append(bdev_name) - snode.lvol_sync_del_queue = not_deleted - snode.write_to_db() + try: + check_node(snode) + except Exception as e: + logger.error(e) time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/lvol_stat_collector.py b/simplyblock_core/services/lvol_stat_collector.py index 09aa7d571..18f09d4ce 100644 --- a/simplyblock_core/services/lvol_stat_collector.py +++ b/simplyblock_core/services/lvol_stat_collector.py @@ -7,7 +7,6 @@ from simplyblock_core.models.lvol_model import LVol from simplyblock_core.models.stats import LVolStatObject, PoolStatObject from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.rpc_client import RPCClient logger = utils.get_logger(__name__) @@ -154,6 +153,11 @@ def add_lvol_stats(cluster, lvol, stats_list, capacity_dict=None): stat_obj.write_to_db(db.kv_store) last_object_record[lvol.get_id()] = stat_obj + all_stats = db.get_lvol_stats(lvol, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj @@ -173,6 +177,12 @@ def add_pool_stats(pool, records): stat_obj = PoolStatObject(data=data) stat_obj.write_to_db(db.kv_store) + + all_stats = db.get_pool_stats(pool, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj @@ -201,68 +211,66 @@ def add_pool_stats(pool, records): continue if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + try: + rpc_client = snode.rpc_client(timeout=3, retry=2) + if snode.get_id() in all_node_bdev_names and all_node_bdev_names[snode.get_id()]: + node_bdev_names = all_node_bdev_names[snode.get_id()] + else: + node_bdevs = rpc_client.get_bdevs() + if node_bdevs: + node_bdev_names = {b['name']: b for b in node_bdevs} + all_node_bdev_names[snode.get_id()] = node_bdev_names - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, timeout=3, retry=2) - - if snode.get_id() in all_node_bdev_names and all_node_bdev_names[snode.get_id()]: - node_bdev_names = all_node_bdev_names[snode.get_id()] - else: - node_bdevs = rpc_client.get_bdevs() - if node_bdevs: - node_bdev_names = {b['name']: b for b in node_bdevs} - all_node_bdev_names[snode.get_id()] = node_bdev_names - - if snode.get_id() in all_node_lvols_nqns and all_node_lvols_nqns[snode.get_id()]: - node_lvols_nqns = all_node_lvols_nqns[snode.get_id()] - else: - ret = rpc_client.subsystem_list() - if ret: - node_lvols_nqns = {} - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub - all_node_lvols_nqns[snode.get_id()] = node_lvols_nqns - - if snode.get_id() in all_node_lvols_stats and all_node_lvols_stats[snode.get_id()]: - node_lvols_stats = all_node_lvols_stats[snode.get_id()] - else: - ret = rpc_client.get_lvol_stats() - if ret: - node_lvols_stats = {} - for st in ret['bdevs']: - node_lvols_stats[st['name']] = st - all_node_lvols_stats[snode.get_id()] = node_lvols_stats - - if snode.secondary_node_id: - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: - sec_rpc_client = RPCClient( - sec_node.mgmt_ip, sec_node.rpc_port, - sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) - - if sec_node.get_id() not in all_node_bdev_names or not all_node_bdev_names[sec_node.get_id()]: - ret = sec_rpc_client.get_bdevs() - if ret: - # node_bdev_names = {} - node_bdev_names = {b['name']: b for b in ret} - all_node_bdev_names[sec_node.get_id()] = node_bdev_names - - if sec_node.get_id() not in all_node_lvols_nqns or not all_node_lvols_nqns[sec_node.get_id()]: - ret = sec_rpc_client.subsystem_list() + if snode.get_id() in all_node_lvols_nqns and all_node_lvols_nqns[snode.get_id()]: + node_lvols_nqns = all_node_lvols_nqns[snode.get_id()] + else: + ret = rpc_client.subsystem_list() if ret: node_lvols_nqns = {} for sub in ret: node_lvols_nqns[sub['nqn']] = sub - all_node_lvols_nqns[sec_node.get_id()] = node_lvols_nqns + all_node_lvols_nqns[snode.get_id()] = node_lvols_nqns - if sec_node.get_id() not in all_node_lvols_stats or not all_node_lvols_stats[sec_node.get_id()]: - ret = sec_rpc_client.get_lvol_stats() + if snode.get_id() in all_node_lvols_stats and all_node_lvols_stats[snode.get_id()]: + node_lvols_stats = all_node_lvols_stats[snode.get_id()] + else: + ret = rpc_client.get_lvol_stats() if ret: - sec_node_lvols_stats = {} + node_lvols_stats = {} for st in ret['bdevs']: - sec_node_lvols_stats[st['name']] = st - all_node_lvols_stats[sec_node.get_id()] = sec_node_lvols_stats + node_lvols_stats[st['name']] = st + all_node_lvols_stats[snode.get_id()] = node_lvols_stats + except Exception as e: + logger.error(e) + + if snode.secondary_node_id: + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: + try: + sec_rpc_client = sec_node.rpc_client(timeout=3, retry=2) + if sec_node.get_id() not in all_node_bdev_names or not all_node_bdev_names[sec_node.get_id()]: + ret = sec_rpc_client.get_bdevs() + if ret: + # node_bdev_names = {} + node_bdev_names = {b['name']: b for b in ret} + all_node_bdev_names[sec_node.get_id()] = node_bdev_names + if sec_node.get_id() not in all_node_lvols_nqns or not all_node_lvols_nqns[sec_node.get_id()]: + ret = sec_rpc_client.subsystem_list() + if ret: + node_lvols_nqns = {} + for sub in ret: + node_lvols_nqns[sub['nqn']] = sub + all_node_lvols_nqns[sec_node.get_id()] = node_lvols_nqns + + if sec_node.get_id() not in all_node_lvols_stats or not all_node_lvols_stats[sec_node.get_id()]: + ret = sec_rpc_client.get_lvol_stats() + if ret: + sec_node_lvols_stats = {} + for st in ret['bdevs']: + sec_node_lvols_stats[st['name']] = st + all_node_lvols_stats[sec_node.get_id()] = sec_node_lvols_stats + except Exception as e: + logger.error(e) for lvol in lvol_list: if lvol.status in [LVol.STATUS_IN_CREATION, LVol.STATUS_IN_DELETION]: diff --git a/simplyblock_core/services/main_distr_event_collector.py b/simplyblock_core/services/main_distr_event_collector.py index 31dffeda0..93e0ae4df 100644 --- a/simplyblock_core/services/main_distr_event_collector.py +++ b/simplyblock_core/services/main_distr_event_collector.py @@ -1,7 +1,7 @@ # coding=utf-8 import threading import time - +from datetime import datetime from simplyblock_core import constants, db_controller, utils, rpc_client, distr_controller from simplyblock_core.controllers import events_controller, device_controller @@ -9,9 +9,8 @@ from simplyblock_core.models.storage_node import StorageNode -logger = utils.get_logger(__name__) - utils.init_sentry_sdk() +logger = utils.get_logger(__name__) # get DB controller db = db_controller.DBController() @@ -19,7 +18,17 @@ EVENTS_LIST = ['SPDK_BDEV_EVENT_REMOVE', "error_open", 'error_read', "error_write", "error_unmap", "error_write_cannot_allocate"] -def process_device_event(event): + +def remove_remote_device_from_node(node_id, device_id): + node = db.get_storage_node_by_id(node_id) + for remote_dev in node.remote_devices: + if remote_dev.get_id() == device_id: + node.remote_devices.remove(remote_dev) + node.write_to_db() + break + + +def process_device_event(event, logger): if event.message in EVENTS_LIST: node_id = event.node_id storage_id = event.storage_id @@ -39,15 +48,31 @@ def process_device_event(event): event.status = 'device_not_found' return - if device_obj.connecting_from_node == event_node_obj.get_id(): + if "timestamp" in event.object_dict: + ev_time = event.object_dict['timestamp'] + time_delta = datetime.now() - datetime.strptime(ev_time, '%Y-%m-%dT%H:%M:%S.%fZ') + if time_delta.total_seconds() > 8: + if snode.rpc_client().bdev_nvme_controller_list(device_obj.nvme_controller): + logger.info(f"event was fired {time_delta.total_seconds()} seconds ago, controller ok, skipping") + event.status = f'skipping_late_by_{int(time_delta.total_seconds())}s_but_controller_ok' + return + + logger.info(f"event was fired {time_delta.total_seconds()} seconds ago, checking controller filed") + event.status = f'late_by_{int(time_delta.total_seconds())}s' + + if device_obj.is_connection_in_progress_to_node(event_node_obj.get_id()): logger.warning("Connection attempt was found from node to device, sleeping 5 seconds") time.sleep(5) + device_obj.lock_device_connection(event_node_obj.get_id()) + if device_obj.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, NVMeDevice.STATUS_CANNOT_ALLOCATE]: logger.info(f"The device is not online, skipping. status: {device_obj.status}") event.status = f'skipped:dev_{device_obj.status}' distr_controller.send_dev_status_event(device_obj, device_obj.status, event_node_obj) + remove_remote_device_from_node(event_node_obj.get_id(), device_obj.get_id()) + device_obj.release_device_connection() return @@ -55,12 +80,16 @@ def process_device_event(event): distr_controller.send_dev_status_event(device_obj, NVMeDevice.STATUS_UNAVAILABLE, event_node_obj) logger.info(f"Node is not online, skipping. status: {event_node_obj.status}") event.status = 'skipped:node_offline' + remove_remote_device_from_node(event_node_obj.get_id(), device_obj.get_id()) + device_obj.release_device_connection() return if device_node_obj.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: distr_controller.send_dev_status_event(device_obj, NVMeDevice.STATUS_UNAVAILABLE, event_node_obj) logger.info(f"Node is not online, skipping. status: {device_node_obj.status}") event.status = f'skipped:device_node_{device_node_obj.status}' + remove_remote_device_from_node(event_node_obj.get_id(), device_obj.get_id()) + device_obj.release_device_connection() return @@ -83,63 +112,37 @@ def process_device_event(event): device_controller.device_set_io_error(device_obj.get_id(), True) else: distr_controller.send_dev_status_event(device_obj, NVMeDevice.STATUS_UNAVAILABLE, event_node_obj) - event_node_obj = db.get_storage_node_by_id(event_node_obj.get_id()) - for dev in event_node_obj.remote_devices: - if dev.get_id() == device_obj.get_id(): - event_node_obj.remote_devices.remove(dev) - event_node_obj.write_to_db() - break + remove_remote_device_from_node(event_node_obj.get_id(), device_obj.get_id()) event.status = 'processed' + device_obj.release_device_connection() -def process_lvol_event(event): +def process_lvol_event(event, logger): if event.message in ["error_open", 'error_read', "error_write", "error_unmap"]: vuid = event.object_dict['vuid'] - # node_id = event.node_id - # storage_node_ops.set_node_status(node_id, StorageNode.STATUS_SUSPENDED) - # event_node_obj = db.get_storage_node_by_id(node_id) - # tasks_controller.add_node_to_auto_restart(event_node_obj) - - # lvols = [] - # for lv in db.get_lvols(): # pass - # if lv.node_id == node_id: - # lvols.append(lv) - # - # if not lvols: - # logger.error(f"LVols on node {node_id} not found") - # event.status = 'lvols_not_found' - # else: - # for lvol in lvols: - # if lvol.status == LVol.STATUS_ONLINE: - # logger.info("Setting LVol to offline") - # lvol.io_error = True - # old_status = lvol.status - # lvol.status = LVol.STATUS_OFFLINE - # lvol.write_to_db(db.kv_store) - # lvol_events.lvol_status_change(lvol, lvol.status, old_status, caused_by="monitor") - # lvol_events.lvol_io_error_change(lvol, True, False, caused_by="monitor") event.status = f'distr error {vuid}' else: logger.error(f"Unknown event message: {event.message}") event.status = "event_unknown" -def process_event(event): +def process_event(event, logger): if event.event == "device_status": if event.storage_id >= 0: - process_device_event(event) + process_device_event(event, logger) if event.vuid >= 0: - process_lvol_event(event) + process_lvol_event(event, logger) event.write_to_db(db.kv_store) def start_event_collector_on_node(node_id): + snode = db.get_storage_node_by_id(node_id) + logger.info(f"Starting Distr event collector on node: {node_id}") - snode = db.get_storage_node_by_id(node_id) client = rpc_client.RPCClient( snode.mgmt_ip, snode.rpc_port, @@ -151,6 +154,7 @@ def start_event_collector_on_node(node_id): while True: page = 1 events_groups = {} + events_list = [] while True: try: events = client.distr_status_events_discard_then_get( @@ -181,14 +185,17 @@ def start_event_collector_on_node(node_id): events_groups[sid][et][msg]: 1 # type: ignore else: events_groups[sid][et][msg].count += 1 # type: ignore - events_groups[sid][et][msg].write_to_db() # type: ignore - logger.info(f"Event {msg} already processed") continue event = events_controller.log_distr_event(snode.cluster_id, snode.get_id(), event_dict) logger.info(f"Processing event: {event.get_id()}") - process_event(event) + process_event(event, logger) events_groups[sid][et][msg] = event + events_list.append(event) + + for ev in events_list: + if ev.count > 1 : + ev.write_to_db(db.kv_store) logger.info(f"Discarding events: {len(events)}") client.distr_status_events_discard_then_get(len(events), 0) @@ -197,8 +204,7 @@ def start_event_collector_on_node(node_id): logger.info("no events found, sleeping") break except Exception as e: - logger.error("Failed to process distr events") - logger.exception(e) + logger.error(f"Failed to process distr events: {e}") break time.sleep(constants.DISTR_EVENT_COLLECTOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/snapshot_monitor.py b/simplyblock_core/services/snapshot_monitor.py index c82476e7b..2910df3d6 100644 --- a/simplyblock_core/services/snapshot_monitor.py +++ b/simplyblock_core/services/snapshot_monitor.py @@ -5,10 +5,9 @@ from simplyblock_core import constants, db_controller, utils from simplyblock_core.models.cluster import Cluster -from simplyblock_core.controllers import health_controller, snapshot_events +from simplyblock_core.controllers import health_controller, snapshot_events, tasks_controller from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.rpc_client import RPCClient logger = utils.get_logger(__name__) @@ -64,21 +63,22 @@ def process_snap_delete_finish(snap, leader_node): # 3-1 async delete lvol bdev from primary primary_node = db.get_storage_node_by_id(leader_node.get_id()) + non_leader_id = snode.secondary_node_id + if snode.get_id() != leader_node.get_id(): + non_leader_id = snode.get_id() + non_leader = db.get_storage_node_by_id(non_leader_id) if primary_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + if non_leader and non_leader.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN, StorageNode.STATUS_UNREACHABLE]: + primary_node.lvol_del_sync_lock() ret, _ = primary_node.rpc_client().delete_lvol(snap.snap_bdev, del_async=True) if not ret: logger.error(f"Failed to delete snap from node: {snode.get_id()}") # 3-2 async delete lvol bdev from secondary - non_leader_id = snode.secondary_node_id - if snode.get_id() != leader_node.get_id(): - non_leader_id = snode.get_id() - - non_leader = db.get_storage_node_by_id(non_leader_id) - if non_leader: - non_leader.lvol_sync_del_queue.append(snap.snap_bdev) - non_leader.write_to_db() - + if non_leader and non_leader.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN, StorageNode.STATUS_UNREACHABLE]: + tasks_controller.add_lvol_sync_del_task(non_leader.cluster_id, non_leader.get_id(), snap.snap_bdev, primary_node.get_id()) snapshot_events.snapshot_delete(snap) snap.remove(db.kv_store) @@ -96,6 +96,115 @@ def set_snap_offline(snap): sn.write_to_db() +def process_snap_delete(snap, snode): + # check leadership + leader_node = None + if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN]: + ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if not ret: + raise Exception("Failed to get LVol store info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = snode + + if not leader_node and sec_node: + ret = sec_node.rpc_client().bdev_lvol_get_lvstores(sec_node.lvstore) + if not ret: + raise Exception("Failed to get LVol store info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = sec_node + + if not leader_node: + raise Exception("Failed to get leader node") + + if snap.deletion_status == "" or snap.deletion_status != leader_node.get_id(): + + ret, _ = leader_node.rpc_client().delete_lvol(snap.snap_bdev) + if not ret: + logger.error(f"Failed to delete snap from node: {snode.get_id()}") + return False + snap = db.get_snapshot_by_id(snap.get_id()) + snap.deletion_status = leader_node.get_id() + snap.write_to_db() + + time.sleep(3) + + try: + ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status(snap.snap_bdev) + except Exception as e: + logger.error(e) + # timeout detected, check other node + return False + + if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed + process_snap_delete_finish(snap, leader_node) + + elif ret == 1: # Async lvol deletion is in progress or queued + logger.info(f"Snap deletion in progress, id: {snap.get_id()}") + + elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error( + "Async deletion is done, but leadership has changed (sync deletion is now blocked)") + + elif ret == 4: # No async delete request exists for this Snap + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No async delete request exists for this snap") + set_snap_offline(snap) + + elif ret == -1: # Operation not permitted + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Operation not permitted") + process_snap_delete_try_again(snap) + + elif ret == -2: # No such file or directory + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No such file or directory") + process_snap_delete_finish(snap, leader_node) + + elif ret == -5: # I/O error + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("I/O error") + process_snap_delete_try_again(snap) + + elif ret == -11: # Try again + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Try again") + process_snap_delete_try_again(snap) + + elif ret == -12: # Out of memory + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Out of memory") + process_snap_delete_try_again(snap) + + elif ret == -16: # Device or resource busy + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Device or resource busy") + process_snap_delete_try_again(snap) + + elif ret == -19: # No such device + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No such device") + set_snap_offline(snap) + + elif ret == -35: # Leadership changed + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Leadership changed") + process_snap_delete_try_again(snap) + + elif ret == -36: # Failed to update lvol for deletion + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Failed to update snapshot for deletion") + process_snap_delete_try_again(snap) + + else: # Failed to update lvol for deletion + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Failed to update snapshot for deletion") + + + # get DB controller db = db_controller.DBController() @@ -110,159 +219,46 @@ def set_snap_offline(snap): for snode in db.get_storage_nodes_by_cluster_id(cluster.get_id()): node_bdev_names = [] - node_lvols_nqns = {} sec_node_bdev_names = {} - sec_node_lvols_nqns = {} sec_node = None if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, timeout=3, retry=2) - node_bdevs = rpc_client.get_bdevs() + rpc_client = snode.rpc_client(timeout=3, retry=2) + try: + node_bdevs = rpc_client.get_bdevs() + except Exception as e: + logger.error(e) + continue if node_bdevs: node_bdev_names = [b['name'] for b in node_bdevs] for bdev in node_bdevs: if "aliases" in bdev and bdev["aliases"]: node_bdev_names.extend(bdev['aliases']) - ret = rpc_client.subsystem_list() - if ret: - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub - if snode.secondary_node_id: sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: - sec_rpc_client = RPCClient( - sec_node.mgmt_ip, sec_node.rpc_port, - sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) - ret = sec_rpc_client.get_bdevs() + if sec_node and sec_node.status in [ + StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + sec_rpc_client = sec_node.rpc_client(timeout=3, retry=2) + try: + ret = sec_rpc_client.get_bdevs() + except Exception as e: + logger.error(e) + continue if ret: for bdev in ret: sec_node_bdev_names[bdev['name']] = bdev - ret = sec_rpc_client.subsystem_list() - if ret: - for sub in ret: - sec_node_lvols_nqns[sub['nqn']] = sub - - if snode.lvstore_status == "ready": - - for snap in db.get_snapshots_by_node_id(snode.get_id()): - if snap.status == SnapShot.STATUS_ONLINE: - - present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) + for snap in db.get_snapshots_by_node_id(snode.get_id()): + if snap.status == SnapShot.STATUS_ONLINE: + present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) + if snode.lvstore_status == "ready": set_snapshot_health_check(snap, present) - elif snap.status == SnapShot.STATUS_IN_DELETION: - - # check leadership - leader_node = None - if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, - StorageNode.STATUS_DOWN]: - ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if not ret: - raise Exception("Failed to get LVol store info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = snode - - if not leader_node and sec_node: - ret = sec_node.rpc_client().bdev_lvol_get_lvstores(sec_node.lvstore) - if not ret: - raise Exception("Failed to get LVol store info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = sec_node - - if not leader_node: - raise Exception("Failed to get leader node") - - if snap.deletion_status == "" or snap.deletion_status != leader_node.get_id(): - - ret, _ = leader_node.rpc_client().delete_lvol(snap.snap_bdev) - if not ret: - logger.error(f"Failed to delete snap from node: {snode.get_id()}") - continue - snap = db.get_snapshot_by_id(snap.get_id()) - snap.deletion_status = leader_node.get_id() - snap.write_to_db() - - time.sleep(3) - - try: - ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status(snap.snap_bdev) - except Exception as e: - logger.error(e) - # timeout detected, check other node - break - - if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed - process_snap_delete_finish(snap, leader_node) - - elif ret == 1: # Async lvol deletion is in progress or queued - logger.info(f"Snap deletion in progress, id: {snap.get_id()}") - - elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error( - "Async deletion is done, but leadership has changed (sync deletion is now blocked)") - - elif ret == 4: # No async delete request exists for this Snap - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No async delete request exists for this snap") - set_snap_offline(snap) - - elif ret == -1: # Operation not permitted - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Operation not permitted") - process_snap_delete_try_again(snap) - - elif ret == -2: # No such file or directory - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No such file or directory") - process_snap_delete_finish(snap, leader_node) - - elif ret == -5: # I/O error - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("I/O error") - process_snap_delete_try_again(snap) - - elif ret == -11: # Try again - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Try again") - process_snap_delete_try_again(snap) - - elif ret == -12: # Out of memory - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Out of memory") - process_snap_delete_try_again(snap) - - elif ret == -16: # Device or resource busy - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Device or resource busy") - process_snap_delete_try_again(snap) - - elif ret == -19: # No such device - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No such device") - set_snap_offline(snap) - - elif ret == -35: # Leadership changed - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Leadership changed") - process_snap_delete_try_again(snap) - - elif ret == -36: # Failed to update lvol for deletion - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Failed to update snapshot for deletion") - process_snap_delete_try_again(snap) - - else: # Failed to update lvol for deletion - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Failed to update snapshot for deletion") - + elif snap.status == SnapShot.STATUS_IN_DELETION: + try: + process_snap_delete(snap, snode) + except Exception as e: + logger.error(e) time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py new file mode 100644 index 000000000..2549b8546 --- /dev/null +++ b/simplyblock_core/services/snapshot_replication.py @@ -0,0 +1,333 @@ +# coding=utf-8 +import time +import uuid + +from simplyblock_core import constants, db_controller, utils +from simplyblock_core.controllers import lvol_controller, snapshot_events +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.pool import Pool +from simplyblock_core.models.snapshot import SnapShot +from simplyblock_core.models.storage_node import StorageNode + +logger = utils.get_logger(__name__) +utils.init_sentry_sdk(__name__) +# get DB controller +db = db_controller.DBController() + + +def process_snap_replicate_start(task, snapshot): + # 1 create lvol on remote node + logger.info("Starting snapshot replication task") + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) + replicate_to_source = task.function_params["replicate_to_source"] + if "remote_lvol_id" not in task.function_params or not task.function_params["remote_lvol_id"]: + if replicate_to_source: + org_snap = db.get_snapshot_by_id(snapshot.source_replicated_snap_uuid) + remote_node_uuid = db.get_storage_node_by_id(task.node_id) + remote_pool_uuid = org_snap.lvol.pool_uuid + else: # replicate to target + remote_node_uuid = db.get_storage_node_by_id(snapshot.lvol.replication_node_id) + cluster = db.get_cluster_by_id(remote_node_uuid.cluster_id) + remote_pool_uuid = None + if cluster.snapshot_replication_target_pool: + remote_pool_uuid = cluster.snapshot_replication_target_pool + else: + for bool in db.get_pools(remote_node_uuid.cluster_id): + if bool.status == Pool.STATUS_ACTIVE: + remote_pool_uuid = bool.uuid + break + if not remote_pool_uuid: + logger.error(f"Unable to find pool on remote cluster: {remote_node_uuid.cluster_id}") + return + + lv_id, err = lvol_controller.add_lvol_ha( + f"REP_{snapshot.snap_name}", snapshot.size, remote_node_uuid.get_id(), snapshot.lvol.ha_type, + remote_pool_uuid) + if lv_id: + task.function_params["remote_lvol_id"] = lv_id + task.write_to_db() + else: + logger.error(err) + task.function_result = "Error creating remote lvol" + task.write_to_db() + return + + remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) + remote_lv_node = db.get_storage_node_by_id(remote_lv.node_id) + if remote_lv_node.status != StorageNode.STATUS_ONLINE: + task.function_result = "Target node is not online, retrying" + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db() + return + + # 2 connect to it + ret = snode.rpc_client().bdev_nvme_controller_list(remote_lv.top_bdev) + if not ret: + remote_snode = db.get_storage_node_by_id(remote_lv.node_id) + for nic in remote_snode.data_nics: + ip = nic.ip4_address + ret = snode.rpc_client().bdev_nvme_attach_controller( + remote_lv.top_bdev, remote_lv.nqn, ip, remote_lv.subsys_port, nic.trtype) + if not ret: + msg = "controller attach failed" + logger.error(msg) + raise RuntimeError(msg) + bdev_name = ret[0] + if not bdev_name: + msg = "Bdev name not returned from controller attach" + logger.error(msg) + raise RuntimeError(msg) + bdev_found = False + for i in range(5): + ret = snode.rpc_client().get_bdevs(bdev_name) + if ret: + bdev_found = True + break + else: + time.sleep(1) + + if not bdev_found: + logger.error("lvol Bdev not found after 5 attempts") + raise RuntimeError(f"Failed to connect to lvol: {remote_lv.get_id()}") + + offset = 0 + if "offset" in task.function_params and task.function_params["offset"]: + offset = task.function_params["offset"] + # 3 start replication + snode.rpc_client().bdev_lvol_transfer( + lvol_name=snapshot.snap_bdev, + offset=offset, + cluster_batch=16, + gateway=f"{remote_lv.top_bdev}n1", + operation="replicate" + ) + task.status = JobSchedule.STATUS_RUNNING + task.function_params["start_time"] = int(time.time()) + task.write_to_db() + + if snapshot.status != SnapShot.STATUS_IN_REPLICATION: + snapshot.status = SnapShot.STATUS_IN_REPLICATION + snapshot.write_to_db() + + +def process_snap_replicate_finish(task, snapshot): + + # detach remote lvol + remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) + snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) + remote_snode = db.get_storage_node_by_id(remote_lv.node_id) + replicate_to_source = task.function_params["replicate_to_source"] + if "replicate_as_snap_instance" in task.function_params: + replicate_as_snap_instance = task.function_params["replicate_as_snap_instance"] + else: + replicate_as_snap_instance = False + target_prev_snap = None + if replicate_to_source: + org_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) + try: + target_prev_snap = db.get_snapshot_by_id(org_snap.source_replicated_snap_uuid) + except KeyError as e: + logger.error(e) + else: + if snapshot.snap_ref_id: + try: + prev_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) + for sn_inst in prev_snap.instances: + if sn_inst.lvol.node_id == remote_snode.get_id(): + target_prev_snap = sn_inst + break + except KeyError as e: + logger.error(e) + + # chain snaps on primary + if target_prev_snap: + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {target_prev_snap.snap_bdev}") + ret = remote_snode.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) + if not ret: + logger.error("Failed to chain replicated snapshot on primary node") + return False + + # convert to snapshot on primary + ret = remote_snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + if not ret: + logger.error("Failed to convert to snapshot on primary node") + return False + + # chain snaps on secondary + sec_node = db.get_storage_node_by_id(remote_snode.secondary_node_id) + if sec_node.status == StorageNode.STATUS_ONLINE: + if target_prev_snap: + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {target_prev_snap.snap_bdev}") + ret = sec_node.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) + if not ret: + logger.error("Failed to chain replicated snapshot on secondary node") + return False + + # convert to snapshot on secondary + ret = sec_node.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + if not ret: + logger.error("Failed to convert to snapshot on secondary node") + return False + + new_snapshot_uuid = str(uuid.uuid4()) + + new_snapshot = SnapShot() + new_snapshot.uuid = new_snapshot_uuid + new_snapshot.cluster_id = remote_snode.cluster_id + new_snapshot.lvol = remote_lv + new_snapshot.pool_uuid = remote_lv.pool_uuid + new_snapshot.snap_bdev = remote_lv.top_bdev + new_snapshot.snap_uuid = remote_lv.lvol_uuid + new_snapshot.size = snapshot.size + new_snapshot.used_size = snapshot.used_size + new_snapshot.snap_name = snapshot.snap_name + new_snapshot.blobid = remote_lv.blobid + new_snapshot.created_at = int(time.time()) + new_snapshot.status = SnapShot.STATUS_ONLINE + snapshot.instances.append(new_snapshot) + if not replicate_as_snap_instance: + if replicate_to_source: + new_snapshot.target_replicated_snap_uuid = snapshot.uuid + snapshot.source_replicated_snap_uuid = new_snapshot_uuid + else: + snapshot.target_replicated_snap_uuid = new_snapshot_uuid + new_snapshot.source_replicated_snap_uuid = snapshot.uuid + + if target_prev_snap: + new_snapshot.prev_snap_uuid = target_prev_snap.get_id() + target_prev_snap.next_snap_uuid = new_snapshot_uuid + target_prev_snap.write_to_db() + + new_snapshot.write_to_db() + + if snapshot.status == SnapShot.STATUS_IN_REPLICATION: + snapshot.status = SnapShot.STATUS_ONLINE + + snapshot.write_to_db() + + # delete lvol object + remote_lv.bdev_stack = [] + remote_lv.write_to_db() + lvol_controller.delete_lvol(remote_lv.get_id(), True) + remote_lv.remove(db.kv_store) + snapshot_events.replication_task_finished(snapshot) + + return new_snapshot_uuid + + +def task_runner(task: JobSchedule): + snapshot = db.get_snapshot_by_id(task.function_params["snapshot_id"]) + if not snapshot: + task.function_result = "snapshot not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return True + + try: + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) + except KeyError: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return True + + if snode.status != StorageNode.STATUS_ONLINE: + task.function_result = "node is not online, retrying" + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db(db.kv_store) + return False + + if task.retry >= task.max_retry or task.canceled is True: + task.function_result = "max retry reached" + if task.canceled is True: + task.function_result = "task cancelled" + + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + if snapshot.status != SnapShot.STATUS_ONLINE: + snapshot.status = SnapShot.STATUS_ONLINE + snapshot.write_to_db() + + remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) + snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) + lvol_controller.delete_lvol(remote_lv.get_id(), True) + + return True + + + if task.status in [JobSchedule.STATUS_NEW, JobSchedule.STATUS_SUSPENDED]: + process_snap_replicate_start(task, snapshot) + + elif task.status == JobSchedule.STATUS_RUNNING: + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) + ret = snode.rpc_client().bdev_lvol_transfer_stat(snapshot.snap_bdev) + if not ret: + logger.error("Failed to get transfer stat") + return False + status = ret["transfer_state"] + offset = ret["offset"] + if status == "No process": + task.function_result = f"Status: {status}, offset:{offset}, retrying" + task.status = JobSchedule.STATUS_NEW + task.retry += 1 + task.write_to_db() + return False + if status == "In progress": + task.function_result = f"Status: {status}, offset:{offset}" + task.function_params["offset"] = offset + task.write_to_db() + return True + if status == "Failed": + task.function_result = f"Status: {status}, offset:{offset}, retrying" + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db() + return False + if status == "Done": + new_snapshot_uuid = process_snap_replicate_finish(task, snapshot) + if new_snapshot_uuid: + task.function_result = new_snapshot_uuid + task.status = JobSchedule.STATUS_DONE + task.function_params["end_time"] = int(time.time()) + task.write_to_db() + else: + task.function_result = "complete repl failed, retrying" + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db() + return True + + +logger.info("Starting Tasks runner...") +while True: + clusters = db.get_clusters() + if not clusters: + logger.error("No clusters found!") + else: + for cl in clusters: + tasks = db.get_job_tasks(cl.get_id(), reverse=False) + for task in tasks: + delay_seconds = constants.TASK_EXEC_INTERVAL_SEC + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + if task.status in [JobSchedule.STATUS_NEW, JobSchedule.STATUS_SUSPENDED]: + active_task = False + for t in db.get_job_tasks(task.cluster_id): + if t.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and t.function_params["snapshot_id"] == task.function_params['snapshot_id']: + if t.status == JobSchedule.STATUS_RUNNING and t.canceled is False: + active_task = True + break + if active_task: + logger.info("replication task found for same snapshot, retry") + continue + if task.status != JobSchedule.STATUS_DONE: + # get new task object because it could be changed from cancel task + task = db.get_task_by_id(task.uuid) + res = task_runner(task) + if not res: + time.sleep(3) + + time.sleep(constants.TASK_EXEC_INTERVAL_SEC) diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 06eeee008..46071e408 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -6,19 +6,58 @@ import os import socket import sys +import threading +import time from http.server import HTTPServer from http.server import ThreadingHTTPServer from http.server import BaseHTTPRequestHandler -rpc_sock = '/var/tmp/spdk.sock' +rpc_sock = '/mnt/ramdisk/spdk.sock' logger_handler = logging.StreamHandler(stream=sys.stdout) logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s')) logger = logging.getLogger() logger.addHandler(logger_handler) logger.setLevel(logging.INFO) +read_line_time_diff: dict = {} +recv_from_spdk_time_diff: dict = {} +def print_stats(): + while True: + try: + time.sleep(3) + t = time.time_ns() + read_line_time_diff_max = max(list(read_line_time_diff.values())) + read_line_time_diff_avg = int(sum(list(read_line_time_diff.values()))/len(read_line_time_diff)) + last_3_sec = [] + for k,v in read_line_time_diff.items(): + if k > t - 3*1000*1000*1000: + last_3_sec.append(v) + if len(last_3_sec) > 0: + read_line_time_diff_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + else: + read_line_time_diff_avg_last_3_sec = 0 + logger.info(f"Periodic stats: {t}: read_line_time: max={read_line_time_diff_max} ns, avg={read_line_time_diff_avg} ns, last_3s_avg={read_line_time_diff_avg_last_3_sec} ns") + if len(read_line_time_diff) > 10000: + read_line_time_diff.clear() + + recv_from_spdk_time_max = max(list(recv_from_spdk_time_diff.values())) + recv_from_spdk_time_avg = int(sum(list(recv_from_spdk_time_diff.values()))/len(recv_from_spdk_time_diff)) + last_3_sec = [] + for k,v in recv_from_spdk_time_diff.items(): + if k > t - 3*1000*1000*1000: + last_3_sec.append(v) + if len(last_3_sec) > 0: + recv_from_spdk_time_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + else: + recv_from_spdk_time_avg_last_3_sec = 0 + logger.info(f"Periodic stats: {t}: recv_from_spdk_time: max={recv_from_spdk_time_max} ns, avg={recv_from_spdk_time_avg} ns, last_3s_avg={recv_from_spdk_time_avg_last_3_sec} ns") + if len(recv_from_spdk_time_diff) > 10000: + recv_from_spdk_time_diff.clear() + except Exception as e: + logger.error(e) + def get_env_var(name, default=None, is_required=False): if not name: @@ -30,13 +69,18 @@ def get_env_var(name, default=None, is_required=False): return os.environ.get(name, default) +unix_sockets: list[socket] = [] # type: ignore[valid-type] def rpc_call(req): + logger.info(f"active threads: {threading.active_count()}") + logger.info(f"active unix sockets: {len(unix_sockets)}") req_data = json.loads(req.decode('ascii')) + req_time = time.time_ns() params = "" if "params" in req_data: params = str(req_data['params']) - logger.info(f"Request function: {str(req_data['method'])}, params: {params}") + logger.info(f"Request:{req_time} function: {str(req_data['method'])}, params: {params}") sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + unix_sockets.append(sock) sock.settimeout(TIMEOUT) sock.connect(rpc_sock) sock.sendall(req) @@ -48,7 +92,7 @@ def rpc_call(req): buf = '' closed = False response = None - + recv_from_spdk_time_start = time.time_ns() while not closed: newdata = sock.recv(1024*1024*1024) if newdata == b'': @@ -59,21 +103,25 @@ def rpc_call(req): except ValueError: continue # incomplete response; keep buffering break + recv_from_spdk_time_end = time.time_ns() + time_diff = recv_from_spdk_time_end - recv_from_spdk_time_start + logger.info(f"recv_from_spdk_time_diff: {time_diff}") + recv_from_spdk_time_diff[recv_from_spdk_time_start] = time_diff sock.close() + unix_sockets.remove(sock) if not response and len(buf) > 0: raise ValueError('Invalid response') - logger.debug(f"Response data: {buf}") + logger.info(f"Response:{req_time}") return buf class ServerHandler(BaseHTTPRequestHandler): - + server_session: list[int] = [] key = "" - def do_HEAD(self): self.send_response(200) self.send_header('Content-type', 'text/html') @@ -96,9 +144,14 @@ def do_INTERNALERROR(self): self.end_headers() def do_POST(self): + req_time = time.time_ns() + self.server_session.append(req_time) + logger.info(f"incoming request at: {req_time}") + logger.info(f"active server session: {len(self.server_session)}") if self.headers['Authorization'] != 'Basic ' + self.key: self.do_AUTHHEAD() else: + read_line_time_start = time.time_ns() if "Content-Length" in self.headers: data_string = self.rfile.read(int(self.headers['Content-Length'])) elif "chunked" in self.headers.get("Transfer-Encoding", ""): @@ -118,7 +171,10 @@ def do_POST(self): # Finally, a chunk size of 0 is an end indication if chunk_length == 0: break - + read_line_time_end = time.time_ns() + time_diff = read_line_time_end - read_line_time_start + logger.info(f"read_line_time_diff: {time_diff}") + read_line_time_diff[read_line_time_start] = time_diff try: response = rpc_call(data_string) if response is not None: @@ -129,12 +185,14 @@ def do_POST(self): except ValueError: self.do_INTERNALERROR() + self.server_session.remove(req_time) def run_server(host, port, user, password, is_threading_enabled=False): # encoding user and password key = base64.b64encode((user+':'+password).encode(encoding='ascii')).decode('ascii') - + print_stats_thread = threading.Thread(target=print_stats, ) + print_stats_thread.start() try: ServerHandler.key = key httpd = (ThreadingHTTPServer if is_threading_enabled else HTTPServer)((host, port), ServerHandler) @@ -157,6 +215,7 @@ def run_server(host, port, user, password, is_threading_enabled=False): rpc_port = int(rpc_port) except Exception: rpc_port = 8080 +rpc_sock = f"/mnt/ramdisk/spdk_{rpc_port}/spdk.sock" is_threading_enabled = bool(is_threading_enabled) run_server(server_ip, rpc_port, rpc_username, rpc_password, is_threading_enabled=is_threading_enabled) diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py index 17a7d0369..a8d5a08b7 100644 --- a/simplyblock_core/services/storage_node_monitor.py +++ b/simplyblock_core/services/storage_node_monitor.py @@ -5,7 +5,8 @@ from simplyblock_core import constants, db_controller, cluster_ops, storage_node_ops, utils -from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events +from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events, \ + cluster_events from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice @@ -74,13 +75,16 @@ def get_next_cluster_status(cluster_id): continue online_nodes += 1 # check for jm rep tasks: - ret = node.rpc_client().jc_get_jm_status(node.jm_vuid) - if ret: - for jm in ret: - if ret[jm] is False: # jm is not ready (has active replication task) - jm_replication_tasks = True - logger.warning("Replication task found!") - break + if node.rpc_client().bdev_lvol_get_lvstores(node.lvstore): + try: + ret = node.rpc_client(timeout=5).jc_get_jm_status(node.jm_vuid) + for jm in ret: + if ret[jm] is False: # jm is not ready (has active replication task) + jm_replication_tasks = True + logger.warning("Replication task found!") + break + except Exception: + logger.warning("Failed to get replication task!") elif node.status == StorageNode.STATUS_REMOVED: pass else: @@ -114,11 +118,12 @@ def get_next_cluster_status(cluster_id): k = cluster.distr_npcs # if number of devices in the cluster unavailable on DIFFERENT nodes > k --> I cannot read and in some cases cannot write (suspended) - if affected_nodes == k and (not cluster.strict_node_anti_affinity or online_nodes >= (n+k)): + if affected_nodes == k and (not cluster.strict_node_anti_affinity or online_nodes >= (n + k)): return Cluster.STATUS_DEGRADED elif jm_replication_tasks: return Cluster.STATUS_DEGRADED - elif (affected_nodes > k or online_devices < (n + k) or (online_nodes < (n+k) and cluster.strict_node_anti_affinity)): + elif (affected_nodes > k or online_devices < (n + k) or ( + online_nodes < (n + k) and cluster.strict_node_anti_affinity)): return Cluster.STATUS_SUSPENDED else: return Cluster.STATUS_ACTIVE @@ -132,12 +137,15 @@ def update_cluster_status(cluster_id): for task in db.get_job_tasks(cluster_id): if task.status != JobSchedule.STATUS_DONE and task.function_name in [ JobSchedule.FN_DEV_MIG, JobSchedule.FN_NEW_DEV_MIG, JobSchedule.FN_FAILED_DEV_MIG]: - if task.retry == 0: + if "migration" not in task.function_params: first_iter_task_pending += 1 - + is_re_balancing = first_iter_task_pending > 0 cluster = db.get_cluster_by_id(cluster_id) - cluster.is_re_balancing = first_iter_task_pending > 0 - cluster.write_to_db() + if cluster.is_re_balancing != is_re_balancing: + old_status = cluster.is_re_balancing + cluster.is_re_balancing = is_re_balancing + cluster.write_to_db() + cluster_events.cluster_rebalancing_change(cluster_id, cluster.is_re_balancing, old_status) current_cluster_status = cluster.status logger.info("cluster_status: %s", current_cluster_status) @@ -145,7 +153,7 @@ def update_cluster_status(cluster_id): return if current_cluster_status == Cluster.STATUS_DEGRADED and next_current_status == Cluster.STATUS_ACTIVE: - # if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_UNREADY] and cluster_current_status == Cluster.STATUS_ACTIVE: + # if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_UNREADY] and cluster_current_status == Cluster.STATUS_ACTIVE: # cluster_ops.cluster_activate(cluster_id, True) cluster_ops.set_cluster_status(cluster_id, Cluster.STATUS_ACTIVE) return @@ -186,7 +194,6 @@ def update_cluster_status(cluster_id): cluster_ops.set_cluster_status(cluster_id, next_current_status) - def set_node_online(node): if node.status != StorageNode.STATUS_ONLINE: @@ -211,24 +218,56 @@ def set_node_online(node): if online_devices_list: tasks_controller.add_device_mig_task(online_devices_list, node.cluster_id) -def set_node_offline(node, set_devs_offline=False): - if node.status != StorageNode.STATUS_UNREACHABLE: - # set node unavailable - storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_UNREACHABLE) + update_cluster_status(cluster_id) + - # if set_devs_offline: - # # set devices unavailable - # for dev in node.nvme_devices: - # if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY]: - # device_controller.device_set_unavailable(dev.get_id()) +def set_node_offline(node): + if node.status != StorageNode.STATUS_OFFLINE: + try: + storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_OFFLINE) + for dev in node.nvme_devices: + if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, + NVMeDevice.STATUS_CANNOT_ALLOCATE]: + device_controller.device_set_unavailable(dev.get_id()) + update_cluster_status(cluster_id) + # initiate restart + tasks_controller.add_node_to_auto_restart(node) + except Exception as e: + logger.debug("Setting node to OFFLINE state failed") + logger.error(e) + + +def set_node_unreachable(node): + if node.status != StorageNode.STATUS_UNREACHABLE: + try: + storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_UNREACHABLE) + update_cluster_status(cluster_id) + except Exception as e: + logger.debug("Setting node to UNREACHABLE state failed") + logger.error(e) + + +def set_node_schedulable(node): + if node.status != StorageNode.STATUS_SCHEDULABLE: + try: + storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_SCHEDULABLE) + # initiate shutdown + # initiate restart + tasks_controller.add_node_to_auto_restart(node) + for dev in node.nvme_devices: + if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, + NVMeDevice.STATUS_CANNOT_ALLOCATE]: + device_controller.device_set_unavailable(dev.get_id()) + update_cluster_status(cluster_id) + except Exception as e: + logger.debug("Setting node to SCHEDULABLE state failed") + logger.error(e) - # # set jm dev offline - # if node.jm_device.status != JMDevice.STATUS_UNAVAILABLE: - # device_controller.set_jm_device_state(node.jm_device.get_id(), JMDevice.STATUS_UNAVAILABLE) def set_node_down(node): if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_SUSPENDED]: storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_DOWN) + update_cluster_status(cluster_id) def node_rpc_timeout_check_and_report(node): @@ -242,10 +281,151 @@ def node_rpc_timeout_check_and_report(node): except Exception as e: logger.debug(e) # RPC timeout detected, send to cluster log - storage_events.snode_rpc_timeout(node, time.time()-start_time) + storage_events.snode_rpc_timeout(node, int(time.time() - start_time)) + return False + + +def node_port_check_fun(snode): + node_port_check = True + if snode.lvstore_status == "ready": + ports = [snode.nvmf_port] + if snode.lvstore_stack_secondary_1: + for n in db.get_primary_storage_nodes_by_secondary_node_id(snode.get_id()): + if n.lvstore_status == "ready": + ports.append(n.lvol_subsys_port) + if not snode.is_secondary_node: + ports.append(snode.lvol_subsys_port) + + for port in ports: + try: + ret = health_controller.check_port_on_node(snode, port) + logger.info(f"Check: node port {snode.mgmt_ip}, {port} ... {ret}") + node_port_check &= ret + except Exception: + logger.error("Check node port failed, connection error") + + node_data_nic_ping_check = False + for data_nic in snode.data_nics: + if data_nic.ip4_address: + data_ping_check = health_controller._check_node_ping(data_nic.ip4_address) + logger.info(f"Check: ping data nic {data_nic.ip4_address} ... {data_ping_check}") + node_data_nic_ping_check |= data_ping_check + + node_port_check &= node_data_nic_ping_check + + return node_port_check + + +class State: + counter = 0 +def increment(): + State.counter = 1 +def decrement(): + State.counter = 0 +def value(): + return State.counter + +def check_node(snode): + snode = db.get_storage_node_by_id(snode.get_id()) + + if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE, + StorageNode.STATUS_SCHEDULABLE, StorageNode.STATUS_DOWN]: + logger.info(f"Node status is: {snode.status}, skipping") + return False + + if snode.status == StorageNode.STATUS_ONLINE and snode.lvstore_status == "in_creation": + logger.info(f"Node lvstore is in creation: {snode.get_id()}, skipping") + return False + + logger.info(f"Checking node {snode.hostname}") + + + # 1- check node ping + ping_check = health_controller._check_node_ping(snode.mgmt_ip) + logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") + if not ping_check: + logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}: FAILED") + set_node_unreachable(snode) + return False + + # 2- check node API + try: + snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=10, retry=2) + ret, _ = snode_api.is_live() + logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {ret}") + if not ret: + logger.info("Check: node API failed, setting node unreachable") + set_node_unreachable(snode) + return False + except Exception as e: + logger.debug(e) + set_node_unreachable(snode) + return False + + # 3- check spdk process through node API + try: + snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=20, retry=2) + is_up, _ = snode_api.spdk_process_is_up( snode.rpc_port, snode.cluster_id) + logger.info(f"Check: spdk process {snode.mgmt_ip}:5000 ... {bool(is_up)}") + if not is_up: + logger.info("Check: node API failed, setting node offline") + set_node_offline(snode) + return False + except Exception as e: + logger.debug(e) + return False + + # 4- check node rpc interface + node_rpc_check, node_rpc_check_1 = health_controller._check_node_rpc( + snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=20, retry=1) + logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") + + #if RPC times out, we dont know if its due to node becoming unavailable or spdk hanging + #so we try it twice. If all other checks pass again, but only this one fails: it's the spdk process + if not node_rpc_check: + logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}:TIMEOUT") + if value()==0: + increment() + return False + + decrement() + if not node_rpc_check or not node_rpc_check_1: + logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}:FAILED") + set_node_schedulable(snode) + return False + + #if not node_rpc_check and snode.get_id() not in node_rpc_timeout_threads: + # t = threading.Thread(target=node_rpc_timeout_check_and_report, args=(snode,)) + # t.start() + # node_rpc_timeout_threads[snode.get_id()] = t + + node_port_check = node_port_check_fun(snode) + + if not node_port_check: + cluster = db.get_cluster_by_id(snode.cluster_id) + if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: + logger.error("Port check failed") + set_node_down(snode) + return True + + set_node_online(snode) + + +def loop_for_node(snode): + # global logger + # logger = logging.getLogger() + # logger_handler = logging.StreamHandler(stream=sys.stdout) + # logger_handler.setFormatter(logging.Formatter(f'%(asctime)s: node:{snode.mgmt_ip} %(levelname)s: %(message)s')) + # logger.addHandler(logger_handler) + while True: + check_node(snode) + logger.info(f"Sleeping for {constants.NODE_MONITOR_INTERVAL_SEC} seconds") + time.sleep(constants.NODE_MONITOR_INTERVAL_SEC) logger.info("Starting node monitor") +threads_maps: dict[str, threading.Thread] = {} + while True: clusters = db.get_clusters() for cluster in clusters: @@ -253,168 +433,20 @@ def node_rpc_timeout_check_and_report(node): if cluster.status == Cluster.STATUS_IN_ACTIVATION: logger.info(f"Cluster status is: {cluster.status}, skipping monitoring") continue - + logger.info(f"Looping for cluster {cluster_id}") nodes = db.get_storage_nodes_by_cluster_id(cluster_id) - for snode in nodes: - - # get fresh node object, something could have changed until the last for loop is reached - snode = db.get_storage_node_by_id(snode.get_id()) - - if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE, - StorageNode.STATUS_SCHEDULABLE, StorageNode.STATUS_DOWN]: - logger.info(f"Node status is: {snode.status}, skipping") - continue - - if snode.status == StorageNode.STATUS_ONLINE and snode.lvstore_status == "in_creation": - logger.info(f"Node lvstore is in creation: {snode.get_id()}, skipping") - continue - - logger.info(f"Checking node {snode.hostname}") - - # 1- check node ping - ping_check = health_controller._check_node_ping(snode.mgmt_ip) - logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") - if not ping_check: - time.sleep(1) - ping_check = health_controller._check_node_ping(snode.mgmt_ip) - logger.info(f"Check 2: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") - - # 2- check node API - node_api_check = health_controller._check_node_api(snode.mgmt_ip) - logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}") - - if snode.status == StorageNode.STATUS_SCHEDULABLE and not ping_check and not node_api_check: - continue - - spdk_process = False - if node_api_check: - # 3- check spdk_process - spdk_process = health_controller._check_spdk_process_up(snode.mgmt_ip, snode.rpc_port) - logger.info(f"Check: spdk process {snode.mgmt_ip}:5000 ... {spdk_process}") - - # 4- check rpc - node_rpc_check = health_controller._check_node_rpc( - snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=5, retry=2) - logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") - - if not node_rpc_check and snode.get_id() not in node_rpc_timeout_threads: - t = threading.Thread(target=node_rpc_timeout_check_and_report, args=(snode,)) + for node in nodes: + node_id = node.get_id() + if node_id not in threads_maps or threads_maps[node_id].is_alive() is False: + logger.info(f"Creating thread for node {node_id}") + t = threading.Thread(target=loop_for_node, args=(node,)) t.start() - node_rpc_timeout_threads[snode.get_id()] = t - - if ping_check and node_api_check and spdk_process and not node_rpc_check: - start_time = time.time() - while time.time() < start_time + 60: - node_rpc_check = health_controller._check_node_rpc( - snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=5, retry=2) - logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") - if node_rpc_check: - break - - node_port_check = True - - if spdk_process and node_rpc_check and snode.lvstore_status == "ready": - ports = [snode.nvmf_port] - if snode.lvstore_stack_secondary_1: - for n in db.get_primary_storage_nodes_by_secondary_node_id(snode.get_id()): - if n.lvstore_status == "ready": - ports.append(n.lvol_subsys_port) - if not snode.is_secondary_node: - ports.append(snode.lvol_subsys_port) - - for port in ports: - ret = health_controller._check_port_on_node(snode, port) - logger.info(f"Check: node port {snode.mgmt_ip}, {port} ... {ret}") - node_port_check &= ret - - node_data_nic_ping_check = False - for data_nic in snode.data_nics: - if data_nic.ip4_address: - data_ping_check = health_controller._check_node_ping(data_nic.ip4_address) - logger.info(f"Check: ping data nic {data_nic.ip4_address} ... {data_ping_check}") - node_data_nic_ping_check |= data_ping_check - - node_port_check &= node_data_nic_ping_check - - cluster = db.get_cluster_by_id(cluster.get_id()) - - # is_node_online = ping_check and spdk_process and node_rpc_check and node_port_check - is_node_online = spdk_process or node_rpc_check - if is_node_online: - - if snode.status == StorageNode.STATUS_UNREACHABLE: - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_UNREADY, - Cluster.STATUS_SUSPENDED, Cluster.STATUS_READONLY]: - # tasks_controller.add_node_to_auto_restart(snode) - set_node_online(snode) - continue - - if not node_port_check: - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: - logger.error("Port check failed") - set_node_down(snode) - continue - - set_node_online(snode) - - # # check JM device - # if snode.jm_device: - # if snode.jm_device.status in [JMDevice.STATUS_ONLINE, JMDevice.STATUS_UNAVAILABLE]: - # ret = health_controller.check_jm_device(snode.jm_device.get_id()) - # if ret: - # logger.info(f"JM bdev is online: {snode.jm_device.get_id()}") - # if snode.jm_device.status != JMDevice.STATUS_ONLINE: - # device_controller.set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) - # else: - # logger.error(f"JM bdev is offline: {snode.jm_device.get_id()}") - # if snode.jm_device.status != JMDevice.STATUS_UNAVAILABLE: - # device_controller.set_jm_device_state(snode.jm_device.get_id(), - # JMDevice.STATUS_UNAVAILABLE) - else: - - if not ping_check and not node_api_check and not spdk_process: - # restart on new node - storage_node_ops.set_node_status(snode.get_id(), StorageNode.STATUS_SCHEDULABLE) - - elif ping_check and node_api_check and (not spdk_process or not node_rpc_check): - # add node to auto restart - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_UNREADY, - Cluster.STATUS_SUSPENDED, Cluster.STATUS_READONLY]: - if not spdk_process and not node_rpc_check: - logger.info("ping is fine, snodeapi is fine, But no spdk process and no rpc check, " - "So that we set device offline") - set_node_offline(snode, set_devs_offline=(not spdk_process and not node_rpc_check)) - try: - ret = snode.rpc_client(timeout=10).get_version() - if not ret: - logger.debug("False RPC response, adding node to auto restart") - tasks_controller.add_node_to_auto_restart(snode) - except Exception as e: - logger.debug("Timeout to get RPC response, skipping restart") - logger.error(e) - - elif not node_port_check: - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: - logger.error("Port check failed") - set_node_down(snode) - - else: - set_node_offline(snode, set_devs_offline=not spdk_process) - - if ping_check and node_api_check and spdk_process and not node_rpc_check: - # restart spdk proxy cont - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_UNREADY, - Cluster.STATUS_SUSPENDED, Cluster.STATUS_READONLY]: - logger.info(f"Restarting spdk_proxy_{snode.rpc_port} on {snode.get_id()}") - snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=60, retry=1) - ret, err = snode_api.spdk_proxy_restart(snode.rpc_port) - if ret: - logger.info(f"Restarting spdk_proxy on {snode.get_id()} successfully") - continue - if err: - logger.error(err) - - update_cluster_status(cluster_id) - - logger.info(f"Sleeping for {constants.NODE_MONITOR_INTERVAL_SEC} seconds") + threads_maps[node_id] = t + logger.debug(threads_maps[node_id]) + + try: + update_cluster_status(cluster_id) + logger.debug("Iteration has been finished...") + except Exception: + logger.error("Error while updating cluster status") time.sleep(constants.NODE_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/tasks_runner_failed_migration.py b/simplyblock_core/services/tasks_runner_failed_migration.py index fce4fd8ef..e3baeb7f0 100644 --- a/simplyblock_core/services/tasks_runner_failed_migration.py +++ b/simplyblock_core/services/tasks_runner_failed_migration.py @@ -87,8 +87,12 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_failure_start( - distr_name, device.cluster_device_order, qos_high_priority, job_size=1024, jobs=constants.MIG_PARALLEL_JOBS) + try: + rsp = rpc_client.distr_migration_failure_start( + distr_name, device.cluster_device_order, qos_high_priority, job_size=constants.MIG_JOB_SIZE, jobs=constants.MIG_PARALLEL_JOBS) + except Exception as e: + logger.error(e) + rsp = False if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") task.function_result = "Failed to start device migration task" diff --git a/simplyblock_core/services/tasks_runner_jc_comp.py b/simplyblock_core/services/tasks_runner_jc_comp.py index 676156af3..9e1ce2368 100644 --- a/simplyblock_core/services/tasks_runner_jc_comp.py +++ b/simplyblock_core/services/tasks_runner_jc_comp.py @@ -46,9 +46,9 @@ task.write_to_db(db.kv_store) continue - node = db.get_storage_node_by_id(task.node_id) - - if not node: + try: + node = db.get_storage_node_by_id(task.node_id) + except KeyError: task.function_result = "node not found" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) @@ -57,6 +57,7 @@ if node.status != StorageNode.STATUS_ONLINE: msg = f"Node is {node.status}, retry task" logger.info(msg) + task.retry += 1 task.function_result = msg task.status = JobSchedule.STATUS_SUSPENDED task.write_to_db(db.kv_store) @@ -79,6 +80,7 @@ logger.info(msg) task.function_result = msg task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 task.write_to_db(db.kv_store) continue @@ -86,12 +88,16 @@ jm_vuid = node.jm_vuid if "jm_vuid" in task.function_params: jm_vuid = task.function_params["jm_vuid"] - ret, err = rpc_client.jc_compression_start(jm_vuid=jm_vuid) + try: + ret, err = rpc_client.jc_suspend_compression(jm_vuid=jm_vuid, suspend=False) + except Exception as e: + logger.error(e) + continue if ret: task.function_result = f"JC {node.jm_vuid} compression resumed on node" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) - elif err and "code" in err and err["code"] == -2: + elif err: task.function_result = f"JC {node.jm_vuid} compression not needed" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) diff --git a/simplyblock_core/services/tasks_runner_migration.py b/simplyblock_core/services/tasks_runner_migration.py index fb085e4aa..c00231d2c 100644 --- a/simplyblock_core/services/tasks_runner_migration.py +++ b/simplyblock_core/services/tasks_runner_migration.py @@ -62,16 +62,6 @@ def task_runner(task): except Exception as e: logger.error(f"Failed to get online since: {e}") - for dev in node.nvme_devices: - if dev.status not in [NVMeDevice.STATUS_ONLINE, - NVMeDevice.STATUS_FAILED_AND_MIGRATED, - NVMeDevice.STATUS_CANNOT_ALLOCATE]: - task.function_result = f"Some dev status is {dev.status }, retrying" - task.status = JobSchedule.STATUS_SUSPENDED - task.retry += 1 - task.write_to_db(db.kv_store) - return False - task.status = JobSchedule.STATUS_RUNNING task.function_result = "" task.write_to_db(db.kv_store) @@ -93,8 +83,12 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024, - jobs=constants.MIG_PARALLEL_JOBS) + try: + rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=constants.MIG_JOB_SIZE, + jobs=constants.MIG_PARALLEL_JOBS) + except Exception as e: + logger.error(e) + rsp = False if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") task.function_result = "Failed to start device migration task, retry later" @@ -112,7 +106,7 @@ def task_runner(task): allow_all_errors = False for node in db.get_storage_nodes_by_cluster_id(task.cluster_id): for dev in node.nvme_devices: - if dev.status in [NVMeDevice.STATUS_READONLY, NVMeDevice.STATUS_CANNOT_ALLOCATE]: + if dev.status in [NVMeDevice.STATUS_READONLY, NVMeDevice.STATUS_CANNOT_ALLOCATE, NVMeDevice.STATUS_FAILED]: allow_all_errors = True break @@ -219,9 +213,12 @@ def _set_master_task_status(master_task, status): continue rpc_client = RPCClient( node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=5, retry=2) - ret, err = rpc_client.jc_compression_start(jm_vuid=node.jm_vuid) - if err and "code" in err and err["code"] != -2: - logger.info("Failed to resume JC compression adding task...") - tasks_controller.add_jc_comp_resume_task(task.cluster_id, task.node_id, node.jm_vuid) + try: + ret, err = rpc_client.jc_suspend_compression(jm_vuid=node.jm_vuid, suspend=False) + if err: + logger.info("Failed to resume JC compression adding task...") + tasks_controller.add_jc_comp_resume_task(task.cluster_id, task.node_id, node.jm_vuid) + except Exception as e: + logger.error(e) time.sleep(3) diff --git a/simplyblock_core/services/tasks_runner_new_dev_migration.py b/simplyblock_core/services/tasks_runner_new_dev_migration.py index f62a7f210..db4143eec 100644 --- a/simplyblock_core/services/tasks_runner_new_dev_migration.py +++ b/simplyblock_core/services/tasks_runner_new_dev_migration.py @@ -98,8 +98,12 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024, - jobs=constants.MIG_PARALLEL_JOBS) + try: + rsp = rpc_client.distr_migration_expansion_start( + distr_name, qos_high_priority, job_size=constants.MIG_JOB_SIZE,jobs=constants.MIG_PARALLEL_JOBS) + except Exception as e: + logger.error(f"Failed to start migration : {e}") + rsp = False if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") task.function_result = "Failed to start device migration task" diff --git a/simplyblock_core/services/tasks_runner_node_add.py b/simplyblock_core/services/tasks_runner_node_add.py index daeba918e..263f2c73e 100644 --- a/simplyblock_core/services/tasks_runner_node_add.py +++ b/simplyblock_core/services/tasks_runner_node_add.py @@ -2,7 +2,7 @@ import time -from simplyblock_core import db_controller, storage_node_ops, utils +from simplyblock_core import db_controller, storage_node_ops, utils, constants from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.cluster import Cluster @@ -13,46 +13,67 @@ db = db_controller.DBController() -logger.info("Starting Tasks runner...") -while True: +def process_task(task): + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return False + + if task.retry >= task.max_retry: + task.function_result = "max retry reached" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return True + + if db.get_cluster_by_id(cl.get_id()).status == Cluster.STATUS_IN_ACTIVATION: + task.function_result = "Cluster is in_activation, waiting" + task.status = JobSchedule.STATUS_NEW + task.write_to_db(db.kv_store) + return False + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + try: + res = storage_node_ops.add_node(**task.function_params) + msg = f"Node add result: {res}" + logger.info(msg) + task.function_result = msg + if res: + task.status = JobSchedule.STATUS_DONE + else: + task.retry += 1 + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return True + except Exception as e: + logger.error(e) + return False + + +logger.info("Starting Tasks runner node add...") +while True: clusters = db.get_clusters() if not clusters: logger.error("No clusters found!") else: for cl in clusters: - if cl.status == Cluster.STATUS_IN_ACTIVATION: - continue - tasks = db.get_job_tasks(cl.get_id(), reverse=False) for task in tasks: - + delay_seconds = constants.TASK_EXEC_INTERVAL_SEC if task.function_name == JobSchedule.FN_NODE_ADD: - if task.status != JobSchedule.STATUS_DONE: - + while task.status != JobSchedule.STATUS_DONE: # get new task object because it could be changed from cancel task task = db.get_task_by_id(task.uuid) - - if task.canceled: - task.function_result = "canceled" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - continue - - if db.get_cluster_by_id(cl.get_id()).status == Cluster.STATUS_IN_ACTIVATION: - task.function_result = "Cluster is in_activation, waiting" - task.status = JobSchedule.STATUS_NEW - task.write_to_db(db.kv_store) - continue - - if task.status != JobSchedule.STATUS_RUNNING: - task.status = JobSchedule.STATUS_RUNNING - task.write_to_db(db.kv_store) - - res = storage_node_ops.add_node(**task.function_params) - logger.info(f"Node add result: {res}") - task.function_result = str(res) - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - - time.sleep(5) + res = process_task(task) + if res: + if task.status == JobSchedule.STATUS_DONE: + break + else: + delay_seconds *= 2 + time.sleep(delay_seconds) + + time.sleep(30) diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py index a39de42ab..fd706b18a 100644 --- a/simplyblock_core/services/tasks_runner_port_allow.py +++ b/simplyblock_core/services/tasks_runner_port_allow.py @@ -3,13 +3,12 @@ from simplyblock_core import db_controller, utils, storage_node_ops, distr_controller -from simplyblock_core.controllers import tcp_ports_events, health_controller +from simplyblock_core.controllers import tcp_ports_events, health_controller, tasks_controller from simplyblock_core.fw_api_client import FirewallClient from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.cluster import Cluster -from simplyblock_core.models.nvme_device import NVMeDevice +from simplyblock_core.models.nvme_device import NVMeDevice, RemoteDevice from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.snode_client import SNodeClient logger = utils.get_logger(__name__) @@ -17,9 +16,234 @@ db = db_controller.DBController() +def exec_port_allow_task(task): + # get new task object because it could be changed from cancel task + task = db.get_task_by_id(task.uuid) + + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + try: + node = db.get_storage_node_by_id(task.node_id) + except KeyError: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: + msg = f"Node is {node.status}, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + # check node ping + ping_check = health_controller._check_node_ping(node.mgmt_ip) + logger.info(f"Check: ping mgmt ip {node.mgmt_ip} ... {ping_check}") + if not ping_check: + time.sleep(1) + ping_check = health_controller._check_node_ping(node.mgmt_ip) + logger.info(f"Check 2: ping mgmt ip {node.mgmt_ip} ... {ping_check}") + + if not ping_check: + msg = "Node ping is false, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + # check node ping + logger.info("connect to remote devices") + nodes = db.get_storage_nodes_by_cluster_id(node.cluster_id) + # connect to remote devs + try: + node_bdevs = node.rpc_client().get_bdevs() + logger.debug(node_bdevs) + if node_bdevs: + node_bdev_names = {} + for b in node_bdevs: + node_bdev_names[b['name']] = b + for al in b['aliases']: + node_bdev_names[al] = b + else: + node_bdev_names = {} + remote_devices = [] + for nd in nodes: + if nd.get_id() == node.get_id() or nd.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: + continue + logger.info(f"Connecting to node {nd.get_id()}") + for index, dev in enumerate(nd.nvme_devices): + + if dev.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, + NVMeDevice.STATUS_CANNOT_ALLOCATE]: + logger.debug(f"Device is not online: {dev.get_id()}, status: {dev.status}") + continue + + if not dev.alceml_bdev: + raise ValueError(f"device alceml bdev not found!, {dev.get_id()}") + + remote_device = RemoteDevice() + remote_device.uuid = dev.uuid + remote_device.alceml_name = dev.alceml_name + remote_device.node_id = dev.node_id + remote_device.size = dev.size + remote_device.nvmf_multipath = dev.nvmf_multipath + remote_device.status = NVMeDevice.STATUS_ONLINE + remote_device.remote_bdev = storage_node_ops.connect_device( + f"remote_{dev.alceml_bdev}", dev, node, + bdev_names=list(node_bdev_names), reattach=False) + + remote_devices.append(remote_device) + if not remote_devices: + msg = "Node unable to connect to remote devs, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + else: + node = db.get_storage_node_by_id(task.node_id) + node.remote_devices = remote_devices + node.write_to_db() + + logger.info("connect to remote JM devices") + remote_jm_devices = storage_node_ops._connect_to_remote_jm_devs(node) + if not remote_jm_devices or len(remote_jm_devices) < 2: + msg = "Node unable to connect to remote JMs, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + else: + node = db.get_storage_node_by_id(task.node_id) + node.remote_jm_devices = remote_jm_devices + node.write_to_db() + + + except Exception as e: + logger.error(e) + msg = "Error when connect to remote devs, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + logger.info("Sending device status event") + for db_dev in node.nvme_devices: + distr_controller.send_dev_status_event(db_dev, db_dev.status, node) + + logger.info("Finished sending device status and now waiting 5s for JMs to connect") + time.sleep(5) + + sec_node = db.get_storage_node_by_id(node.secondary_node_id) + snode = db.get_storage_node_by_id(node.get_id()) + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + try: + ret = sec_node.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if ret: + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + # is_sec_node_leader = True + # check jc_compression status + jc_compression_is_active = sec_node.rpc_client().jc_compression_get_status(snode.jm_vuid) + retries = 10 + while jc_compression_is_active: + if retries <= 0: + logger.warning("Timeout waiting for JC compression task to finish") + break + retries -= 1 + logger.info( + f"JC compression task found on node: {sec_node.get_id()}, retrying in 60 seconds") + time.sleep(60) + jc_compression_is_active = sec_node.rpc_client().jc_compression_get_status( + snode.jm_vuid) + except Exception as e: + logger.error(e) + return + + if node.lvstore_status == "ready": + lvstore_check = health_controller._check_node_lvstore(node.lvstore_stack, node, auto_fix=True) + if not lvstore_check: + msg = "Node LVolStore check fail, retry later" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + if node.secondary_node_id: + primary_hublvol_check = health_controller._check_node_hublvol(node) + if not primary_hublvol_check: + msg = "Node hublvol check fail, retry later" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + sec_node = db.get_storage_node_by_id(node.secondary_node_id) + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + secondary_hublvol_check = health_controller._check_sec_node_hublvol(sec_node, auto_fix=True) + if not secondary_hublvol_check: + msg = "Secondary node hublvol check fail, retry later" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + try: + # wait for lvol sync delete + lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) + while lvol_sync_del_found: + logger.info("Lvol sync delete task found, waiting") + time.sleep(3) + lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) + + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + sec_rpc_client = sec_node.rpc_client() + ret = sec_node.wait_for_jm_rep_tasks_to_finish(node.jm_vuid) + if not ret: + msg = "JM replication task found on secondary" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + sec_rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False, bs_nonleadership=True) + + except Exception as e: + logger.error(e) + return + + port_number = task.function_params["port_number"] + logger.info(f"Allow port {port_number} on node {node.get_id()}") + fw_api = FirewallClient(snode, timeout=5, retry=2) + port_type = "tcp" + if node.active_rdma: + port_type = "udp" + fw_api.firewall_set_port(port_number, port_type, "allow", node.rpc_port) + tcp_ports_events.port_allowed(node, port_number) + + task.function_result = f"Port {port_number} allowed on node" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + logger.info("Starting Tasks runner...") while True: - clusters = db.get_clusters() if not clusters: logger.error("No clusters found!") @@ -27,207 +251,10 @@ for cl in clusters: if cl.status == Cluster.STATUS_IN_ACTIVATION: continue - tasks = db.get_job_tasks(cl.get_id(), reverse=False) for task in tasks: - if task.function_name == JobSchedule.FN_PORT_ALLOW: if task.status != JobSchedule.STATUS_DONE: - - # get new task object because it could be changed from cancel task - task = db.get_task_by_id(task.uuid) - - if task.canceled: - task.function_result = "canceled" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - continue - - node = db.get_storage_node_by_id(task.node_id) - - if not node: - task.function_result = "node not found" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - continue - - if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: - msg = f"Node is {node.status}, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - - # check node ping - ping_check = health_controller._check_node_ping(node.mgmt_ip) - logger.info(f"Check: ping mgmt ip {node.mgmt_ip} ... {ping_check}") - if not ping_check: - time.sleep(1) - ping_check = health_controller._check_node_ping(node.mgmt_ip) - logger.info(f"Check 2: ping mgmt ip {node.mgmt_ip} ... {ping_check}") - - if not ping_check: - msg = "Node ping is false, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - - # check node ping - logger.info("connect to remote devices") - nodes = db.get_storage_nodes_by_cluster_id(node.cluster_id) - # connect to remote devs - try: - node_bdevs = node.rpc_client().get_bdevs() - logger.debug(node_bdevs) - if node_bdevs: - node_bdev_names = {} - for b in node_bdevs: - node_bdev_names[b['name']] = b - for al in b['aliases']: - node_bdev_names[al] = b - else: - node_bdev_names = {} - remote_devices = [] - for nd in nodes: - if nd.get_id() == node.get_id() or nd.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: - continue - logger.info(f"Connecting to node {nd.get_id()}") - for index, dev in enumerate(nd.nvme_devices): - - if dev.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, - NVMeDevice.STATUS_CANNOT_ALLOCATE]: - logger.debug(f"Device is not online: {dev.get_id()}, status: {dev.status}") - continue - - if not dev.alceml_bdev: - raise ValueError(f"device alceml bdev not found!, {dev.get_id()}") - - dev.remote_bdev = storage_node_ops.connect_device( - f"remote_{dev.alceml_bdev}", dev, node, - bdev_names=list(node_bdev_names), reattach=False) - - remote_devices.append(dev) - if not remote_devices: - msg = "Node unable to connect to remote devs, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - else: - node = db.get_storage_node_by_id(task.node_id) - node.remote_devices = remote_devices - node.write_to_db() - - logger.info("connect to remote JM devices") - remote_jm_devices = storage_node_ops._connect_to_remote_jm_devs(node) - if not remote_jm_devices or len(remote_jm_devices) < 2: - msg = "Node unable to connect to remote JMs, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - else: - node = db.get_storage_node_by_id(task.node_id) - node.remote_jm_devices = remote_jm_devices - node.write_to_db() - - - except Exception as e: - logger.error(e) - msg = "Error when connect to remote devs, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - - logger.info("Sending device status event") - for db_dev in node.nvme_devices: - distr_controller.send_dev_status_event(db_dev, db_dev.status) - - logger.info("Finished sending device status and now waiting 5s for JMs to connect") - time.sleep(5) - - sec_node = db.get_storage_node_by_id(node.secondary_node_id) - snode = db.get_storage_node_by_id(node.get_id()) - if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: - ret = sec_node.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if ret: - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - # is_sec_node_leader = True - # check jc_compression status - jc_compression_is_active = sec_node.rpc_client().jc_compression_get_status(snode.jm_vuid) - retries = 10 - while jc_compression_is_active: - if retries <= 0: - logger.warning("Timeout waiting for JC compression task to finish") - break - retries -= 1 - logger.info( - f"JC compression task found on node: {sec_node.get_id()}, retrying in 60 seconds") - time.sleep(60) - jc_compression_is_active = sec_node.rpc_client().jc_compression_get_status( - snode.jm_vuid) - - lvstore_check = True - if node.lvstore_status == "ready": - lvstore_check &= health_controller._check_node_lvstore(node.lvstore_stack, node, auto_fix=True) - if node.secondary_node_id: - lvstore_check &= health_controller._check_node_hublvol(node) - sec_node = db.get_storage_node_by_id(node.secondary_node_id) - if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: - lvstore_check &= health_controller._check_sec_node_hublvol(sec_node, auto_fix=True) - - if lvstore_check is False: - msg = "Node LVolStore check fail, retry later" - logger.warning(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - - if task.status != JobSchedule.STATUS_RUNNING: - task.status = JobSchedule.STATUS_RUNNING - task.write_to_db(db.kv_store) - - not_deleted = [] - for bdev_name in snode.lvol_sync_del_queue: - logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}") - ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True) - if not ret: - if "code" in err and err["code"] == -19: - logger.error(f"Sync delete completed with error: {err}") - else: - logger.error( - f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}") - not_deleted.append(bdev_name) - snode.lvol_sync_del_queue = not_deleted - snode.write_to_db() - - if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: - sec_rpc_client = sec_node.rpc_client() - sec_rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False, bs_nonleadership=True) - - port_number = task.function_params["port_number"] - snode_api = SNodeClient(f"{node.mgmt_ip}:5000", timeout=3, retry=2) - - logger.info(f"Allow port {port_number} on node {node.get_id()}") - - fw_api = FirewallClient(snode, timeout=5, retry=2) - port_type = "tcp" - if node.active_rdma: - port_type = "udp" - fw_api.firewall_set_port(port_number, port_type, "allow", node.rpc_port) - tcp_ports_events.port_allowed(node, port_number) - - task.function_result = f"Port {port_number} allowed on node" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) + exec_port_allow_task(task) time.sleep(5) diff --git a/simplyblock_core/services/tasks_runner_restart.py b/simplyblock_core/services/tasks_runner_restart.py index 2cfc82a53..61f8c5e6b 100644 --- a/simplyblock_core/services/tasks_runner_restart.py +++ b/simplyblock_core/services/tasks_runner_restart.py @@ -3,6 +3,7 @@ from simplyblock_core import constants, db_controller, storage_node_ops, utils from simplyblock_core.controllers import device_controller, health_controller, tasks_controller +from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.nvme_device import NVMeDevice from simplyblock_core.models.storage_node import StorageNode @@ -127,18 +128,19 @@ def task_runner_device(task): def task_runner_node(task): - node = db.get_storage_node_by_id(task.node_id) - if task.retry >= task.max_retry: - task.function_result = "max retry reached" + try: + node = db.get_storage_node_by_id(task.node_id) + except KeyError: + task.function_result = "node not found" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) - storage_node_ops.set_node_status(task.node_id, StorageNode.STATUS_OFFLINE) return True - if not node: - task.function_result = "node not found" + if task.retry >= task.max_retry: + task.function_result = "max retry reached" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) + storage_node_ops.set_node_status(task.node_id, StorageNode.STATUS_OFFLINE) return True if node.status in [StorageNode.STATUS_REMOVED, StorageNode.STATUS_SCHEDULABLE, StorageNode.STATUS_DOWN]: @@ -171,6 +173,13 @@ def task_runner_node(task): task.status = JobSchedule.STATUS_RUNNING task.write_to_db(db.kv_store) + cluster = db.get_cluster_by_id(task.cluster_id) + if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: + task.function_result = f"Cluster is not active: {cluster.status}, retry" + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return False + # is node reachable? ping_check = health_controller._check_node_ping(node.mgmt_ip) logger.info(f"Check: ping mgmt ip {node.mgmt_ip} ... {ping_check}") @@ -191,19 +200,26 @@ def task_runner_node(task): return False - # shutting down node - logger.info(f"Shutdown node {node.get_id()}") - ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True) - if ret: - logger.info("Node shutdown succeeded") - - time.sleep(3) + try: + # shutting down node + logger.info(f"Shutdown node {node.get_id()}") + ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True) + if ret: + logger.info("Node shutdown succeeded") + time.sleep(3) + except Exception as e: + logger.error(e) + return False - # resetting node - logger.info(f"Restart node {node.get_id()}") - ret = storage_node_ops.restart_storage_node(node.get_id(), force=True) - if ret: - logger.info("Node restart succeeded") + try: + # resetting node + logger.info(f"Restart node {node.get_id()}") + ret = storage_node_ops.restart_storage_node(node.get_id(), force=True) + if ret: + logger.info("Node restart succeeded") + except Exception as e: + logger.error(e) + return False time.sleep(3) node = db.get_storage_node_by_id(task.node_id) diff --git a/simplyblock_core/services/tasks_runner_sync_lvol_del.py b/simplyblock_core/services/tasks_runner_sync_lvol_del.py new file mode 100644 index 000000000..bce8692c3 --- /dev/null +++ b/simplyblock_core/services/tasks_runner_sync_lvol_del.py @@ -0,0 +1,87 @@ +# coding=utf-8 +import time + + +from simplyblock_core import db_controller, utils +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.cluster import Cluster +from simplyblock_core.models.storage_node import StorageNode + +logger = utils.get_logger(__name__) + +# get DB controller +db = db_controller.DBController() + + +logger.info("Starting Tasks runner...") +while True: + + clusters = db.get_clusters() + if not clusters: + logger.error("No clusters found!") + else: + for cl in clusters: + if cl.status == Cluster.STATUS_IN_ACTIVATION: + continue + + tasks = db.get_job_tasks(cl.get_id(), reverse=False) + for task in tasks: + + if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL: + if task.status != JobSchedule.STATUS_DONE: + + # get new task object because it could be changed from cancel task + task = db.get_task_by_id(task.uuid) + + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + primary_node = db.get_storage_node_by_id(task.function_params["primary_node"]) + primary_node.lvol_del_sync_lock_reset() + continue + + node = db.get_storage_node_by_id(task.node_id) + + if not node: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + primary_node = db.get_storage_node_by_id(task.function_params["primary_node"]) + primary_node.lvol_del_sync_lock_reset() + continue + + if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: + msg = f"Node is {node.status}, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + continue + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + lvol_bdev_name = task.function_params["lvol_bdev_name"] + + logger.info(f"Sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + ret, err = node.rpc_client().delete_lvol(lvol_bdev_name, del_async=True) + if not ret: + if "code" in err and err["code"] == -19: + logger.error(f"Sync delete completed with error: {err}") + else: + msg = f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}" + logger.error(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + continue + + task.function_result = f"bdev {lvol_bdev_name} deleted" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + primary_node = db.get_storage_node_by_id(task.function_params["primary_node"]) + primary_node.lvol_del_sync_lock_reset() + + time.sleep(3) diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 2e8504b08..ba9e8b2ad 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -40,8 +40,7 @@ def _request(self, method, path, payload=None): response = self.session.request(method, self.url+path, data=data, timeout=self.timeout, params=params) except Exception as e: - logger.error("Request failed: %s", e) - raise e + raise SNodeClientException(str(e)) logger.debug("Response: status_code: %s, content: %s", response.status_code, response.content) @@ -69,11 +68,14 @@ def _request(self, method, path, payload=None): if ret_code == 422: raise SNodeClientException(f"Request validation failed: '{response.text}'") - logger.error("Unknown http status: %s", ret_code) - return None, None + raise SNodeClientException(f"Unknown http status: {ret_code}") def is_live(self): - return self._request("GET", "/check") + try: + return self._request("GET", "check") + except SNodeClientException: + logger.warning("Failed to call snode/check, trying snode/info") + return self.info() def info(self): return self._request("GET", "info") @@ -81,7 +83,7 @@ def info(self): def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None, cluster_ip=None, fdb_connection=None, namespace=None, server_ip=None, rpc_port=None, rpc_username=None, rpc_password=None, multi_threading_enabled=False, timeout=0, ssd_pcie=None, - total_mem=None, system_mem=None, cluster_mode=None): + total_mem=None, system_mem=None, cluster_mode=None, socket=0, cluster_id=None, firewall_port=0): params = { "cluster_ip": cluster_ip, "server_ip": server_ip, @@ -113,6 +115,13 @@ def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None params["system_mem"] = system_mem if cluster_mode: params["cluster_mode"] = cluster_mode + params["socket"] = socket + + if cluster_id: + params["cluster_id"] = cluster_id + if firewall_port: + params["firewall_port"] = firewall_port + params["socket"] = socket return self._request("POST", "spdk_process_start", params) def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id): @@ -124,8 +133,8 @@ def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id): # "db_connection": db_connection} # return self._request("POST", "join_swarm", params) - def spdk_process_kill(self, rpc_port): - return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port}) + def spdk_process_kill(self, rpc_port, cluster_id=None): + return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port, "cluster_id": cluster_id}) def leave_swarm(self): return True @@ -148,12 +157,16 @@ def bind_device_to_nvme(self, device_pci): params = {"device_pci": device_pci} return self._request("POST", "bind_device_to_nvme", params) + def format_device_with_4k(self, device_pci): + params = {"device_pci": device_pci} + return self._request("POST", "format_device_with_4k", params) + def bind_device_to_spdk(self, device_pci): params = {"device_pci": device_pci} return self._request("POST", "bind_device_to_spdk", params) - def spdk_process_is_up(self, rpc_port): - params = {"rpc_port": rpc_port} + def spdk_process_is_up(self, rpc_port, cluster_id): + params = {"rpc_port": rpc_port, "cluster_id": cluster_id} return self._request("GET", "spdk_process_is_up", params) def get_file_content(self, file_name): @@ -172,4 +185,11 @@ def ifc_is_roce(self, nic): def ifc_is_tcp(self, nic): params = {"nic": nic} - return self._request("GET", "ifc_is_tcp", params) \ No newline at end of file + return self._request("GET", "ifc_is_tcp", params) + def nvme_connect(self, ip, port, nqn): + params = {"ip": ip, "port": port, "nqn": nqn} + return self._request("POST", "nvme_connect", params) + + def disconnect_nqn(self, nqn): + params = {"nqn": nqn} + return self._request("POST", "disconnect_nqn", params) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 3d32dd17a..1b2b23c0d 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -1,7 +1,6 @@ # coding=utf- 8 import datetime import json -import os import platform import socket @@ -27,15 +26,18 @@ from simplyblock_core.models.iface import IFace from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.lvol_model import LVol -from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice +from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice, RemoteDevice, RemoteJMDevice from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.models.cluster import Cluster +from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient, RPCException from simplyblock_core.snode_client import SNodeClient, SNodeClientException from simplyblock_web import node_utils from simplyblock_core.utils import addNvmeDevices from simplyblock_core.utils import pull_docker_image_with_retry +import os + logger = utils.get_logger(__name__) @@ -57,73 +59,88 @@ def connect_device(name: str, device: NVMeDevice, node: StorageNode, bdev_names: rpc_client = node.rpc_client() # check connection status - if device.connecting_from_node and device.connecting_from_node != node.get_id(): + if device.is_connection_in_progress_to_node(node.get_id()): logger.warning("This device is being connected to from other node, sleep for 5 seconds") time.sleep(5) - device.connecting_from_node = node.get_id() - device.write_to_db() + device.lock_device_connection(node.get_id()) ret = rpc_client.bdev_nvme_controller_list(name) if ret: - for controller in ret[0]["ctrlrs"]: - controller_state = controller["state"] - logger.info(f"Controller found: {name}, status: {controller_state}") - if controller_state == "deleting": - raise RuntimeError(f"Controller: {name}, status is {controller_state}") - - if reattach: - rpc_client.bdev_nvme_detach_controller(name) - time.sleep(1) - - bdev_name = None - - db_ctrl=DBController() - node=db_ctrl.get_storage_node_by_id(device.node_id) - if node.active_rdma: - tr_type="RDMA" - else: - if node.active_tcp: - tr_type="TCP" + counter = 0 + while (counter < 5): + waiting = False + for controller in ret[0]["ctrlrs"]: + controller_state = controller["state"] + logger.info(f"Controller found: {name}, status: {controller_state}") + if controller_state== "failed": + # we can remove the controller only for certain, if its failed. other states are intermediate and require retry. + rpc_client.bdev_nvme_detach_controller(name) + time.sleep(2) + break + elif controller_state == "resetting" or controller_state == "deleting" or controller_state == "reconnect_is_delayed": + if counter < 5: + time.sleep(2) + waiting = True + break + else: # this should never happen. It means controller is "hanging" in an intermediate state for more than 10 seconds. usually if some io is hanging. + raise RuntimeError(f"Controller: {name}, status is {controller_state}") + if not waiting: + counter = 5 + else: + counter += 1 + + # if reattach: + # rpc_client.bdev_nvme_detach_controller(name) + # time.sleep(1) + + # only if the controller is really gone we try to reattach it + if not rpc_client.bdev_nvme_controller_list(name): + bdev_name = None + + db_ctrl = DBController() + node = db_ctrl.get_storage_node_by_id(device.node_id) + if node.active_rdma: + tr_type = "RDMA" else: - msg="target node to connect has no active fabric." - logger.error(msg) - raise RuntimeError(msg) + if node.active_tcp: + tr_type = "TCP" + else: + msg = "target node to connect has no active fabric." + logger.error(msg) + raise RuntimeError(msg) - for ip in device.nvmf_ip.split(","): - ret = rpc_client.bdev_nvme_attach_controller( - name, device.nvmf_nqn, ip, device.nvmf_port,tr_type, + for ip in device.nvmf_ip.split(","): + ret = rpc_client.bdev_nvme_attach_controller( + name, device.nvmf_nqn, ip, device.nvmf_port, tr_type, multipath=device.nvmf_multipath) - if not bdev_name and ret and isinstance(ret, list): - bdev_name = ret[0] - - if device.nvmf_multipath: - rpc_client.bdev_nvme_set_multipath_policy(bdev_name, "active_active") + if not bdev_name and ret and isinstance(ret, list): + bdev_name = ret[0] - # wait 5 seconds after controller attach - time.sleep(5) + if device.nvmf_multipath: + rpc_client.bdev_nvme_set_multipath_policy(bdev_name, "active_active") - if not bdev_name: - msg = "Bdev name not returned from controller attach" - logger.error(msg) - raise RuntimeError(msg) - bdev_found = False - for i in range(5): - ret = rpc_client.get_bdevs(bdev_name) - if ret: - bdev_found = True - break - else: - time.sleep(1) + if not bdev_name: + msg = "Bdev name not returned from controller attach" + logger.error(msg) + raise RuntimeError(msg) + bdev_found = False + for i in range(5): + ret = rpc_client.get_bdevs(bdev_name) + if ret: + bdev_found = True + break + else: + time.sleep(1) - device.connecting_from_node = "" - device.write_to_db() + device.release_device_connection() - if not bdev_found: - logger.error("Bdev not found after 5 attempts") - raise RuntimeError(f"Failed to connect to device: {device.get_id()}") + if not bdev_found: + logger.error("Bdev not found after 5 attempts") + raise RuntimeError(f"Failed to connect to device: {device.get_id()}") - return bdev_name + return bdev_name + return None def get_next_cluster_device_order(db_controller, cluster_id): @@ -170,15 +187,24 @@ def _search_for_partitions(rpc_client, nvme_device): def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart): - raid_bdev = f"raid_jm_{snode.get_id()}" - if len(jm_nvme_bdevs) > 1: - raid_level = "1" - ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) - if not ret: - logger.error(f"Failed to create raid_jm_{snode.get_id()}") - return False + if snode.jm_device and snode.jm_device.raid_bdev: + raid_bdev = snode.jm_device.raid_bdev + if raid_bdev.startswith("raid_jm_"): + raid_level = "1" + ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) + if not ret: + logger.error(f"Failed to create raid_jm_{snode.get_id()}") + return False else: - raid_bdev = jm_nvme_bdevs[0] + if len(jm_nvme_bdevs) > 1: + raid_bdev = f"raid_jm_{snode.get_id()}" + raid_level = "1" + ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) + if not ret: + logger.error(f"Failed to create raid_jm_{snode.get_id()}") + return False + else: + raid_bdev = jm_nvme_bdevs[0] alceml_id = snode.get_id() alceml_name = f"alceml_jm_{snode.get_id()}" @@ -224,9 +250,9 @@ def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart): return False for iface in snode.data_nics: - logger.info(f"adding {iface.trtype} listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address)) - ret = rpc_client.listeners_create(subsystem_nqn, iface.trtype, iface.ip4_address, snode.nvmf_port) - ip_list.append(iface.ip4_address) + logger.info(f"adding {iface.trtype} listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address)) + ret = rpc_client.listeners_create(subsystem_nqn, iface.trtype, iface.ip4_address, snode.nvmf_port) + ip_list.append(iface.ip4_address) if len(ip_list) > 1: IP = ",".join(ip_list) @@ -413,8 +439,8 @@ def _create_storage_device_stack(rpc_client, nvme, snode, after_restart): return nvme -def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, jm_percent, partition_size=0): - nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev) +def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, jm_percent, partition_size, nbd_index): + nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev, f"/dev/nbd{nbd_index}") time.sleep(3) if not nbd_device: logger.error("Failed to start nbd dev") @@ -431,9 +457,15 @@ def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, j return False time.sleep(3) rpc_client.nbd_stop_disk(nbd_device) - time.sleep(1) + for i in range(10): + if not rpc_client.nbd_get_disks(nbd_device): + break + time.sleep(1) rpc_client.bdev_nvme_detach_controller(nvme.nvme_controller) - time.sleep(1) + for i in range(10): + if not rpc_client.bdev_nvme_controller_list(nvme.nvme_controller): + break + time.sleep(1) try: rpc_client.bdev_nvme_controller_attach(nvme.nvme_controller, nvme.pcie_address) except RPCException as e: @@ -447,79 +479,84 @@ def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, j def _prepare_cluster_devices_partitions(snode, devices): db_controller = DBController() - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password) - new_devices = [] - jm_devices = [] - dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id) - bdevs_names = [d['name'] for d in rpc_client.get_bdevs()] + devices_to_partition = [] + thread_list = [] for index, nvme in enumerate(devices): if nvme.status == "not_found": continue - if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_NEW]: logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}") new_devices.append(nvme) continue - if nvme.is_partition: - dev_part = f"{nvme.nvme_bdev[:-2]}p1" - if dev_part in bdevs_names: - if dev_part not in jm_devices: - jm_devices.append(dev_part) - - new_device = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=False) - if not new_device: - logger.error("failed to create dev stack") - return False - new_devices.append(new_device) - if new_device.status == NVMeDevice.STATUS_ONLINE: - device_events.device_create(new_device) - + t = threading.Thread(target=_create_storage_device_stack, args=(snode.rpc_client(), nvme, snode, False,)) + thread_list.append(t) + new_devices.append(nvme) + t.start() else: - # look for partitions - partitioned_devices = _search_for_partitions(rpc_client, nvme) - logger.debug("partitioned_devices") - logger.debug(partitioned_devices) - if len(partitioned_devices) == (1 + snode.num_partitions_per_dev): - logger.info("Partitioned devices found") - else: + devices_to_partition.append(nvme) + partitioned_devices = _search_for_partitions(snode.rpc_client(), nvme) + if len(partitioned_devices) != (1 + snode.num_partitions_per_dev): logger.info(f"Creating partitions for {nvme.nvme_bdev}") - _create_device_partitions(rpc_client, nvme, snode, snode.num_partitions_per_dev, snode.jm_percent, - snode.partition_size) - partitioned_devices = _search_for_partitions(rpc_client, nvme) - if len(partitioned_devices) == (1 + snode.num_partitions_per_dev): - logger.info("Device partitions created") - else: - logger.error("Failed to create partitions") - return False + t = threading.Thread( + target=_create_device_partitions, + args=(snode.rpc_client(), nvme, snode, snode.num_partitions_per_dev, + snode.jm_percent, snode.partition_size, index + 1,)) + thread_list.append(t) + t.start() - jm_devices.append(partitioned_devices.pop(0).nvme_bdev) + for thread in thread_list: + thread.join() + thread_list = [] + for nvme in devices_to_partition: + partitioned_devices = _search_for_partitions(snode.rpc_client(), nvme) + if len(partitioned_devices) == (1 + snode.num_partitions_per_dev): + logger.info("Device partitions created") + # remove 1st partition for jm + partitioned_devices.pop(0) for dev in partitioned_devices: - ret = _create_storage_device_stack(rpc_client, dev, snode, after_restart=False) - if not ret: - logger.error("failed to create dev stack") - return False - if dev.status == NVMeDevice.STATUS_ONLINE: - if dev.cluster_device_order < 0: - dev.cluster_device_order = dev_order - dev_order += 1 - device_events.device_create(dev) + t = threading.Thread(target=_create_storage_device_stack, + args=(snode.rpc_client(), dev, snode, False,)) + thread_list.append(t) new_devices.append(dev) + t.start() + else: + logger.error("Failed to create partitions") + return False - snode.nvme_devices = new_devices + for thread in thread_list: + thread.join() + + # assign device order + dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id) + for nvme in new_devices: + if nvme.status == NVMeDevice.STATUS_ONLINE: + if nvme.cluster_device_order < 0: + nvme.cluster_device_order = dev_order + dev_order += 1 + device_events.device_create(nvme) + + # create jm device + jm_devices = [] + bdevs_names = [d['name'] for d in snode.rpc_client().get_bdevs()] + for nvme in new_devices: + if nvme.status == NVMeDevice.STATUS_ONLINE: + dev_part = f"{nvme.nvme_bdev[:-2]}p1" + if dev_part in bdevs_names: + if dev_part not in jm_devices: + jm_devices.append(dev_part) if jm_devices: - jm_device = _create_jm_stack_on_raid(rpc_client, jm_devices, snode, after_restart=False) + jm_device = _create_jm_stack_on_raid(snode.rpc_client(), jm_devices, snode, after_restart=False) if not jm_device: logger.error("Failed to create JM device") return False snode.jm_device = jm_device + snode.nvme_devices = new_devices return True @@ -599,7 +636,7 @@ def _prepare_cluster_devices_on_restart(snode, clear_data=False): # prepare JM device jm_device = snode.jm_device - if jm_device is None or jm_device.status == JMDevice.STATUS_REMOVED: + if jm_device is None: return True if not jm_device or not jm_device.uuid: @@ -608,20 +645,36 @@ def _prepare_cluster_devices_on_restart(snode, clear_data=False): jm_device.status = JMDevice.STATUS_UNAVAILABLE if jm_device.jm_nvme_bdev_list: - all_bdevs_found = True - for bdev_name in jm_device.jm_nvme_bdev_list: - ret = rpc_client.get_bdevs(bdev_name) + if len(jm_device.jm_nvme_bdev_list) == 1: + ret = rpc_client.get_bdevs(jm_device.jm_nvme_bdev_list[0]) if not ret: - logger.error(f"BDev not found: {bdev_name}") - all_bdevs_found = False - break - - if all_bdevs_found: + logger.error(f"BDev not found: {jm_device.jm_nvme_bdev_list[0]}") + jm_device.status = JMDevice.STATUS_REMOVED + return True ret = _create_jm_stack_on_raid(rpc_client, jm_device.jm_nvme_bdev_list, snode, after_restart=not clear_data) if not ret: logger.error("Failed to create JM device") return False + return True + + jm_bdevs_found = [] + for bdev_name in jm_device.jm_nvme_bdev_list: + ret = rpc_client.get_bdevs(bdev_name) + if ret: + logger.info(f"JM bdev found: {bdev_name}") + jm_bdevs_found.append(bdev_name) + else: + logger.error(f"JM bdev not found: {bdev_name}") + if len(jm_bdevs_found) > 1: + ret = _create_jm_stack_on_raid(rpc_client, jm_bdevs_found, snode, after_restart=not clear_data) + if not ret: + logger.error("Failed to create JM device") + return False + else: + logger.error("Only one jm nvme bdev found, setting jm device to removed") + jm_device.status = JMDevice.STATUS_REMOVED + return True else: nvme_bdev = jm_device.nvme_bdev @@ -684,7 +737,7 @@ def _connect_to_remote_devs( rpc_client = RPCClient( this_node.mgmt_ip, this_node.rpc_port, - this_node.rpc_username, this_node.rpc_password, timeout=3, retry=1) + this_node.rpc_username, this_node.rpc_password, timeout=5, retry=1) node_bdevs = rpc_client.get_bdevs() if node_bdevs: @@ -701,6 +754,8 @@ def _connect_to_remote_devs( allowed_node_statuses.append(StorageNode.STATUS_RESTARTING) allowed_dev_statuses.append(NVMeDevice.STATUS_UNAVAILABLE) + devices_to_connect = [] + connect_threads = [] nodes = db_controller.get_storage_nodes_by_cluster_id(this_node.cluster_id) # connect to remote devs for node_index, node in enumerate(nodes): @@ -715,12 +770,36 @@ def _connect_to_remote_devs( if not dev.alceml_bdev: raise ValueError(f"device alceml bdev not found!, {dev.get_id()}") + devices_to_connect.append(dev) + t = threading.Thread( + target=connect_device, + args=(f"remote_{dev.alceml_bdev}", dev, this_node, node_bdev_names, reattach,)) + connect_threads.append(t) + t.start() - dev.remote_bdev = connect_device( - f"remote_{dev.alceml_bdev}", dev, this_node, - bdev_names=node_bdev_names, reattach=reattach, - ) - remote_devices.append(dev) + for t in connect_threads: + t.join() + + node_bdevs = rpc_client.get_bdevs() + if node_bdevs: + node_bdev_names = [b['name'] for b in node_bdevs] + + for dev in devices_to_connect: + remote_bdev = RemoteDevice() + remote_bdev.uuid = dev.uuid + remote_bdev.alceml_name = dev.alceml_name + remote_bdev.node_id = dev.node_id + remote_bdev.size = dev.size + remote_bdev.status = NVMeDevice.STATUS_ONLINE + remote_bdev.nvmf_multipath = dev.nvmf_multipath + for bdev in node_bdev_names: + if bdev.startswith(f"remote_{dev.alceml_bdev}"): + remote_bdev.remote_bdev = bdev + break + if not remote_bdev.remote_bdev: + logger.error(f"Failed to connect to remote device {dev.alceml_name}") + continue + remote_devices.append(remote_bdev) return remote_devices @@ -759,6 +838,10 @@ def _connect_to_remote_jm_devs(this_node, jm_ids=None): if jm_dev and jm_dev not in remote_devices: remote_devices.append(jm_dev) + logger.debug(f"remote_devices: {remote_devices}") + allowed_node_statuses = [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN, StorageNode.STATUS_RESTARTING] + allowed_dev_statuses = [NVMeDevice.STATUS_ONLINE] + new_devs = [] for jm_dev in remote_devices: if not jm_dev.jm_bdev: @@ -775,17 +858,34 @@ def _connect_to_remote_jm_devs(this_node, jm_ids=None): if not org_dev or org_dev in new_devs or org_dev_node and org_dev_node.get_id() == this_node.get_id(): continue + if org_dev_node is not None and org_dev_node.status not in allowed_node_statuses: + logger.warning(f"Skipping node:{org_dev_node.get_id()} with status: {org_dev_node.status}") + continue + + if org_dev is not None and org_dev.status not in allowed_dev_statuses: + logger.warning(f"Skipping device:{org_dev.get_id()} with status: {org_dev.status}") + continue + + remote_device = RemoteJMDevice() + remote_device.uuid = org_dev.uuid + remote_device.alceml_name = org_dev.alceml_name + remote_device.node_id = org_dev.node_id + remote_device.size = org_dev.size + remote_device.jm_bdev = org_dev.jm_bdev + remote_device.status = NVMeDevice.STATUS_ONLINE + remote_device.nvmf_multipath = org_dev.nvmf_multipath try: - org_dev.remote_bdev = connect_device( - f"remote_{org_dev.jm_bdev}", org_dev, this_node, - bdev_names=node_bdev_names, reattach=True, + remote_device.remote_bdev = connect_device( + f"remote_{org_dev.jm_bdev}", org_dev, this_node, + bdev_names=node_bdev_names, reattach=True, ) except RuntimeError: logger.error(f'Failed to connect to {org_dev.get_id()}') - new_devs.append(org_dev) + new_devs.append(remote_device) return new_devs + def ifc_is_tcp(nic): addrs = psutil.net_if_addrs().get(nic, []) for addr in addrs: @@ -793,6 +893,7 @@ def ifc_is_tcp(nic): return True return False + def ifc_is_roce(nic): rdma_path = "/sys/class/infiniband/" if not os.path.exists(rdma_path): @@ -806,12 +907,14 @@ def ifc_is_roce(nic): return True return False -def add_node(cluster_id, node_addr, iface_name,data_nics_list, + +def add_node(cluster_id, node_addr, iface_name, data_nics_list, max_snap, spdk_image=None, spdk_debug=False, small_bufsize=0, large_bufsize=0, num_partitions_per_dev=0, jm_percent=0, enable_test_device=False, - namespace=None, enable_ha_jm=False, id_device_by_nqn=False, - partition_size="", ha_jm_count=3): + namespace=None, enable_ha_jm=False, cr_name=None, cr_namespace=None, cr_plural=None, + id_device_by_nqn=False, partition_size="", ha_jm_count=3, format_4k=False): + snode_api = SNodeClient(node_addr) node_info, _ = snode_api.info() if node_info.get("nodes_config") and node_info["nodes_config"].get("nodes"): @@ -887,6 +990,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, app_thread_core = node_config.get("distribution").get("app_thread_core") jm_cpu_core = node_config.get("distribution").get("jm_cpu_core") number_of_distribs = node_config.get("number_of_distribs") + lvol_poller_core = node_config.get("distribution").get("lvol_poller_core") + lvol_poller_mask = utils.generate_mask(lvol_poller_core) pollers_mask = utils.generate_mask(poller_cpu_cores) app_thread_mask = utils.generate_mask(app_thread_core) @@ -896,9 +1001,10 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, jm_cpu_mask = utils.generate_mask(jm_cpu_core) # Calculate pool count - max_prov = int(utils.parse_size(node_config.get("max_size"))) - - if max_prov <= 0: + max_prov = 0 + if node_config.get("max_size"): + max_prov = int(utils.parse_size(node_config.get("max_size"))) + if max_prov < 0: logger.error(f"Incorrect max-prov value {max_prov}") return False @@ -910,6 +1016,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, minimum_hp_memory = node_config.get("huge_page_memory") + minimum_hp_memory = max(minimum_hp_memory, max_prov) + # check for memory if "memory_details" in node_info and node_info['memory_details']: memory_details = node_info['memory_details'] @@ -918,7 +1026,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, logger.info(f"Free: {utils.humanbytes(memory_details['free'])}") logger.info(f"huge_total: {utils.humanbytes(memory_details['huge_total'])}") logger.info(f"huge_free: {utils.humanbytes(memory_details['huge_free'])}") - logger.info(f"Minimum required huge pages memory is : {utils.humanbytes(minimum_hp_memory)}") + logger.info(f"Set huge pages memory is : {utils.humanbytes(minimum_hp_memory)}") else: logger.error("Cannot get memory info from the instance.. Exiting") return False @@ -926,14 +1034,15 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, # Calculate minimum sys memory minimum_sys_memory = node_config.get("sys_memory") - satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory, - minimum_sys_memory, - int(memory_details['free']), - int(memory_details['huge_total'])) + # satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory, + # minimum_sys_memory, + # int(memory_details['free']), + # int(memory_details['huge_total'])) max_lvol = node_config.get("max_lvol") - if not satisfied: - logger.warning( - f"Not enough memory for the provided max_lvo: {max_lvol}, max_prov: {max_prov}..") + + # if not satisfied: + # logger.warning( + # f"Not enough memory for the provided max_lvo: {max_lvol}, max_prov: {max_prov}..") ssd_pcie = node_config.get("ssd_pcis") if ssd_pcie: @@ -962,6 +1071,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, else: cluster_ip = utils.get_k8s_node_ip() + firewall_port = utils.get_next_fw_port(cluster_id) rpc_port = utils.get_next_rpc_port(cluster_id) rpc_user, rpc_pass = utils.generate_rpc_user_and_pass() mgmt_info = utils.get_mgmt_ip(node_info, iface_name) @@ -980,17 +1090,20 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, if log_config_type and log_config_type != LogConfig.types.GELF: logger.info("SNodeAPI container found but not configured with gelf logger") start_storage_node_api_container(mgmt_ip, cluster_ip) + node_socket = node_config.get("socket") total_mem = minimum_hp_memory for n in db_controller.get_storage_nodes_by_cluster_id(cluster_id): - if n.api_endpoint == node_addr: - total_mem += n.spdk_mem - total_mem += utils.parse_size("500m") + if n.api_endpoint == node_addr and n.socket == node_socket: + total_mem += (n.spdk_mem + 500000000) + logger.info("Deploying SPDK") - results = None l_cores = node_config.get("l-cores") spdk_cpu_mask = node_config.get("cpu_mask") for ssd in ssd_pcie: + if format_4k: + snode_api.format_device_with_4k(ssd) + snode_api.bind_device_to_spdk(ssd) snode_api.bind_device_to_spdk(ssd) try: results, err = snode_api.spdk_process_start( @@ -998,7 +1111,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, namespace, mgmt_ip, rpc_port, rpc_user, rpc_pass, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, - ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode) + ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, + socket=node_socket, cluster_id=cluster_id, firewall_port=firewall_port) time.sleep(5) except Exception as e: @@ -1011,8 +1125,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, data_nics = [] - active_tcp=False - active_rdma=False + active_tcp = False + active_rdma = False fabric_tcp = cluster.fabric_tcp fabric_rdma = cluster.fabric_rdma names = data_nics_list or [mgmt_iface] @@ -1021,17 +1135,17 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, logger.debug(f"Data nics ports are: {names}") for nic in names: device = node_info['network_interface'][nic] - base_ifc_cfg={ - 'uuid': str(uuid.uuid4()), - 'if_name': nic, - 'ip4_address': device['ip'], - 'status': device['status'], - 'net_type': device['net_type'],} + base_ifc_cfg = { + 'uuid': str(uuid.uuid4()), + 'if_name': nic, + 'ip4_address': device['ip'], + 'status': device['status'], + 'net_type': device['net_type'], } if fabric_rdma and snode_api.ifc_is_roce(nic): cfg = base_ifc_cfg.copy() cfg['trtype'] = "RDMA" data_nics.append(IFace(cfg)) - active_rdma=True + active_rdma = True if fabric_tcp and snode_api.ifc_is_tcp(nic): active_tcp = True elif fabric_tcp and snode_api.ifc_is_tcp(nic): @@ -1061,6 +1175,9 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.cloud_name = cloud_instance['cloud'] or "" snode.namespace = namespace + snode.cr_name = cr_name + snode.cr_namespace = cr_namespace + snode.cr_plural = cr_plural snode.ssd_pcie = ssd_pcie snode.hostname = hostname snode.host_nqn = subsystem_nqn @@ -1080,8 +1197,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.enable_ha_jm = enable_ha_jm snode.ha_jm_count = ha_jm_count snode.minimum_sys_memory = minimum_sys_memory - snode.active_tcp=active_tcp - snode.active_rdma=active_rdma + snode.active_tcp = active_tcp + snode.active_rdma = active_rdma if 'cpu_count' in node_info: snode.cpu = node_info['cpu_count'] @@ -1103,6 +1220,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.write_to_db(kv_store) snode.app_thread_mask = app_thread_mask or "" snode.pollers_mask = pollers_mask or "" + snode.lvol_poller_mask = lvol_poller_mask or "" snode.jm_cpu_mask = jm_cpu_mask snode.alceml_cpu_index = alceml_cpu_index snode.alceml_worker_cpu_index = alceml_worker_cpu_index @@ -1114,11 +1232,14 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.nvmf_port = utils.get_next_dev_port(cluster_id) snode.poller_cpu_cores = poller_cpu_cores or [] + snode.socket = node_socket + snode.iobuf_small_pool_count = small_pool_count or 0 snode.iobuf_large_pool_count = large_pool_count or 0 snode.iobuf_small_bufsize = small_bufsize or 0 snode.iobuf_large_bufsize = large_bufsize or 0 snode.enable_test_device = enable_test_device + snode.firewall_port = firewall_port if cluster.is_single_node: snode.physical_label = 0 @@ -1176,6 +1297,12 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, rpc_client.log_set_print_level("DEBUG") + if snode.lvol_poller_mask: + ret = rpc_client.bdev_lvol_create_poller_group(snode.lvol_poller_mask) + if not ret: + logger.error("Failed to set pollers mask") + return False + # 5- set app_thread cpu mask if snode.app_thread_mask: ret = rpc_client.thread_get_stats() @@ -1204,15 +1331,15 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, return False if cluster.fabric_tcp: - ret = rpc_client.transport_create("TCP", qpair,512*(req_cpu_count+1)) - if not ret: - logger.error(f"Failed to create transport TCP with qpair: {qpair}") - return False + ret = rpc_client.transport_create("TCP", qpair, 512 * (req_cpu_count + 1)) + if not ret: + logger.error(f"Failed to create transport TCP with qpair: {qpair}") + return False if cluster.fabric_rdma: - ret = rpc_client.transport_create("RDMA", qpair,512*(req_cpu_count+1)) - if not ret: - logger.error(f"Failed to create transport RDMA with qpair: {qpair}") - return False + ret = rpc_client.transport_create("RDMA", qpair, 512 * (req_cpu_count + 1)) + if not ret: + logger.error(f"Failed to create transport RDMA with qpair: {qpair}") + return False # 7- set jc singleton mask if snode.jc_singleton_mask: @@ -1262,8 +1389,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, logger.info("Setting Alcemls QOS weights") ret = rpc_client.alceml_set_qos_weights(qos_controller.get_qos_weights_list(cluster_id)) if not ret: - logger.error("Failed to set Alcemls QOS") - return False + logger.error("Failed to set Alcemls QOS") + return False logger.info("Connecting to remote devices") remote_devices = _connect_to_remote_devs(snode) @@ -1277,7 +1404,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode = db_controller.get_storage_node_by_id(snode.get_id()) old_status = snode.status - snode.status = StorageNode.STATUS_ONLINE + snode.status = StorageNode.STATUS_ONLINE snode.updated_at = str(datetime.datetime.now(datetime.timezone.utc)) snode.online_since = str(datetime.datetime.now(datetime.timezone.utc)) snode.write_to_db(db_controller.kv_store) @@ -1297,7 +1424,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, return False node.write_to_db(kv_store) - if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY, Cluster.STATUS_IN_EXPANSION]: + if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY, + Cluster.STATUS_IN_EXPANSION]: logger.warning( f"The cluster status is not active ({cluster.status}), adding the node without distribs and lvstore") continue @@ -1454,7 +1582,7 @@ def remove_storage_node(node_id, force_remove=False, force_migrate=False): if health_controller._check_node_api(snode.mgmt_ip): logger.info("Stopping SPDK container") snode_api = SNodeClient(snode.api_endpoint, timeout=20) - snode_api.spdk_process_kill(snode.rpc_port) + snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id) snode_api.leave_swarm() pci_address = [] for dev in snode.nvme_devices: @@ -1464,7 +1592,6 @@ def remove_storage_node(node_id, force_remove=False, force_migrate=False): pci_address.append(dev.pcie_address) except Exception as e: logger.exception(e) - return False set_node_status(node_id, StorageNode.STATUS_REMOVED) @@ -1481,8 +1608,6 @@ def restart_storage_node( spdk_image=None, set_spdk_debug=None, small_bufsize=0, large_bufsize=0, force=False, node_ip=None, reattach_volume=False, clear_data=False, new_ssd_pcie=[], force_lvol_recreate=False): - db_controller = DBController() - kv_store = db_controller.kv_store db_controller = DBController() logger.info("Restarting storage node") @@ -1565,23 +1690,22 @@ def restart_storage_node( snode_api.bind_device_to_spdk(dev['address']) else: node_ip = None - active_tcp=False - active_rdma=False + active_tcp = False + active_rdma = False fabric_tcp = cluster.fabric_tcp fabric_rdma = cluster.fabric_rdma snode_api = SNodeClient(snode.api_endpoint, timeout=5 * 60, retry=3) for nic in snode.data_nics: if fabric_rdma and snode_api.ifc_is_roce(nic["if_name"]): nic.trtype = "RDMA" - active_rdma=True + active_rdma = True if fabric_tcp and snode_api.ifc_is_tcp(nic["if_name"]): active_tcp = True elif fabric_tcp and snode_api.ifc_is_tcp(nic["if_name"]): nic.trtype = "TCP" active_tcp = True - snode.active_tcp=active_tcp - snode.active_rdma=active_rdma - + snode.active_tcp = active_tcp + snode.active_rdma = active_rdma logger.info(f"Restarting Storage node: {snode.mgmt_ip}") node_info, _ = snode_api.info() @@ -1602,28 +1726,27 @@ def restart_storage_node( snode.l_cores = node['l-cores'] break - if max_prov: - if not isinstance(max_prov, int): - try: - max_prov = int(max_prov) - max_prov = f"{max_prov}g" - max_prov = int(utils.parse_size(max_prov)) - except Exception: - logger.error(f"Invalid max_prov value: {max_prov}") - return False - - snode.max_prov = max_prov - if snode.max_prov <= 0: - logger.error(f"Incorrect max-prov value {snode.max_prov}") - return False + if max_prov > 0: + try: + max_prov = int(utils.parse_size(max_prov)) + snode.max_prov = max_prov + except Exception as e: + logger.debug(e) + logger.error(f"Invalid max_prov value: {max_prov}") + return False + else: + max_prov = snode.max_prov if spdk_image: snode.spdk_image = spdk_image # Calculate minimum huge page memory - minimum_hp_memory = utils.calculate_minimum_hp_memory(snode.iobuf_small_pool_count, snode.iobuf_large_pool_count, snode.max_lvol, - snode.max_prov, + minimum_hp_memory = utils.calculate_minimum_hp_memory(snode.iobuf_small_pool_count, snode.iobuf_large_pool_count, + snode.max_lvol, + max_prov, len(utils.hexa_to_cpu_list(snode.spdk_cpu_mask))) + minimum_hp_memory = max(minimum_hp_memory, max_prov) + # check for memory if "memory_details" in node_info and node_info['memory_details']: memory_details = node_info['memory_details'] @@ -1636,22 +1759,26 @@ def restart_storage_node( return False # Calculate minimum sys memory - #minimum_sys_memory = utils.calculate_minimum_sys_memory(snode.max_prov, memory_details['total']) - minimum_sys_memory = snode.minimum_sys_memory - satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory, - minimum_sys_memory, - int(memory_details['free']), - int(memory_details['huge_total'])) - if not satisfied: - logger.error( - f"Not enough memory for the provided max_lvo: {snode.max_lvol}, max_snap: {snode.max_snap}, max_prov: {utils.humanbytes(snode.max_prov)}.. Exiting") - - snode.spdk_mem = spdk_mem + # minimum_sys_memory = utils.calculate_minimum_sys_memory(snode.max_prov, memory_details['total']) + # minimum_sys_memory = snode.minimum_sys_memory + # satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory, + # minimum_sys_memory, + # int(memory_details['free']), + # int(memory_details['huge_total'])) + # if not satisfied: + # logger.error( + # f"Not enough memory for the provided max_lvo: {snode.max_lvol}, max_snap: {snode.max_snap}, max_prov: {utils.humanbytes(snode.max_prov)}.. Exiting") + minimum_sys_memory = snode.minimum_sys_memory or 0 + snode.spdk_mem = minimum_hp_memory + spdk_debug = snode.spdk_debug if set_spdk_debug: spdk_debug = True snode.spdk_debug = spdk_debug + if minimum_sys_memory: + snode.minimum_sys_memory = minimum_sys_memory + cluster = db_controller.get_cluster_by_id(snode.cluster_id) if cluster.mode == "docker": @@ -1661,22 +1788,29 @@ def restart_storage_node( else: cluster_ip = utils.get_k8s_node_ip() - total_mem = 0 + total_mem = minimum_hp_memory for n in db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id): - if n.api_endpoint == snode.api_endpoint: - total_mem += n.spdk_mem - total_mem+= utils.parse_size("500m") + if n.api_endpoint == snode.api_endpoint and n.socket == snode.socket and n.uuid != snode.uuid: + total_mem += (n.spdk_mem + 500000000) results = None try: if new_ssd_pcie and type(new_ssd_pcie) is list: - snode.ssd_pcie.extend(new_ssd_pcie) + for new_ssd in new_ssd_pcie: + if new_ssd not in snode.ssd_pcie: + try: + snode_api.bind_device_to_spdk(new_ssd) + except Exception as e: + logger.error(e) + snode.ssd_pcie.append(new_ssd) + fdb_connection = cluster.db_connection results, err = snode_api.spdk_process_start( snode.l_cores, snode.spdk_mem, snode.spdk_image, spdk_debug, cluster_ip, fdb_connection, snode.namespace, snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, - ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode) + ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, + cluster_id=snode.cluster_id, socket=snode.socket, firewall_port=snode.firewall_port) except Exception as e: logger.error(e) @@ -1737,6 +1871,12 @@ def restart_storage_node( rpc_client.log_set_print_level("DEBUG") + if snode.lvol_poller_mask: + ret = rpc_client.bdev_lvol_create_poller_group(snode.lvol_poller_mask) + if not ret: + logger.error("Failed to set pollers mask") + return False + # 5- set app_thread cpu mask if snode.app_thread_mask: ret = rpc_client.thread_get_stats() @@ -1761,12 +1901,12 @@ def restart_storage_node( qpair = cluster.qpair_count req_cpu_count = len(utils.hexa_to_cpu_list(snode.spdk_cpu_mask)) if cluster.fabric_tcp: - ret = rpc_client.transport_create("TCP", qpair, 512*(req_cpu_count+1)) + ret = rpc_client.transport_create("TCP", qpair, 512 * (req_cpu_count + 1)) if not ret: logger.error(f"Failed to create transport TCP with qpair: {qpair}") return False if cluster.fabric_rdma: - ret = rpc_client.transport_create("RDMA", qpair, 512*(req_cpu_count+1)) + ret = rpc_client.transport_create("RDMA", qpair, 512 * (req_cpu_count + 1)) if not ret: logger.error(f"Failed to create transport RDMA with qpair: {qpair}") return False @@ -1778,11 +1918,14 @@ def restart_storage_node( logger.error("Failed to set jc singleton mask") return False + node_info, _ = snode_api.info() if not snode.ssd_pcie: - node_info, _ = snode_api.info() ssds = node_info['spdk_pcie_list'] else: - ssds = snode.ssd_pcie + ssds = [] + for ssd in snode.ssd_pcie: + if ssd in node_info['spdk_pcie_list']: + ssds.append(ssd) nvme_devs = addNvmeDevices(rpc_client, snode, ssds) if not nvme_devs: @@ -1799,10 +1942,11 @@ def restart_storage_node( active_devices = [] removed_devices = [] known_devices_sn = [] - devices_sn_dict = {d.serial_number:d for d in nvme_devs} + devices_sn_dict = {d.serial_number: d for d in nvme_devs} for db_dev in snode.nvme_devices: known_devices_sn.append(db_dev.serial_number) - if db_dev.status in [NVMeDevice.STATUS_FAILED_AND_MIGRATED, NVMeDevice.STATUS_FAILED, NVMeDevice.STATUS_REMOVED]: + if db_dev.status in [NVMeDevice.STATUS_FAILED_AND_MIGRATED, NVMeDevice.STATUS_FAILED, + NVMeDevice.STATUS_REMOVED]: removed_devices.append(db_dev) continue if db_dev.serial_number in devices_sn_dict.keys(): @@ -1811,7 +1955,7 @@ def restart_storage_node( if not db_dev.is_partition and not found_dev.is_partition: db_dev.device_name = found_dev.device_name db_dev.nvme_bdev = found_dev.nvme_bdev - db_dev.nvme_controller =found_dev.nvme_controller + db_dev.nvme_controller = found_dev.nvme_controller db_dev.pcie_address = found_dev.pcie_address # if db_dev.status in [ NVMeDevice.STATUS_ONLINE]: @@ -1819,9 +1963,11 @@ def restart_storage_node( active_devices.append(db_dev) else: logger.info(f"Device not found: {db_dev.get_id()}") - db_dev.status = NVMeDevice.STATUS_REMOVED - removed_devices.append(db_dev) - # distr_controller.send_dev_status_event(db_dev, db_dev.status) + if db_dev.status == NVMeDevice.STATUS_NEW: + snode.nvme_devices.remove(db_dev) + else: + db_dev.status = NVMeDevice.STATUS_REMOVED + removed_devices.append(db_dev) jm_dev_sn = "" if snode.jm_device and "serial_number" in snode.jm_device.device_data_dict: @@ -1840,7 +1986,7 @@ def restart_storage_node( snode.nvme_devices.append(dev) snode.write_to_db(db_controller.kv_store) - if node_ip and len(new_devices)>0: + if node_ip and len(new_devices) > 0: # prepare devices on new node if snode.num_partitions_per_dev == 0 or snode.jm_percent == 0: @@ -1883,11 +2029,9 @@ def restart_storage_node( return False if snode.enable_ha_jm: snode.remote_jm_devices = _connect_to_remote_jm_devs(snode) - snode.health_check = True snode.lvstore_status = "" snode.write_to_db(db_controller.kv_store) - snode = db_controller.get_storage_node_by_id(snode.get_id()) for db_dev in snode.nvme_devices: if db_dev.status in [NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_ONLINE, @@ -1900,23 +2044,6 @@ def restart_storage_node( db_dev.health_check = True device_events.device_restarted(db_dev) snode.write_to_db(db_controller.kv_store) - # - # # make other nodes connect to the new devices - # logger.info("Make other nodes connect to the node devices") - # snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id) - # for node in snodes: - # if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE: - # continue - # node.remote_devices = _connect_to_remote_devs(node, force_connect_restarting_nodes=True) - # node.write_to_db(kv_store) - # - # logger.info(f"Sending device status event") - # snode = db_controller.get_storage_node_by_id(snode.get_id()) - # for db_dev in snode.nvme_devices: - # distr_controller.send_dev_status_event(db_dev, db_dev.status) - # - # if snode.jm_device and snode.jm_device.status in [JMDevice.STATUS_UNAVAILABLE, JMDevice.STATUS_ONLINE]: - # device_controller.set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) cluster = db_controller.get_cluster_by_id(snode.cluster_id) if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: @@ -1932,7 +2059,7 @@ def restart_storage_node( except RuntimeError: logger.error('Failed to connect to remote devices') return False - node.write_to_db(kv_store) + node.write_to_db() logger.info("Sending device status event") snode = db_controller.get_storage_node_by_id(snode.get_id()) @@ -1979,9 +2106,7 @@ def restart_storage_node( except RuntimeError: logger.error('Failed to connect to remote devices') return False - node.write_to_db(kv_store) - - + node.write_to_db() logger.info("Sending device status event") snode = db_controller.get_storage_node_by_id(snode.get_id()) for db_dev in snode.nvme_devices: @@ -2004,11 +2129,11 @@ def restart_storage_node( pools = db_controller.get_pools() for pool in pools: ret = rpc_client.bdev_lvol_set_qos_limit(pool.numeric_id, - pool.max_rw_ios_per_sec, - pool.max_rw_mbytes_per_sec, - pool.max_r_mbytes_per_sec, - pool.max_w_mbytes_per_sec, - ) + pool.max_rw_ios_per_sec, + pool.max_rw_mbytes_per_sec, + pool.max_r_mbytes_per_sec, + pool.max_w_mbytes_per_sec, + ) if not ret: logger.error("RPC failed bdev_lvol_set_qos_limit") return False @@ -2137,51 +2262,28 @@ def list_storage_devices(node_id, is_json): "Health": snode.jm_device.health_check }) - for jm_id in snode.jm_ids: - try: - jm_device = db_controller.get_jm_device_by_id(jm_id) - except KeyError: - continue - - jm_devices.append({ - "UUID": jm_device.uuid, - "Name": jm_device.device_name, - "Size": utils.humanbytes(jm_device.size), - "Status": jm_device.status, - "IO Err": jm_device.io_error, - "Health": jm_device.health_check - }) - - for device in snode.remote_devices: - logger.debug(device) + for remote_device in snode.remote_devices: + logger.debug(remote_device) logger.debug("*" * 20) - name = device.alceml_name - status = device.status - if device.remote_bdev: - name = device.remote_bdev - try: - org_dev = db_controller.get_storage_device_by_id(device.get_id()) - status = org_dev.status - except KeyError: - pass + name = remote_device.alceml_name remote_devices.append({ - "UUID": device.uuid, + "UUID": remote_device.uuid, "Name": name, - "Size": utils.humanbytes(device.size), - "Node ID": device.node_id, - "Status": status, + "Size": utils.humanbytes(remote_device.size), + "Node ID": remote_device.node_id, + "Status": remote_device.status, }) - for device in snode.remote_jm_devices: - logger.debug(device) + for remote_jm_device in snode.remote_jm_devices: + logger.debug(remote_jm_device) logger.debug("*" * 20) remote_devices.append({ - "UUID": device.uuid, - "Name": device.remote_bdev, - "Size": utils.humanbytes(device.size), - "Node ID": device.node_id, - "Status": device.status, + "UUID": remote_jm_device.uuid, + "Name": remote_jm_device.remote_bdev, + "Size": utils.humanbytes(remote_jm_device.size), + "Node ID": remote_jm_device.node_id, + "Status": remote_jm_device.status, }) data: dict[str, List[Any]] = { @@ -2228,7 +2330,8 @@ def shutdown_storage_node(node_id, force=False): if force is False: return False for task in tasks: - if task.function_name != JobSchedule.FN_NODE_RESTART: + if task.function_name not in [ + JobSchedule.FN_NODE_RESTART, JobSchedule.FN_SNAPSHOT_REPLICATION, JobSchedule.FN_LVOL_SYNC_DEL]: tasks_controller.cancel_task(task.uuid) logger.info("Shutting down node") @@ -2250,16 +2353,19 @@ def shutdown_storage_node(node_id, force=False): logger.info("Stopping SPDK") try: - SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port) + SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port, snode.cluster_id) except SNodeClientException: logger.error('Failed to kill SPDK') return False pci_address = [] for dev in snode.nvme_devices: if dev.pcie_address not in pci_address: - ret = SNodeClient(snode.api_endpoint, timeout=30, retry=1).bind_device_to_nvme(dev.pcie_address) - logger.debug(ret) - pci_address.append(dev.pcie_address) + try: + ret = SNodeClient(snode.api_endpoint, timeout=30, retry=1).bind_device_to_nvme(dev.pcie_address) + logger.debug(ret) + pci_address.append(dev.pcie_address) + except Exception as e: + logger.debug(e) logger.info("Setting node status to offline") set_node_status(node_id, StorageNode.STATUS_OFFLINE) @@ -2358,34 +2464,32 @@ def suspend_storage_node(node_id, force=False): if snode.lvstore_stack_secondary_1: nodes = db_controller.get_primary_storage_nodes_by_secondary_node_id(node_id) if nodes: - for node in nodes: + for node in nodes: try: fw_api.firewall_set_port( node.hublvol.nvmf_port, port_type, "block", snode.rpc_port, is_reject=True) fw_api.firewall_set_port( node.lvol_subsys_port, port_type, "block", snode.rpc_port, is_reject=True) + time.sleep(0.5) + rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False) + rpc_client.bdev_distrib_force_to_non_leader(node.jm_vuid) except Exception as e: logger.error(e) return False - time.sleep(0.5) - rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False) - rpc_client.bdev_distrib_force_to_non_leader(node.jm_vuid) try: fw_api.firewall_set_port( snode.hublvol.nvmf_port, port_type, "block", snode.rpc_port, is_reject=True) fw_api.firewall_set_port( snode.lvol_subsys_port, port_type, "block", snode.rpc_port, is_reject=True) + time.sleep(0.5) + rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False) + rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid) + time.sleep(1) except Exception as e: logger.error(e) return False - time.sleep(0.5) - rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False) - rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid) - time.sleep(1) - - logger.info("Done") return True @@ -2429,7 +2533,7 @@ def resume_storage_node(node_id): return False if snode.enable_ha_jm: snode.remote_jm_devices = _connect_to_remote_jm_devs(snode) - snode.write_to_db(db_controller.kv_store) + snode.write_to_db() fw_api = FirewallClient(snode, timeout=20, retry=1) port_type = "tcp" @@ -2437,7 +2541,7 @@ def resume_storage_node(node_id): port_type = "udp" nodes = db_controller.get_primary_storage_nodes_by_secondary_node_id(node_id) if nodes: - for node in nodes: + for node in nodes: try: fw_api.firewall_set_port( node.lvol_subsys_port, port_type, "allow", snode.rpc_port) @@ -2465,20 +2569,11 @@ def resume_storage_node(node_id): def get_node_capacity(node_id, history, records_count=20, parse_sizes=True): db_controller = DBController() try: - this_node = db_controller.get_storage_node_by_id(node_id) + node = db_controller.get_storage_node_by_id(node_id) except KeyError: logger.error("Storage node Not found") return - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records = db_controller.get_node_capacity(this_node, records_number) cap_stats_keys = [ "date", "size_total", @@ -2488,6 +2583,8 @@ def get_node_capacity(node_id, history, records_count=20, parse_sizes=True): "size_util", "size_prov_util", ] + prom_client = PromClient(node.cluster_id) + records = prom_client.get_node_metrics(node_id, cap_stats_keys, history) new_records = utils.process_records(records, records_count, keys=cap_stats_keys) if not parse_sizes: @@ -2514,17 +2611,6 @@ def get_node_iostats_history(node_id, history, records_count=20, parse_sizes=Tru except KeyError: logger.error("node not found") return False - - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records = db_controller.get_node_stats(node, records_number) - io_stats_keys = [ "date", "read_bytes", @@ -2562,6 +2648,8 @@ def get_node_iostats_history(node_id, history, records_count=20, parse_sizes=Tru "write_latency_ticks", ] ) + prom_client = PromClient(node.cluster_id) + records = prom_client.get_node_metrics(node_id, io_stats_keys, history) # combine records new_records = utils.process_records(records, records_count, keys=io_stats_keys) @@ -2669,8 +2757,8 @@ def upgrade_automated_deployment_config(): return False -def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, cores_percentage=0): - +def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, + cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None, k8s=False): # we need minimum of 6 VPCs. RAM 4GB min. Plus 0.2% of the storage. total_cores = os.cpu_count() or 0 if total_cores < 6: @@ -2681,7 +2769,8 @@ def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nod utils.load_kernel_module("uio_pci_generic") nodes_config, system_info = utils.generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, - pci_allowed, pci_blocked, cores_percentage) + pci_allowed, pci_blocked, cores_percentage, force=force, + device_model=device_model, size_range=size_range, nvme_names=nvme_names) if not nodes_config or not nodes_config.get("nodes"): return False utils.store_config_file(nodes_config, constants.NODES_CONFIG_FILE, create_read_only_file=True) @@ -2693,11 +2782,14 @@ def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nod for node_config in nodes_config["nodes"]: numa = node_config["socket"] huge_page_memory_dict[numa] = huge_page_memory_dict.get(numa, 0) + node_config["huge_page_memory"] - for numa, huge_page_memory in huge_page_memory_dict.items(): - num_pages = huge_page_memory // (2048 * 1024) - utils.set_hugepages_if_needed(numa, num_pages) + if not k8s: + utils.create_rpc_socket_mount() + # for numa, huge_page_memory in huge_page_memory_dict.items(): + # num_pages = huge_page_memory // (2048 * 1024) + # utils.set_hugepages_if_needed(numa, num_pages) return True + def deploy(ifname, isolate_cores=False): if not ifname: ifname = "eth0" @@ -2721,7 +2813,8 @@ def deploy(ifname, isolate_cores=False): logger.info("Config Validated successfully.") logger.info("NVMe SSD devices found on node:") - stream = os.popen(f"lspci -Dnn | grep -i '\\[{LINUX_DRV_MASS_STORAGE_ID:02}{LINUX_DRV_MASS_STORAGE_NVME_TYPE_ID:02}\\]'") + stream = os.popen( + f"lspci -Dnn | grep -i '\\[{LINUX_DRV_MASS_STORAGE_ID:02}{LINUX_DRV_MASS_STORAGE_NVME_TYPE_ID:02}\\]'") for line in stream.readlines(): logger.info(line.strip()) @@ -2790,6 +2883,10 @@ def deploy_cleaner(): scripts.deploy_cleaner() +def clean_devices(config_path, format=True, force=False): + utils.clean_devices(config_path, format=format, force=force) + + def get_host_secret(node_id): db_controller = DBController() try: @@ -2879,12 +2976,12 @@ def health_check(node_id): # subsystem = rpc_client.subsystem_list(dev.nvmf_nqn) - # dev.testing_bdev = test_name - # dev.alceml_bdev = alceml_name - # dev.pt_bdev = pt_name - # # nvme.nvmf_nqn = subsystem_nqn - # # nvme.nvmf_ip = IP - # # nvme.nvmf_port = 4420 + # dev.testing_bdev = test_name + # dev.alceml_bdev = alceml_name + # dev.pt_bdev = pt_name + # # nvme.nvmf_nqn = subsystem_nqn + # # nvme.nvmf_ip = IP + # # nvme.nvmf_port = 4420 except Exception as e: logger.error(f"Failed to connect to node's SPDK: {e}") @@ -2975,9 +3072,9 @@ def set_node_status(node_id, status, reconnect_on_online=True): return False if snode.enable_ha_jm: snode.remote_jm_devices = _connect_to_remote_jm_devs(snode) - snode.health_check = True snode.write_to_db(db_controller.kv_store) - distr_controller.send_cluster_map_to_node(snode) + for device in snode.nvme_devices: + distr_controller.send_dev_status_event(device, device.status, target_node=snode) for node in db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id): if node.get_id() == snode.get_id(): @@ -2986,7 +3083,8 @@ def set_node_status(node_id, status, reconnect_on_online=True): try: node.remote_devices = _connect_to_remote_devs(node) node.write_to_db() - distr_controller.send_cluster_map_to_node(node) + for device in node.nvme_devices: + distr_controller.send_dev_status_event(device, device.status, target_node=node) except RuntimeError: logger.error(f'Failed to connect to remote devices from node: {node.get_id()}') continue @@ -3009,7 +3107,6 @@ def set_node_status(node_id, status, reconnect_on_online=True): except Exception as e: logger.error("Error establishing hublvol: %s", e) - return True @@ -3040,7 +3137,7 @@ def recreate_lvstore_on_sec(secondary_node): return False # sending to the node that is being restarted (secondary_node) with the secondary group jm_vuid (primary_node.jm_vuid) - ret = secondary_node.rpc_client().jc_suspend_compression(jm_vuid=primary_node.jm_vuid, suspend=False) + ret, err = secondary_node.rpc_client().jc_suspend_compression(jm_vuid=primary_node.jm_vuid, suspend=False) if not ret: logger.info("Failed to resume JC compression adding task...") tasks_controller.add_jc_comp_resume_task( @@ -3057,7 +3154,6 @@ def recreate_lvstore_on_sec(secondary_node): port_type = "udp" if primary_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_RESTARTING]: - fw_api = FirewallClient(primary_node, timeout=5, retry=2) ### 3- block primary port fw_api.firewall_set_port(primary_node.lvol_subsys_port, port_type, "block", primary_node.rpc_port) @@ -3084,7 +3180,6 @@ def recreate_lvstore_on_sec(secondary_node): logger.error("Error connecting to hublvol: %s", e) # return False - fw_api = FirewallClient(primary_node, timeout=5, retry=2) ### 8- allow port on primary fw_api.firewall_set_port(primary_node.lvol_subsys_port, port_type, "allow", primary_node.rpc_port) @@ -3127,6 +3222,7 @@ def recreate_lvstore(snode, force=False): ### 1- create distribs and raid ret, err = _create_bdev_stack(snode, []) + if err: logger.error(f"Failed to recreate lvstore on node {snode.get_id()}") logger.error(err) @@ -3178,6 +3274,13 @@ def recreate_lvstore(snode, force=False): port_type = "tcp" if sec_node.active_rdma: port_type = "udp" + + ret = sec_node.wait_for_jm_rep_tasks_to_finish(snode.jm_vuid) + if not ret: + msg = f"JM replication task found for jm {snode.jm_vuid}" + logger.error(msg) + storage_events.jm_repl_tasks_found(sec_node, snode.jm_vuid) + fw_api.firewall_set_port(snode.lvol_subsys_port, port_type, "block", sec_node.rpc_port) tcp_ports_events.port_deny(sec_node, snode.lvol_subsys_port) @@ -3196,7 +3299,8 @@ def recreate_lvstore(snode, force=False): logger.info("Inflight IO NOT found, continuing") break else: - logger.error(f"Timeout while checking for inflight IO after 10 seconds on node {snode.secondary_node_id}") + logger.error( + f"Timeout while checking for inflight IO after 10 seconds on node {snode.secondary_node_id}") if sec_node.status in [StorageNode.STATUS_UNREACHABLE, StorageNode.STATUS_DOWN]: logger.info(f"Secondary node is not online, forcing journal replication on node: {snode.get_id()}") @@ -3214,7 +3318,7 @@ def recreate_lvstore(snode, force=False): def _kill_app(): storage_events.snode_restart_failed(snode) snode_api = SNodeClient(snode.api_endpoint, timeout=5, retry=5) - snode_api.spdk_process_kill(snode.rpc_port) + snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id) set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE) # If LVol Store recovery failed then stop spdk process @@ -3331,10 +3435,10 @@ def add_lvol_thread(lvol, snode, lvol_ana_state="optimized"): logger.error(msg) return False, msg - logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, nsid=lvol.ns_id) + logger.info("Add BDev to subsystem "+f"{lvol.vuid:016X}") + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, nsid=lvol.ns_id, eui64=f"{lvol.vuid:016X}") for iface in snode.data_nics: - if iface.ip4_address and lvol.fabric==iface.trtype.lower(): + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) ret = rpc_client.listeners_create( lvol.nqn, iface.trtype, iface.ip4_address, lvol.subsys_port, ana_state=lvol_ana_state) @@ -3349,9 +3453,9 @@ def add_lvol_thread(lvol, snode, lvol_ana_state="optimized"): lvol_obj.health_check = True lvol_obj.write_to_db() # set QOS - if lvol.rw_ios_per_sec or lvol.rw_mbytes_per_sec or lvol.r_mbytes_per_sec or lvol.w_mbytes_per_sec : + if lvol.rw_ios_per_sec or lvol.rw_mbytes_per_sec or lvol.r_mbytes_per_sec or lvol.w_mbytes_per_sec: lvol_controller.set_lvol(lvol.uuid, lvol.rw_ios_per_sec, lvol.rw_mbytes_per_sec, - lvol.r_mbytes_per_sec , lvol.w_mbytes_per_sec) + lvol.r_mbytes_per_sec, lvol.w_mbytes_per_sec) return True, None @@ -3388,7 +3492,7 @@ def get_sorted_ha_jms(current_node): continue mgmt_ips.append(jm_dev_to_mgmt_ip[jm_id]) out.append(jm_id) - return out[:constants.HA_JM_COUNT-1] + return out[:current_node.ha_jm_count - 1] def get_node_jm_names(current_node, remote_node=None): @@ -3410,16 +3514,11 @@ def get_node_jm_names(current_node, remote_node=None): if remote_node.jm_device.get_id() == jm_id: jm_list.append(remote_node.jm_device.jm_bdev) continue - for jm_dev in remote_node.remote_jm_devices: - if jm_dev.get_id() == jm_id: - jm_list.append(jm_dev.remote_bdev) - break - else: - for jm_dev in current_node.remote_jm_devices: - if jm_dev.get_id() == jm_id: - jm_list.append(jm_dev.remote_bdev) - break - return jm_list[:constants.HA_JM_COUNT] + + jm_dev = DBController().get_jm_device_by_id(jm_id) + jm_list.append(f"remote_{jm_dev.jm_bdev}n1") + + return jm_list[:current_node.ha_jm_count] def get_secondary_nodes(current_node): @@ -3436,8 +3535,8 @@ def get_secondary_nodes(current_node): if node.get_id() == current_node.get_id(): nod_found = True continue - elif node.status == StorageNode.STATUS_ONLINE and node.mgmt_ip != current_node.mgmt_ip : - # elif node.status == StorageNode.STATUS_ONLINE : + elif node.status == StorageNode.STATUS_ONLINE and node.mgmt_ip != current_node.mgmt_ip: + # elif node.status == StorageNode.STATUS_ONLINE : if node.is_secondary_node: nodes.append(node.get_id()) @@ -3575,7 +3674,7 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo return False # sending to the other node (sec_node) with the primary group jm_vuid (snode.jm_vuid) - ret = sec_node.rpc_client().jc_suspend_compression(jm_vuid=snode.jm_vuid, suspend=False) + ret, err = sec_node.rpc_client().jc_suspend_compression(jm_vuid=snode.jm_vuid, suspend=False) if not ret: logger.info("Failed to resume JC compression adding task...") tasks_controller.add_jc_comp_resume_task(sec_node.cluster_id, sec_node.get_id(), jm_vuid=snode.jm_vuid) @@ -3600,10 +3699,21 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo sec_node.write_to_db() + storage_events.node_ports_changed(snode) return True + def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): + def _create_distr(snode, name, params): + try: + rpc_client.bdev_distrib_create(**params) + except Exception: + logger.error("Failed to create bdev distrib") + ret = distr_controller.send_cluster_map_to_distr(snode, name) + if not ret: + logger.error("Failed to send cluster map") + rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) db_controller = DBController() cluster = db_controller.get_cluster_by_id(snode.cluster_id) @@ -3620,11 +3730,11 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): else: node_bdev_names = [] + thread_list = [] for bdev in stack: type = bdev['type'] name = bdev['name'] params = bdev['params'] - if name in node_bdev_names: continue @@ -3640,23 +3750,21 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): snode.distrib_cpu_index = (snode.distrib_cpu_index + 1) % len(snode.distrib_cpu_cores) params['full_page_unmap'] = cluster.full_page_unmap - ret = rpc_client.bdev_distrib_create(**params) - if ret: - ret = distr_controller.send_cluster_map_to_distr(snode, name) - if not ret: - return False, "Failed to send cluster map" - # time.sleep(1) + t = threading.Thread(target=_create_distr, args=(snode, name, params,)) + thread_list.append(t) + t.start() + ret = True elif type == "bdev_lvstore" and lvstore_stack and not primary_node: ret = rpc_client.create_lvstore(**params) - # if ret and snode.jm_vuid > 0: - # rpc_client.bdev_lvol_set_lvs_ops(snode.lvstore, snode.jm_vuid, snode.lvol_subsys_port) elif type == "bdev_ptnonexcl": ret = rpc_client.bdev_PT_NoExcl_create(**params) elif type == "bdev_raid": - + if thread_list: + for t in thread_list: + t.join() distribs_list = bdev["distribs_list"] strip_size_kb = params["strip_size_kb"] ret = rpc_client.bdev_raid_create(name, distribs_list, strip_size_kb=strip_size_kb) @@ -3674,6 +3782,9 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): _remove_bdev_stack(created_bdevs[::-1], rpc_client) return False, f"Failed to create BDev: {name}" + if thread_list: + for t in thread_list: + t.join() return True, None @@ -3792,7 +3903,7 @@ def dump_lvstore(node_id): logger.error("Storage node does not have lvstore") return False - rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=3, retry=0) + rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=120) logger.info(f"Dumping lvstore data on node: {snode.get_id()}") file_name = f"LVS_dump_{snode.hostname}_{snode.lvstore}_{str(datetime.datetime.now().isoformat())}.txt" file_path = f"/etc/simplyblock/{file_name}" diff --git a/simplyblock_core/test/test_utils.py b/simplyblock_core/test/test_utils.py index da22a73ba..37b3cb267 100644 --- a/simplyblock_core/test/test_utils.py +++ b/simplyblock_core/test/test_utils.py @@ -1,8 +1,13 @@ +import uuid from typing import ContextManager +from unittest.mock import patch import pytest -from simplyblock_core import utils +from simplyblock_core import utils, storage_node_ops +from simplyblock_core.db_controller import DBController +from simplyblock_core.models.nvme_device import JMDevice, RemoteJMDevice +from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.utils import helpers, parse_thread_siblings_list @@ -146,3 +151,51 @@ def test_parse_thread_siblings_list(input, expected): parse_thread_siblings_list(input) else: assert parse_thread_siblings_list(input) == expected + + +@patch.object(DBController, 'get_jm_device_by_id') +def test_get_node_jm_names(db_controller_get_jm_device_by_id): + + node_1_jm = JMDevice() + node_1_jm.uuid = "node_1_jm_id" + node_1_jm.jm_bdev = "node_1_jm" + + node_2_jm = JMDevice() + node_2_jm.uuid = "node_2_jm_id" + node_2_jm.jm_bdev = "node_2_jm" + + node_3_jm = JMDevice() + node_3_jm.uuid = "node_3_jm_id" + node_3_jm.jm_bdev = "node_3_jm" + + node_4_jm = JMDevice() + node_4_jm.uuid = "node_4_jm_id" + node_4_jm.jm_bdev = "node_4_jm" + + def get_jm_device_by_id(jm_id): + for jm in [node_1_jm, node_2_jm, node_3_jm, node_4_jm]: + if jm.uuid == jm_id: + return jm + + db_controller_get_jm_device_by_id.side_effect = get_jm_device_by_id + + node_1 = StorageNode() + node_1.uuid = str(uuid.uuid4()) + node_1.enable_ha_jm = True + node_1.ha_jm_count = 4 + node_1.jm_device = node_1_jm + node_1.jm_ids = ["node_2_jm_id", "node_3_jm_id", "node_4_jm_id"] + + remote_node = StorageNode() + remote_node.uuid = str(uuid.uuid4()) + remote_node.enable_ha_jm = True + remote_node.jm_ids = [] + remote_node.jm_device = node_2_jm + remote_node.remote_jm_devices = [ + RemoteJMDevice({"uuid": node_1_jm.uuid, "remote_bdev": f"rem_{node_1_jm.jm_bdev}"}), + RemoteJMDevice({"uuid": node_3_jm.uuid, "remote_bdev": f"rem_{node_3_jm.jm_bdev}"}), + RemoteJMDevice({"uuid": node_4_jm.uuid, "remote_bdev": f"rem_{node_4_jm.jm_bdev}"})] + + jm_names = storage_node_ops.get_node_jm_names(node_1, remote_node=remote_node) + print(f"jm_names: {len(jm_names)}", jm_names) + diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 941414708..6e15fba9c 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -1,4 +1,5 @@ # coding=utf-8 +import glob import json import logging import math @@ -10,10 +11,14 @@ import sys import uuid import time -import socket -from typing import Union, Any, Optional, Tuple +from datetime import datetime, timezone +from typing import Union, Any, Optional, Tuple, List, Dict, Iterable +from docker import DockerClient from kubernetes import client, config -from kubernetes.client import ApiException +from kubernetes.client import ApiException, V1Deployment, V1DeploymentSpec, V1ObjectMeta, \ + V1PodTemplateSpec, V1PodSpec, V1Container, V1EnvVar, V1VolumeMount, V1Volume, V1ConfigMapVolumeSource, \ + V1LabelSelector, V1ResourceRequirements + import docker from prettytable import PrettyTable from docker.errors import APIError, DockerException, ImageNotFound, NotFound @@ -145,7 +150,7 @@ def print_table(data: list, title=None): } -def humanbytes(size: int, mode: str = 'iec') -> str: # show size using 1024 base +def humanbytes(size: int, mode: str = 'iec') -> str: # show size using 1024 base """Return the given bytes as a human friendly including the appropriate unit.""" if not size or size < 0: return '0 B' @@ -194,16 +199,8 @@ def get_k8s_node_ip(): logger.error("No mgmt nodes was found in the cluster!") return False - mgmt_ips = [node.mgmt_ip for node in nodes] - - for ip in mgmt_ips: - try: - with socket.create_connection((ip, 10250), timeout=2): - return ip - except Exception as e: - print(e) - raise e - return False + for node in nodes: + return node.mgmt_ip def dict_agg(data, mean=False, keys=None): @@ -447,7 +444,8 @@ def reserve_n(count): assigned["jm_cpu_core"] = vcpu vcpu = reserve_n(1) assigned["jc_singleton_core"] = vcpu - assigned["alceml_worker_cpu_cores"] = vcpu + assigned["lvol_poller_core"] = vcpu + # assigned["alceml_worker_cpu_cores"] = vcpu vcpu = reserve_n(1) assigned["alceml_cpu_cores"] = vcpu elif (len(vcpu_list) < 22): @@ -455,8 +453,10 @@ def reserve_n(count): assigned["jm_cpu_core"] = vcpu vcpu = reserve_n(1) assigned["jc_singleton_core"] = vcpu - vcpus = reserve_n(1) - assigned["alceml_worker_cpu_cores"] = vcpus + vcpu = reserve_n(1) + assigned["lvol_poller_core"] = vcpu + # vcpus = reserve_n(1) + # assigned["alceml_worker_cpu_cores"] = vcpus vcpus = reserve_n(2) assigned["alceml_cpu_cores"] = vcpus else: @@ -464,20 +464,35 @@ def reserve_n(count): assigned["jm_cpu_core"] = vcpus vcpu = reserve_n(1) assigned["jc_singleton_core"] = vcpu - vcpus = reserve_n(int(alceml_count / 3) + ((alceml_count % 3) > 0)) - assigned["alceml_worker_cpu_cores"] = vcpus + # vcpus = reserve_n(int(alceml_count / 3) + ((alceml_count % 3) > 0)) + # assigned["alceml_worker_cpu_cores"] = vcpus vcpus = reserve_n(alceml_count) assigned["alceml_cpu_cores"] = vcpus + vcpus = reserve_n(2) + assigned["lvol_poller_core"] = vcpus dp = int(len(remaining) / 2) - vcpus = reserve_n(dp) - assigned["distrib_cpu_cores"] = vcpus - vcpus = reserve_n(dp) - assigned["poller_cpu_cores"] = vcpus + if 17 > dp >= 12: + poller_n = len(remaining) - 12 + vcpus = reserve_n(12) + assigned["distrib_cpu_cores"] = vcpus + vcpus = reserve_n(poller_n) + assigned["poller_cpu_cores"] = vcpus + elif dp >= 17: + poller_n = len(remaining) - 24 + vcpus = reserve_n(24) + assigned["distrib_cpu_cores"] = vcpus + vcpus = reserve_n(poller_n) + assigned["poller_cpu_cores"] = vcpus + else: + vcpus = reserve_n(dp) + assigned["distrib_cpu_cores"] = vcpus + vcpus = reserve_n(dp) + assigned["poller_cpu_cores"] = vcpus if len(remaining) > 0: if len(assigned["poller_cpu_cores"]) == 0: assigned["distrib_cpu_cores"] = assigned["poller_cpu_cores"] = reserve_n(1) else: - assigned["distrib_cpu_cores"] = assigned["distrib_cpu_cores"] + reserve_n(1) + assigned["poller_cpu_cores"] = assigned["poller_cpu_cores"] + reserve_n(1) # Return the individual threads as separate values return ( assigned.get("app_thread_core", []), @@ -486,7 +501,8 @@ def reserve_n(count): assigned.get("alceml_cpu_cores", []), assigned.get("alceml_worker_cpu_cores", []), assigned.get("distrib_cpu_cores", []), - assigned.get("jc_singleton_core", []) + assigned.get("jc_singleton_core", []), + assigned.get("lvol_poller_core", []), ) @@ -536,11 +552,12 @@ def calculate_pool_count(alceml_count, number_of_distribs, cpu_count, poller_cou poller_number = poller_count if poller_count else cpu_count small_pool_count = 384 * (alceml_count + number_of_distribs + 3 + poller_count) + ( - 6 + alceml_count + number_of_distribs) * 256 + poller_number * 127 + 384 + 128 * poller_number + constants.EXTRA_SMALL_POOL_COUNT + + 6 + alceml_count + number_of_distribs) * + poller_number * 127 + 384 + 128 * poller_number + constants.EXTRA_SMALL_POOL_COUNT large_pool_count = 48 * (alceml_count + number_of_distribs + 3 + poller_count) + ( 6 + alceml_count + number_of_distribs) * 32 + poller_number * 15 + 384 + 16 * poller_number + constants.EXTRA_LARGE_POOL_COUNT - return int(4.0 * small_pool_count), int(2.5 * large_pool_count) + return int(small_pool_count), int(large_pool_count) def calculate_minimum_hp_memory(small_pool_count, large_pool_count, lvol_count, max_prov, cpu_count): @@ -551,9 +568,9 @@ def calculate_minimum_hp_memory(small_pool_count, large_pool_count, lvol_count, extra buffer 2GB return: minimum_hp_memory in bytes ''' - pool_consumption = (small_pool_count * 8 + large_pool_count * 128) / 1024 + 1092 - memory_consumption = (4 * cpu_count + 1.0277 * pool_consumption + 25 * lvol_count) * (1024 * 1024) + ( - 250 * 1024 * 1024) * 1.1 * convert_size(max_prov, 'TiB') + constants.EXTRA_HUGE_PAGE_MEMORY + pool_consumption = (small_pool_count * 8 + large_pool_count * 128) / 1024 + memory_consumption = (4 * cpu_count + 1.1 * pool_consumption + 22 * lvol_count) * ( + 1024 * 1024) + constants.EXTRA_HUGE_PAGE_MEMORY return int(1.2 * memory_consumption) @@ -626,7 +643,7 @@ def get_logger(name=""): if not logg.hasHandlers(): logger_handler = logging.StreamHandler(stream=sys.stdout) - logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s')) + logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(thread)d: %(levelname)s: %(message)s')) logg.addHandler(logger_handler) # gelf_handler = GELFTCPHandler('0.0.0.0', constants.GELF_PORT) # logg.addHandler(gelf_handler) @@ -712,6 +729,7 @@ def get_total_cpu_cores(mapping: str) -> int: items = [pair for pair in mapping.split(",") if "@" in pair] return len(items) + def convert_size(size: Union[int, str], unit: str, round_up: bool = False) -> int: """Convert the given number of bytes to target unit @@ -726,6 +744,14 @@ def convert_size(size: Union[int, str], unit: str, round_up: bool = False) -> in return math.ceil(raw) if round_up else int(raw) +def first_six_chars(s: str) -> str: + """ + Returns the first six characters of a given string. + If the string is shorter than six characters, returns the entire string. + """ + return s[:6] + + def nearest_upper_power_of_2(n): # Check if n is already a power of 2 if (n & (n - 1)) == 0: @@ -735,7 +761,10 @@ def nearest_upper_power_of_2(n): def strfdelta(tdelta): - remainder = int(tdelta.total_seconds()) + return strfdelta_seconds(int(tdelta.total_seconds())) + + +def strfdelta_seconds(remainder: int) -> str: possible_fields = ('W', 'D', 'H', 'M', 'S') constants = {'W': 604800, 'D': 86400, 'H': 3600, 'M': 60, 'S': 1} values = {} @@ -819,7 +848,7 @@ def get_next_rpc_port(cluster_id): from simplyblock_core.db_controller import DBController db_controller = DBController() - port = 8080 + port = constants.RPC_PORT_RANGE_START used_ports = [] for node in db_controller.get_storage_nodes_by_cluster_id(cluster_id): if node.rpc_port > 0: @@ -834,6 +863,22 @@ def get_next_rpc_port(cluster_id): return 0 +def get_next_fw_port(cluster_id): + from simplyblock_core.db_controller import DBController + db_controller = DBController() + + port = constants.FW_PORT_START + used_ports = [] + for node in db_controller.get_storage_nodes_by_cluster_id(cluster_id): + if node.firewall_port > 0: + used_ports.append(node.firewall_port) + next_port = port + while True: + if next_port not in used_ports: + return next_port + next_port += 1 + + def get_next_dev_port(cluster_id): from simplyblock_core.db_controller import DBController db_controller = DBController() @@ -1093,7 +1138,7 @@ def addNvmeDevices(rpc_client, snode, devs): serial_number = nvme_driver_data['ctrlr_data']['serial_number'] if snode.id_device_by_nqn: if "ns_data" in nvme_driver_data: - serial_number = nvme_driver_data['pci_address'] + nvme_driver_data['ns_data']['id'] + serial_number = nvme_driver_data['pci_address'] + str(nvme_driver_data['ns_data']['id']) else: logger.error(f"No subsystem nqn found for device: {nvme_driver_data['pci_address']}") @@ -1231,9 +1276,10 @@ def get_nvme_pci_devices(): return [], [] -def detect_nvmes(pci_allowed, pci_blocked): +def detect_nvmes(pci_allowed, pci_blocked, device_model, size_range, nvme_names): pci_addresses, blocked_devices = get_nvme_pci_devices() ssd_pci_set = set(pci_addresses) + claim_devices_to_nvme() # Normalize SSD PCI addresses and user PCI list if pci_allowed: @@ -1245,10 +1291,20 @@ def detect_nvmes(pci_allowed, pci_blocked): # Check for unmatched addresses unmatched = user_pci_set - ssd_pci_set if unmatched: - logger.error(f"Invalid PCI addresses: {', '.join(unmatched)}") - return [] - - pci_addresses = list(user_pci_set) + logger.warn(f"Invalid PCI addresses: {', '.join(unmatched)}") + pci_addresses = user_pci_set & ssd_pci_set + else: + pci_addresses = list(user_pci_set) + for pci in pci_addresses: + pci_utils.ensure_driver(pci, 'nvme', override=True) + logger.debug(f"Found nvme devices are {pci_addresses}") + elif device_model and size_range: + pci_addresses = query_nvme_ssd_by_model_and_size(device_model, size_range) + logger.debug(f"Found nvme devices are {pci_addresses}") + pci_allowed = pci_addresses + elif nvme_names: + pci_addresses = query_nvme_ssd_by_namespace_names(nvme_names) + pci_allowed = pci_addresses elif pci_blocked: user_pci_set = set( addr if len(addr.split(":")[0]) == 4 else f"0000:{addr}" @@ -1259,19 +1315,14 @@ def detect_nvmes(pci_allowed, pci_blocked): for pci in pci_addresses: pci_utils.ensure_driver(pci, 'nvme') - nvme_base_path = '/sys/class/nvme/' nvme_devices = [dev for dev in os.listdir(nvme_base_path) if dev.startswith('nvme')] nvmes = {} for dev in nvme_devices: - dev_name = os.path.basename(dev) - pattern = re.compile(rf"^{re.escape(dev_name)}n\d+$") - if any(pattern.match(block_device) for block_device in blocked_devices): - logger.debug(f"device {dev_name} is busy.. skipping") - continue - device_symlink = os.path.join(nvme_base_path, dev) try: - pci_address = "unknown" + dev_name = os.path.basename(dev) + pattern = re.compile(rf"^{re.escape(dev_name)}n\d+$") + device_symlink = os.path.join(nvme_base_path, dev) # Resolve the real path to get the actual device path real_path = os.path.realpath(device_symlink) @@ -1280,12 +1331,15 @@ def detect_nvmes(pci_allowed, pci_blocked): address_file = os.path.join(real_path, 'address') with open(address_file, 'r') as f: pci_address = f.read().strip() - + if any(pattern.match(block_device) for block_device in blocked_devices): + if pci_address not in pci_allowed: + logger.debug(f"device {dev_name} is busy.. skipping") + continue + logger.warning(f"PCI {pci_address} passed as allowed PCI, even it has partitions.. Formatting it now") # Read the NUMA node information numa_node_file = os.path.join(real_path, 'numa_node') with open(numa_node_file, 'r') as f: numa_node = f.read().strip() - if pci_address not in pci_addresses: continue nvmes[dev_name] = {"pci_address": pci_address, "numa_node": numa_node} @@ -1300,11 +1354,11 @@ def calculate_unisolated_cores(cores, cores_percentage=0): if cores_percentage: return math.ceil(total * (100 - cores_percentage) / 100) if total <= 10: - return 1 - if total <= 20: return 2 - if total <= 28: + if total <= 20: return 3 + if total <= 28: + return 4 return math.ceil(total * 0.15) @@ -1312,6 +1366,103 @@ def get_core_indexes(core_to_index, list_of_cores): return [core_to_index[core] for core in list_of_cores if core in core_to_index] +def build_unisolated_stride( + all_cores: List[int], + num_unisolated: int, + client_qpair_count: int, + pool_stride: int = 2, +) -> List[int]: + """ + Build a list of 'unisolated' CPUs by picking from per-qpair pools. + + Pools are contiguous slices of all_cores: + total=30, q=3 -> [0..9], [10..19], [20..29] + + Selection: + round-robin across pools, and within each pool advance by pool_stride + e.g. stride=2 -> 0,2,4,... then 10,12,14,... then 20,22,24,... + + If hyper_thread=True, append sibling right after each core: + sibling = cpu +/- (total//2) + """ + hyper_thread = is_hyperthreading_enabled_via_siblings() + if num_unisolated <= 0: + return [] + if client_qpair_count <= 0: + raise ValueError("client_qpair_count must be > 0") + if pool_stride <= 0: + raise ValueError("pool_stride must be > 0") + + cores = sorted(all_cores) + total = len(cores) + if total == 0: + return [] + + core_set = set(cores) + + half: int = 0 + if hyper_thread: + if total % 2 != 0: + raise ValueError(f"hyper_thread=True but total logical CPUs ({total}) is not even") + half = total // 2 + + # Build pools + pool_size = math.ceil(total / client_qpair_count) + pools = [cores[i * pool_size: min((i + 1) * pool_size, total)] for i in range(client_qpair_count)] + pools = [p for p in pools if p] # drop empties + + # Per-pool index (within each pool) + idx = [0] * len(pools) + + out: List[int] = [] + used = set() + + def add_cpu(cpu: int) -> None: + if cpu in core_set and cpu not in used and len(out) < num_unisolated: + out.append(cpu) + used.add(cpu) + + while len(out) < num_unisolated: + progress = False + + for pi, pool in enumerate(pools): + if len(out) >= num_unisolated: + break + + # find next candidate in this pool using stride + j = idx[pi] + while j < len(pool) and pool[j] in used: + j += pool_stride + if j >= len(pool): + continue + + cpu = pool[j] + idx[pi] = j + pool_stride + + add_cpu(cpu) + progress = True + + if hyper_thread and len(out) < num_unisolated: + sib = cpu + half if cpu < half else cpu - half + add_cpu(sib) + + if progress: + continue + + # Fallback: fill any remaining from whatever is unused (should rarely happen) + for cpu in cores: + if len(out) >= num_unisolated: + break + if cpu not in used: + add_cpu(cpu) + if hyper_thread and len(out) < num_unisolated: + sib = cpu + half if cpu < half else cpu - half + add_cpu(sib) + break + + return out[:num_unisolated] + + def generate_core_allocation(cores_by_numa, sockets_to_use, nodes_per_socket, cores_percentage=0): node_distribution: dict = {} # Iterate over each NUMA node @@ -1319,20 +1470,8 @@ def generate_core_allocation(cores_by_numa, sockets_to_use, nodes_per_socket, co if numa_node not in cores_by_numa: continue all_cores = sorted(cores_by_numa[numa_node]) - total_cores = len(all_cores) num_unisolated = calculate_unisolated_cores(all_cores, cores_percentage) - - unisolated = [] - half = total_cores // 2 - for i in range(num_unisolated): - if i % 2 == 0: - index = i // 2 - else: - index = (i - 1) // 2 - if i % 2 == 0: - unisolated.append(all_cores[index]) - else: - unisolated.append(all_cores[half + index]) + unisolated = build_unisolated_stride(all_cores, num_unisolated, constants.CLIENT_QPAIR_COUNT) available_cores = [c for c in all_cores if c not in unisolated] q1 = len(available_cores) // 4 @@ -1420,14 +1559,17 @@ def regenerate_config(new_config, old_config, force=False): "alceml_cpu_cores": get_core_indexes(core_to_index, distribution[3]), "alceml_worker_cpu_cores": get_core_indexes(core_to_index, distribution[4]), "distrib_cpu_cores": get_core_indexes(core_to_index, distribution[5]), - "jc_singleton_core": get_core_indexes(core_to_index, distribution[6])} + "jc_singleton_core": get_core_indexes(core_to_index, distribution[6]), + "lvol_poller_core": get_core_indexes(core_to_index, distribution[7])} isolated_cores = old_config["nodes"][i]["isolated"] number_of_distribs = 2 number_of_distribs_cores = len(old_config["nodes"][i]["distribution"]["distrib_cpu_cores"]) number_of_poller_cores = len(old_config["nodes"][i]["distribution"]["poller_cpu_cores"]) - if number_of_distribs_cores > 2: + if 12 >= number_of_distribs_cores > 2: number_of_distribs = number_of_distribs_cores + else: + number_of_distribs = 12 old_config["nodes"][i]["number_of_distribs"] = number_of_distribs old_config["nodes"][i]["ssd_pcis"] = new_config["nodes"][i]["ssd_pcis"] old_config["nodes"][i]["nic_ports"] = new_config["nodes"][i]["nic_ports"] @@ -1457,7 +1599,7 @@ def regenerate_config(new_config, old_config, force=False): all_isolated_cores = set() for node in old_config["nodes"]: if len(node["ssd_pcis"]) == 0: - logger.error(f"There are not enough SSD devices on numa node {node['socket']}") + logger.error(f"There are no enough SSD devices on numa node {node['socket']}") return False total_required_memory += node["huge_page_memory"] + node["sys_memory"] node_cores_set = set(node["isolated"]) @@ -1471,7 +1613,7 @@ def regenerate_config(new_config, old_config, force=False): def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, - cores_percentage=0): + cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None): system_info = {} nodes_config: dict = {"nodes": []} @@ -1479,7 +1621,25 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a validate_sockets(sockets_to_use, cores_by_numa) logger.debug(f"Cores by numa {cores_by_numa}") nics = detect_nics() - nvmes = detect_nvmes(pci_allowed, pci_blocked) + nvmes = detect_nvmes(pci_allowed, pci_blocked, device_model, size_range, nvme_names) + if not nvmes: + logger.error( + "There are no enough SSD devices on system, you may run 'sbctl sn clean-devices', to clean devices stored in /etc/simplyblock/sn_config_file") + return False, False + if force: + nvme_devices = " ".join([f"/dev/{d}n1" for d in nvmes.keys()]) + logger.warning(f"Formating Nvme devices {nvme_devices}") + answer = input("Type YES/Y to continue: ").strip().lower() + if answer not in ("yes", "y"): + logger.warning("Aborted by user.") + exit(1) + logger.info("OK, continuing formating...") + for nvme_device in nvmes.keys(): + nvme_device_path = f"/dev/{nvme_device}n1" + clean_partitions(nvme_device_path) + nvme_json_string = get_idns(nvme_device_path) + lbaf_id = find_lbaf_id(nvme_json_string, 0, 12) + format_nvme_device(nvme_device_path, lbaf_id) for nid in sockets_to_use: if nid in cores_by_numa: @@ -1497,7 +1657,7 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a for nvme, val in nvmes.items(): pci = val["pci_address"] - numa = val["numa_node"] + numa = int(val["numa_node"]) pci_utils.unbind_driver(pci) if numa in sockets_to_use: system_info[numa]["nvmes"].append(pci) @@ -1550,10 +1710,11 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a "jm_cpu_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][1]), "poller_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][2]), "alceml_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][3]), - "alceml_worker_cpu_cores": get_core_indexes(core_group["core_to_index"], - core_group["distribution"][4]), + # "alceml_worker_cpu_cores": get_core_indexes(core_group["core_to_index"], + # core_group["distribution"][4]), "distrib_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][5]), - "jc_singleton_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][6]) + "jc_singleton_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][6]), + "lvol_poller_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][7]) }, "ssd_pcis": [], "nic_ports": system_info[nid]["nics"] @@ -1583,7 +1744,7 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a node_info["large_pool_count"] = large_pool_count node_info["max_lvol"] = max_lvol node_info["max_size"] = max_prov - node_info["huge_page_memory"] = minimum_hp_memory + node_info["huge_page_memory"] = max(minimum_hp_memory, max_prov) minimum_sys_memory = calculate_minimum_sys_memory(max_prov) node_info["sys_memory"] = minimum_sys_memory all_nodes.append(node_info) @@ -1596,7 +1757,7 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a all_isolated_cores = set() for node in all_nodes: if len(node["ssd_pcis"]) == 0: - logger.error(f"There are not enough SSD devices on numa node {node['socket']}") + logger.error(f"There are no enough SSD devices on numa node {node['socket']}") return False, False total_required_memory += node["huge_page_memory"] + node["sys_memory"] node_cores_set = set(node["isolated"]) @@ -1611,6 +1772,29 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a return final_config, system_info +def get_nvme_name_from_pci(pci_address): + # Search for the PCI address in the sysfs tree for NVMe devices + path = f"/sys/bus/pci/devices/{pci_address}/nvme/nvme*" + matches = glob.glob(path) + + if matches: + # returns 'nvme0' + return os.path.basename(matches[0]) + return None + + +def format_device_with_4k(pci_device): + try: + nvme_device = get_nvme_name_from_pci(pci_device) + nvme_device_path = f"/dev/{nvme_device}n1" + clean_partitions(nvme_device_path) + nvme_json_string = get_idns(nvme_device_path) + lbaf_id = find_lbaf_id(nvme_json_string, 0, 12) + format_nvme_device(nvme_device_path, lbaf_id) + except Exception as e: + logger.error(f"Failed to format device with 4K {e}") + + def set_hugepages_if_needed(node, hugepages_needed, page_size_kb=2048): """Set hugepages for a specific NUMA node if current number is less than needed.""" hugepage_path = f"/sys/devices/system/node/node{node}/hugepages/hugepages-{page_size_kb}kB/nr_hugepages" @@ -1657,8 +1841,7 @@ def validate_node_config(node): required_distribution_fields = [ "app_thread_core", "jm_cpu_core", "poller_cpu_cores", - "alceml_cpu_cores", "alceml_worker_cpu_cores", - "distrib_cpu_cores", "jc_singleton_core" + "alceml_cpu_cores", "distrib_cpu_cores", "jc_singleton_core" ] # Check top-level fields @@ -1928,6 +2111,258 @@ def load_kube_config_with_fallback(): config.load_kube_config() +def patch_cr_status( + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + status_patch: dict, +): + """ + Patch the status subresource of a Custom Resource. + + status_patch example: + {"": "": } + """ + + load_kube_config_with_fallback() + + api = client.CustomObjectsApi() + + body = { + "status": status_patch + } + + try: + api.patch_namespaced_custom_object_status( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body=body, + ) + except ApiException as e: + logger.error( + f"Failed to patch status for {name}: {e.reason} {e.body}" + ) + + +def patch_cr_node_status( + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + node_uuid: str, + node_mgmt_ip: str, + updates: Optional[Dict[str, Any]] = None, + remove: bool = False, +): + """ + Patch status.nodes[*] fields for a specific node identified by UUID. + + Operations: + - Update a node (by uuid or mgmtIp) + - Remove a node (by uuid or mgmtIp) + + updates example: + {"health": "true"} + {"status": "offline"} + {"capacity": {"sizeUsed": 1234}} + """ + load_kube_config_with_fallback() + api = client.CustomObjectsApi() + + try: + cr = api.get_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + ) + + status_nodes = cr.get("status", {}).get("nodes", []) + if not status_nodes: + raise RuntimeError("CR has no status.nodes") + + spec_worker_nodes = cr.get("spec", {}).get("workerNodes", []) + + found = False + new_status_nodes = [] + removed_hostname = None + + for node in status_nodes: + match = ( + node.get("uuid") == node_uuid or + node.get("mgmtIp") == node_mgmt_ip + ) + + if match: + found = True + removed_hostname = node.get("hostname") + + if remove: + continue + + if updates: + node.update(updates) + + new_status_nodes.append(node) + + if not found: + raise RuntimeError( + f"Node not found (uuid={node_uuid}, mgmtIp={node_mgmt_ip})" + ) + + if remove and removed_hostname: + new_worker_nodes = [ + n for n in spec_worker_nodes if n != removed_hostname + ] + + api.patch_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body={ + "spec": { + "workerNodes": new_worker_nodes + } + }, + ) + + api.patch_namespaced_custom_object_status( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body={ + "status": { + "nodes": new_status_nodes + } + }, + ) + + except ApiException as e: + logger.error( + f"Failed to patch node for {name}: {e.reason} {e.body}" + ) + + +def patch_cr_lvol_status( + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + lvol_uuid: Optional[str] = None, + updates: Optional[Dict[str, Any]] = None, + remove: bool = False, + add: Optional[Dict[str, Any]] = None, +): + """ + Patch status.lvols[*] for an LVOL CustomResource. + + Operations: + - Update an existing LVOL (by uuid) + - Remove an LVOL (by uuid) + - Add a new LVOL entry + + Parameters: + lvol_uuid: + UUID of the lvol entry to update or remove + + updates: + Dict of fields to update on the matched lvol + Example: + {"status": "offline", "health": False} + + remove: + If True, remove the lvol identified by lvol_uuid + + add: + Full lvol dict to append to status.lvols + """ + + load_kube_config_with_fallback() + api = client.CustomObjectsApi() + + now = datetime.now(timezone.utc).isoformat() + + try: + cr = api.get_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + ) + + status = cr.get("status", {}) + lvols = status.get("lvols", []) + + # Ensure list exists + if lvols is None: + lvols = [] + + # ---- ADD ---- + if add is not None: + add.setdefault("createDt", now) + add["updateDt"] = now + lvols.append(add) + + # ---- UPDATE / REMOVE ---- + if lvol_uuid: + found = False + new_lvols = [] + + for lvol in lvols: + if lvol.get("uuid") == lvol_uuid: + found = True + + if remove: + continue + + if updates: + lvol.update(updates) + lvol["updateDt"] = now + + new_lvols.append(lvol) + + if not found: + raise RuntimeError(f"LVOL not found (uuid={lvol_uuid})") + + lvols = new_lvols + + body = { + "status": { + "lvols": lvols + } + } + + api.patch_namespaced_custom_object_status( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body=body, + ) + + except ApiException as e: + logger.error( + f"Failed to patch lvol status for {name}: {e.reason} {e.body}" + ) + + def get_node_name_by_ip(target_ip: str) -> str: load_kube_config_with_fallback() v1 = client.CoreV1Api() @@ -2031,17 +2466,438 @@ def patch_prometheus_configmap(username: str, password: str): load_kube_config_with_fallback() v1 = client.CoreV1Api() - cm = v1.read_namespaced_config_map(name="sbcli-simplyblock-prometheus-config", namespace=constants.K8S_NAMESPACE) - prometheus_yml = cm.data.get("prometheus.yml", "") + try: + cm = v1.read_namespaced_config_map( + name="sbcli-simplyblock-prometheus-config", + namespace=constants.K8S_NAMESPACE + ) + except client.exceptions.ApiException as e: + logger.error(f"Failed to read ConfigMap: {e}") + return False + + try: + prometheus_yml = cm.data.get("prometheus.yml", "") + if not prometheus_yml: + logger.error("prometheus.yml key not found in ConfigMap.") + return False - prometheus_yml = re.sub(r"username:*", f"username: '{username}'", prometheus_yml) - prometheus_yml = re.sub(r"password:*", f"password: '{password}'", prometheus_yml) + try: + prometheus_yml = re.sub(r"username:.*", f"username: '{username}'", prometheus_yml) + prometheus_yml = re.sub(r"password:.*", f"password: '{password}'", prometheus_yml) + except re.error as e: + logger.error(f"Regex error while patching Prometheus YAML: {e}") + return False - patch_body = { - "data": { - "prometheus.yml": prometheus_yml + patch_body = { + "data": { + "prometheus.yml": prometheus_yml + } } - } - v1.patch_namespaced_config_map(name="sbcli-simplyblock-prometheus-config", namespace=constants.K8S_NAMESPACE, body=patch_body) - logger.info("Patched sbcli-simplyblock-prometheus-config ConfigMap with new credentials.") + v1.patch_namespaced_config_map( + name="sbcli-simplyblock-prometheus-config", + namespace=constants.K8S_NAMESPACE, + body=patch_body + ) + + logger.info("Patched sbcli-simplyblock-prometheus-config ConfigMap with new credentials.") + return True + + except client.exceptions.ApiException as e: + logger.error(f"Failed to patch ConfigMap: {e}") + return False + + except Exception as e: + logger.error(f"Unexpected error while patching ConfigMap: {e}") + return False + + +def create_docker_service(cluster_docker: DockerClient, service_name: str, service_file: str, service_image: str): + logger.info(f"Creating service: {service_name}") + cluster_docker.services.create( + image=service_image, + command=service_file, + name=service_name, + mounts=["/etc/foundationdb:/etc/foundationdb"], + env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"], + networks=["host"], + constraints=["node.role == manager"], + labels={ + "com.docker.stack.image": service_image, + "com.docker.stack.namespace": "app"} + ) + + +def create_k8s_service(namespace: str, deployment_name: str, + container_name: str, service_file: str, container_image: str): + logger.info(f"Creating deployment: {deployment_name} in namespace {namespace}") + load_kube_config_with_fallback() + apps_v1 = client.AppsV1Api() + + env_list = [ + V1EnvVar( + name="SIMPLYBLOCK_LOG_LEVEL", + value_from={"config_map_key_ref": {"name": "simplyblock-config", "key": "LOG_LEVEL"}} + ) + ] + + volume_mounts = [ + V1VolumeMount( + name="fdb-cluster-file", + mount_path="/etc/foundationdb/fdb.cluster", + sub_path="fdb.cluster" + ) + ] + + volumes = [ + V1Volume( + name="fdb-cluster-file", + config_map=V1ConfigMapVolumeSource( + name="simplyblock-fdb-cluster-config", + items=[{"key": "cluster-file", "path": "fdb.cluster"}] + ) + ) + ] + + container = V1Container( + name=container_name, + image=container_image, + command=["python", service_file], + env=env_list, + volume_mounts=volume_mounts, + resources=V1ResourceRequirements( + requests={"cpu": "200m", "memory": "256Mi"}, + limits={"cpu": "400m", "memory": "1Gi"} + ) + ) + + pod_spec = V1PodSpec( + containers=[container], + volumes=volumes, + host_network=True, + dns_policy="ClusterFirstWithHostNet" + ) + + pod_template = V1PodTemplateSpec( + metadata=V1ObjectMeta(labels={"app": deployment_name}), + spec=pod_spec + ) + + deployment_spec = V1DeploymentSpec( + replicas=1, + selector=V1LabelSelector(match_labels={"app": deployment_name}), + template=pod_template + ) + + deployment = V1Deployment( + api_version="apps/v1", + kind="Deployment", + metadata=V1ObjectMeta(name=deployment_name, namespace=namespace), + spec=deployment_spec + ) + + apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment) + logger.info(f"Deployment {deployment_name} created successfully.") + + +def clean_partitions(nvme_device: str): + command = ['wipefs', '-a', nvme_device] + print(" ".join(command)) + try: + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True # Raise a CalledProcessError if the exit code is non-zero + ) + return result.stdout + + except subprocess.CalledProcessError as e: + # Handle errors (e.g., nvme not found, permission denied, or other command failures) + return (f"Error executing command: {' '.join(command)}\n" + f"Return Code: {e.returncode}\n" + f"Standard Error:\n{e.stderr}") + except FileNotFoundError: + return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" + + +def find_lbaf_id(json_data: str, target_ms: int, target_ds: int) -> int: + try: + data = json.loads(json_data) + except json.JSONDecodeError: + print("Error: Invalid JSON format provided.") + return 0 + + lbafs_list: List[Dict[str, int]] = data.get('lbafs', []) + + # LBAF IDs are 1-based, so we use enumerate starting from 1 + for index, lbaf in enumerate(lbafs_list, start=0): + if lbaf.get('ms') == target_ms and lbaf.get('ds') == target_ds: + return index + + return 0 + + +def get_idns(nvme_device: str): + command = ['nvme', 'id-ns', nvme_device, '--output-format', 'json'] + try: + # Run the command + # capture_output=True captures stdout and stderr. + # text=True decodes the output as text (using default encoding, typically UTF-8). + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True # Raise a CalledProcessError if the exit code is non-zero + ) + + # Return the captured standard output + return result.stdout + + except subprocess.CalledProcessError as e: + # Handle errors (e.g., nvme not found, permission denied, or other command failures) + return (f"Error executing command: {' '.join(command)}\n" + f"Return Code: {e.returncode}\n" + f"Standard Error:\n{e.stderr}") + except FileNotFoundError: + return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" + + +def is_namespace_4k_from_nvme_list(device_path: str) -> bool: + """ + Returns True if nvme list JSON shows SectorSize == 4096 for the given DevicePath + (e.g. '/dev/nvme3n1'). + """ + try: + out = subprocess.check_output(["nvme", "list", "--output-format", "json"], text=True) + data = json.loads(out) + + for dev in data.get("Devices", []): + if dev.get("DevicePath") == device_path: + return int(dev.get("SectorSize", 0)) == 4096 + + # Not found in list + return False + + except subprocess.CalledProcessError: + print("Error: nvme list failed") + return False + except (ValueError, json.JSONDecodeError) as e: + print(f"Error parsing nvme list output: {e}") + return False + + +def format_nvme_device(nvme_device: str, lbaf_id: int): + if is_namespace_4k_from_nvme_list(nvme_device): + logger.debug(f"Device {nvme_device} already formatted with 4K...skipping") + return + command = ['nvme', 'format', nvme_device, f"--lbaf={lbaf_id}", '--force'] + print(" ".join(command)) + try: + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True # Raise a CalledProcessError if the exit code is non-zero + ) + + return result.stdout + + except subprocess.CalledProcessError as e: + # Handle errors (e.g., nvme not found, permission denied, or other command failures) + return (f"Error executing command: {' '.join(command)}\n" + f"Return Code: {e.returncode}\n" + f"Standard Error:\n{e.stderr}") + except FileNotFoundError: + return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" + + +def get_nvme_list_verbose() -> str: + """ + Executes the 'nvme list -v' command and returns the output. + + Returns: + str: The standard output of the command, or an error message + if the command fails. + """ + command = ['nvme', 'list', '-v', '--output-format', 'json'] + + try: + # Run the command + # capture_output=True captures stdout and stderr. + # text=True decodes the output as text (using default encoding, typically UTF-8). + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True # Raise a CalledProcessError if the exit code is non-zero + ) + + # Return the captured standard output + return result.stdout + + except subprocess.CalledProcessError as e: + # Handle errors (e.g., nvme not found, permission denied, or other command failures) + return (f"Error executing command: {' '.join(command)}\n" + f"Return Code: {e.returncode}\n" + f"Standard Error:\n{e.stderr}") + except FileNotFoundError: + return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" + + +def query_nvme_ssd_by_model_and_size(model: str, size_range: str) -> list: + if not model: + print("No model specified.") + return [] + if not size_range: + print("No size range specified.") + return [] + + size_from = 0 + size_to = 0 + try: + range_split = size_range.split('-') + if len(range_split) == 1: + size_from = parse_size(range_split[0]) + elif len(range_split) == 2: + size_from = parse_size(range_split[0]) + size_to = parse_size(range_split[1]) + else: + raise ValueError("Invalid size range") + except Exception as e: + print(e) + return [] + + json_string = get_nvme_list_verbose() + data = json.loads(json_string) + + pci_lst = [] + for device_entry in data.get('Devices', []): + for subsystem in device_entry.get('Subsystems', []): + for controller in subsystem.get('Controllers', []): + model_number = controller.get("ModelNumber") + if model_number != model: + continue + address = controller.get("Address") + if len(controller.get("Namespaces")) > 0: + size = controller.get("Namespaces")[0].get("PhysicalSize") + if size > size_from: + if size_to > 0 and size < size_to: + pci_lst.append(address) + return pci_lst + + +def query_nvme_ssd_by_namespace_names(nvme_names: Iterable[str]) -> List[str]: + """ + Match NVMe devices by namespace names (e.g. nvme0n1, nvme1n1) using nvme list -v JSON output. + Returns a de-duplicated list of PCI addresses (e.g. 0000:00:03.0). + """ + nvme_names = list(nvme_names or []) + if not nvme_names: + print("No NVMe device names specified.") + return [] + + wanted = set(nvme_names) + + json_string = get_nvme_list_verbose() # should return the JSON string shown in your example + data = json.loads(json_string) + + out: List[str] = [] + seen = set() + + for dev in data.get("Devices", []): + for subsys in dev.get("Subsystems", []): + for ctrl in subsys.get("Controllers", []): + addr = ctrl.get("Address") + for ns in ctrl.get("Namespaces", []) or []: + ns_name = ns.get("NameSpace") # <-- exact key in your JSON + if ns_name in wanted and addr and addr not in seen: + seen.add(addr) + out.append(addr) + break + + return out + + +def claim_devices_to_nvme(config_path=""): + config_path = config_path or constants.NODES_CONFIG_FILE + nvme_devices_list = [] + try: + with open(config_path) as f: + cfg = json.load(f) + nvme_devices_list = [ + pci + for node in cfg.get("nodes", []) + for pci in node.get("ssd_pcis", []) + ] + for pci in nvme_devices_list: + pci_utils.ensure_driver(pci, 'nvme') + except Exception as e: + print(f"An unexpected error occurred: {e}") + return nvme_devices_list + + +def clean_devices(config_path, format, force): + nvme_devices_list = claim_devices_to_nvme(config_path) + try: + json_string = get_nvme_list_verbose() + data = json.loads(json_string) + controllers_list = [] + + # The structure is Devices[0] -> Subsystems[] -> Controllers[] + nvme_devices = "" + for device_entry in data.get('Devices', []): + for subsystem in device_entry.get('Subsystems', []): + for controller in subsystem.get('Controllers', []): + # 3. Pull out the desired fields + if len(controller.get("Namespaces")) > 0 and controller.get("Address") in nvme_devices_list: + controllers_list.append({ + "NVMe_Controller": controller.get("Controller"), + "PCI_Address": controller.get("Address"), + "NAMESPACE": controller.get("Namespaces")[0].get("NameSpace") + }) + nvme_devices += f"/dev/{controller.get('Namespaces')[0].get('NameSpace')} " + if format: + logger.warning(f"Formating Nvme devices {nvme_devices}") + if not force: + answer = input("Type YES/Y to continue: ").strip().lower() + if answer not in ("yes", "y"): + logger.warning("Aborted by user.") + exit(1) + + for mapping in controllers_list: + if mapping['PCI_Address'] in nvme_devices_list: + nvme_device_path = f"/dev/{mapping['NAMESPACE']}" + clean_partitions(nvme_device_path) + + except json.JSONDecodeError as e: + logger.error(f"Error decoding JSON: {e}") + + +def create_rpc_socket_mount(): + try: + + logger.info("create RPC socket mount") + mount_point = "/mnt/ramdisk" + size = "1G" + fstab_entry = f"tmpfs {mount_point} tmpfs size={size},mode=1777,noatime 0 0\n" + + # Create the mount point if it doesn't exist + os.makedirs(mount_point, exist_ok=True) + + # Add to /etc/fstab if not already present + with open("/etc/fstab", "r+") as fstab: + lines = fstab.readlines() + if not any(mount_point in line for line in lines): + fstab.write(fstab_entry) + print(f"Added fstab entry for {mount_point}") + else: + print(f"fstab entry for {mount_point} already exists") + + # Mount the RAM disk immediately + subprocess.run(["mount", mount_point], check=True) + + # Verify + subprocess.run(["df", "-h", mount_point]) + except Exception as e: + logger.error(e) diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index 8e18fc276..cfaf79e15 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -4,7 +4,6 @@ import math import os from pathlib import Path -import subprocess import time from typing import List, Optional, Union @@ -19,6 +18,7 @@ from simplyblock_core import scripts, constants, shell_utils, utils as core_utils import simplyblock_core.utils.pci as pci_utils +import simplyblock_core.utils as init_utils from simplyblock_web import utils, node_utils logger = core_utils.get_logger(__name__) @@ -129,7 +129,7 @@ def scan_devices(): class SPDKParams(BaseModel): server_ip: str = Field(pattern=utils.IP_PATTERN) - rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=1, le=65536) + rpc_port: int = Field(constants.RPC_PORT_RANGE_START, ge=1, le=65536) rpc_username: str rpc_password: str ssd_pcie: Optional[List[str]] = Field(None) @@ -142,6 +142,9 @@ class SPDKParams(BaseModel): spdk_image: Optional[str] = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE) cluster_ip: Optional[str] = Field(default=None, pattern=utils.IP_PATTERN) cluster_mode: str + socket: Optional[int] = Field(None, ge=0) + cluster_id: str + firewall_port: int = Field(constants.FW_PORT_START) @api.post('/spdk_process_start', responses={ @@ -154,7 +157,8 @@ def spdk_process_start(body: SPDKParams): ssd_pcie_list = " ".join(body.ssd_pcie) if body.ssd_pcie else "none" spdk_debug = '1' if body.spdk_debug else '' total_mem_mib = core_utils.convert_size(core_utils.parse_size(body.total_mem), 'MiB') if body.total_mem else '' - spdk_mem_mib = core_utils.convert_size(body.spdk_mem, 'MiB') + # spdk_mem_mib = core_utils.convert_size(body.spdk_mem, 'MiB') + spdk_mem_mib = 0 node_docker = get_docker_client(timeout=60 * 3) for name in {f"/spdk_{body.rpc_port}", f"/spdk_proxy_{body.rpc_port}"}: @@ -180,24 +184,29 @@ def spdk_process_start(body: SPDKParams): f'/tmp/shm_{body.rpc_port}/:/dev/shm/', '/lib/modules/:/lib/modules/', '/var/lib/systemd/coredump/:/var/lib/systemd/coredump/', - '/sys:/sys'], + '/sys:/sys', + '/mnt/ramdisk:/mnt/ramdisk', + ], environment=[ f"RPC_PORT={body.rpc_port}", f"ssd_pcie={ssd_pcie_params}", f"PCI_ALLOWED={ssd_pcie_list}", f"TOTAL_HP={total_mem_mib}", + f"NSOCKET={body.socket}", + f"FW_PORT={body.firewall_port}", ] # restart_policy={"Name": "on-failure", "MaximumRetryCount": 99} ) node_docker.containers.run( constants.SIMPLY_BLOCK_DOCKER_IMAGE, - "python simplyblock_core/services/spdk_http_proxy_server.py", + "python simplyblock_core/services/spdk_http_proxy_server.py ", name=f"spdk_proxy_{body.rpc_port}", detach=True, network_mode="host", log_config=log_config, volumes=[ f'/var/tmp/spdk_{body.rpc_port}:/var/tmp', + '/mnt/ramdisk:/mnt/ramdisk', ], environment=[ f"SERVER_IP={body.server_ip}", @@ -508,8 +517,10 @@ def bind_device_to_nvme(body: utils.DeviceParams): def delete_gpt_partitions_for_dev(body: utils.DeviceParams): bind_device_to_nvme(body) device_name = pci_utils.nvme_device_name(body.device_pci) - subprocess.check_call(['parted', '-fs', f'/dev/{device_name}', 'mklabel' 'gpt']) - return utils.get_response(True) + cmd = f"parted -fs /dev/{device_name} mklabel gpt" + out, err, ret_code = shell_utils.run_command(cmd) + logger.info(f"out: {out}, err: {err}, ret_code: {ret_code}") + return utils.get_response(ret_code==0, error=err) CPU_INFO = cpuinfo.get_cpu_info() @@ -528,6 +539,13 @@ def delete_gpt_partitions_for_dev(body: utils.DeviceParams): SYSTEM_ID = CLOUD_INFO["id"] +@api.post('/format_device_with_4k') +def format_device_with_4k(body: utils.DeviceParams): + pci_utils.ensure_driver(body.device_pci, 'nvme') + init_utils.format_device_with_4k(body.device_pci) + return utils.get_response(True) + + @api.post('/bind_device_to_spdk') def bind_device_to_spdk(body: utils.DeviceParams): device_path = pci_utils.device(body.device_pci) @@ -697,3 +715,44 @@ def ifc_is_tcp(query: NicQuery): }) def is_alive(): return utils.get_response(True) + + +@api.post('/nvme_connect', + summary='Connect NVMe-oF target', + responses={ + 200: {'content': {'application/json': {'schema': utils.response_schema({ + 'type': 'boolean', + })}}, + }, +}) +def connect_to_nvme(body: utils.NVMEConnectParams): + """Connect to the indicated NVMe-oF target. + """ + st = f"nvme connect --transport=tcp --traddr={body.ip} --trsvcid={body.port} --nqn={body.nqn}" + logger.debug(st) + out, err, ret_code = shell_utils.run_command(st) + logger.debug(ret_code) + logger.debug(out) + logger.debug(err) + if ret_code == 0: + return utils.get_response(True) + else: + return utils.get_response(ret_code, error=err) + + +@api.post('/disconnect_nqn', + summary='Disconnect NVMe-oF device by NQN', + responses={ + 200: {'content': {'application/json': {'schema': utils.response_schema({ + 'type': 'integer', + })}}}, +}) +def disconnect_nqn(body: utils.DisconnectParams): + """Disconnect from indicated NVMe-oF target + """ + st = f"nvme disconnect --nqn={body.nqn}" + out, err, ret_code = shell_utils.run_command(st) + logger.debug(ret_code) + logger.debug(out) + logger.debug(err) + return utils.get_response(ret_code) diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index be3193138..65ad28a3b 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -268,6 +268,9 @@ class SPDKParams(BaseModel): spdk_image: str = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE) cluster_ip: str = Field(pattern=utils.IP_PATTERN) cluster_mode: str + socket: Optional[int] = Field(None, ge=0) + firewall_port: Optional[int] = Field(constants.FW_PORT_START) + cluster_id: str @api.post('/spdk_process_start', responses={ @@ -286,9 +289,10 @@ def spdk_process_start(body: SPDKParams): total_mem_mib = core_utils.convert_size(core_utils.parse_size(body.total_mem), 'MB') if body.total_mem else "" - if _is_pod_up(body.rpc_port) or _is_pod_present(body.rpc_port): + first_six_cluster_id = core_utils.first_six_chars(body.cluster_id) + if _is_pod_up(body.rpc_port, first_six_cluster_id) or _is_pod_present(body.rpc_port, first_six_cluster_id): logger.info("SPDK pod found, removing...") - query = utils.RPCPortParams(rpc_port=body.rpc_port) + query = utils.RPCPortParams(rpc_port=body.rpc_port, cluster_id=body.cluster_id) spdk_process_kill(query) node_prepration_job_name = "snode-spdk-job-" @@ -336,8 +340,8 @@ def spdk_process_start(body: SPDKParams): "L_CORES": body.l_cores, "CORES": core_utils.get_total_cpu_cores(body.l_cores), 'SPDK_MEM': core_utils.convert_size(body.spdk_mem, 'MiB'), - 'MEM_GEGA': core_utils.convert_size(body.spdk_mem, 'GiB', round_up=True), - 'MEM2_GEGA': core_utils.convert_size(body.system_mem, 'GiB', round_up=True), + 'MEM_MEGA': (core_utils.convert_size(body.spdk_mem, 'MiB', round_up=True) // 2) * 2 + 512, + 'MEM2_MEGA': (core_utils.convert_size(body.system_mem, 'MiB', round_up=True) // 2) * 2, 'SERVER_IP': body.server_ip, 'RPC_PORT': body.rpc_port, 'RPC_USERNAME': body.rpc_username, @@ -351,9 +355,12 @@ def spdk_process_start(body: SPDKParams): 'SIMPLYBLOCK_DOCKER_IMAGE': constants.SIMPLY_BLOCK_DOCKER_IMAGE, 'GRAYLOG_SERVER_IP': body.cluster_ip, 'MODE': body.cluster_mode, + 'CLUSTER_ID': first_six_cluster_id, 'SSD_PCIE': ssd_pcie_params, 'PCI_ALLOWED': ssd_pcie_list, - 'TOTAL_HP': total_mem_mib + 'TOTAL_HP': total_mem_mib, + 'NSOCKET': body.socket, + 'FW_PORT': body.firewall_port } if ubuntu_host: @@ -420,9 +427,35 @@ def spdk_process_start(body: SPDKParams): logger.info(f"Job deleted: '{core_resp.metadata.name}' in namespace '{namespace}") elif core_isolate and openshift: + batch_v1 = core_utils.get_k8s_batch_client() + try: + batch_v1.read_namespaced_job( + name=node_prepration_core_name, + namespace=namespace + ) + logger.info(f"Existing Job '{node_prepration_core_name}' found — deleting it first...") + + batch_v1.delete_namespaced_job( + name=node_prepration_core_name, + namespace=namespace, + body=V1DeleteOptions( + propagation_policy='Foreground', + grace_period_seconds=0 + ) + ) + + node_utils_k8s.wait_for_job_deletion(node_prepration_core_name, namespace) + + logger.info(f"Old Job '{node_prepration_core_name}' fully deleted.") + + except ApiException as e: + if e.status == 404: + logger.info(f"No pre-existing Job '{node_prepration_core_name}' found. Proceeding.") + else: + raise + core_template = env.get_template('oc_storage_core_isolation.yaml.j2') core_yaml = yaml.safe_load(core_template.render(values)) - batch_v1 = core_utils.get_k8s_batch_client() core_resp = batch_v1.create_namespaced_job(namespace=namespace, body=core_yaml) msg = f"Job created: '{core_resp.metadata.name}' in namespace '{namespace}" logger.info(msg) @@ -463,7 +496,11 @@ def spdk_process_kill(query: utils.RPCPortParams): k8s_core_v1 = core_utils.get_k8s_core_client() try: namespace = node_utils_k8s.get_namespace() - pod_name = f"snode-spdk-pod-{query.rpc_port}" + if not query.cluster_id: + return utils.get_response(False, "param required: cluster_id") + + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) + pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}" resp = k8s_core_v1.delete_namespaced_pod(pod_name, namespace) retries = 10 while retries > 0: @@ -486,9 +523,9 @@ def spdk_process_kill(query: utils.RPCPortParams): return utils.get_response(True) -def _is_pod_up(rpc_port): +def _is_pod_up(rpc_port, cluster_id): k8s_core_v1 = core_utils.get_k8s_core_client() - pod_name = f"snode-spdk-pod-{rpc_port}" + pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}" try: resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace()) for pod in resp.items: @@ -502,9 +539,9 @@ def _is_pod_up(rpc_port): return False return False -def _is_pod_present(rpc_port): +def _is_pod_present(rpc_port, cluster_id): k8s_core_v1 = core_utils.get_k8s_core_client() - pod_name = f"snode-spdk-pod-{rpc_port}" + pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}" try: resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace()) for pod in resp.items: @@ -525,7 +562,11 @@ def _is_pod_present(rpc_port): })}}}, }) def spdk_process_is_up(query: utils.RPCPortParams): - if _is_pod_up(query.rpc_port): + if not query.cluster_id: + return utils.get_response(False, "param required: cluster_id") + + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) + if _is_pod_up(query.rpc_port, first_six_cluster_id): return utils.get_response(True) else: return utils.get_response(False, "SPDK container is not running") @@ -602,10 +643,13 @@ def apply_config(): # Set Huge page memory huge_page_memory_dict: dict = {} for node_config in nodes: + hg_memory = node_config["huge_page_memory"] + if int(node_config["max_size"]) > 0: + hg_memory = max(hg_memory , node_config["max_size"]) numa = node_config["socket"] - huge_page_memory_dict[numa] = huge_page_memory_dict.get(numa, 0) + node_config["huge_page_memory"] + huge_page_memory_dict[numa] = huge_page_memory_dict.get(numa, 0) + hg_memory + 1000000000 for numa, huge_page_memory in huge_page_memory_dict.items(): - num_pages = huge_page_memory // (2048 * 1024) + num_pages = huge_page_memory // 2000000 core_utils.set_hugepages_if_needed(numa, num_pages) return utils.get_response(True) @@ -628,6 +672,7 @@ def is_alive(): def spdk_proxy_restart(query: utils.RPCPortParams): return utils.get_response(True) +api.post('/bind_device_to_nvme')(snode_ops.bind_device_to_nvme) api.post('/bind_device_to_spdk')(snode_ops.bind_device_to_spdk) @@ -635,3 +680,5 @@ def spdk_proxy_restart(query: utils.RPCPortParams): api.get('/ifc_is_roce')(snode_ops.ifc_is_roce) +api.post('/format_device_with_4k')(snode_ops.format_device_with_4k) + diff --git a/simplyblock_web/api/v1/__init__.py b/simplyblock_web/api/v1/__init__.py index 4bcc5ba41..6df2a2db5 100644 --- a/simplyblock_web/api/v1/__init__.py +++ b/simplyblock_web/api/v1/__init__.py @@ -1,9 +1,12 @@ import logging +import fdb +from flask import jsonify from flask import Flask from simplyblock_web.auth_middleware import token_required from simplyblock_web import utils +from simplyblock_core import constants from . import cluster from . import mgmt_node @@ -39,3 +42,24 @@ def before_request(): @api.route('/', methods=['GET']) def status(): return utils.get_response("Live") + +@api.route('/health/fdb', methods=['GET']) +def health_fdb(): + try: + fdb.api_version(constants.KVD_DB_VERSION) + + db = fdb.open(constants.KVD_DB_FILE_PATH) + tr = db.create_transaction() + + tr.get(b"\x00") + tr.commit().wait() + + return jsonify({ + "fdb_connected": True + }), 200 + + except Exception as e: + return jsonify({ + "fdb_connected": False, + "error": str(e) + }), 503 diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py index 698d9582d..759cdbd31 100644 --- a/simplyblock_web/api/v1/cluster.py +++ b/simplyblock_web/api/v1/cluster.py @@ -47,6 +47,9 @@ def add_cluster(): qpair_count = cl_data.get('qpair_count', 256) name = cl_data.get('name', None) fabric = cl_data.get('fabric', "tcp") + cr_name = cl_data.get('cr_name', None) + cr_namespace = cl_data.get('cr_namespace', None) + cr_plural = cl_data.get('cr_plural', None) max_queue_size = cl_data.get('max_queue_size', 128) inflight_io_threshold = cl_data.get('inflight_io_threshold', 4) @@ -56,10 +59,62 @@ def add_cluster(): return utils.get_response(cluster_ops.add_cluster( blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, - qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric + qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, + cr_name, cr_namespace, cr_plural, fabric )) +@bp.route('/cluster/create_first', methods=['POST']) +def create_first_cluster(): + cl_data = request.get_json() + + if db.get_clusters(): + return utils.get_response_error("Cluster found!", 400) + + blk_size = 512 + if 'blk_size' in cl_data: + if cl_data['blk_size'] not in [512, 4096]: + return utils.get_response_error("blk_size can be 512 or 4096", 400) + else: + blk_size = cl_data['blk_size'] + page_size_in_blocks = cl_data.get('page_size_in_blocks', 2097152) + distr_ndcs = cl_data.get('distr_ndcs', 1) + distr_npcs = cl_data.get('distr_npcs', 1) + distr_bs = cl_data.get('distr_bs', 4096) + distr_chunk_bs = cl_data.get('distr_chunk_bs', 4096) + ha_type = cl_data.get('ha_type', 'ha') + enable_node_affinity = cl_data.get('enable_node_affinity', False) + qpair_count = cl_data.get('qpair_count', 256) + name = cl_data.get('name', None) + fabric = cl_data.get('fabric', "tcp") + cap_warn = cl_data.get('cap_warn', 0) + cap_crit = cl_data.get('cap_crit', 0) + prov_cap_warn = cl_data.get('prov_cap_warn', 0) + prov_cap_crit = cl_data.get('prov_cap_crit', 0) + max_queue_size = cl_data.get('max_queue_size', 128) + inflight_io_threshold = cl_data.get('inflight_io_threshold', 4) + strict_node_anti_affinity = cl_data.get('strict_node_anti_affinity', False) + is_single_node = cl_data.get('is_single_node', False) + cr_name = cl_data.get('cr_name', None) + cr_namespace = cl_data.get('cr_namespace', None) + cr_plural = cl_data.get('cr_plural', None) + cluster_ip = cl_data.get('cluster_ip', None) + grafana_secret = cl_data.get('grafana_secret', None) + + try: + cluster_id = cluster_ops.add_cluster( + blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, + distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, + qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, + cr_name, cr_namespace, cr_plural, fabric, cluster_ip=cluster_ip, grafana_secret=grafana_secret) + if cluster_id: + return utils.get_response(db.get_cluster_by_id(cluster_id).to_dict()) + else: + return utils.get_response(False, "Failed to create cluster", 400) + except Exception as e: + return utils.get_response(False, str(e), 404) + + @bp.route('/cluster', methods=['GET'], defaults={'uuid': None}) @bp.route('/cluster/', methods=['GET']) def list_clusters(uuid): @@ -227,6 +282,23 @@ def cluster_activate(uuid): # FIXME: Any failure within the thread are not handled return utils.get_response(True), 202 +@bp.route('/cluster/addreplication/', methods=['PUT']) +def cluster_add_replication(uuid): + req_data = request.get_json() + target_cluster_uuid = req_data.get("target_cluster_uuid", None) + replication_timeout = req_data.get("replication_timeout", 0) + target_pool_uuid = req_data.get("target_pool_uuid", None) + + try: + db.get_cluster_by_id(uuid) + except KeyError: + return utils.get_response_error(f"Cluster not found: {uuid}", 404) + + cluster_ops.add_replication(source_cl_id=uuid, target_cl_id=target_cluster_uuid, + timeout=replication_timeout, target_pool=target_pool_uuid) + return utils.get_response(True), 202 + + @bp.route('/cluster/allstats//history/', methods=['GET']) @bp.route('/cluster/allstats/', methods=['GET'], defaults={'history': None}) diff --git a/simplyblock_web/api/v1/lvol.py b/simplyblock_web/api/v1/lvol.py index dbb77f6a2..31443e6c8 100644 --- a/simplyblock_web/api/v1/lvol.py +++ b/simplyblock_web/api/v1/lvol.py @@ -158,6 +158,7 @@ def add_lvol(): ndcs = utils.get_value_or_default(cl_data, "ndcs", 0) npcs = utils.get_value_or_default(cl_data, "npcs", 0) fabric = utils.get_value_or_default(cl_data, "fabric", "tcp") + do_replicate = utils.get_value_or_default(cl_data, "do_replicate", False) ret, error = lvol_controller.add_lvol_ha( name=name, @@ -186,7 +187,8 @@ def add_lvol(): max_namespace_per_subsys=max_namespace_per_subsys, ndcs=ndcs, npcs=npcs, - fabric=fabric + fabric=fabric, + do_replicate=do_replicate ) return utils.get_response(ret, error, http_code=400) @@ -306,3 +308,24 @@ def inflate_lvol(uuid): ret = lvol_controller.inflate_lvol(uuid) return utils.get_response(ret) + +@bp.route('/lvol/replication_start/', methods=['PUT']) +def replication_start(uuid): + try: + db.get_lvol_by_id(uuid) + except KeyError as e: + return utils.get_response_error(str(e), 404) + + ret = lvol_controller.replication_trigger(uuid) + return utils.get_response(ret) + +@bp.route('/lvol/replication_stop/', methods=['PUT']) +def replication_stop(uuid): + try: + db.get_lvol_by_id(uuid) + except KeyError as e: + return utils.get_response_error(str(e), 404) + + ret = lvol_controller.replication_stop(uuid) + return utils.get_response(ret) + \ No newline at end of file diff --git a/simplyblock_web/api/v1/pool.py b/simplyblock_web/api/v1/pool.py index a24a9e9b7..3b4fe5f72 100644 --- a/simplyblock_web/api/v1/pool.py +++ b/simplyblock_web/api/v1/pool.py @@ -184,21 +184,10 @@ def pool_iostats(uuid, history): except KeyError: return utils.get_response_error(f"Pool not found: {uuid}", 404) - if history: - records_number = core_utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - out = db.get_pool_stats(pool, records_number) - records_count = 20 - new_records = core_utils.process_records(out, records_count) - + data = pool_controller.get_io_stats(uuid, history) ret = { "object_data": pool.get_clean_dict(), - "stats": new_records or [] + "stats": data or [] } return utils.get_response(ret) @@ -207,21 +196,13 @@ def pool_iostats(uuid, history): @bp.route('/pool/iostats-all-lvols/', methods=['GET']) def lvol_iostats(pool_uuid): try: - db.get_pool_by_id(pool_uuid) + pool = db.get_pool_by_id(pool_uuid) except KeyError: return utils.get_response_error(f"Pool not found: {pool_uuid}", 404) - ret = [] - for lvol in db.get_lvols_by_pool_id(pool_uuid): - - records_list = db.get_lvol_stats(lvol, limit=1) - - if records_list: - data = records_list[0].get_clean_dict() - else: - data = {} - ret.append({ - "object_data": lvol.get_clean_dict(), - "stats": data - }) + data = pool_controller.get_capacity(pool_uuid) + ret = { + "object_data": pool.get_clean_dict(), + "stats": data or [] + } return utils.get_response(ret) diff --git a/simplyblock_web/api/v1/storage_node.py b/simplyblock_web/api/v1/storage_node.py index b44313c11..b3f0925bf 100644 --- a/simplyblock_web/api/v1/storage_node.py +++ b/simplyblock_web/api/v1/storage_node.py @@ -249,6 +249,19 @@ def storage_node_add(): if 'iobuf_large_pool_count' in req_data: iobuf_large_pool_count = int(req_data['iobuf_large_pool_count']) + ha_jm_count = 3 + if 'ha_jm_count' in req_data: + ha_jm_count = int(req_data['ha_jm_count']) + + format_4k = False + param = req_data.get('format_4k') + if param: + if isinstance(param, bool): + format_4k = param + elif isinstance(param, str): + format_4k = param == "true" + + tasks_controller.add_node_add_task(cluster_id, { "cluster_id": cluster_id, "node_addr": node_addr, @@ -264,6 +277,8 @@ def storage_node_add(): "enable_test_device": enable_test_device, "namespace": namespace, "enable_ha_jm": not disable_ha_jm, + "ha_jm_count": ha_jm_count, + "format_4k": format_4k }) return utils.get_response(True) diff --git a/simplyblock_web/api/v2/__init__.py b/simplyblock_web/api/v2/__init__.py index ff8511e1c..c4c0168c7 100644 --- a/simplyblock_web/api/v2/__init__.py +++ b/simplyblock_web/api/v2/__init__.py @@ -10,6 +10,7 @@ from . import pool from . import snapshot from . import storage_node +from . import task from simplyblock_core.db_controller import DBController @@ -38,6 +39,9 @@ def _verify_api_token( cluster.instance_api.include_router(storage_node.api) +task.api.include_router(task.instance_api) + +cluster.instance_api.include_router(task.api) volume.api.include_router(volume.instance_api) pool.instance_api.include_router(volume.api) diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py index 422766246..9447ff718 100644 --- a/simplyblock_web/api/v2/cluster.py +++ b/simplyblock_web/api/v2/cluster.py @@ -17,6 +17,11 @@ db = DBController() +class _ReplicationParams(BaseModel): + snapshot_replication_target_cluster: str + snapshot_replication_timeout: int = 0 + target_pool: Optional[str] = None + class _UpdateParams(BaseModel): management_image: Optional[str] spdk_image: Optional[str] @@ -24,7 +29,7 @@ class _UpdateParams(BaseModel): class ClusterParams(BaseModel): - name: Optional[str] = None + name: str = "" blk_size: Literal[512, 4096] = 512 page_size_in_blocks: int = Field(2097152, gt=0) cap_warn: util.Percent = 0 @@ -35,22 +40,30 @@ class ClusterParams(BaseModel): distr_npcs: int = 1 distr_bs: int = 4096 distr_chunk_bs: int = 4096 - ha_type: Literal['single', 'ha'] = 'single' + ha_type: Literal['single', 'ha'] = 'ha' qpair_count: int = 256 max_queue_size: int = 128 inflight_io_threshold: int = 4 enable_node_affinity: bool = False strict_node_anti_affinity: bool = False - + is_single_node: bool = False + fabric: str = "tcp" + cr_name: str = "" + cr_namespace: str = "" + cr_plural: str = "" + cluster_ip: str = "" + grafana_secret: str = "" @api.get('/', name='clusters:list') def list() -> List[ClusterDTO]: - return [ - ClusterDTO.from_model(cluster) - for cluster - in db.get_clusters() - ] - + data = [] + for cluster in db.get_clusters(): + stat_obj = None + ret = db.get_cluster_capacity(cluster, 1) + if ret: + stat_obj = ret[0] + data.append(ClusterDTO.from_model(cluster, stat_obj)) + return data @api.post('/', name='clusters:create', status_code=201, responses={201: {"content": None}}) def add(request: Request, parameters: ClusterParams): @@ -58,8 +71,8 @@ def add(request: Request, parameters: ClusterParams): if not cluster_id_or_false: raise ValueError('Failed to create cluster') - entity_url = request.app.url_path_for('get', cluster_id=cluster_id_or_false) - return Response(status_code=201, headers={'Location': entity_url}) + cluster = db.get_cluster_by_id(cluster_id_or_false) + return ClusterDTO.from_model(cluster) instance_api = APIRouter(prefix='/{cluster_id}') @@ -77,7 +90,11 @@ def _lookup_cluster(cluster_id: UUID): @instance_api.get('/', name='clusters:detail') def get(cluster: Cluster) -> ClusterDTO: - return ClusterDTO.from_model(cluster) + stat_obj = None + ret = db.get_cluster_capacity(cluster, 1) + if ret: + stat_obj = ret[0] + return ClusterDTO.from_model(cluster, stat_obj) class UpdatableClusterParameters(BaseModel): @@ -154,6 +171,23 @@ def activate(cluster: Cluster) -> Response: ).start() return Response(status_code=202) # FIXME: Provide URL for checking task status +@instance_api.post('/addreplication', name='clusters:addreplication', status_code=202, responses={202: {"content": None}}) +def cluster_add_replication(cluster: Cluster, parameters: _ReplicationParams) -> Response: + cluster_ops.add_replication( + source_cl_id=cluster.get_id(), + target_cl_id=parameters.snapshot_replication_target_cluster, + timeout=parameters.snapshot_replication_timeout, + target_pool=parameters.target_pool + ) + return Response(status_code=202) + +@instance_api.post('/expand', name='clusters:expand', status_code=202, responses={202: {"content": None}}) +def expand(cluster: Cluster) -> Response: + Thread( + target=cluster_ops.cluster_expand, + args=(cluster.get_id(),), + ).start() + return Response(status_code=202) # FIXME: Provide URL for checking task status @instance_api.post('/update', name='clusters:upgrade', status_code=204, responses={204: {"content": None}}) def update_cluster( cluster: Cluster, parameters: _UpdateParams) -> Response: diff --git a/simplyblock_web/api/v2/device.py b/simplyblock_web/api/v2/device.py index 1c7b40d7e..b0015b69b 100644 --- a/simplyblock_web/api/v2/device.py +++ b/simplyblock_web/api/v2/device.py @@ -18,10 +18,14 @@ @api.get('/', name='clusters:storage_nodes:devices:list') def list(cluster: Cluster, storage_node: StorageNode) -> List[DeviceDTO]: - return [ - DeviceDTO.from_model(device) - for device in storage_node.nvme_devices - ] + data = [] + for device in storage_node.nvme_devices: + stat_obj = None + ret = db.get_device_stats(device, 1) + if ret: + stat_obj = ret[0] + data.append(DeviceDTO.from_model(device, stat_obj)) + return data instance_api = APIRouter(prefix='/{device_id}') @@ -38,16 +42,26 @@ def _lookup_device(storage_node: StorageNode, device_id: UUID) -> NVMeDevice: @instance_api.get('/', name='clusters:storage_nodes:devices:detail') def get(cluster: Cluster, storage_node: StorageNode, device: Device) -> DeviceDTO: - return DeviceDTO.from_model(device) + stat_obj = None + ret = db.get_device_stats(device, 1) + if ret: + stat_obj = ret[0] + return DeviceDTO.from_model(device, stat_obj) -@instance_api.delete('/', name='clusters:storage_nodes:devices:delete', status_code=204, responses={204: {"content": None}}) -def delete(cluster: Cluster, storage_node: StorageNode, device: Device) -> Response: - if not device_controller.device_remove(device.get_id()): +@instance_api.post('/remove', name='clusters:storage_nodes:devices:remove', status_code=204, responses={204: {"content": None}}) +def remove(cluster: Cluster, storage_node: StorageNode, device: Device, force: bool = False) -> Response: + if not device_controller.device_remove(device.get_id(), force): raise ValueError('Failed to remove device') return Response(status_code=204) +@instance_api.post('/restart', name='clusters:storage_nodes:devices:restart', status_code=204, responses={204: {"content": None}}) +def restart(cluster: Cluster, storage_node: StorageNode, device: Device, force: bool = False) -> Response: + if not device_controller.restart_device(device.get_id(), force): + raise ValueError('Failed to restart device') + + return Response(status_code=204) @instance_api.get('/capacity', name='clusters:storage_nodes:devices:capacity') def capacity( diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index 54c1b5b01..ca29bfae0 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -12,19 +12,40 @@ from simplyblock_core.models.nvme_device import NVMeDevice from simplyblock_core.models.pool import Pool from simplyblock_core.models.snapshot import SnapShot +from simplyblock_core.models.stats import StatsObject from simplyblock_core.models.storage_node import StorageNode from . import util +class CapacityStatDTO(BaseModel): + date: int + size_total: int + size_prov: int + size_used: int + size_free: int + size_util: int + + @staticmethod + def from_model(model: StatsObject): + return CapacityStatDTO( + date=model.date, + size_total=model.size_total, + size_prov=model.size_prov, + size_used=model.size_used, + size_free=model.size_free, + size_util=model.size_util, + ) + + + class ClusterDTO(BaseModel): id: UUID name: Optional[str] nqn: str status: Literal['active', 'read_only', 'inactive', 'suspended', 'degraded', 'unready', 'in_activation', 'in_expansion'] - rebalancing: bool + is_re_balancing: bool block_size: util.Unsigned - coding: Tuple[util.Unsigned, util.Unsigned] ha: bool utliziation_critical: util.Percent utilization_warning: util.Percent @@ -33,17 +54,21 @@ class ClusterDTO(BaseModel): node_affinity: bool anti_affinity: bool secret: str + distr_ndcs: int + distr_npcs: int + capacity: CapacityStatDTO @staticmethod - def from_model(model: Cluster): + def from_model(model: Cluster, stat_obj: Optional[StatsObject]=None): return ClusterDTO( id=UUID(model.get_id()), name=model.cluster_name, nqn=model.nqn, status=model.status, # type: ignore - rebalancing=model.is_re_balancing, + is_re_balancing=model.is_re_balancing, block_size=model.blk_size, - coding=(model.distr_ndcs, model.distr_npcs), + distr_ndcs=model.distr_ndcs, + distr_npcs=model.distr_npcs, ha=model.ha_type == 'ha', utilization_warning=model.cap_warn, utliziation_critical=model.cap_crit, @@ -52,6 +77,7 @@ def from_model(model: Cluster): node_affinity=model.enable_node_affinity, anti_affinity=model.strict_node_anti_affinity, secret=model.secret, + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) @@ -65,9 +91,10 @@ class DeviceDTO(BaseModel): nvmf_ips: List[IPv4Address] nvmf_nqn: str = "" nvmf_port: int = 0 + capacity: CapacityStatDTO @staticmethod - def from_model(model: NVMeDevice): + def from_model(model: NVMeDevice, stat_obj: Optional[StatsObject]=None): return DeviceDTO( id=UUID(model.get_id()), status=model.status, @@ -78,6 +105,7 @@ def from_model(model: NVMeDevice): nvmf_ips=[IPv4Address(ip) for ip in model.nvmf_ip.split(',')], nvmf_nqn=model.nvmf_nqn, nvmf_port=model.nvmf_port, + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) @@ -107,9 +135,10 @@ class StoragePoolDTO(BaseModel): max_rw_mbytes: util.Unsigned max_r_mbytes: util.Unsigned max_w_mbytes: util.Unsigned + capacity: CapacityStatDTO @staticmethod - def from_model(model: Pool): + def from_model(model: Pool, stat_obj: Optional[StatsObject]=None): return StoragePoolDTO( id=UUID(model.get_id()), name=model.pool_name, @@ -120,6 +149,7 @@ def from_model(model: Pool): max_rw_mbytes=model.max_rw_mbytes_per_sec, max_r_mbytes=model.max_r_mbytes_per_sec, max_w_mbytes=model.max_w_mbytes_per_sec, + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) @@ -153,14 +183,34 @@ def from_model(model: SnapShot, request: Request, cluster_id, pool_id, volume_id class StorageNodeDTO(BaseModel): id: UUID status: str - ip: IPv4Address + hostname: str + cpu: int + spdk_mem: int + lvols: int + rpc_port: int + lvol_subsys_port: int + nvmf_port: int + mgmt_ip: IPv4Address + health_check: bool + online_devices: str + capacity: CapacityStatDTO @staticmethod - def from_model(model: StorageNode): + def from_model(model: StorageNode, stat_obj: Optional[StatsObject]=None): return StorageNodeDTO( id=UUID(model.get_id()), status=model.status, - ip=IPv4Address(model.mgmt_ip), + hostname=model.hostname, + cpu=model.cpu, + spdk_mem=model.spdk_mem, + lvols=model.lvols, + rpc_port=model.rpc_port, + lvol_subsys_port=model.lvol_subsys_port, + nvmf_port=model.nvmf_port, + mgmt_ip=IPv4Address(model.mgmt_ip), + health_check=model.health_check, + online_devices=f"{len(model.nvme_devices)}/{len([d for d in model.nvme_devices if d.status=='online'])}", + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) @@ -177,7 +227,7 @@ class TaskDTO(BaseModel): @staticmethod def from_model(model: JobSchedule): return TaskDTO( - id=UUID(model.get_id()), + id=UUID(model.uuid), status=model.status, canceled=model.canceled, function_name=model.function_name, @@ -194,25 +244,44 @@ class VolumeDTO(BaseModel): status: str health_check: bool nqn: str + hostname: str + fabric: str nodes: List[util.UrlPath] port: util.Port size: util.Unsigned + ndcs: int + npcs: int + pool_uuid: str + pool_name: str + pvc_name: str = "" + snapshot_name: str = "" + blobid: int + ns_id: int cloned_from: Optional[util.UrlPath] crypto_key: Optional[Tuple[str, str]] high_availability: bool + lvol_priority_class: util.Unsigned + do_replicate: bool = False + max_namespace_per_subsys: int max_rw_iops: util.Unsigned max_rw_mbytes: util.Unsigned max_r_mbytes: util.Unsigned max_w_mbytes: util.Unsigned + capacity: CapacityStatDTO + rep_info: Optional[dict] = None + from_source: bool = True + @staticmethod - def from_model(model: LVol, request: Request, cluster_id: str): + def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optional[StatsObject]=None, rep_info=None): return VolumeDTO( id=UUID(model.get_id()), name=model.lvol_name, status=model.status, health_check=model.health_check, nqn=model.nqn, + hostname=model.hostname, + fabric=model.fabric, nodes=[ str(request.url_for( 'clusters:storage-nodes:detail', @@ -235,8 +304,22 @@ def from_model(model: LVol, request: Request, cluster_id: str): else None ), high_availability=model.ha_type == 'ha', + pool_uuid=model.pool_uuid, + pool_name=model.pool_name, + pvc_name=model.pvc_name, + snapshot_name=model.snapshot_name, + ndcs=model.ndcs, + npcs=model.npcs, + blobid=model.blobid, + ns_id=model.ns_id, + lvol_priority_class=model.lvol_priority_class, + do_replicate=model.do_replicate, + max_namespace_per_subsys=model.max_namespace_per_subsys, max_rw_iops=model.rw_ios_per_sec, max_rw_mbytes=model.rw_mbytes_per_sec, max_r_mbytes=model.r_mbytes_per_sec, max_w_mbytes=model.w_mbytes_per_sec, + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), + rep_info=rep_info, + from_source=model.from_source ) diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py index c779f70ca..4bc201a28 100644 --- a/simplyblock_web/api/v2/pool.py +++ b/simplyblock_web/api/v2/pool.py @@ -20,12 +20,15 @@ @api.get('/', name='clusters:storage-pools:list') def list(cluster: Cluster) -> List[StoragePoolDTO]: - return [ - StoragePoolDTO.from_model(pool) - for pool - in db.get_pools() - if pool.cluster_id == cluster.get_id() - ] + data = [] + for pool in db.get_pools(): + if pool.cluster_id == cluster.get_id(): + stat_obj = None + ret = db.get_pool_stats(pool, 1) + if ret: + stat_obj = ret[0] + data.append(StoragePoolDTO.from_model(pool, stat_obj)) + return data class StoragePoolParams(BaseModel): @@ -36,6 +39,9 @@ class StoragePoolParams(BaseModel): max_rw_mbytes: util.Unsigned = 0 max_r_mbytes: util.Unsigned = 0 max_w_mbytes: util.Unsigned = 0 + cr_name: str + cr_namespace: str + cr_plural: str @api.post('/', name='clusters:storage-pools:create', status_code=201, responses={201: {"content": None}}) @@ -49,14 +55,13 @@ def add(request: Request, cluster: Cluster, parameters: StoragePoolParams) -> Re id_or_false = pool_controller.add_pool( parameters.name, parameters.pool_max, parameters.volume_max_size, parameters.max_rw_iops, parameters.max_rw_mbytes, - parameters.max_r_mbytes, parameters.max_w_mbytes, cluster.get_id() + parameters.max_r_mbytes, parameters.max_w_mbytes, cluster.get_id(), parameters.cr_name, parameters.cr_namespace, parameters.cr_plural ) if not id_or_false: raise ValueError('Failed to create pool') - - entity_url = request.app.url_path_for('clusters:storage-pools:detail', cluster_id=cluster.get_id(), pool_id=id_or_false) - return Response(status_code=201, headers={'Location': entity_url}) + pool = db.get_pool_by_id(id_or_false) + return pool.to_dict() instance_api = APIRouter(prefix='/{pool_id}') @@ -74,7 +79,11 @@ def _lookup_storage_pool(pool_id: UUID) -> PoolModel: @instance_api.get('/', name='clusters:storage-pools:detail') def get(cluster: Cluster, pool: StoragePool) -> StoragePoolDTO: - return StoragePoolDTO.from_model(pool) + stat_obj = None + ret = db.get_pool_stats(pool, 1) + if ret: + stat_obj = ret[0] + return StoragePoolDTO.from_model(pool, stat_obj) @instance_api.delete('/', name='clusters:storage-pools:delete', status_code=204, responses={204: {"content": None}}) @@ -96,6 +105,9 @@ class UpdatableStoragePoolParams(BaseModel): max_rw_mbytes: Optional[util.Unsigned] = None max_r_mbytes: Optional[util.Unsigned] = None max_w_mbytes: Optional[util.Unsigned] = None + lvols_cr_name: Optional[str] = None + lvols_cr_namespace: Optional[str] = None + lvols_cr_plural: Optional[str] = None @instance_api.put('/', name='clusters:storage-pools:update', status_code=204, responses={204: {"content": None}}) @@ -122,5 +134,5 @@ def update(cluster: Cluster, pool: StoragePool, parameters: UpdatableStoragePool @instance_api.get('/iostats', name='clusters:storage-pools:iostats') def iostats(cluster: Cluster, pool: StoragePool, limit: int = 20): - records = db.get_pool_stats(pool, limit) - return core_utils.process_records(records, 20) + data = pool_controller.get_io_stats(pool.get_id(), history="") + return core_utils.process_records(data, 20) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index f93fa5250..7d27ecc5e 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -22,32 +22,40 @@ @api.get('/', name='clusters:storage-nodes:list') def list(cluster: Cluster) -> List[StorageNodeDTO]: - return [ - StorageNodeDTO.from_model(storage_node) - for storage_node - in db.get_storage_nodes_by_cluster_id(cluster.get_id()) - ] + data = [] + for storage_node in db.get_storage_nodes_by_cluster_id(cluster.get_id()): + node_stat_obj = None + ret = db.get_node_capacity(storage_node, 1) + if ret: + node_stat_obj = ret[0] + data.append(StorageNodeDTO.from_model(storage_node, node_stat_obj)) + return data class StorageNodeParams(BaseModel): node_address: Annotated[str, Field(web_utils.IP_PATTERN)] interface_name: str - max_snapshots: int = Field(500) - ha_jm: bool = Field(True) - test_device: bool = Field(False) - spdk_image: Optional[str] + max_snapshots: Optional[int] = Field(500) + ha_jm: Optional[bool] = Field(True) + test_device: Optional[bool] = Field(False) + spdk_image: Optional[str] = Field("") spdk_debug: bool = Field(False) - full_page_unmap: bool = Field(False) data_nics: List[str] = Field([]) namespace: str = Field('default') + id_device_by_nqn: Optional[bool] = Field(False) jm_percent: util.Percent = Field(3) partitions: int = Field(1) iobuf_small_pool_count: int = Field(0) iobuf_large_pool_count: int = Field(0) + cr_name: str + cr_namespace: str + cr_plural: str + ha_jm_count: int = Field(3) + format_4k: bool = Field(False) @api.post('/', name='clusters:storage-nodes:create', status_code=201, responses={201: {"content": None}}) -def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Response: +def add(request: Request, cluster: Cluster, parameters: StorageNodeParams): task_id_or_false = tasks_controller.add_node_add_task( cluster.get_id(), { @@ -65,14 +73,17 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Re 'enable_test_device': parameters.test_device, 'namespace': parameters.namespace, 'enable_ha_jm': parameters.ha_jm, - 'full_page_unmap': parameters.full_page_unmap, + 'id_device_by_nqn': parameters.id_device_by_nqn, + 'cr_name': parameters.cr_name, + 'cr_namespace': parameters.cr_namespace, + 'cr_plural': parameters.cr_plural, + "ha_jm_count": parameters.ha_jm_count, + "format_4k": parameters.format_4k, } ) if not task_id_or_false: raise ValueError('Failed to create add-node task') - - task_url = request.app.url_path_for('clusters:storage-nodes:detail', cluster_id=cluster.get_id(), task_id=task_id_or_false) - return Response(status_code=201, headers={'Location': task_url}) + return task_id_or_false instance_api = APIRouter(prefix='/{storage_node_id}') @@ -90,18 +101,29 @@ def _lookup_storage_node(storage_node_id: UUID) -> StorageNodeModel: @instance_api.get('/', name='clusters:storage-nodes:detail') def get(cluster: Cluster, storage_node: StorageNode): - return StorageNodeDTO.from_model(storage_node) + node_stat_obj = None + ret = db.get_node_capacity(storage_node, 1) + if ret: + node_stat_obj = ret[0] + return StorageNodeDTO.from_model(storage_node, node_stat_obj) @instance_api.delete('/', name='clusters:storage-nodes:delete') def delete( - cluster: Cluster, storage_node: StorageNode, force_remove: bool = False, force_migrate: bool = False) -> Response: + cluster: Cluster, storage_node: StorageNode, force_remove: bool = False, force_migrate: bool = False, force_delete: bool = False ) -> Response: none_or_false = storage_node_ops.remove_storage_node( storage_node.get_id(), force_remove=force_remove, force_migrate=force_migrate ) if none_or_false == False: # noqa raise ValueError('Failed to remove storage node') + if force_delete: + none_or_false = storage_node_ops.delete_storage_node( + storage_node.get_id(), force=force_delete + ) + if none_or_false == False: # noqa + raise ValueError('Failed to delete storage node') + return Response(status_code=204) @@ -198,17 +220,20 @@ def shutdown(cluster: Cluster, storage_node: StorageNode, force: bool = False) - class _RestartParams(BaseModel): force: bool = False reattach_volume: bool = False + node_address: Optional[Annotated[str, Field(pattern=web_utils.IP_PATTERN)]] = None + @instance_api.post('/start', name='clusters:storage-nodes:start', status_code=202, responses={202: {"content": None}}) # Same as restart for now @instance_api.post('/restart', name='clusters:storage-nodes:restart', status_code=202, responses={202: {"content": None}}) -def restart(cluster: Cluster, storage_node: StorageNode, parameters: _RestartParams = _RestartParams()) -> Response: +def restart(cluster: Cluster, storage_node: StorageNode, parameters: _RestartParams) -> Response: storage_node = storage_node Thread( target=storage_node_ops.restart_storage_node, kwargs={ "node_id": storage_node.get_id(), "force": parameters.force, + "node_ip": parameters.node_address, "reattach_volume": parameters.reattach_volume, } ).start() diff --git a/simplyblock_web/api/v2/task.py b/simplyblock_web/api/v2/task.py index c17bec3b7..94ecccce3 100644 --- a/simplyblock_web/api/v2/task.py +++ b/simplyblock_web/api/v2/task.py @@ -5,7 +5,6 @@ from simplyblock_core.db_controller import DBController from simplyblock_core.models.job_schedule import JobSchedule -from simplyblock_core.controllers import tasks_controller from .cluster import Cluster from .dtos import TaskDTO @@ -16,13 +15,13 @@ @api.get('/', name='clusters:tasks:list') def list(cluster: Cluster) -> List[TaskDTO]: - return [ - TaskDTO.from_model(task) - for task - in tasks_controller.list_tasks(cluster.get_id()) - if task.cluster_id == cluster.get_id() - ] - + cluster_tasks = db.get_job_tasks(cluster.get_id(), limit=0) + data=[] + for t in cluster_tasks: + if t.function_name == JobSchedule.FN_DEV_MIG: + continue + data.append(t) + return [TaskDTO.from_model(task) for task in data] instance_api = APIRouter(prefix='/{task_id}') diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 698788718..ba342f071 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -11,7 +11,7 @@ from .cluster import Cluster from .pool import StoragePool -from .dtos import VolumeDTO, SnapshotDTO +from .dtos import VolumeDTO, SnapshotDTO, TaskDTO from . import util @@ -21,11 +21,14 @@ @api.get('/', name='clusters:storage-pools:volumes:list') def list(request: Request, cluster: Cluster, pool: StoragePool) -> List[VolumeDTO]: - return [ - VolumeDTO.from_model(lvol, request, cluster.get_id()) - for lvol - in db.get_lvols_by_pool_id(pool.get_id()) - ] + data = [] + for lvol in db.get_lvols_by_pool_id(pool.get_id()): + stat_obj = None + ret = db.get_lvol_stats(lvol, 1) + if ret: + stat_obj = ret[0] + data.append(VolumeDTO.from_model(lvol, request, cluster.get_id(), stat_obj)) + return data class _CreateParams(BaseModel): @@ -43,6 +46,10 @@ class _CreateParams(BaseModel): pvc_name: Optional[str] = None ndcs: util.Unsigned = 0 npcs: util.Unsigned = 0 + fabric: str = "tcp" + max_namespace_per_subsys: int = 1 + do_replicate: bool = False + replication_cluster_id: Optional[str] = None class _CloneParams(BaseModel): @@ -85,6 +92,11 @@ def add( pvc_name=data.pvc_name, ndcs=data.ndcs, npcs=data.npcs, + fabric=data.fabric, + max_namespace_per_subsys=data.max_namespace_per_subsys, + do_replicate=data.do_replicate, + replication_cluster_id=data.replication_cluster_id, + ) elif isinstance(data, _CloneParams): volume_id_or_false, error = snapshot_controller.clone( @@ -122,7 +134,12 @@ def _lookup_volume(volume_id: UUID) -> LVol: @instance_api.get('/', name='clusters:storage-pools:volumes:detail') def get(request: Request, cluster: Cluster, pool: StoragePool, volume: Volume) -> VolumeDTO: - return VolumeDTO.from_model(volume, request, cluster.get_id()) + stat_obj = None + ret = db.get_lvol_stats(volume, 1) + if ret: + stat_obj = ret[0] + rep_info = lvol_controller.get_replication_info(volume.get_id()) + return VolumeDTO.from_model(volume, request, cluster.get_id(), stat_obj, rep_info) class UpdatableLVolParams(BaseModel): @@ -171,6 +188,26 @@ def inflate(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: return Response(status_code=204) +@instance_api.post('/replication_trigger', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) +def replication_trigger(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: + if not lvol_controller.replication_trigger(volume.get_id()): + raise ValueError('Failed to start volume snapshot replication') + + return Response(status_code=204) + +@instance_api.post('/replication_start', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) +def replication_start(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: + if not lvol_controller.replication_start(volume.get_id(), cluster.get_id()): + raise ValueError('Failed to start volume snapshot replication') + + return Response(status_code=204) + +@instance_api.post('/replication_stop', name='clusters:storage-pools:volumes:replication_stop', status_code=204, responses={204: {"content": None}}) +def replication_stop(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: + if not lvol_controller.replication_stop(volume.get_id()): + raise ValueError('Failed to stop volume snapshot replication') + + return Response(status_code=204) @instance_api.get('/connect', name='clusters:storage-pools:volumes:connect') def connect(cluster: Cluster, pool: StoragePool, volume: Volume): @@ -232,3 +269,27 @@ def create_snapshot( cluster_id=cluster.get_id(), pool_id=pool.get_id(), snapshot_id=snapshot_id, ) return Response(status_code=201, headers={'Location': entity_url}) + + +@instance_api.post('/replicate_lvol', name='clusters:storage-pools:volumes:replicate_lvol') +def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume: Volume): + return lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) + + +@instance_api.post('/replicate_lvol_on_source_cluster', name='clusters:storage-pools:volumes:replicate_lvol_on_source_cluster') +def replicate_lvol_on_source_cluster(cluster: Cluster, pool: StoragePool, volume: Volume): + return lvol_controller.replicate_lvol_on_source_cluster(volume.get_id()) + + +@instance_api.get('/list_replication_tasks', name='clusters:storage-pools:volumes:list_replication_tasks') +def list_replication_tasks(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: + tasks = lvol_controller.list_replication_tasks(volume.get_id()) + return [TaskDTO.from_model(task) for task in tasks] + +@instance_api.get('/suspend', name='clusters:storage-pools:volumes:suspend') +def suspend(cluster: Cluster, pool: StoragePool, volume: Volume) -> bool: + return lvol_controller.suspend_lvol(volume.get_id()) + +@instance_api.get('/resume', name='clusters:storage-pools:volumes:resume') +def resume(cluster: Cluster, pool: StoragePool, volume: Volume) -> bool: + return lvol_controller.resume_lvol(volume.get_id()) diff --git a/simplyblock_web/auth_middleware.py b/simplyblock_web/auth_middleware.py index 8a1a9e83a..87449cb64 100644 --- a/simplyblock_web/auth_middleware.py +++ b/simplyblock_web/auth_middleware.py @@ -34,6 +34,10 @@ def decorated(*args: Any, **kwargs: Any) -> ResponseType: # Skip authentication for Swagger UI if request.method == "GET" and request.path.startswith("/swagger"): return cast(ResponseType, f(*args, **kwargs)) + if request.method == "POST" and request.path.startswith("/cluster/create_first"): + return cast(ResponseType, f(*args, **kwargs)) + if request.method == "GET" and request.path.startswith("/health/fdb"): + return cast(ResponseType, f(*args, **kwargs)) cluster_id: str = "" cluster_secret: str = "" diff --git a/simplyblock_web/node_configure.py b/simplyblock_web/node_configure.py index 6b69ee347..ff5a2434d 100755 --- a/simplyblock_web/node_configure.py +++ b/simplyblock_web/node_configure.py @@ -1,54 +1,55 @@ -#!/usr/bin/env python -# encoding: utf-8 - -import argparse -import logging -import os -import sys -from typing import List, Optional, cast - -from kubernetes.client import ApiException, CoreV1Api - -from simplyblock_core import constants, utils -from simplyblock_core.storage_node_ops import ( - generate_automated_deployment_config, - upgrade_automated_deployment_config, -) -from simplyblock_cli.clibase import range_type -from simplyblock_web import node_utils_k8s - - -logger = logging.getLogger(__name__) -logger.setLevel(constants.LOG_LEVEL) - -POD_PREFIX: str = "snode-spdk-pod" - -def _is_pod_present_for_node() -> bool: - """ - Check if a pod with the specified prefix is already running on the current node. - - Returns: - bool: True if a matching pod is found, False otherwise - - Raises: - RuntimeError: If there's an error communicating with the Kubernetes API - """ - k8s_core_v1: CoreV1Api = cast(CoreV1Api, utils.get_k8s_core_client()) - namespace: str = node_utils_k8s.get_namespace() - node_name: Optional[str] = os.environ.get("HOSTNAME") - - if not node_name: +#!/usr/bin/env python +# encoding: utf-8 + +import argparse +import logging +import sys +from typing import List, Optional, cast + +from kubernetes.client import ApiException, CoreV1Api + +from simplyblock_core import constants, utils +from simplyblock_core.storage_node_ops import ( + generate_automated_deployment_config, + upgrade_automated_deployment_config, +) +from simplyblock_cli.clibase import range_type +from simplyblock_web import node_utils_k8s +import os +import subprocess + +logger = logging.getLogger(__name__) +logger.setLevel(constants.LOG_LEVEL) + +POD_PREFIX: str = "snode-spdk-pod" + + +def _is_pod_present_for_node() -> bool: + """ + Check if a pod with the specified prefix is already running on the current node. + + Returns: + bool: True if a matching pod is found, False otherwise + + Raises: + RuntimeError: If there's an error communicating with the Kubernetes API + """ + k8s_core_v1: CoreV1Api = cast(CoreV1Api, utils.get_k8s_core_client()) + namespace: str = node_utils_k8s.get_namespace() + node_name: Optional[str] = os.environ.get("HOSTNAME") + + if not node_name: raise RuntimeError("HOSTNAME environment variable not set") try: resp = k8s_core_v1.list_namespaced_pod(namespace) for pod in resp.items: if ( - pod.metadata and - pod.metadata.name and - pod.spec and - pod.spec.node_name == node_name and - pod.metadata.name.startswith(POD_PREFIX) + pod.metadata and + pod.metadata.name and + pod.spec and + pod.spec.node_name == node_name and + pod.metadata.name.startswith(POD_PREFIX) ): return True except ApiException as e: @@ -66,7 +67,7 @@ def parse_arguments() -> argparse.Namespace: argparse.Namespace: Parsed command line arguments """ parser = argparse.ArgumentParser(description="Automated Deployment Configuration Script") - + # Define command line arguments parser.add_argument( '--max-lvol', @@ -121,16 +122,45 @@ def parse_arguments() -> argparse.Namespace: ) parser.add_argument( '--cores-percentage', - help='The percentage of cores to be used for spdk (0-100)', - type=range_type(0, 100), + help='The percentage of cores to be used for spdk (0-99)', + type=range_type(0, 99), dest='cores_percentage', required=False, default=0 ) - + parser.add_argument( + '--force', + help='Force format detected or passed nvme pci address to 4K and clean partitions', + action='store_true', + dest='force', + required=False + ) + parser.add_argument( + '--device-model', + help='NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together', + type=str, + default='', + dest='device_model', + required=False + ) + parser.add_argument( + '--size-range', + help='NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together', + type=str, + default='', + dest='size_range', + required=False + ) + parser.add_argument( + '--nvme-devices', + help='Comma separated list of nvme namespace names like nvme0n1,nvme1n1...', + type=str, + default='', + dest='nvme_names', + required=False + ) return parser.parse_args() - def validate_arguments(args: argparse.Namespace) -> None: """ Validate the provided command line arguments. @@ -145,8 +175,7 @@ def validate_arguments(args: argparse.Namespace) -> None: if not args.max_lvol: raise argparse.ArgumentError(None, '--max-lvol is required') if not args.max_prov: - raise argparse.ArgumentError(None, '--max-size is required') - + args.max_prov=0 try: max_lvol = int(args.max_lvol) if max_lvol <= 0: @@ -156,15 +185,15 @@ def validate_arguments(args: argparse.Namespace) -> None: None, f"Invalid value for max-lvol '{args.max_lvol}': {str(e)}" ) - + if args.pci_allowed and args.pci_blocked: raise argparse.ArgumentError( None, "pci-allowed and pci-blocked cannot be both specified" ) - + max_prov = utils.parse_size(args.max_prov, assume_unit='G') - if max_prov <= 0: + if max_prov < 0: raise argparse.ArgumentError( None, f"Invalid storage size: {args.max_prov}. Must be a positive value with optional unit (e.g., 100G, 1T)" @@ -175,17 +204,19 @@ def main() -> None: """Main entry point for the node configuration script.""" try: args = parse_arguments() - + if args.upgrade: upgrade_automated_deployment_config() return - + + if not args.max_prov: + args.max_prov=0 validate_arguments(args) - + if _is_pod_present_for_node(): logger.info("Skipped generating automated deployment configuration — pod already present.") sys.exit(0) - + # Process socket configuration sockets_to_use: List[int] = [0] if args.sockets_to_use: @@ -196,7 +227,7 @@ def main() -> None: None, f"Invalid value for sockets-to-use '{args.sockets_to_use}': {str(e)}" ) - + nodes_per_socket: int = 1 if args.nodes_per_socket: try: @@ -208,16 +239,19 @@ def main() -> None: None, f"Invalid value for nodes-per-socket '{args.nodes_per_socket}': {str(e)}" ) - + # Process PCI device filters pci_allowed: List[str] = [] pci_blocked: List[str] = [] - + nvme_names: List[str] = [] + if args.pci_allowed: pci_allowed = [pci.strip() for pci in args.pci_allowed.split(',') if pci.strip()] if args.pci_blocked: pci_blocked = [pci.strip() for pci in args.pci_blocked.split(',') if pci.strip()] - + if args.nvme_names: + nvme_names = [nvme_name.strip() for nvme_name in args.nvme_names.split(',') if nvme_name.strip()] + # Generate the deployment configuration generate_automated_deployment_config( max_lvol=int(args.max_lvol), @@ -226,9 +260,36 @@ def main() -> None: sockets_to_use=sockets_to_use, pci_allowed=pci_allowed, pci_blocked=pci_blocked, - cores_percentage=args.cores_percentage + cores_percentage=args.cores_percentage, + force=args.force, + device_model=args.device_model, + size_range=args.size_range, + nvme_names=nvme_names, + k8s=True ) - + + logger.info("create RPC socket mount") + mount_point = "/mnt/ramdisk" + size = "1G" + fstab_entry = f"tmpfs {mount_point} tmpfs size={size},mode=1777,noatime 0 0\n" + + # 1️⃣ Create the mount point if it doesn't exist + os.makedirs(mount_point, exist_ok=True) + + # 2️⃣ Add to /etc/fstab if not already present + with open("/etc/fstab", "r+") as fstab: + lines = fstab.readlines() + if not any(mount_point in line for line in lines): + fstab.write(fstab_entry) + print(f"Added fstab entry for {mount_point}") + else: + print(f"fstab entry for {mount_point} already exists") + + # 3️⃣ Mount the RAM disk immediately + subprocess.run(["mount", mount_point], check=True) + + # 4️⃣ Verify + subprocess.run(["df", "-h", mount_point]) except argparse.ArgumentError as e: logger.error(f"Argument error: {e}") sys.exit(1) diff --git a/simplyblock_web/node_utils_k8s.py b/simplyblock_web/node_utils_k8s.py index 4626a89c9..b1440744d 100644 --- a/simplyblock_web/node_utils_k8s.py +++ b/simplyblock_web/node_utils_k8s.py @@ -5,6 +5,7 @@ import time from simplyblock_core.utils import get_k8s_batch_client +from kubernetes.client import ApiException node_name = os.environ.get("HOSTNAME") @@ -23,7 +24,7 @@ def get_namespace(): return out return default_namespace -def wait_for_job_completion(job_name, namespace, timeout=60): +def wait_for_job_completion(job_name, namespace, timeout=180): batch_v1 = get_k8s_batch_client() for _ in range(timeout): job = batch_v1.read_namespaced_job(job_name, namespace) @@ -33,3 +34,19 @@ def wait_for_job_completion(job_name, namespace, timeout=60): raise RuntimeError(f"Job '{job_name}' failed") time.sleep(3) raise TimeoutError(f"Timeout waiting for Job '{job_name}' to complete") + +def wait_for_job_deletion(job_name, namespace, timeout=60): + batch_v1 = get_k8s_batch_client() + + for _ in range(timeout): + try: + batch_v1.read_namespaced_job(job_name, namespace) + except ApiException as e: + if e.status == 404: + return True + else: + raise + + time.sleep(2) + + raise TimeoutError(f"Timeout waiting for Job '{job_name}' to be deleted") diff --git a/simplyblock_web/static/openapi.json b/simplyblock_web/static/openapi.json new file mode 100644 index 000000000..3e2a05130 --- /dev/null +++ b/simplyblock_web/static/openapi.json @@ -0,0 +1 @@ +{"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/api/v2/clusters/":{"get":{"summary":"Clusters:List","operationId":"clusters_list_api_v2_clusters__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/ClusterDTO"},"title":"Response Clusters List Api V2 Clusters Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Create","operationId":"clusters_create_api_v2_clusters__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClusterParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/":{"get":{"summary":"Clusters:Detail","operationId":"clusters_detail_api_v2_clusters__cluster_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClusterDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Update","operationId":"clusters_update_api_v2_clusters__cluster_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableClusterParameters"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Delete","operationId":"clusters_delete_api_v2_clusters__cluster_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/capacity":{"get":{"summary":"Clusters:Capacity","operationId":"clusters_capacity_api_v2_clusters__cluster_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/iostats":{"get":{"summary":"Clusters:Iostats","operationId":"clusters_iostats_api_v2_clusters__cluster_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/logs":{"get":{"summary":"Clusters:Logs","operationId":"clusters_logs_api_v2_clusters__cluster_id__logs_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","default":50,"title":"Limit"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/start":{"post":{"summary":"Clusters:Start","operationId":"clusters_start_api_v2_clusters__cluster_id__start_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/shutdown":{"post":{"summary":"Clusters:Shutdown","operationId":"clusters_shutdown_api_v2_clusters__cluster_id__shutdown_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/activate":{"post":{"summary":"Clusters:Activate","operationId":"clusters_activate_api_v2_clusters__cluster_id__activate_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/update":{"post":{"summary":"Clusters:Upgrade","operationId":"clusters_upgrade_api_v2_clusters__cluster_id__update_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_UpdateParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/":{"get":{"summary":"Clusters:Storage-Nodes:List","operationId":"clusters_storage_nodes_list_api_v2_clusters__cluster_id__storage_nodes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/StorageNodeDTO"},"title":"Response Clusters Storage Nodes List Api V2 Clusters Cluster Id Storage Nodes Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Nodes:Create","operationId":"clusters_storage_nodes_create_api_v2_clusters__cluster_id__storage_nodes__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/StorageNodeParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/":{"get":{"summary":"Clusters:Storage-Nodes:Detail","operationId":"clusters_storage_nodes_detail_api_v2_clusters__cluster_id__storage_nodes__storage_node_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Nodes:Delete","operationId":"clusters_storage_nodes_delete_api_v2_clusters__cluster_id__storage_nodes__storage_node_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force_remove","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force Remove"}},{"name":"force_migrate","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force Migrate"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/capacity":{"get":{"summary":"Clusters:Storage-Nodes:Capacity","operationId":"clusters_storage_nodes_capacity_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/iostats":{"get":{"summary":"Clusters:Storage-Nodes:Iostats","operationId":"clusters_storage_nodes_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/nics":{"get":{"summary":"Clusters:Storage-Nodes:Nics:List","operationId":"clusters_storage_nodes_nics_list_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__nics_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/nics/{nic_id}/iostats":{"get":{"summary":"Clusters:Storage-Nodes:Nics:Iostats","operationId":"clusters_storage_nodes_nics_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__nics__nic_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"nic_id","in":"path","required":true,"schema":{"type":"string","title":"Nic Id"}},{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/suspend":{"post":{"summary":"Clusters:Storage-Nodes:Suspend","operationId":"clusters_storage_nodes_suspend_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__suspend_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/resume":{"post":{"summary":"Clusters:Storage-Nodes:Resume","operationId":"clusters_storage_nodes_resume_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__resume_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/shutdown":{"post":{"summary":"Clusters:Storage-Nodes:Shutdown","operationId":"clusters_storage_nodes_shutdown_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__shutdown_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/restart":{"post":{"summary":"Clusters:Storage-Nodes:Restart","operationId":"clusters_storage_nodes_restart_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__restart_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_RestartParams","default":{"force":false,"reattach_volume":false}}}}},"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/start":{"post":{"summary":"Clusters:Storage-Nodes:Start","operationId":"clusters_storage_nodes_start_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__start_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_RestartParams","default":{"force":false,"reattach_volume":false}}}}},"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/":{"get":{"summary":"Clusters:Storage Nodes:Devices:List","operationId":"clusters_storage_nodes_devices_list_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/DeviceDTO"},"title":"Response Clusters Storage Nodes Devices List Api V2 Clusters Cluster Id Storage Nodes Storage Node Id Devices Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/":{"get":{"summary":"Clusters:Storage Nodes:Devices:Detail","operationId":"clusters_storage_nodes_devices_detail_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/DeviceDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage Nodes:Devices:Delete","operationId":"clusters_storage_nodes_devices_delete_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/capacity":{"get":{"summary":"Clusters:Storage Nodes:Devices:Capacity","operationId":"clusters_storage_nodes_devices_capacity_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/iostats":{"get":{"summary":"Clusters:Storage Nodes:Devices:Iostats","operationId":"clusters_storage_nodes_devices_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/reset":{"post":{"summary":"Clusters:Storage Nodes:Devices:Reset","operationId":"clusters_storage_nodes_devices_reset_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__reset_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/":{"get":{"summary":"Clusters:Storage-Pools:List","operationId":"clusters_storage_pools_list_api_v2_clusters__cluster_id__storage_pools__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/StoragePoolDTO"},"title":"Response Clusters Storage Pools List Api V2 Clusters Cluster Id Storage Pools Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Create","operationId":"clusters_storage_pools_create_api_v2_clusters__cluster_id__storage_pools__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/StoragePoolParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/":{"get":{"summary":"Clusters:Storage-Pools:Detail","operationId":"clusters_storage_pools_detail_api_v2_clusters__cluster_id__storage_pools__pool_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/StoragePoolDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Delete","operationId":"clusters_storage_pools_delete_api_v2_clusters__cluster_id__storage_pools__pool_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Storage-Pools:Update","operationId":"clusters_storage_pools_update_api_v2_clusters__cluster_id__storage_pools__pool_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableStoragePoolParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/iostats":{"get":{"summary":"Clusters:Storage-Pools:Iostats","operationId":"clusters_storage_pools_iostats_api_v2_clusters__cluster_id__storage_pools__pool_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","default":20,"title":"Limit"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/":{"get":{"summary":"Clusters:Storage-Pools:Volumes:List","operationId":"clusters_storage_pools_volumes_list_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/VolumeDTO"},"title":"Response Clusters Storage Pools Volumes List Api V2 Clusters Cluster Id Storage Pools Pool Id Volumes Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Volumes:Create","operationId":"clusters_storage_pools_volumes_create_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RootModel_Union__CreateParams___CloneParams__"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Detail","operationId":"clusters_storage_pools_volumes_detail_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VolumeDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Storage-Pools:Volumes:Update","operationId":"clusters_storage_pools_volumes_update_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableLVolParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Volumes:Delete","operationId":"clusters_storage_pools_volumes_delete_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/inflate":{"post":{"summary":"Clusters:Storage-Pools:Volumes:Inflate","operationId":"clusters_storage_pools_volumes_inflate_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__inflate_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/connect":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Connect","operationId":"clusters_storage_pools_volumes_connect_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__connect_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/capacity":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Capacity","operationId":"clusters_storage_pools_volumes_capacity_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/iostats":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Iostats","operationId":"clusters_storage_pools_volumes_iostats_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/snapshots":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Snapshots:List","operationId":"clusters_storage_pools_volumes_snapshots_list_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__snapshots_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/SnapshotDTO"},"title":"Response Clusters Storage Pools Volumes Snapshots List Api V2 Clusters Cluster Id Storage Pools Pool Id Volumes Volume Id Snapshots Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Volumes:Snapshots:Create","operationId":"clusters_storage_pools_volumes_snapshots_create_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__snapshots_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_SnapshotParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/snapshots/":{"get":{"summary":"Clusters:Storage-Pools:Snapshots:List","operationId":"clusters_storage_pools_snapshots_list_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/SnapshotDTO"},"title":"Response Clusters Storage Pools Snapshots List Api V2 Clusters Cluster Id Storage Pools Pool Id Snapshots Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/snapshots/{snapshot_id}/":{"get":{"summary":"Clusters:Storage-Pools:Snapshots:Detail","operationId":"clusters_storage_pools_snapshots_detail_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__snapshot_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"snapshot_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Snapshot Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SnapshotDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Snapshots:Delete","operationId":"clusters_storage_pools_snapshots_delete_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__snapshot_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"snapshot_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Snapshot Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/management-nodes/":{"get":{"summary":"Management Nodes:List","operationId":"management_nodes_list_api_v2_management_nodes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/ManagementNodeDTO"},"title":"Response Management Nodes List Api V2 Management Nodes Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/management-nodes/{management_node_id}/":{"get":{"summary":"Management Node:Detail","operationId":"management_node_detail_api_v2_management_nodes__management_node_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"management_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Management Node Id"}},{"name":"cluster_id","in":"query","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ManagementNodeDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"ClusterDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"nqn":{"type":"string","title":"Nqn"},"status":{"type":"string","enum":["active","read_only","inactive","suspended","degraded","unready","in_activation","in_expansion"],"title":"Status"},"rebalancing":{"type":"boolean","title":"Rebalancing"},"block_size":{"type":"integer","minimum":0.0,"title":"Block Size"},"coding":{"prefixItems":[{"type":"integer","minimum":0.0},{"type":"integer","minimum":0.0}],"type":"array","maxItems":2,"minItems":2,"title":"Coding"},"ha":{"type":"boolean","title":"Ha"},"utliziation_critical":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Utliziation Critical"},"utilization_warning":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Utilization Warning"},"provisioned_cacacity_critical":{"type":"integer","minimum":0.0,"title":"Provisioned Cacacity Critical"},"provisioned_cacacity_warning":{"type":"integer","minimum":0.0,"title":"Provisioned Cacacity Warning"},"node_affinity":{"type":"boolean","title":"Node Affinity"},"anti_affinity":{"type":"boolean","title":"Anti Affinity"},"secret":{"type":"string","title":"Secret"}},"type":"object","required":["id","name","nqn","status","rebalancing","block_size","coding","ha","utliziation_critical","utilization_warning","provisioned_cacacity_critical","provisioned_cacacity_warning","node_affinity","anti_affinity","secret"],"title":"ClusterDTO"},"ClusterParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"blk_size":{"type":"integer","enum":[512,4096],"title":"Blk Size","default":512},"page_size_in_blocks":{"type":"integer","exclusiveMinimum":0.0,"title":"Page Size In Blocks","default":2097152},"cap_warn":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Cap Warn","default":0},"cap_crit":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Cap Crit","default":0},"prov_cap_warn":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Prov Cap Warn","default":0},"prov_cap_crit":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Prov Cap Crit","default":0},"distr_ndcs":{"type":"integer","title":"Distr Ndcs","default":1},"distr_npcs":{"type":"integer","title":"Distr Npcs","default":1},"distr_bs":{"type":"integer","title":"Distr Bs","default":4096},"distr_chunk_bs":{"type":"integer","title":"Distr Chunk Bs","default":4096},"ha_type":{"type":"string","enum":["single","ha"],"title":"Ha Type","default":"single"},"qpair_count":{"type":"integer","title":"Qpair Count","default":256},"max_queue_size":{"type":"integer","title":"Max Queue Size","default":128},"inflight_io_threshold":{"type":"integer","title":"Inflight Io Threshold","default":4},"enable_node_affinity":{"type":"boolean","title":"Enable Node Affinity","default":false},"strict_node_anti_affinity":{"type":"boolean","title":"Strict Node Anti Affinity","default":false}},"type":"object","title":"ClusterParams"},"DeviceDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"size":{"type":"integer","title":"Size"},"io_error":{"type":"boolean","title":"Io Error"},"is_partition":{"type":"boolean","title":"Is Partition"},"nvmf_ips":{"items":{"type":"string","format":"ipv4"},"type":"array","title":"Nvmf Ips"},"nvmf_nqn":{"type":"string","title":"Nvmf Nqn","default":""},"nvmf_port":{"type":"integer","title":"Nvmf Port","default":0}},"type":"object","required":["id","status","health_check","size","io_error","is_partition","nvmf_ips"],"title":"DeviceDTO"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ManagementNodeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"hostname":{"type":"string","title":"Hostname"},"ip":{"type":"string","format":"ipv4","title":"Ip"}},"type":"object","required":["id","status","hostname","ip"],"title":"ManagementNodeDTO"},"RootModel_Union__CreateParams___CloneParams__":{"anyOf":[{"$ref":"#/components/schemas/_CreateParams"},{"$ref":"#/components/schemas/_CloneParams"}],"title":"RootModel[Union[_CreateParams, _CloneParams]]"},"SnapshotDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"used_size":{"type":"integer","minimum":0.0,"title":"Used Size"},"lvol":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Lvol"}},"type":"object","required":["id","name","status","health_check","size","used_size","lvol"],"title":"SnapshotDTO"},"StorageNodeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"ip":{"type":"string","format":"ipv4","title":"Ip"}},"type":"object","required":["id","status","ip"],"title":"StorageNodeDTO"},"StorageNodeParams":{"properties":{"node_address":{"type":"string","title":"Node Address","default":"^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$"},"interface_name":{"type":"string","title":"Interface Name"},"max_snapshots":{"type":"integer","title":"Max Snapshots","default":500},"ha_jm":{"type":"boolean","title":"Ha Jm","default":true},"test_device":{"type":"boolean","title":"Test Device","default":false},"spdk_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Spdk Image"},"spdk_debug":{"type":"boolean","title":"Spdk Debug","default":false},"full_page_unmap":{"type":"boolean","title":"Full Page Unmap","default":false},"data_nics":{"items":{"type":"string"},"type":"array","title":"Data Nics","default":[]},"namespace":{"type":"string","title":"Namespace","default":"default"},"jm_percent":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Jm Percent","default":3},"partitions":{"type":"integer","title":"Partitions","default":1},"iobuf_small_pool_count":{"type":"integer","title":"Iobuf Small Pool Count","default":0},"iobuf_large_pool_count":{"type":"integer","title":"Iobuf Large Pool Count","default":0},"ha_jm_count":{"type":"integer","title":"Ha Jm Count","default":3}},"type":"object","required":["interface_name","spdk_image"],"title":"StorageNodeParams"},"StoragePoolDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","enum":["active","inactive"],"title":"Status"},"max_size":{"type":"integer","minimum":0.0,"title":"Max Size"},"volume_max_size":{"type":"integer","minimum":0.0,"title":"Volume Max Size"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops"},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes"},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes"},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes"}},"type":"object","required":["id","name","status","max_size","volume_max_size","max_rw_iops","max_rw_mbytes","max_r_mbytes","max_w_mbytes"],"title":"StoragePoolDTO"},"StoragePoolParams":{"properties":{"name":{"type":"string","title":"Name"},"pool_max":{"type":"integer","minimum":0.0,"title":"Pool Max","default":0},"volume_max_size":{"type":"integer","minimum":0.0,"title":"Volume Max Size","default":0},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0}},"type":"object","required":["name"],"title":"StoragePoolParams"},"UpdatableClusterParameters":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"}},"type":"object","title":"UpdatableClusterParameters"},"UpdatableLVolParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0},"size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Size"}},"type":"object","title":"UpdatableLVolParams"},"UpdatableStoragePoolParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"max_size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Size"},"volume_max_size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Volume Max Size"},"max_rw_iops":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Rw Iops"},"max_rw_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Rw Mbytes"},"max_r_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max R Mbytes"},"max_w_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max W Mbytes"}},"type":"object","title":"UpdatableStoragePoolParams"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VolumeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"nqn":{"type":"string","title":"Nqn"},"nodes":{"items":{"type":"string"},"type":"array","title":"Nodes"},"port":{"type":"integer","exclusiveMaximum":65536.0,"minimum":0.0,"title":"Port"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"cloned_from":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cloned From"},"crypto_key":{"anyOf":[{"prefixItems":[{"type":"string"},{"type":"string"}],"type":"array","maxItems":2,"minItems":2},{"type":"null"}],"title":"Crypto Key"},"high_availability":{"type":"boolean","title":"High Availability"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops"},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes"},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes"},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes"}},"type":"object","required":["id","name","status","health_check","nqn","nodes","port","size","cloned_from","crypto_key","high_availability","max_rw_iops","max_rw_mbytes","max_r_mbytes","max_w_mbytes"],"title":"VolumeDTO"},"_CloneParams":{"properties":{"name":{"type":"string","title":"Name"},"snapshot_id":{"anyOf":[{"type":"string","pattern":"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"},{"type":"null"}],"title":"Snapshot Id"},"size":{"type":"integer","minimum":0.0,"title":"Size","default":0}},"type":"object","required":["name","snapshot_id"],"title":"_CloneParams"},"_CreateParams":{"properties":{"name":{"type":"string","title":"Name"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"crypto_key":{"anyOf":[{"prefixItems":[{"type":"string"},{"type":"string"}],"type":"array","maxItems":2,"minItems":2},{"type":"null"}],"title":"Crypto Key"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0},"ha_type":{"anyOf":[{"type":"string","enum":["single","ha"]},{"type":"null"}],"title":"Ha Type"},"host_id":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Host Id"},"priority_class":{"type":"integer","enum":[0,1],"title":"Priority Class","default":0},"namespace":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Namespace"},"pvc_name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Pvc Name"},"ndcs":{"type":"integer","minimum":0.0,"title":"Ndcs","default":0},"npcs":{"type":"integer","minimum":0.0,"title":"Npcs","default":0}},"type":"object","required":["name","size"],"title":"_CreateParams"},"_RestartParams":{"properties":{"force":{"type":"boolean","title":"Force","default":false},"reattach_volume":{"type":"boolean","title":"Reattach Volume","default":false}},"type":"object","title":"_RestartParams"},"_SnapshotParams":{"properties":{"name":{"type":"string","title":"Name"}},"type":"object","required":["name"],"title":"_SnapshotParams"},"_UpdateParams":{"properties":{"management_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Management Image"},"spdk_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Spdk Image"},"restart":{"type":"boolean","title":"Restart","default":false}},"type":"object","required":["management_image","spdk_image"],"title":"_UpdateParams"}},"securitySchemes":{"HTTPBearer":{"type":"http","scheme":"bearer"}}}} \ No newline at end of file diff --git a/simplyblock_web/templates/Untitled-1.j2 b/simplyblock_web/templates/Untitled-1.j2 new file mode 100644 index 000000000..e69de29bb diff --git a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 index 734d9c59e..85bfd0f7b 100644 --- a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 +++ b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 @@ -34,9 +34,18 @@ spec: - | set -e + MARKER="/var/simplyblock/.cpu_isolation_applied" + echo "--- Installing jq ---" apk add --no-cache jq + echo "--- Checking if node was already configured ---" + + if [[ -f "$MARKER" ]]; then + echo "[INFO] Node already configured. Skipping sleep and exiting..." + exit 0 + fi + echo "--- Reading isolated cores from config ---" CONFIG_FILE="/var/simplyblock/sn_config_file" @@ -65,7 +74,7 @@ spec: operator: In values: - worker - - worker-isolated + - worker-isolated-{{ HOSTNAME }} nodeSelector: matchLabels: kubernetes.io/hostname: {{ HOSTNAME }} @@ -78,7 +87,7 @@ spec: metadata: name: worker-isolated-{{ HOSTNAME }} labels: - machineconfiguration.openshift.io/role: worker-isolated + machineconfiguration.openshift.io/role: worker-isolated-{{ HOSTNAME }} spec: kernelArguments: - "nohz_full=${ISOLATED_CORES}" @@ -93,11 +102,11 @@ spec: apiVersion: machineconfiguration.openshift.io/v1 kind: KubeletConfig metadata: - name: set-static-cpu-manager + name: set-static-cpu-manager-{{ HOSTNAME }} spec: machineConfigPoolSelector: matchLabels: - machineconfiguration.openshift.io/role: worker-isolated + machineconfiguration.openshift.io/role: worker-isolated-{{ HOSTNAME }} kubeletConfig: cpuManagerPolicy: static cpuManagerReconcilePeriod: 5s @@ -105,4 +114,8 @@ spec: echo "[INFO] Init setup and CPU isolation complete." - echo "--- Init setup complete ---" + echo "[INFO] Marking node as configured." + touch "$MARKER" + + echo "[INFO] Node is rebooting. Sleeping for 5 minutes to stop pipeline gracefully..." + sleep 300 \ No newline at end of file diff --git a/simplyblock_web/templates/storage_core_isolation.yaml.j2 b/simplyblock_web/templates/storage_core_isolation.yaml.j2 index b6fafe2ee..30bbf8809 100644 --- a/simplyblock_web/templates/storage_core_isolation.yaml.j2 +++ b/simplyblock_web/templates/storage_core_isolation.yaml.j2 @@ -91,7 +91,7 @@ spec: - name: etc mountPath: /etc - name: rootfs - mountPath: / + mountPath: /host - name: var-simplyblock mountPath: /var/simplyblock command: ["/bin/sh", "-c"] @@ -113,13 +113,13 @@ spec: apt update && apt install -y grep jq nvme-cli tuned ;; ubuntu) - apt update && apt install -y grep jq nvme-cli tuned - apt-get install -y linux-modules-extra-$(uname -r) + chroot /host apt update && chroot /host apt install -y grep jq nvme-cli tuned + chroot /host apt-get install -y linux-modules-extra-$(uname -r) ;; - centos|rhel|rocky|almalinux) + centos|rhel|rocky|almalinux|ol) export YUM_RELEASEVER=$(awk -F'=' '/^VERSION_ID=/{gsub(/"/,"",$2); print $2}' /etc/os-release) export DNF_RELEASEVER=$(awk -F'=' '/^VERSION_ID=/{gsub(/"/,"",$2); print $2}' /etc/os-release) - dnf install -y grep jq nvme-cli kernel-modules-extra tuned \ + chroot /host dnf install -y grep jq nvme-cli kernel-modules-extra tuned \ --setopt=tsflags=nocontexts,noscripts --setopt=install_weak_deps=False ;; *) @@ -204,7 +204,7 @@ spec: tuned-adm profile "$TUNED_PROFILE" case "$OS_ID" in centos|rhel|rocky|almalinux) - grubby --update-kernel=ALL --args="isolcpus=$ISOLATED_CORES nohz_full=$ISOLATED_CORES rcu_nocbs=$ISOLATED_CORES" + chroot /host grubby --update-kernel=ALL --args="isolcpus=$ISOLATED_CORES nohz_full=$ISOLATED_CORES rcu_nocbs=$ISOLATED_CORES" ;; *) echo "" diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index f10478c75..105ee1157 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -1,7 +1,7 @@ apiVersion: v1 kind: Pod metadata: - name: snode-spdk-pod-{{ RPC_PORT }} + name: snode-spdk-pod-{{ RPC_PORT }}-{{ CLUSTER_ID }} namespace: {{ NAMESPACE }} labels: app: spdk-app-{{ RPC_PORT }} @@ -16,6 +16,7 @@ spec: nodeSelector: kubernetes.io/hostname: {{ HOSTNAME }} hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet tolerations: - effect: NoSchedule operator: Exists @@ -24,7 +25,8 @@ spec: volumes: - name: socket-dir emptyDir: - medium: "Memory" + medium: Memory + sizeLimit: 1Gi - name: host-sys hostPath: path: /sys @@ -57,6 +59,11 @@ spec: - name: dockercontainerlogdirectory hostPath: path: /var/log/pods + {% if MODE == "kubernetes" %} + - name: config + configMap: + name: simplyblock-fluent-bit-config + {% endif %} initContainers: - name: copy-script @@ -87,16 +94,10 @@ spec: value: "{{ TOTAL_HP }}" - name: RPC_PORT value: "{{ RPC_PORT }}" - - name: SPDKCSI_SECRET - valueFrom: - secretKeyRef: - name: simplyblock-csi-secret - key: secret.json - - name: CLUSTER_CONFIG - valueFrom: - configMapKeyRef: - name: simplyblock-csi-cm - key: config.json + - name: NSOCKET + value: "{{ NSOCKET }}" + - name: FW_PORT + value: "{{ FW_PORT }}" lifecycle: postStart: exec: @@ -105,7 +106,7 @@ spec: privileged: true volumeMounts: - name: socket-dir - mountPath: /var/tmp + mountPath: /mnt/ramdisk - name: host-sys mountPath: /sys - name: host-modules @@ -118,10 +119,10 @@ spec: mountPath: /etc/simplyblock resources: limits: - hugepages-2Mi: {{ MEM_GEGA }}Gi + hugepages-2Mi: {{ MEM_MEGA }}Mi cpu: {{ CORES }} requests: - hugepages-2Mi: {{ MEM_GEGA }}Gi + hugepages-2Mi: {{ MEM_MEGA }}Mi - name: spdk-proxy-container image: {{ SIMPLYBLOCK_DOCKER_IMAGE }} @@ -129,7 +130,7 @@ spec: command: ["python", "simplyblock_core/services/spdk_http_proxy_server.py"] volumeMounts: - name: socket-dir - mountPath: /var/tmp + mountPath: /mnt/ramdisk env: - name: SERVER_IP value: "{{ SERVER_IP }}" @@ -170,4 +171,19 @@ spec: - name: dockercontainerlogdirectory mountPath: /var/log/pods readOnly: true + {% elif MODE == "kubernetes" %} + - name: fluent-bit + image: fluent/fluent-bit:1.8.11 + volumeMounts: + - name: varlog + mountPath: /var/log + - name: config + mountPath: /fluent-bit/etc/ + resources: + requests: + cpu: "100m" + memory: "200Mi" + limits: + cpu: "400m" + memory: "2Gi" {% endif %} diff --git a/simplyblock_web/templates/storage_init_job.yaml.j2 b/simplyblock_web/templates/storage_init_job.yaml.j2 index 6432d4500..2d59571c7 100644 --- a/simplyblock_web/templates/storage_init_job.yaml.j2 +++ b/simplyblock_web/templates/storage_init_job.yaml.j2 @@ -17,18 +17,34 @@ spec: operator: Exists - effect: NoExecute operator: Exists + + volumes: + - name: etc-systemd + hostPath: + path: /etc/systemd/ + - name: host-proc + hostPath: + path: /proc containers: - name: init-setup image: simplyblock/ubuntu-tools:22.04 securityContext: privileged: true + volumeMounts: + - name: etc-systemd + mountPath: /etc/systemd/ + - name: host-proc + mountPath: /proc command: ["/bin/sh", "-c"] args: - | set -e echo "--- Starting init setup ---" - + + HUGEPAGES_BEFORE=$(grep HugePages_Total /proc/meminfo | awk '{print $2}') + echo "[INFO] Hugepages before: $HUGEPAGES_BEFORE" + NODE_IP=$(ip route get 1.1.1.1 | grep -oE 'src [0-9.]+' | awk '{print $2}') echo "Detected node IP: $NODE_IP" @@ -44,18 +60,63 @@ spec: OS_ID="$(cat /proc/version | awk '{print $3}' | awk -F'-' '{print $NF}')" if [ "$OS_ID" != "talos" ]; then - echo "--- Restarting kubelet ---" + + echo "--- Creating RAM disk systemd unit on host ---" + + + UNIT_PATH="/etc/systemd/system/var-mnt-ramdisk.mount" + + echo "Writing systemd unit to $UNIT_PATH" + + + cat < "$UNIT_PATH" + [Unit] + Description=1G RAM disk at /var/mnt/ramdisk + After=local-fs-pre.target + Before=local-fs.target + + [Mount] + What=tmpfs + Where=/var/mnt/ramdisk + Type=tmpfs + Options=size=1G,mode=1777 + + [Install] + WantedBy=local-fs.target + EOF + + echo "Starting RAM disk mounting." nsenter --target 1 --mount --uts --ipc --net --pid -- /bin/sh -c ' if command -v systemctl >/dev/null 2>&1; then - echo "Restarting kubelet..." - systemctl restart kubelet && echo "Kubelet restarted" || echo "Kubelet restart failed" + echo "Reloading systemd..." + systemctl daemon-reload || echo "systemd reload failed" + + echo "Enabling mount unit..." + systemctl enable var-mnt-ramdisk.mount || echo "enable failed" + + echo "Starting mount unit..." + systemctl start var-mnt-ramdisk.mount || echo "start failed (check logs or unit file)" else - echo "systemctl not found; skipping kubelet restart" + echo "systemctl not found; skipping RAM disk mounting" fi ' + + HUGEPAGES_AFTER=$(grep HugePages_Total /proc/meminfo | awk '{print $2}') + echo "[INFO] Hugepages after: $HUGEPAGES_AFTER" + + if [ "$HUGEPAGES_BEFORE" != "$HUGEPAGES_AFTER" ]; then + echo "[INFO] Hugepages changed, restarting kubelet..." + nsenter --target 1 --mount --uts --ipc --net --pid -- /bin/sh -c ' + if command -v systemctl >/dev/null 2>&1; then + systemctl restart kubelet && echo "Kubelet restarted" || echo "Kubelet restart failed" + fi + ' + else + echo "[INFO] Hugepages unchanged, skipping kubelet restart." + fi else echo "Talos detected - skipping nsenter and kubelet restart." - echo "Use 'talosctl service kubelet restart -n $NODE_IP' to restart the node kubelet" + echo "Use '\''talosctl service kubelet restart -n $NODE_IP'\'' to restart the node kubelet" fi echo "--- Init setup complete ---" diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py index b0d1795df..de72db274 100644 --- a/simplyblock_web/utils.py +++ b/simplyblock_web/utils.py @@ -148,7 +148,8 @@ def error_handler(exception: Exception): class RPCPortParams(BaseModel): - rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536) + rpc_port: int = Field(constants.RPC_PORT_RANGE_START, ge=0, le=65536) + cluster_id: Optional[str] class DeviceParams(BaseModel):