From 5e2ce252d67292368d0b7ae30b0e5cbb1adf3f78 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 3 Oct 2025 14:49:29 +0300 Subject: [PATCH 001/192] wip --- simplyblock_cli/cli-reference.yaml | 9 ++++++ simplyblock_core/rpc_client.py | 45 ++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 5d90a51bd..8dead88c4 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1267,6 +1267,15 @@ commands: help: "Name" dest: name type: str + - name: add-replication + help: Assigns the snapshot replication target cluster + arguments: + - name: "cluster_id" + help: "Cluster id" + dest: cluster_id + type: str + completer: _completer_get_cluster_list + - name: "volume" help: "Logical volume commands" aliases: diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 551099d0b..1657b038f 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -1176,3 +1176,48 @@ def bdev_distrib_check_inflight_io(self, jm_vuid): "jm_vuid": jm_vuid, } return self._request("bdev_distrib_check_inflight_io", params) + + def bdev_lvol_create_poller_group(self, cpu_mask): + params = { + "cpu_mask": cpu_mask, + } + return self._request("bdev_lvol_create_poller_group", params) + + def bdev_lvol_transfer(self, lvol_name, offset, cluster_batch, gateway, operation): + # --operation {migrate,replicate} + params = { + "lvol_name": lvol_name, + "offset": offset, + "cluster_batch": cluster_batch, + "gateway": gateway, + "operation": operation, + } + return self._request("bdev_lvol_transfer", params) + + def bdev_lvol_transfer_stat(self, lvol_name): + """ + example: + ./rpc.py bdev_lvol_transfer_stat lvs_raid0_lvol/snapshot_1 + { + "transfer_state": "No process", + "offset": 0 + } + transfer_state values: + - No process + - In progress + - Failed + - Done + """ + params = { + "lvol_name": lvol_name, + } + return self._request("bdev_lvol_transfer_stat", params) + + def bdev_lvol_convert(self, lvol_name): + """ + convert lvol to snapshot + """ + params = { + "lvol_name": lvol_name, + } + return self._request("bdev_lvol_convert", params) From 68486bf431472e674a487eeb113270cd87ae03c8 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 7 Oct 2025 16:41:30 +0300 Subject: [PATCH 002/192] wip 2 --- simplyblock_cli/cli-reference.yaml | 6 +- simplyblock_core/db_controller.py | 10 +- simplyblock_core/models/snapshot.py | 14 + .../scripts/docker-compose-swarm.yml | 14 + .../services/snapshot_replication.py | 242 ++++++++++++++++++ 5 files changed, 284 insertions(+), 2 deletions(-) create mode 100644 simplyblock_core/services/snapshot_replication.py diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 8dead88c4..e2d87cd61 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1275,7 +1275,11 @@ commands: dest: cluster_id type: str completer: _completer_get_cluster_list - + - name: "--timeout" + help: "Snapshot replication network timeout" + dest: timeout + type: int + default: "3600" - name: "volume" help: "Logical volume commands" aliases: diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index 2dd873ae8..1ca8d88d1 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -13,7 +13,7 @@ from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice from simplyblock_core.models.pool import Pool from simplyblock_core.models.port_stat import PortStat -from simplyblock_core.models.snapshot import SnapShot +from simplyblock_core.models.snapshot import SnapShot, SnapshotReplication from simplyblock_core.models.stats import DeviceStatObject, NodeStatObject, ClusterStatObject, LVolStatObject, \ PoolStatObject, CachedLVolStatObject from simplyblock_core.models.storage_node import StorageNode @@ -298,3 +298,11 @@ def get_primary_storage_nodes_by_secondary_node_id(self, node_id) -> List[Storag if node.secondary_node_id == node_id and node.lvstore: nodes.append(node) return sorted(nodes, key=lambda x: x.create_dt) + + def get_snapshot_replication_tasks(self, cluster_id) -> List[StorageNode]: + ret = SnapshotReplication().read_from_db(self.kv_store) + out = [] + for n in ret: + if n.source_cluster_id == cluster_id: + out.append(n) + return sorted(out, key=lambda x: x.create_dt) diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py index 5476df3c5..39dbb226a 100644 --- a/simplyblock_core/models/snapshot.py +++ b/simplyblock_core/models/snapshot.py @@ -28,3 +28,17 @@ class SnapShot(BaseModel): vuid: int = 0 deletion_status: str = "" status: str = "" + + +class SnapshotReplication(BaseModel): + + STATUS_NEW = 'new' + STATUS_IN_PROGRESS = 'in-progress' + STATUS_IN_DONE = 'done' + STATUS_IN_FAILED = 'failed' + + snapshot: SnapShot = None + source_cluster_id: str = "" + target_cluster_id: str = "" + status: str = "" + date: str = "" diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml index dc710bfea..1e6025bcc 100644 --- a/simplyblock_core/scripts/docker-compose-swarm.yml +++ b/simplyblock_core/scripts/docker-compose-swarm.yml @@ -335,6 +335,20 @@ services: environment: SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + SnapshotReplication: + <<: *service-base + image: $SIMPLYBLOCK_DOCKER_IMAGE + command: "python simplyblock_core/services/snapshot_replication.py" + deploy: + placement: + constraints: [node.role == manager] + volumes: + - "/etc/foundationdb:/etc/foundationdb" + networks: + - hostnet + environment: + SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + networks: monitoring-net: external: true diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py new file mode 100644 index 000000000..fad2479b9 --- /dev/null +++ b/simplyblock_core/services/snapshot_replication.py @@ -0,0 +1,242 @@ +# coding=utf-8 +import time +from datetime import datetime + + +from simplyblock_core import constants, db_controller, utils +from simplyblock_core.models.cluster import Cluster +from simplyblock_core.controllers import health_controller, snapshot_events +from simplyblock_core.models.snapshot import SnapShot, SnapshotReplication +from simplyblock_core.models.storage_node import StorageNode +from simplyblock_core.rpc_client import RPCClient + +logger = utils.get_logger(__name__) + +utils.init_sentry_sdk(__name__) + + +def set_snapshot_health_check(snap, health_check_status): + snap = db.get_snapshot_by_id(snap.get_id()) + if snap.health_check == health_check_status: + return + snap.health_check = health_check_status + snap.updated_at = str(datetime.now()) + snap.write_to_db() + + +def process_snap_delete_finish(snap, leader_node): + logger.info(f"Snapshot deleted successfully, id: {snap.get_id()}") + + snode = db.get_storage_node_by_id(snap.lvol.node_id) + # 3-1 async delete snap bdev from primary + if snode.get_id() == leader_node.get_id(): + primary_node = snode + secondary_node = db.get_storage_node_by_id(snode.secondary_node_id) + else: + primary_node = db.get_storage_node_by_id(snode.secondary_node_id) + secondary_node = snode + + if primary_node.status == StorageNode.STATUS_ONLINE: + ret = primary_node.rpc_client().delete_lvol(snap.snap_bdev, del_async=True) + if not ret: + logger.error(f"Failed to delete snap from primary_node node: {primary_node.get_id()}") + + # 3-2 async delete lvol bdev from secondary + if secondary_node: + if secondary_node.status == StorageNode.STATUS_ONLINE: + ret = secondary_node.rpc_client().delete_lvol(snap.snap_bdev, del_async=True) + if not ret: + logger.error(f"Failed to delete lvol from sec node: {secondary_node.get_id()}") + # what to do here ? + + snapshot_events.snapshot_delete(snap) + snap.remove(db.kv_store) + + +def process_snap_delete_try_again(snap): + snap = db.get_snapshot_by_id(snap.get_id()) + snap.deletion_status = "" + snap.write_to_db() + + +def set_snap_offline(snap): + sn = db.get_snapshot_by_id(snap.get_id()) + sn.deletion_status = "" + sn.status = SnapShot.STATUS_OFFLINE + sn.write_to_db() + + +# get DB controller +db = db_controller.DBController() + +logger.info("Starting snapshot replication service...") +while True: + + for cluster in db.get_clusters(): + + if cluster.status in [Cluster.STATUS_INACTIVE, Cluster.STATUS_UNREADY, Cluster.STATUS_IN_ACTIVATION]: + logger.warning(f"Cluster {cluster.get_id()} is in {cluster.status} state, skipping") + continue + + # for task in db.get_snapshot_replication_tasks(cluster.get_id()): + # if task.status == SnapshotReplication.STATUS_NEW: + # + # + for snode in db.get_storage_nodes_by_cluster_id(cluster.get_id()): + node_bdev_names = [] + node_lvols_nqns = {} + sec_node_bdev_names = {} + sec_node_lvols_nqns = {} + sec_node = None + + if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + + rpc_client = RPCClient( + snode.mgmt_ip, snode.rpc_port, + snode.rpc_username, snode.rpc_password, timeout=3, retry=2) + node_bdevs = rpc_client.get_bdevs() + if node_bdevs: + node_bdev_names = [b['name'] for b in node_bdevs] + for bdev in node_bdevs: + if "aliases" in bdev and bdev["aliases"]: + node_bdev_names.extend(bdev['aliases']) + + ret = rpc_client.subsystem_list() + if ret: + for sub in ret: + node_lvols_nqns[sub['nqn']] = sub + + if snode.secondary_node_id: + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: + sec_rpc_client = RPCClient( + sec_node.mgmt_ip, sec_node.rpc_port, + sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) + ret = sec_rpc_client.get_bdevs() + if ret: + for bdev in ret: + sec_node_bdev_names[bdev['name']] = bdev + + ret = sec_rpc_client.subsystem_list() + if ret: + for sub in ret: + sec_node_lvols_nqns[sub['nqn']] = sub + + if snode.lvstore_status == "ready": + + for snap in db.get_snapshots_by_node_id(snode.get_id()): + if snap.status == SnapShot.STATUS_ONLINE: + + present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) + set_snapshot_health_check(snap, present) + + elif snap.status == SnapShot.STATUS_IN_DELETION: + + # check leadership + leader_node = None + if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN]: + ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if not ret: + raise Exception("Failed to get LVol store info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = snode + + if not leader_node and sec_node: + ret = sec_node.rpc_client().bdev_lvol_get_lvstores(sec_node.lvstore) + if not ret: + raise Exception("Failed to get LVol store info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = sec_node + + if not leader_node: + raise Exception("Failed to get leader node") + + if snap.deletion_status == "" or snap.deletion_status != leader_node.get_id(): + + ret = leader_node.rpc_client().delete_lvol(snap.snap_bdev) + if not ret: + logger.error(f"Failed to delete snap from node: {snode.get_id()}") + continue + snap = db.get_snapshot_by_id(snap.get_id()) + snap.deletion_status = leader_node.get_id() + snap.write_to_db() + + time.sleep(3) + + try: + ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status(snap.snap_bdev) + except Exception as e: + logger.error(e) + # timeout detected, check other node + break + + if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed + process_snap_delete_finish(snap, leader_node) + + elif ret == 1: # Async lvol deletion is in progress or queued + logger.info(f"Snap deletion in progress, id: {snap.get_id()}") + + elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error( + "Async deletion is done, but leadership has changed (sync deletion is now blocked)") + + elif ret == 4: # No async delete request exists for this Snap + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No async delete request exists for this snap") + set_snap_offline(snap) + + elif ret == -1: # Operation not permitted + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Operation not permitted") + set_snap_offline(snap) + + elif ret == -2: # No such file or directory + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No such file or directory") + process_snap_delete_finish(snap, leader_node) + + elif ret == -5: # I/O error + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("I/O error") + process_snap_delete_try_again(snap) + + elif ret == -11: # Try again + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Try again") + process_snap_delete_try_again(snap) + + elif ret == -12: # Out of memory + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Out of memory") + process_snap_delete_try_again(snap) + + elif ret == -16: # Device or resource busy + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Device or resource busy") + process_snap_delete_try_again(snap) + + elif ret == -19: # No such device + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No such device") + set_snap_offline(snap) + + elif ret == -35: # Leadership changed + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Leadership changed") + process_snap_delete_try_again(snap) + + elif ret == -36: # Failed to update lvol for deletion + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Failed to update snapshot for deletion") + process_snap_delete_try_again(snap) + + else: # Failed to update lvol for deletion + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Failed to update snapshot for deletion") + + + time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) From 93397211cedad46988bc732d21a87f42d8608d7c Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 13 Oct 2025 16:48:24 +0300 Subject: [PATCH 003/192] implement snapshot replication --- simplyblock_cli/cli-reference.yaml | 10 ++++++ simplyblock_cli/cli.py | 10 ++++++ simplyblock_cli/clibase.py | 3 ++ simplyblock_core/cluster_ops.py | 20 +++++++++++ .../controllers/lvol_controller.py | 8 +++-- simplyblock_core/db_controller.py | 2 +- simplyblock_core/models/cluster.py | 2 ++ simplyblock_core/models/lvol_model.py | 2 ++ .../services/snapshot_replication.py | 33 ++++++++++++++++--- simplyblock_core/snode_client.py | 4 +++ 10 files changed, 86 insertions(+), 8 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index e2d87cd61..b82a4a566 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1275,6 +1275,11 @@ commands: dest: cluster_id type: str completer: _completer_get_cluster_list + - name: "target_cluster_id" + help: "Target Cluster id" + dest: target_cluster_id + type: str + completer: _completer_get_cluster_list - name: "--timeout" help: "Snapshot replication network timeout" dest: timeout @@ -1399,6 +1404,11 @@ commands: removed. Please exchange the use of `--pvc_name` with `--pvc-name`. dest: pvc_name type: str + - name: "--replicate" + help: "Replicate LVol snapshot" + dest: replicate + type: bool + action: store_true - name: qos-set help: "Changes QoS settings for an active logical volume" arguments: diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 4b25bd771..79742a54a 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -329,6 +329,7 @@ def init_cluster(self): if self.developer_mode: self.init_cluster__set(subparser) self.init_cluster__change_name(subparser) + self.init_cluster__add_replication(subparser) def init_cluster__create(self, subparser): @@ -510,6 +511,12 @@ def init_cluster__change_name(self, subparser): subcommand.add_argument('cluster_id', help='Cluster id', type=str).completer = self._completer_get_cluster_list subcommand.add_argument('name', help='Name', type=str) + def init_cluster__add_replication(self, subparser): + subcommand = self.add_sub_command(subparser, 'add-replication', 'Assigns the snapshot replication target cluster') + subcommand.add_argument('cluster_id', help='Cluster id', type=str).completer = self._completer_get_cluster_list + subcommand.add_argument('target_cluster_id', help='Target Cluster id', type=str).completer = self._completer_get_cluster_list + argument = subcommand.add_argument('--timeout', help='Snapshot replication network timeout', type=int, default=3600, dest='timeout') + def init_volume(self): subparser = self.add_command('volume', 'Logical volume commands', aliases=['lvol',]) @@ -557,6 +564,7 @@ def init_volume__add(self, subparser): if self.developer_mode: argument = subcommand.add_argument('--uid', help='Set logical volume id', type=str, dest='uid') argument = subcommand.add_argument('--pvc-name', '--pvc_name', help='Set logical volume PVC name for k8s clients', type=str, dest='pvc_name') + argument = subcommand.add_argument('--replicate', help='Replicate LVol snapshot', dest='replicate', action='store_true') def init_volume__qos_set(self, subparser): subcommand = self.add_sub_command(subparser, 'qos-set', 'Changes QoS settings for an active logical volume') @@ -979,6 +987,8 @@ def run(self): ret = self.cluster__set(sub_command, args) elif sub_command in ['change-name']: ret = self.cluster__change_name(sub_command, args) + elif sub_command in ['add-replication']: + ret = self.cluster__add_replication(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 673fe10fc..cc681e022 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -443,6 +443,9 @@ def cluster__complete_expand(self, sub_command, args): cluster_ops.cluster_expand(args.cluster_id) return True + def cluster__add_replication(self, sub_command, args): + return cluster_ops.add_replication(args.cluster_id, args.target_cluster_id, args.timeout) + def volume__add(self, sub_command, args): name = args.name size = args.size diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 4bd55a513..3850497e9 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -783,6 +783,7 @@ def list() -> t.List[dict]: "#storage": len(st), "Mod": f"{cl.distr_ndcs}x{cl.distr_npcs}", "Status": status.upper(), + "Replicate": cl.snapshot_replication_target_cluster, }) return data @@ -1323,3 +1324,22 @@ def set(cl_id, attr, value) -> None: logger.info(f"Setting {attr} to {value}") setattr(cluster, attr, value) cluster.write_to_db() + + +def add_replication(source_cl_id, target_cl_id, timeout=0) -> bool: + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(source_cl_id) + if not cluster: + raise ValueError(f"Cluster not found: {source_cl_id}") + + target_cluster = db_controller.get_cluster_by_id(target_cl_id) + if not target_cluster: + raise ValueError(f"Target cluster not found: {target_cl_id}") + + logger.info("Updating Cluster replication target") + cluster.snapshot_replication_target_cluster = target_cl_id + if timeout and timeout > 0: + cluster.snapshot_replication_timeout = timeout + cluster.write_to_db() + logger.info("Done") + return True diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 8fbaa5a5d..633cbff63 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -265,7 +265,7 @@ def validate_aes_xts_keys(key1: str, key2: str) -> Tuple[bool, str]: def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, use_crypto, distr_vuid, max_rw_iops, max_rw_mbytes, max_r_mbytes, max_w_mbytes, with_snapshot=False, max_size=0, crypto_key1=None, crypto_key2=None, lvol_priority_class=0, - uid=None, pvc_name=None, namespace=None, max_namespace_per_subsys=1, fabric="TCP"): + uid=None, pvc_name=None, namespace=None, max_namespace_per_subsys=1, fabric="TCP", do_replicate=False): db_controller = DBController() logger.info(f"Adding LVol: {name}") @@ -450,6 +450,10 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, lvol.subsys_port = host_node.lvol_subsys_port lvol.top_bdev = f"{lvol.lvs_name}/{lvol.lvol_bdev}" lvol.base_bdev = lvol.top_bdev + lvol.do_replicate = bool(do_replicate) + if lvol.do_replicate: + random_nodes = _get_next_3_nodes(cl.snapshot_replication_target_cluster, lvol.size) + lvol.replication_node_id = random_nodes[0].get_id() lvol_count = len(db_controller.get_lvols_by_node_id(host_node.get_id())) if lvol_count > host_node.max_lvol: @@ -1154,11 +1158,11 @@ def list_lvols(is_json, cluster_id, pool_id_or_name, all=False): "HA": lvol.ha_type, "BlobID": lvol.blobid or "", "LVolUUID": lvol.lvol_uuid or "", - # "Priority": lvol.lvol_priority_class, "Status": lvol.status, "IO Err": lvol.io_error, "Health": lvol.health_check, "NS ID": lvol.ns_id, + "Replicated On": lvol.replication_node_id, }) for snap, count in snap_dict.items(): ref_snap = db_controller.get_snapshot_by_id(snap) diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index 1ca8d88d1..664ab6e5b 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -299,7 +299,7 @@ def get_primary_storage_nodes_by_secondary_node_id(self, node_id) -> List[Storag nodes.append(node) return sorted(nodes, key=lambda x: x.create_dt) - def get_snapshot_replication_tasks(self, cluster_id) -> List[StorageNode]: + def get_snapshot_replication_tasks(self, cluster_id) -> List[SnapshotReplication]: ret = SnapshotReplication().read_from_db(self.kv_store) out = [] for n in ret: diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index a61eaf632..12106f787 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -70,6 +70,8 @@ class Cluster(BaseModel): is_re_balancing: bool = False full_page_unmap: bool = True is_single_node: bool = False + snapshot_replication_target_cluster: str = "" + snapshot_replication_timeout: int = 60*10 def get_status_code(self): if self.status in self.STATUS_CODE_MAP: diff --git a/simplyblock_core/models/lvol_model.py b/simplyblock_core/models/lvol_model.py index b5762b513..a8d69e7bc 100644 --- a/simplyblock_core/models/lvol_model.py +++ b/simplyblock_core/models/lvol_model.py @@ -64,6 +64,8 @@ class LVol(BaseModel): vuid: int = 0 w_mbytes_per_sec: int = 0 fabric: str = "TCP" + do_replicate: bool = False + replication_node_id: str = "" def has_qos(self): return (self.rw_ios_per_sec > 0 or self.rw_mbytes_per_sec > 0 or self.r_mbytes_per_sec > 0 or self.w_mbytes_per_sec > 0) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index fad2479b9..cb5734f7e 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -5,10 +5,11 @@ from simplyblock_core import constants, db_controller, utils from simplyblock_core.models.cluster import Cluster -from simplyblock_core.controllers import health_controller, snapshot_events +from simplyblock_core.controllers import health_controller, snapshot_events, lvol_controller from simplyblock_core.models.snapshot import SnapShot, SnapshotReplication from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.rpc_client import RPCClient +from simplyblock_core.snode_client import SNodeClient logger = utils.get_logger(__name__) @@ -78,10 +79,32 @@ def set_snap_offline(snap): logger.warning(f"Cluster {cluster.get_id()} is in {cluster.status} state, skipping") continue - # for task in db.get_snapshot_replication_tasks(cluster.get_id()): - # if task.status == SnapshotReplication.STATUS_NEW: - # - # + for task in db.get_snapshot_replication_tasks(cluster.get_id()): + if task.status == SnapshotReplication.STATUS_NEW: + # start_replication + #1 create lvol on remote node + logger.info("Starting snapshot replication task") + lv_id, err = lvol_controller.add_lvol_ha(f"REP_{task.snapshot.name}", task.snapshot.size, task.snapshot.lvol.replication_node_id, task.snapshot.lvol.ha_type) + remote_lv = db.get_lvol_by_id(lv_id) + #2 connect to it + snode_api = SNodeClient(f"{ip}:5000", timeout=5, retry=2) + snode = db.get_storage_node_by_id(remote_lv.node_id) + for nic in snode.data_nics: + ip = nic.ip4_address + snode_api.nvme_connect(ip, remote_lv.subsys_port, remote_lv.nqn) + #3 start replication + snode.rpc_client().bdev_lvol_transfer( + lvol_name=task.snapshot.snap_bdev, + offset=0, + cluster_batch=16, + gateway=f"{remote_lv.top_bdev}n1", + operation="replicate" + ) + task.status = SnapshotReplication.STATUS_IN_PROGRESS + task.write_to_db() + + # if task.status == SnapshotReplication.STATUS_IN_PROGRESS: + for snode in db.get_storage_nodes_by_cluster_id(cluster.get_id()): node_bdev_names = [] node_lvols_nqns = {} diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index fac50854f..0b343ad64 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -163,3 +163,7 @@ def spdk_proxy_restart(self,rpc_port=None): def set_hugepages(self): return self._request("POST", "set_hugepages") + + def nvme_connect(self, ip, port, nqn): + params = {"ip": ip, "port": port, "nqn": nqn} + return self._request("POST", "nvme_connect", params) From 0e046ec5f49da7075e840319d96b5e3cb88d6b9e Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 15 Oct 2025 11:56:35 +0300 Subject: [PATCH 004/192] implement snapshot replication 2 --- .../controllers/snapshot_controller.py | 3 +- .../controllers/tasks_controller.py | 20 + simplyblock_core/db_controller.py | 10 +- simplyblock_core/models/job_schedule.py | 1 + simplyblock_core/models/snapshot.py | 14 +- .../services/snapshot_replication.py | 374 ++++++------------ simplyblock_core/snode_client.py | 4 + 7 files changed, 150 insertions(+), 276 deletions(-) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 1cb28c7b9..fe4d7310c 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -3,7 +3,7 @@ import time import uuid -from simplyblock_core.controllers import lvol_controller, snapshot_events, pool_controller +from simplyblock_core.controllers import lvol_controller, snapshot_events, pool_controller, tasks_controller from simplyblock_core import utils, constants from simplyblock_core.db_controller import DBController @@ -218,6 +218,7 @@ def add(lvol_id, snapshot_name): logger.info("Done") snapshot_events.snapshot_create(snap) + tasks_controller.add_snapshot_replication_task(snap) return snap.uuid, False diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index 291c53033..343d5cb2a 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -61,6 +61,12 @@ def _add_task(function_name, cluster_id, node_id, device_id, logger.info(f"Task found, skip adding new task: {task_id}") return False + elif function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + task_id = get_snapshot_replication_task(cluster_id, function_params['snapshot_id']) + if task_id: + logger.info(f"Task found, skip adding new task: {task_id}") + return False + task_obj = JobSchedule() task_obj.uuid = str(uuid.uuid4()) task_obj.cluster_id = cluster_id @@ -349,3 +355,17 @@ def get_failed_device_mig_task(cluster_id, device_id): def add_port_allow_task(cluster_id, node_id, port_number): return _add_task(JobSchedule.FN_PORT_ALLOW, cluster_id, node_id, "", function_params={"port_number": port_number}) + + +def get_snapshot_replication_task(cluster_id, snapshot_id): + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and task.function_params["snapshot_id"] == snapshot_id: + if task.status != JobSchedule.STATUS_DONE and task.canceled is False: + return task.uuid + return False + + +def add_snapshot_replication_task(snapshot): + return _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, snapshot.cluster_id, snapshot.lvol.node_id, "", + function_params={"snapshot_id": snapshot.get_id()}) diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index 664ab6e5b..2dd873ae8 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -13,7 +13,7 @@ from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice from simplyblock_core.models.pool import Pool from simplyblock_core.models.port_stat import PortStat -from simplyblock_core.models.snapshot import SnapShot, SnapshotReplication +from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.stats import DeviceStatObject, NodeStatObject, ClusterStatObject, LVolStatObject, \ PoolStatObject, CachedLVolStatObject from simplyblock_core.models.storage_node import StorageNode @@ -298,11 +298,3 @@ def get_primary_storage_nodes_by_secondary_node_id(self, node_id) -> List[Storag if node.secondary_node_id == node_id and node.lvstore: nodes.append(node) return sorted(nodes, key=lambda x: x.create_dt) - - def get_snapshot_replication_tasks(self, cluster_id) -> List[SnapshotReplication]: - ret = SnapshotReplication().read_from_db(self.kv_store) - out = [] - for n in ret: - if n.source_cluster_id == cluster_id: - out.append(n) - return sorted(out, key=lambda x: x.create_dt) diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py index 9d3eb5aae..676575f84 100644 --- a/simplyblock_core/models/job_schedule.py +++ b/simplyblock_core/models/job_schedule.py @@ -21,6 +21,7 @@ class JobSchedule(BaseModel): FN_BALANCING_AFTER_NODE_RESTART = "balancing_on_restart" FN_BALANCING_AFTER_DEV_REMOVE = "balancing_on_dev_rem" FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add" + FN_SNAPSHOT_REPLICATION = "snapshot_replication" canceled: bool = False cluster_id: str = "" diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py index 39dbb226a..4355ae9b7 100644 --- a/simplyblock_core/models/snapshot.py +++ b/simplyblock_core/models/snapshot.py @@ -9,6 +9,7 @@ class SnapShot(BaseModel): STATUS_ONLINE = 'online' STATUS_OFFLINE = 'offline' STATUS_IN_DELETION = 'in_deletion' + STATUS_IN_REPLICATION = 'in_replication' base_bdev: str = "" blobid: int = 0 @@ -29,16 +30,3 @@ class SnapShot(BaseModel): deletion_status: str = "" status: str = "" - -class SnapshotReplication(BaseModel): - - STATUS_NEW = 'new' - STATUS_IN_PROGRESS = 'in-progress' - STATUS_IN_DONE = 'done' - STATUS_IN_FAILED = 'failed' - - snapshot: SnapShot = None - source_cluster_id: str = "" - target_cluster_id: str = "" - status: str = "" - date: str = "" diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index cb5734f7e..89570c171 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -1,265 +1,133 @@ # coding=utf-8 import time -from datetime import datetime - +import uuid from simplyblock_core import constants, db_controller, utils -from simplyblock_core.models.cluster import Cluster -from simplyblock_core.controllers import health_controller, snapshot_events, lvol_controller -from simplyblock_core.models.snapshot import SnapShot, SnapshotReplication -from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.rpc_client import RPCClient +from simplyblock_core.controllers import lvol_controller +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.snode_client import SNodeClient -logger = utils.get_logger(__name__) +logger = utils.get_logger(__name__) utils.init_sentry_sdk(__name__) - - -def set_snapshot_health_check(snap, health_check_status): - snap = db.get_snapshot_by_id(snap.get_id()) - if snap.health_check == health_check_status: - return - snap.health_check = health_check_status - snap.updated_at = str(datetime.now()) - snap.write_to_db() - - -def process_snap_delete_finish(snap, leader_node): - logger.info(f"Snapshot deleted successfully, id: {snap.get_id()}") - - snode = db.get_storage_node_by_id(snap.lvol.node_id) - # 3-1 async delete snap bdev from primary - if snode.get_id() == leader_node.get_id(): - primary_node = snode - secondary_node = db.get_storage_node_by_id(snode.secondary_node_id) - else: - primary_node = db.get_storage_node_by_id(snode.secondary_node_id) - secondary_node = snode - - if primary_node.status == StorageNode.STATUS_ONLINE: - ret = primary_node.rpc_client().delete_lvol(snap.snap_bdev, del_async=True) - if not ret: - logger.error(f"Failed to delete snap from primary_node node: {primary_node.get_id()}") - - # 3-2 async delete lvol bdev from secondary - if secondary_node: - if secondary_node.status == StorageNode.STATUS_ONLINE: - ret = secondary_node.rpc_client().delete_lvol(snap.snap_bdev, del_async=True) - if not ret: - logger.error(f"Failed to delete lvol from sec node: {secondary_node.get_id()}") - # what to do here ? - - snapshot_events.snapshot_delete(snap) - snap.remove(db.kv_store) - - -def process_snap_delete_try_again(snap): - snap = db.get_snapshot_by_id(snap.get_id()) - snap.deletion_status = "" - snap.write_to_db() - - -def set_snap_offline(snap): - sn = db.get_snapshot_by_id(snap.get_id()) - sn.deletion_status = "" - sn.status = SnapShot.STATUS_OFFLINE - sn.write_to_db() - - # get DB controller db = db_controller.DBController() -logger.info("Starting snapshot replication service...") -while True: - - for cluster in db.get_clusters(): - - if cluster.status in [Cluster.STATUS_INACTIVE, Cluster.STATUS_UNREADY, Cluster.STATUS_IN_ACTIVATION]: - logger.warning(f"Cluster {cluster.get_id()} is in {cluster.status} state, skipping") - continue - - for task in db.get_snapshot_replication_tasks(cluster.get_id()): - if task.status == SnapshotReplication.STATUS_NEW: - # start_replication - #1 create lvol on remote node - logger.info("Starting snapshot replication task") - lv_id, err = lvol_controller.add_lvol_ha(f"REP_{task.snapshot.name}", task.snapshot.size, task.snapshot.lvol.replication_node_id, task.snapshot.lvol.ha_type) - remote_lv = db.get_lvol_by_id(lv_id) - #2 connect to it - snode_api = SNodeClient(f"{ip}:5000", timeout=5, retry=2) - snode = db.get_storage_node_by_id(remote_lv.node_id) - for nic in snode.data_nics: - ip = nic.ip4_address - snode_api.nvme_connect(ip, remote_lv.subsys_port, remote_lv.nqn) - #3 start replication - snode.rpc_client().bdev_lvol_transfer( - lvol_name=task.snapshot.snap_bdev, - offset=0, - cluster_batch=16, - gateway=f"{remote_lv.top_bdev}n1", - operation="replicate" - ) - task.status = SnapshotReplication.STATUS_IN_PROGRESS - task.write_to_db() - - # if task.status == SnapshotReplication.STATUS_IN_PROGRESS: - - for snode in db.get_storage_nodes_by_cluster_id(cluster.get_id()): - node_bdev_names = [] - node_lvols_nqns = {} - sec_node_bdev_names = {} - sec_node_lvols_nqns = {} - sec_node = None - - if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, timeout=3, retry=2) - node_bdevs = rpc_client.get_bdevs() - if node_bdevs: - node_bdev_names = [b['name'] for b in node_bdevs] - for bdev in node_bdevs: - if "aliases" in bdev and bdev["aliases"]: - node_bdev_names.extend(bdev['aliases']) - - ret = rpc_client.subsystem_list() - if ret: - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub - - if snode.secondary_node_id: - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: - sec_rpc_client = RPCClient( - sec_node.mgmt_ip, sec_node.rpc_port, - sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) - ret = sec_rpc_client.get_bdevs() - if ret: - for bdev in ret: - sec_node_bdev_names[bdev['name']] = bdev - - ret = sec_rpc_client.subsystem_list() - if ret: - for sub in ret: - sec_node_lvols_nqns[sub['nqn']] = sub - if snode.lvstore_status == "ready": - - for snap in db.get_snapshots_by_node_id(snode.get_id()): - if snap.status == SnapShot.STATUS_ONLINE: - - present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) - set_snapshot_health_check(snap, present) - - elif snap.status == SnapShot.STATUS_IN_DELETION: - - # check leadership - leader_node = None - if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, - StorageNode.STATUS_DOWN]: - ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if not ret: - raise Exception("Failed to get LVol store info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = snode - - if not leader_node and sec_node: - ret = sec_node.rpc_client().bdev_lvol_get_lvstores(sec_node.lvstore) - if not ret: - raise Exception("Failed to get LVol store info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = sec_node - - if not leader_node: - raise Exception("Failed to get leader node") - - if snap.deletion_status == "" or snap.deletion_status != leader_node.get_id(): - - ret = leader_node.rpc_client().delete_lvol(snap.snap_bdev) - if not ret: - logger.error(f"Failed to delete snap from node: {snode.get_id()}") - continue - snap = db.get_snapshot_by_id(snap.get_id()) - snap.deletion_status = leader_node.get_id() - snap.write_to_db() - - time.sleep(3) - - try: - ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status(snap.snap_bdev) - except Exception as e: - logger.error(e) - # timeout detected, check other node - break - - if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed - process_snap_delete_finish(snap, leader_node) - - elif ret == 1: # Async lvol deletion is in progress or queued - logger.info(f"Snap deletion in progress, id: {snap.get_id()}") - - elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error( - "Async deletion is done, but leadership has changed (sync deletion is now blocked)") - - elif ret == 4: # No async delete request exists for this Snap - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No async delete request exists for this snap") - set_snap_offline(snap) - - elif ret == -1: # Operation not permitted - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Operation not permitted") - set_snap_offline(snap) - - elif ret == -2: # No such file or directory - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No such file or directory") - process_snap_delete_finish(snap, leader_node) - - elif ret == -5: # I/O error - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("I/O error") - process_snap_delete_try_again(snap) - - elif ret == -11: # Try again - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Try again") - process_snap_delete_try_again(snap) - - elif ret == -12: # Out of memory - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Out of memory") - process_snap_delete_try_again(snap) - - elif ret == -16: # Device or resource busy - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Device or resource busy") - process_snap_delete_try_again(snap) - - elif ret == -19: # No such device - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No such device") - set_snap_offline(snap) - - elif ret == -35: # Leadership changed - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Leadership changed") - process_snap_delete_try_again(snap) - - elif ret == -36: # Failed to update lvol for deletion - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Failed to update snapshot for deletion") - process_snap_delete_try_again(snap) - - else: # Failed to update lvol for deletion - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Failed to update snapshot for deletion") - - - time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) +def process_snap_replicate_start(task, snapshot): + # 1 create lvol on remote node + logger.info("Starting snapshot replication task") + lv_id, err = lvol_controller.add_lvol_ha(f"REP_{snapshot.name}", snapshot.size, + snapshot.lvol.replication_node_id, snapshot.lvol.ha_type) + remote_lv = db.get_lvol_by_id(lv_id) + # 2 connect to it + snode = db.get_storage_node_by_id(remote_lv.node_id) + snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=5, retry=2) + for nic in snode.data_nics: + ip = nic.ip4_address + snode_api.nvme_connect(ip, remote_lv.subsys_port, remote_lv.nqn) + # 3 start replication + snode.rpc_client().bdev_lvol_transfer( + lvol_name=snapshot.snap_bdev, + offset=0, + cluster_batch=16, + gateway=f"{remote_lv.top_bdev}n1", + operation="replicate" + ) + task.function_params["remote_lvol_id"] = lv_id + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db() + + if snapshot.status != SnapShot.STATUS_IN_REPLICATION: + snapshot.status = SnapShot.STATUS_IN_REPLICATION + snapshot.write_to_db() + + +def process_snap_replicate_finish(task, snapshot): + + task.function_result = "Done" + task.status = JobSchedule.STATUS_DONE + task.write_to_db() + if snapshot.status != SnapShot.STATUS_ONLINE: + snapshot.status = SnapShot.STATUS_ONLINE + snapshot.write_to_db() + + remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) + snode = db.get_storage_node_by_id(remote_lv.node_id) + snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=5, retry=2) + snode_api.disconnect_nqn(remote_lv.nqn) + + snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + + new_snapshot = snapshot + new_snapshot.uuid = str(uuid.uuid4()) + new_snapshot.cluster_id = snode.cluster_id + new_snapshot.lvol = remote_lv + new_snapshot.snap_bdev = remote_lv.top_bdev + new_snapshot.write_to_db() + lvol_controller.delete_lvol(remote_lv.get_id(), True) + + return True + + +def task_runner(task: JobSchedule): + + snapshot = db.get_snapshot_by_id(task.function_params["snapshot_id"]) + + if task.status == JobSchedule.STATUS_NEW: + process_snap_replicate_start(task, snapshot) + + elif task.status == JobSchedule.STATUS_IN_PROGRESS: + remote_lv = db.get_lvol_by_id(snapshot.lvol.node_id) + snode = db.get_storage_node_by_id(remote_lv.node_id) + ret = snode.rpc_client().bdev_lvol_transfer_stat(snapshot.snap_bdev) + if not ret: + logger.error("Failed to get transfer stat") + status = ret["transfer_state"] + offset = ret["offset"] + if status == "No process": + task.function_result = f"Status: {status}, offset:{offset}, retrying" + task.status = JobSchedule.STATUS_NEW + task.write_to_db() + return False + if status == "In progress": + task.function_result = f"Status: {status}, offset:{offset}" + task.write_to_db() + return True + if status == "Failed": + task.function_result = f"Status: {status}, offset:{offset}, retrying" + task.status = JobSchedule.STATUS_NEW + task.write_to_db() + return False + if status == "Done": + process_snap_replicate_finish(task, snapshot) + return True + + +logger.info("Starting Tasks runner...") +while True: + clusters = db.get_clusters() + if not clusters: + logger.error("No clusters found!") + else: + for cl in clusters: + tasks = db.get_job_tasks(cl.get_id(), reverse=False) + for task in tasks: + delay_seconds = constants.TASK_EXEC_INTERVAL_SEC + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + while task.status != JobSchedule.STATUS_DONE: + # get new task object because it could be changed from cancel task + task = db.get_task_by_id(task.uuid) + res = task_runner(task) + if res: + if task.status == JobSchedule.STATUS_DONE: + break + else: + if task.retry <= 3: + delay_seconds *= 1 + else: + delay_seconds *= 2 + time.sleep(delay_seconds) + + time.sleep(constants.TASK_EXEC_INTERVAL_SEC) diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 0b343ad64..09b891c48 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -167,3 +167,7 @@ def set_hugepages(self): def nvme_connect(self, ip, port, nqn): params = {"ip": ip, "port": port, "nqn": nqn} return self._request("POST", "nvme_connect", params) + + def disconnect_nqn(self, nqn): + params = {"nqn": nqn} + return self._request("POST", "disconnect_nqn", params) From efe11b62f384f61e975d50d99273251a03988988 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 02:44:05 +0300 Subject: [PATCH 005/192] Fix env_var --- simplyblock_core/env_var | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index 6b716ceb5..fa511bb03 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -2,5 +2,5 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev SIMPLY_BLOCK_VERSION=19.2.17 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main -SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main-sfam-2359 +SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:transfer-feature-latest From f5456e9bb9cc62262bedd34bfc63f5c8f416cd58 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 03:09:34 +0300 Subject: [PATCH 006/192] Fix service --- simplyblock_cli/clibase.py | 3 ++- simplyblock_core/services/snapshot_replication.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index cc681e022..4eb2409c6 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -471,7 +471,8 @@ def volume__add(self, sub_command, args): crypto_key2=args.crypto_key2, lvol_priority_class=lvol_priority_class, uid=args.uid, pvc_name=args.pvc_name, namespace=args.namespace, - max_namespace_per_subsys=args.max_namespace_per_subsys) + max_namespace_per_subsys=args.max_namespace_per_subsys, + do_replicate=args.replicate) if results: return results else: diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 89570c171..3d143ccd6 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -18,8 +18,10 @@ def process_snap_replicate_start(task, snapshot): # 1 create lvol on remote node logger.info("Starting snapshot replication task") - lv_id, err = lvol_controller.add_lvol_ha(f"REP_{snapshot.name}", snapshot.size, - snapshot.lvol.replication_node_id, snapshot.lvol.ha_type) + lv_id, err = lvol_controller.add_lvol_ha( + f"REP_{snapshot.name}", snapshot.size, snapshot.lvol.replication_node_id, snapshot.lvol.ha_type, + snapshot.pool_id, use_comp=False, use_crypto=False, distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, + max_r_mbytes=0, max_w_mbytes=0) remote_lv = db.get_lvol_by_id(lv_id) # 2 connect to it snode = db.get_storage_node_by_id(remote_lv.node_id) From de9500351912752fa8606034560e2c376a099240 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 03:22:06 +0300 Subject: [PATCH 007/192] Fix service --- .../services/snapshot_replication.py | 2 +- .../api/internal/storage_node/docker.py | 41 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 3d143ccd6..581c54404 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -20,7 +20,7 @@ def process_snap_replicate_start(task, snapshot): logger.info("Starting snapshot replication task") lv_id, err = lvol_controller.add_lvol_ha( f"REP_{snapshot.name}", snapshot.size, snapshot.lvol.replication_node_id, snapshot.lvol.ha_type, - snapshot.pool_id, use_comp=False, use_crypto=False, distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, + snapshot.lvol.pool_uuid, use_comp=False, use_crypto=False, distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0) remote_lv = db.get_lvol_by_id(lv_id) # 2 connect to it diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index 555b89191..1a1306869 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -567,3 +567,44 @@ def set_hugepages(): core_utils.set_hugepages_if_needed(numa, num_pages) return utils.get_response(True) + + +@api.post('/nvme_connect', + summary='Connect NVMe-oF target', + responses={ + 200: {'content': {'application/json': {'schema': utils.response_schema({ + 'type': 'boolean', + })}}, + }, +}) +def connect_to_nvme(body: utils.NVMEConnectParams): + """Connect to the indicated NVMe-oF target. + """ + st = f"nvme connect --transport=tcp --traddr={body.ip} --trsvcid={body.port} --nqn={body.nqn}" + logger.debug(st) + out, err, ret_code = shell_utils.run_command(st) + logger.debug(ret_code) + logger.debug(out) + logger.debug(err) + if ret_code == 0: + return utils.get_response(True) + else: + return utils.get_response(ret_code, error=err) + + +@api.post('/disconnect_nqn', + summary='Disconnect NVMe-oF device by NQN', + responses={ + 200: {'content': {'application/json': {'schema': utils.response_schema({ + 'type': 'integer', + })}}}, +}) +def disconnect_nqn(body: utils.DisconnectParams): + """Disconnect from indicated NVMe-oF target + """ + st = f"nvme disconnect --nqn={body.nqn}" + out, err, ret_code = shell_utils.run_command(st) + logger.debug(ret_code) + logger.debug(out) + logger.debug(err) + return utils.get_response(ret_code) From 02077a1cd1016befd1f61d5988d3350972bf8a94 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 03:35:07 +0300 Subject: [PATCH 008/192] Fix service --- simplyblock_core/services/snapshot_replication.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 581c54404..440ea7439 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -80,12 +80,13 @@ def task_runner(task: JobSchedule): if task.status == JobSchedule.STATUS_NEW: process_snap_replicate_start(task, snapshot) - elif task.status == JobSchedule.STATUS_IN_PROGRESS: - remote_lv = db.get_lvol_by_id(snapshot.lvol.node_id) + elif task.status == JobSchedule.STATUS_RUNNING: + remote_lv = db.get_lvol_by_id(snapshot.lvol.get_id()) snode = db.get_storage_node_by_id(remote_lv.node_id) ret = snode.rpc_client().bdev_lvol_transfer_stat(snapshot.snap_bdev) if not ret: logger.error("Failed to get transfer stat") + return False status = ret["transfer_state"] offset = ret["offset"] if status == "No process": From d0db64f84d350984bb995136ae2e069be801470a Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 03:59:19 +0300 Subject: [PATCH 009/192] Fix service --- simplyblock_core/services/snapshot_replication.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 440ea7439..5b0b45d17 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -28,7 +28,9 @@ def process_snap_replicate_start(task, snapshot): snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=5, retry=2) for nic in snode.data_nics: ip = nic.ip4_address - snode_api.nvme_connect(ip, remote_lv.subsys_port, remote_lv.nqn) + ret = snode.rpc_client().bdev_nvme_attach_controller( + remote_lv.top_bdev, remote_lv.nqn, ip, remote_lv.subsys_port, nic.trtype) + # 3 start replication snode.rpc_client().bdev_lvol_transfer( lvol_name=snapshot.snap_bdev, @@ -57,8 +59,7 @@ def process_snap_replicate_finish(task, snapshot): remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) snode = db.get_storage_node_by_id(remote_lv.node_id) - snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=5, retry=2) - snode_api.disconnect_nqn(remote_lv.nqn) + snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) From b1561bbf7e781d6055e9dd8475eab919b9ae16b1 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 04:25:40 +0300 Subject: [PATCH 010/192] Fix service 2 --- simplyblock_core/services/snapshot_replication.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 5b0b45d17..5cb046dad 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -24,13 +24,14 @@ def process_snap_replicate_start(task, snapshot): max_r_mbytes=0, max_w_mbytes=0) remote_lv = db.get_lvol_by_id(lv_id) # 2 connect to it - snode = db.get_storage_node_by_id(remote_lv.node_id) - snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=5, retry=2) - for nic in snode.data_nics: + remote_snode = db.get_storage_node_by_id(remote_lv.node_id) + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) + for nic in remote_snode.data_nics: ip = nic.ip4_address ret = snode.rpc_client().bdev_nvme_attach_controller( remote_lv.top_bdev, remote_lv.nqn, ip, remote_lv.subsys_port, nic.trtype) + # 3 start replication snode.rpc_client().bdev_lvol_transfer( lvol_name=snapshot.snap_bdev, @@ -58,10 +59,11 @@ def process_snap_replicate_finish(task, snapshot): snapshot.write_to_db() remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) - snode = db.get_storage_node_by_id(remote_lv.node_id) + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) - snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + remote_snode = db.get_storage_node_by_id(remote_lv.node_id) + remote_snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) new_snapshot = snapshot new_snapshot.uuid = str(uuid.uuid4()) @@ -82,8 +84,7 @@ def task_runner(task: JobSchedule): process_snap_replicate_start(task, snapshot) elif task.status == JobSchedule.STATUS_RUNNING: - remote_lv = db.get_lvol_by_id(snapshot.lvol.get_id()) - snode = db.get_storage_node_by_id(remote_lv.node_id) + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) ret = snode.rpc_client().bdev_lvol_transfer_stat(snapshot.snap_bdev) if not ret: logger.error("Failed to get transfer stat") From 2fde4519ba14a9624d305b530fbd8aadeede80f8 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 04:46:06 +0300 Subject: [PATCH 011/192] Fix service 3 --- simplyblock_core/services/snapshot_replication.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 5cb046dad..3656844b3 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -6,7 +6,6 @@ from simplyblock_core.controllers import lvol_controller from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.snapshot import SnapShot -from simplyblock_core.snode_client import SNodeClient logger = utils.get_logger(__name__) @@ -71,6 +70,8 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot.lvol = remote_lv new_snapshot.snap_bdev = remote_lv.top_bdev new_snapshot.write_to_db() + remote_lv.bdev_stack = [] + remote_lv.write_to_db() lvol_controller.delete_lvol(remote_lv.get_id(), True) return True From d084c067c7530de404609be6a420c676f24703ff Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 04:46:59 +0300 Subject: [PATCH 012/192] Fix service 4 --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 3656844b3..dd7319ffa 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -18,7 +18,7 @@ def process_snap_replicate_start(task, snapshot): # 1 create lvol on remote node logger.info("Starting snapshot replication task") lv_id, err = lvol_controller.add_lvol_ha( - f"REP_{snapshot.name}", snapshot.size, snapshot.lvol.replication_node_id, snapshot.lvol.ha_type, + f"REP_{snapshot.snap_name}", snapshot.size, snapshot.lvol.replication_node_id, snapshot.lvol.ha_type, snapshot.lvol.pool_uuid, use_comp=False, use_crypto=False, distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0) remote_lv = db.get_lvol_by_id(lv_id) From d7e7c209f60ffbe341ec1d9eb0d2b08300f6a6c7 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 06:37:22 +0300 Subject: [PATCH 013/192] Fix service 6 --- simplyblock_cli/cli-reference.yaml | 4 + simplyblock_cli/cli.py | 1 + simplyblock_cli/clibase.py | 2 +- simplyblock_core/cluster_ops.py | 10 ++- .../controllers/lvol_controller.py | 4 +- .../controllers/snapshot_controller.py | 4 +- .../controllers/snapshot_events.py | 7 ++ simplyblock_core/models/cluster.py | 1 + .../services/snapshot_replication.py | 81 +++++++++++++++---- 9 files changed, 93 insertions(+), 21 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 73e4d1e08..9c5b9fb7d 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1273,6 +1273,10 @@ commands: dest: timeout type: int default: "3600" + - name: "--target-pool" + help: "Target cluster pool ID or name" + dest: target_pool + type: str - name: "volume" help: "Logical volume commands" aliases: diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 066f361d0..8891ec979 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -513,6 +513,7 @@ def init_cluster__add_replication(self, subparser): subcommand.add_argument('cluster_id', help='Cluster id', type=str).completer = self._completer_get_cluster_list subcommand.add_argument('target_cluster_id', help='Target Cluster id', type=str).completer = self._completer_get_cluster_list argument = subcommand.add_argument('--timeout', help='Snapshot replication network timeout', type=int, default=3600, dest='timeout') + argument = subcommand.add_argument('--target-pool', help='Target cluster pool ID or name', type=str, dest='target_pool') def init_volume(self): diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index a6101727c..7b2abb5b7 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -444,7 +444,7 @@ def cluster__complete_expand(self, sub_command, args): return True def cluster__add_replication(self, sub_command, args): - return cluster_ops.add_replication(args.cluster_id, args.target_cluster_id, args.timeout) + return cluster_ops.add_replication(args.cluster_id, args.target_cluster_id, args.timeout, args.target_pool) def volume__add(self, sub_command, args): name = args.name diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 4bf051a52..661cf60f7 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -1322,7 +1322,7 @@ def set(cl_id, attr, value) -> None: cluster.write_to_db() -def add_replication(source_cl_id, target_cl_id, timeout=0) -> bool: +def add_replication(source_cl_id, target_cl_id, timeout=0, target_pool=None) -> bool: db_controller = DBController() cluster = db_controller.get_cluster_by_id(source_cl_id) if not cluster: @@ -1334,6 +1334,14 @@ def add_replication(source_cl_id, target_cl_id, timeout=0) -> bool: logger.info("Updating Cluster replication target") cluster.snapshot_replication_target_cluster = target_cl_id + if target_pool: + pool = db_controller.get_pool_by_id(target_pool) + if not pool: + raise ValueError(f"Pool not found: {target_pool}") + if pool.status != Pool.STATUS_ACTIVE: + raise ValueError(f"Pool not active: {target_pool}") + cluster.snapshot_replication_target_pool = target_pool + if timeout and timeout > 0: cluster.snapshot_replication_timeout = timeout cluster.write_to_db() diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 7f9d8980c..3ee8caf5c 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -262,8 +262,8 @@ def validate_aes_xts_keys(key1: str, key2: str) -> Tuple[bool, str]: return True, "" -def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, use_crypto, - distr_vuid, max_rw_iops, max_rw_mbytes, max_r_mbytes, max_w_mbytes, +def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp=False, use_crypto=False, + distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, with_snapshot=False, max_size=0, crypto_key1=None, crypto_key2=None, lvol_priority_class=0, uid=None, pvc_name=None, namespace=None, max_namespace_per_subsys=1, fabric="TCP", do_replicate=False): diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index ed9633a73..14cad25ac 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -218,7 +218,9 @@ def add(lvol_id, snapshot_name): logger.info("Done") snapshot_events.snapshot_create(snap) - tasks_controller.add_snapshot_replication_task(snap) + task = tasks_controller.add_snapshot_replication_task(snap) + if task: + snapshot_events.replication_task_created(snap) return snap.uuid, False diff --git a/simplyblock_core/controllers/snapshot_events.py b/simplyblock_core/controllers/snapshot_events.py index 4cb107dcd..839da25af 100644 --- a/simplyblock_core/controllers/snapshot_events.py +++ b/simplyblock_core/controllers/snapshot_events.py @@ -31,3 +31,10 @@ def snapshot_delete(snapshot, caused_by=ec.CAUSED_BY_CLI): def snapshot_clone(snapshot, lvol_clone, caused_by=ec.CAUSED_BY_CLI): _snapshot_event(snapshot, f"Snapshot cloned: {snapshot.get_id()} clone id: {lvol_clone.get_id()}", caused_by, ec.EVENT_STATUS_CHANGE) + +def replication_task_created(snapshot, caused_by=ec.CAUSED_BY_CLI): + _snapshot_event(snapshot, f"Snapshot replication task created", caused_by, ec.EVENT_OBJ_CREATED) + + +def replication_task_finished(snapshot, caused_by=ec.CAUSED_BY_CLI): + _snapshot_event(snapshot, f"Snapshot replication task finished", caused_by, ec.EVENT_OBJ_CREATED) diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index 3b2b052ba..5d435dbb8 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -70,6 +70,7 @@ class Cluster(BaseModel): full_page_unmap: bool = True is_single_node: bool = False snapshot_replication_target_cluster: str = "" + snapshot_replication_target_pool: str = "" snapshot_replication_timeout: int = 60*10 def get_status_code(self): diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index dd7319ffa..13b0e6980 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -3,8 +3,9 @@ import uuid from simplyblock_core import constants, db_controller, utils -from simplyblock_core.controllers import lvol_controller +from simplyblock_core.controllers import lvol_controller, snapshot_events from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.pool import Pool from simplyblock_core.models.snapshot import SnapShot @@ -17,29 +18,70 @@ def process_snap_replicate_start(task, snapshot): # 1 create lvol on remote node logger.info("Starting snapshot replication task") - lv_id, err = lvol_controller.add_lvol_ha( - f"REP_{snapshot.snap_name}", snapshot.size, snapshot.lvol.replication_node_id, snapshot.lvol.ha_type, - snapshot.lvol.pool_uuid, use_comp=False, use_crypto=False, distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, - max_r_mbytes=0, max_w_mbytes=0) - remote_lv = db.get_lvol_by_id(lv_id) - # 2 connect to it - remote_snode = db.get_storage_node_by_id(remote_lv.node_id) snode = db.get_storage_node_by_id(snapshot.lvol.node_id) - for nic in remote_snode.data_nics: - ip = nic.ip4_address - ret = snode.rpc_client().bdev_nvme_attach_controller( - remote_lv.top_bdev, remote_lv.nqn, ip, remote_lv.subsys_port, nic.trtype) - + if not task.function_params["remote_lvol_id"] : + remote_node_uuid = db.get_storage_node_by_id(snapshot.lvol.replication_node_id) + cluster = db.get_cluster_by_id(remote_node_uuid.cluster_id) + remote_pool_uuid = None + if cluster.snapshot_replication_target_pool: + remote_pool_uuid = cluster.snapshot_replication_target_pool + else: + for bool in db.get_pools(remote_node_uuid.cluster_id): + if bool.status == Pool.STATUS_ACTIVE: + remote_pool_uuid = bool.uuid + break + if not remote_pool_uuid: + logger.error(f"Unable to find pool on remote cluster: {remote_node_uuid.cluster_id}") + return + + lv_id, err = lvol_controller.add_lvol_ha( + f"REP_{snapshot.snap_name}", snapshot.size, snapshot.lvol.replication_node_id, snapshot.lvol.ha_type, + remote_pool_uuid) + task.function_params["remote_lvol_id"] = lv_id + task.write_to_db() + remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) + # 2 connect to it + ret = snode.rpc_client().bdev_nvme_controller_list(remote_lv.top_bdev) + if not ret: + remote_snode = db.get_storage_node_by_id(remote_lv.node_id) + for nic in remote_snode.data_nics: + ip = nic.ip4_address + ret = snode.rpc_client().bdev_nvme_attach_controller( + remote_lv.top_bdev, remote_lv.nqn, ip, remote_lv.subsys_port, nic.trtype) + if not ret: + msg = "controller attach failed" + logger.error(msg) + raise RuntimeError(msg) + bdev_name = ret[0] + if not bdev_name: + msg = "Bdev name not returned from controller attach" + logger.error(msg) + raise RuntimeError(msg) + bdev_found = False + for i in range(5): + ret = snode.rpc_client().get_bdevs(bdev_name) + if ret: + bdev_found = True + break + else: + time.sleep(1) + + if not bdev_found: + logger.error("lvol Bdev not found after 5 attempts") + raise RuntimeError(f"Failed to connect to lvol: {remote_lv.get_id()}") + + offset = 0 + if "offset" in task.function_params and task.function_params["offset"]: + offset = task.function_params["offset"] # 3 start replication snode.rpc_client().bdev_lvol_transfer( lvol_name=snapshot.snap_bdev, - offset=0, + offset=offset, cluster_batch=16, gateway=f"{remote_lv.top_bdev}n1", operation="replicate" ) - task.function_params["remote_lvol_id"] = lv_id task.status = JobSchedule.STATUS_RUNNING task.write_to_db() @@ -66,13 +108,19 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot = snapshot new_snapshot.uuid = str(uuid.uuid4()) - new_snapshot.cluster_id = snode.cluster_id + new_snapshot.cluster_id = remote_snode.cluster_id new_snapshot.lvol = remote_lv + new_snapshot.pool_uuid = remote_lv.pool_uuid new_snapshot.snap_bdev = remote_lv.top_bdev + new_snapshot.snap_uuid = remote_lv.lvol_uuid + new_snapshot.blobid = remote_lv.blobid + new_snapshot.created_at = int(time.time()) new_snapshot.write_to_db() remote_lv.bdev_stack = [] remote_lv.write_to_db() lvol_controller.delete_lvol(remote_lv.get_id(), True) + remote_lv.remove(db.kv_store) + snapshot_events.replication_task_finished(snapshot) return True @@ -99,6 +147,7 @@ def task_runner(task: JobSchedule): return False if status == "In progress": task.function_result = f"Status: {status}, offset:{offset}" + task.function_params["offset"] = offset task.write_to_db() return True if status == "Failed": From 6770f32952cc79a0134602d8db0fbb4b934e7ff7 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 06:53:38 +0300 Subject: [PATCH 014/192] Fix service 5 --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 13b0e6980..c0764e7bc 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -19,7 +19,7 @@ def process_snap_replicate_start(task, snapshot): # 1 create lvol on remote node logger.info("Starting snapshot replication task") snode = db.get_storage_node_by_id(snapshot.lvol.node_id) - if not task.function_params["remote_lvol_id"] : + if "remote_lvol_id" not in task.function_params or not task.function_params["remote_lvol_id"] : remote_node_uuid = db.get_storage_node_by_id(snapshot.lvol.replication_node_id) cluster = db.get_cluster_by_id(remote_node_uuid.cluster_id) remote_pool_uuid = None From 08d7edce141dfebefb759ae0b6a6007e2bbaa3c5 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 07:35:59 +0300 Subject: [PATCH 015/192] Fix service 7 --- .../services/snapshot_replication.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index c0764e7bc..53a8ffece 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -126,9 +126,28 @@ def process_snap_replicate_finish(task, snapshot): def task_runner(task: JobSchedule): - snapshot = db.get_snapshot_by_id(task.function_params["snapshot_id"]) + if task.retry >= task.max_retry or task.canceled is True: + task.function_result = "max retry reached" + if task.canceled is True: + task.function_result = "task cancelled" + + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + if snapshot.status != SnapShot.STATUS_ONLINE: + snapshot.status = SnapShot.STATUS_ONLINE + snapshot.write_to_db() + + remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) + snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) + lvol_controller.delete_lvol(remote_lv.get_id(), True) + + return True + + if task.status == JobSchedule.STATUS_NEW: process_snap_replicate_start(task, snapshot) @@ -143,6 +162,7 @@ def task_runner(task: JobSchedule): if status == "No process": task.function_result = f"Status: {status}, offset:{offset}, retrying" task.status = JobSchedule.STATUS_NEW + task.retry += 1 task.write_to_db() return False if status == "In progress": @@ -153,6 +173,7 @@ def task_runner(task: JobSchedule): if status == "Failed": task.function_result = f"Status: {status}, offset:{offset}, retrying" task.status = JobSchedule.STATUS_NEW + task.retry += 1 task.write_to_db() return False if status == "Done": From 08064892064c62859d2154fcfe227331f390f0de Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 08:32:40 +0300 Subject: [PATCH 016/192] Fix service 8 --- .../services/snapshot_replication.py | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 53a8ffece..ff23d9b87 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -3,11 +3,11 @@ import uuid from simplyblock_core import constants, db_controller, utils -from simplyblock_core.controllers import lvol_controller, snapshot_events +from simplyblock_core.controllers import lvol_controller, snapshot_events, tasks_controller from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.pool import Pool from simplyblock_core.models.snapshot import SnapShot - +from simplyblock_core.models.storage_node import StorageNode logger = utils.get_logger(__name__) utils.init_sentry_sdk(__name__) @@ -127,6 +127,19 @@ def process_snap_replicate_finish(task, snapshot): def task_runner(task: JobSchedule): snapshot = db.get_snapshot_by_id(task.function_params["snapshot_id"]) + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) + + if not snode: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return True + + if snode.status != StorageNode.STATUS_ONLINE: + task.function_result = "node is not online, retrying" + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return False if task.retry >= task.max_retry or task.canceled is True: task.function_result = "max retry reached" @@ -141,14 +154,13 @@ def task_runner(task: JobSchedule): snapshot.write_to_db() remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) - snode = db.get_storage_node_by_id(snapshot.lvol.node_id) snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) lvol_controller.delete_lvol(remote_lv.get_id(), True) return True - if task.status == JobSchedule.STATUS_NEW: + if task.status in [JobSchedule.STATUS_NEW, JobSchedule.STATUS_SUSPENDED]: process_snap_replicate_start(task, snapshot) elif task.status == JobSchedule.STATUS_RUNNING: @@ -172,7 +184,7 @@ def task_runner(task: JobSchedule): return True if status == "Failed": task.function_result = f"Status: {status}, offset:{offset}, retrying" - task.status = JobSchedule.STATUS_NEW + task.status = JobSchedule.STATUS_SUSPENDED task.retry += 1 task.write_to_db() return False @@ -192,18 +204,17 @@ def task_runner(task: JobSchedule): for task in tasks: delay_seconds = constants.TASK_EXEC_INTERVAL_SEC if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: - while task.status != JobSchedule.STATUS_DONE: + if task.status in [JobSchedule.STATUS_NEW, JobSchedule.STATUS_SUSPENDED]: + active_task = tasks_controller.get_snapshot_replication_task( + task.cluster_id, task.function_params['snapshot_id']) + if active_task and active_task != task.get_id(): + logger.info("task found on same snapshot, retry") + continue + if task.status != JobSchedule.STATUS_DONE: # get new task object because it could be changed from cancel task task = db.get_task_by_id(task.uuid) res = task_runner(task) - if res: - if task.status == JobSchedule.STATUS_DONE: - break - else: - if task.retry <= 3: - delay_seconds *= 1 - else: - delay_seconds *= 2 - time.sleep(delay_seconds) + if not res: + time.sleep(3) time.sleep(constants.TASK_EXEC_INTERVAL_SEC) From a6eb900b40d48a2b260cfaee37e05aafc49dad2b Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 18 Oct 2025 08:52:35 +0300 Subject: [PATCH 017/192] Fix service 8 --- simplyblock_core/services/snapshot_replication.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index ff23d9b87..c90323f6a 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -205,10 +205,14 @@ def task_runner(task: JobSchedule): delay_seconds = constants.TASK_EXEC_INTERVAL_SEC if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: if task.status in [JobSchedule.STATUS_NEW, JobSchedule.STATUS_SUSPENDED]: - active_task = tasks_controller.get_snapshot_replication_task( - task.cluster_id, task.function_params['snapshot_id']) - if active_task and active_task != task.get_id(): - logger.info("task found on same snapshot, retry") + active_task = False + for t in db.get_job_tasks(task.cluster_id): + if t.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and t.function_params["snapshot_id"] == task.function_params['snapshot_id']: + if t.status == JobSchedule.STATUS_RUNNING and t.canceled is False: + active_task = True + break + if active_task: + logger.info("replication task found for same snapshot, retry") continue if task.status != JobSchedule.STATUS_DONE: # get new task object because it could be changed from cancel task From 1f5568f3b7102677aa97783331e05d828aa4c766 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 5 Nov 2025 14:21:53 +0100 Subject: [PATCH 018/192] do not auto add default qos class (#720) * added 3 secs sleep before add qos class * wip * Revert "wip" This reverts commit b0f2ba695f778f5eb177fab8a474210e12b0c69a. * increase sleep time * remove auto add default qos class --------- Co-authored-by: hamdykhader --- simplyblock_core/cluster_ops.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 103123934..fbe91a58f 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -371,8 +371,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.write_to_db(db_controller.kv_store) - qos_controller.add_class("Default", 100, cluster.get_id()) - cluster_events.cluster_create(cluster) mgmt_node_ops.add_mgmt_node(dev_ip, mode, cluster.uuid) From 3af633b009d706c07e7526e36c5ab744c28e5041 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 5 Nov 2025 14:30:53 +0100 Subject: [PATCH 019/192] Update env_var (#721) --- simplyblock_core/env_var | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index f3e377ee4..e1d2e2f8b 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,5 +1,5 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev -SIMPLY_BLOCK_VERSION=19.2.23 +SIMPLY_BLOCK_VERSION=19.2.24 SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest From d02beba1a846009f3b3c3e121b7cdfa5c3e9e6dd Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 00:28:13 +0300 Subject: [PATCH 020/192] wip --- simplyblock_cli/cli-reference.yaml | 7 +++++ simplyblock_cli/cli.py | 9 +++++- simplyblock_cli/clibase.py | 3 ++ .../controllers/snapshot_controller.py | 28 +++++++++++++++++++ .../controllers/tasks_controller.py | 1 + .../services/snapshot_replication.py | 11 ++++++-- simplyblock_core/utils/__init__.py | 5 +++- 7 files changed, 60 insertions(+), 4 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 9a528c57e..016553854 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1882,6 +1882,13 @@ commands: dest: resize type: size default: "0" + - name: replication-status + help: "Lists snapshots replication status" + arguments: + - name: "cluster_id" + help: "Cluster UUID" + dest: cluster_id + type: str - name: "qos" help: "qos commands" weight: 700 diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 6f13af8f0..1a6d64543 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -568,9 +568,9 @@ def init_volume__add(self, subparser): if self.developer_mode: argument = subcommand.add_argument('--uid', help='Set logical volume id', type=str, dest='uid') argument = subcommand.add_argument('--pvc-name', '--pvc_name', help='Set logical volume PVC name for k8s clients', type=str, dest='pvc_name') - argument = subcommand.add_argument('--replicate', help='Replicate LVol snapshot', dest='replicate', action='store_true') argument = subcommand.add_argument('--data-chunks-per-stripe', help='Erasure coding schema parameter k (distributed raid), default: 1', type=int, default=0, dest='ndcs') argument = subcommand.add_argument('--parity-chunks-per-stripe', help='Erasure coding schema parameter n (distributed raid), default: 1', type=int, default=0, dest='npcs') + argument = subcommand.add_argument('--replicate', help='Replicate LVol snapshot', dest='replicate', action='store_true') def init_volume__qos_set(self, subparser): subcommand = self.add_sub_command(subparser, 'qos-set', 'Changes QoS settings for an active logical volume') @@ -748,6 +748,7 @@ def init_snapshot(self): self.init_snapshot__list(subparser) self.init_snapshot__delete(subparser) self.init_snapshot__clone(subparser) + self.init_snapshot__replication_status(subparser) def init_snapshot__add(self, subparser): @@ -770,6 +771,10 @@ def init_snapshot__clone(self, subparser): subcommand.add_argument('lvol_name', help='Logical volume name', type=str) argument = subcommand.add_argument('--resize', help='New logical volume size: 10M, 10G, 10(bytes). Can only increase.', type=size_type(), default='0', dest='resize') + def init_snapshot__replication_status(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-status', 'Lists snapshots replication status') + subcommand.add_argument('cluster_id', help='Cluster UUID', type=str) + def init_qos(self): subparser = self.add_command('qos', 'qos commands') @@ -1113,6 +1118,8 @@ def run(self): ret = self.snapshot__delete(sub_command, args) elif sub_command in ['clone']: ret = self.snapshot__clone(sub_command, args) + elif sub_command in ['replication-status']: + ret = self.snapshot__replication_status(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index dcc81c7a6..43ca2c471 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -638,6 +638,9 @@ def snapshot__clone(self, sub_command, args): success, details = snapshot_controller.clone(args.snapshot_id, args.lvol_name, new_size) return details + def snapshot__replication_status(self, sub_command, args): + return snapshot_controller.list_replication_tasks(args.cluster_id) + def qos__add(self, sub_command, args): return qos_controller.add_class(args.name, args.weight, args.cluster_id) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 9b27511ab..e6ccf366c 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -7,6 +7,7 @@ from simplyblock_core import utils, constants from simplyblock_core.db_controller import DBController +from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.pool import Pool from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.lvol_model import LVol @@ -590,3 +591,30 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None if new_size: lvol_controller.resize_lvol(lvol.get_id(), new_size) return lvol.uuid, False + + +def list_replication_tasks(cluster_id): + tasks = db_controller.get_job_tasks(cluster_id) + + data = [] + for task in tasks: + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + remote_lv = db_controller.get_lvol_by_id(task.function_params["remote_lvol_id"]) + duration = "" + if task.status == JobSchedule.STATUS_RUNNING: + try: + duration = utils.strfdelta_seconds(time.time() - task.function_params["start_time"]) + except Exception: + pass + data.append({ + "Task ID": task.uuid, + "Snapshot ID": snap.uuid, + "Size": utils.humanbytes(snap.used_size), + "Duration": duration, + "Offset": task.function_params["offset"], + "Status": task.status, + "Replicate on node": remote_lv.node_id, + }) + return utils.print_table(data) diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index 0bc87c8b6..15e38e808 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -168,6 +168,7 @@ def list_tasks(cluster_id, is_json=False, limit=50, **kwargs): for task in tasks: if task.function_name == JobSchedule.FN_DEV_MIG: continue + logger.debug(task) if task.max_retry > 0: retry = f"{task.retry}/{task.max_retry}" else: diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index c90323f6a..b4155c3e9 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -37,8 +37,14 @@ def process_snap_replicate_start(task, snapshot): lv_id, err = lvol_controller.add_lvol_ha( f"REP_{snapshot.snap_name}", snapshot.size, snapshot.lvol.replication_node_id, snapshot.lvol.ha_type, remote_pool_uuid) - task.function_params["remote_lvol_id"] = lv_id - task.write_to_db() + if lv_id: + task.function_params["remote_lvol_id"] = lv_id + task.write_to_db() + else: + logger.error(err) + task.function_result = "Error creating remote lvol" + task.write_to_db() + return remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) # 2 connect to it @@ -83,6 +89,7 @@ def process_snap_replicate_start(task, snapshot): operation="replicate" ) task.status = JobSchedule.STATUS_RUNNING + task.function_params["start_time"] = time.time() task.write_to_db() if snapshot.status != SnapShot.STATUS_IN_REPLICATION: diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 941414708..c6313802e 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -735,7 +735,10 @@ def nearest_upper_power_of_2(n): def strfdelta(tdelta): - remainder = int(tdelta.total_seconds()) + return strfdelta_seconds(int(tdelta.total_seconds())) + + +def strfdelta_seconds(remainder: int) -> str: possible_fields = ('W', 'D', 'H', 'M', 'S') constants = {'W': 604800, 'D': 86400, 'H': 3600, 'M': 60, 'S': 1} values = {} From 4267f1c94e36b569d21d2949543e0496db04f9de Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 01:12:53 +0300 Subject: [PATCH 021/192] wip 2 --- .../controllers/snapshot_controller.py | 21 ++++++++++++------- .../controllers/snapshot_events.py | 4 ++-- .../services/snapshot_replication.py | 5 +++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index e6ccf366c..bcdc66bab 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -601,20 +601,25 @@ def list_replication_tasks(cluster_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) - remote_lv = db_controller.get_lvol_by_id(task.function_params["remote_lvol_id"]) duration = "" - if task.status == JobSchedule.STATUS_RUNNING: - try: - duration = utils.strfdelta_seconds(time.time() - task.function_params["start_time"]) - except Exception: - pass + try: + if task.status == JobSchedule.STATUS_RUNNING: + duration = utils.strfdelta_seconds(int(time.time()) - task.function_params["start_time"]) + elif "end_time" in task.function_params: + duration = utils.strfdelta_seconds( + task.function_params["end_time"] - task.function_params["start_time"]) + except Exception as e: + logger.error(e) + offset = "" + if "offset" in task.function_params: + offset = task.function_params["offset"] data.append({ "Task ID": task.uuid, "Snapshot ID": snap.uuid, "Size": utils.humanbytes(snap.used_size), "Duration": duration, - "Offset": task.function_params["offset"], + "Offset": offset, "Status": task.status, - "Replicate on node": remote_lv.node_id, + "Replicated on node": snap.lvol.node_id, }) return utils.print_table(data) diff --git a/simplyblock_core/controllers/snapshot_events.py b/simplyblock_core/controllers/snapshot_events.py index 839da25af..9b29f8b6f 100644 --- a/simplyblock_core/controllers/snapshot_events.py +++ b/simplyblock_core/controllers/snapshot_events.py @@ -33,8 +33,8 @@ def snapshot_clone(snapshot, lvol_clone, caused_by=ec.CAUSED_BY_CLI): def replication_task_created(snapshot, caused_by=ec.CAUSED_BY_CLI): - _snapshot_event(snapshot, f"Snapshot replication task created", caused_by, ec.EVENT_OBJ_CREATED) + _snapshot_event(snapshot, "Snapshot replication task created", caused_by, ec.EVENT_OBJ_CREATED) def replication_task_finished(snapshot, caused_by=ec.CAUSED_BY_CLI): - _snapshot_event(snapshot, f"Snapshot replication task finished", caused_by, ec.EVENT_OBJ_CREATED) + _snapshot_event(snapshot, "Snapshot replication task finished", caused_by, ec.EVENT_OBJ_CREATED) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index b4155c3e9..e726c20b9 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -3,7 +3,7 @@ import uuid from simplyblock_core import constants, db_controller, utils -from simplyblock_core.controllers import lvol_controller, snapshot_events, tasks_controller +from simplyblock_core.controllers import lvol_controller, snapshot_events from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.pool import Pool from simplyblock_core.models.snapshot import SnapShot @@ -89,7 +89,7 @@ def process_snap_replicate_start(task, snapshot): operation="replicate" ) task.status = JobSchedule.STATUS_RUNNING - task.function_params["start_time"] = time.time() + task.function_params["start_time"] = int(time.time()) task.write_to_db() if snapshot.status != SnapShot.STATUS_IN_REPLICATION: @@ -101,6 +101,7 @@ def process_snap_replicate_finish(task, snapshot): task.function_result = "Done" task.status = JobSchedule.STATUS_DONE + task.function_params["end_time"] = int(time.time()) task.write_to_db() if snapshot.status != SnapShot.STATUS_ONLINE: snapshot.status = SnapShot.STATUS_ONLINE From ef130f2aa8a03e8ab5097804b3b05efc4968ea81 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 02:05:04 +0300 Subject: [PATCH 022/192] wip 3 --- simplyblock_cli/cli-reference.yaml | 28 +++++++++++++++++++ simplyblock_cli/cli.py | 28 +++++++++++++++++++ simplyblock_cli/clibase.py | 12 ++++++++ .../controllers/lvol_controller.py | 25 +++++++++++++++++ .../controllers/snapshot_controller.py | 22 +++++++++++++++ .../services/snapshot_replication.py | 14 ++++++++++ 6 files changed, 129 insertions(+) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 016553854..e9e276b41 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1630,6 +1630,27 @@ commands: help: "Logical volume id" dest: volume_id type: str + - name: replication-start + help: "Start snapshot replication taken from lvol" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str + - name: replication-stop + help: "Stop snapshot replication taken from lvol" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str + - name: replication-status + help: "Lists replication status" + arguments: + - name: "cluster_id" + help: "Cluster UUID" + dest: cluster_id + type: str - name: "control-plane" help: "Control plane commands" aliases: @@ -1889,6 +1910,13 @@ commands: help: "Cluster UUID" dest: cluster_id type: str + - name: delete-replication-only + help: "Delete replicated version of a snapshot" + arguments: + - name: "snapshot_id" + help: "Snapshot UUID" + dest: snapshot_id + type: str - name: "qos" help: "qos commands" weight: 700 diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 1a6d64543..0c9677096 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -541,6 +541,9 @@ def init_volume(self): self.init_volume__get_io_stats(subparser) self.init_volume__check(subparser) self.init_volume__inflate(subparser) + self.init_volume__replication_start(subparser) + self.init_volume__replication_stop(subparser) + self.init_volume__replication_status(subparser) def init_volume__add(self, subparser): @@ -648,6 +651,18 @@ def init_volume__inflate(self, subparser): subcommand = self.add_sub_command(subparser, 'inflate', 'Inflate a logical volume') subcommand.add_argument('volume_id', help='Logical volume id', type=str) + def init_volume__replication_start(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-start', 'Start snapshot replication taken from lvol') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + + def init_volume__replication_stop(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-stop', 'Stop snapshot replication taken from lvol') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + + def init_volume__replication_status(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-status', 'Lists replication status') + subcommand.add_argument('cluster_id', help='Cluster UUID', type=str) + def init_control_plane(self): subparser = self.add_command('control-plane', 'Control plane commands', aliases=['cp','mgmt',]) @@ -749,6 +764,7 @@ def init_snapshot(self): self.init_snapshot__delete(subparser) self.init_snapshot__clone(subparser) self.init_snapshot__replication_status(subparser) + self.init_snapshot__delete_replication_only(subparser) def init_snapshot__add(self, subparser): @@ -775,6 +791,10 @@ def init_snapshot__replication_status(self, subparser): subcommand = self.add_sub_command(subparser, 'replication-status', 'Lists snapshots replication status') subcommand.add_argument('cluster_id', help='Cluster UUID', type=str) + def init_snapshot__delete_replication_only(self, subparser): + subcommand = self.add_sub_command(subparser, 'delete-replication-only', 'Delete replicated version of a snapshot') + subcommand.add_argument('snapshot_id', help='Snapshot UUID', type=str) + def init_qos(self): subparser = self.add_command('qos', 'qos commands') @@ -1071,6 +1091,12 @@ def run(self): ret = self.volume__check(sub_command, args) elif sub_command in ['inflate']: ret = self.volume__inflate(sub_command, args) + elif sub_command in ['replication-start']: + ret = self.volume__replication_start(sub_command, args) + elif sub_command in ['replication-stop']: + ret = self.volume__replication_stop(sub_command, args) + elif sub_command in ['replication-status']: + ret = self.volume__replication_status(sub_command, args) else: self.parser.print_help() @@ -1120,6 +1146,8 @@ def run(self): ret = self.snapshot__clone(sub_command, args) elif sub_command in ['replication-status']: ret = self.snapshot__replication_status(sub_command, args) + elif sub_command in ['delete-replication-only']: + ret = self.snapshot__delete_replication_only(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 43ca2c471..60bebece3 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -559,6 +559,15 @@ def volume__check(self, sub_command, args): def volume__inflate(self, sub_command, args): return lvol_controller.inflate_lvol(args.volume_id) + def volume__replication_start(self, sub_command, args): + return lvol_controller.volume__replication_start(args.lvol_id) + + def volume__replication_stop(self, sub_command, args): + return lvol_controller.volume__replication_stop(args.lvol_id) + + def volume__replication_status(self, sub_command, args): + return snapshot_controller.list_replication_tasks(args.cluster_id) + def control_plane__add(self, sub_command, args): cluster_id = args.cluster_id cluster_ip = args.cluster_ip @@ -641,6 +650,9 @@ def snapshot__clone(self, sub_command, args): def snapshot__replication_status(self, sub_command, args): return snapshot_controller.list_replication_tasks(args.cluster_id) + def snapshot__delete_replication_only(self, sub_command, args): + return snapshot_controller.delete_replicated(args.snapshot_id) + def qos__add(self, sub_command, args): return qos_controller.add_class(args.name, args.weight, args.cluster_id) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 11a8dcbeb..e1236d629 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1771,3 +1771,28 @@ def inflate_lvol(lvol_id): else: logger.error(f"Failed to inflate LVol: {lvol_id}") return ret + +def volume__replication_start(lvol_id): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + logger.info("Setting LVol do_replicate: True") + lvol.do_replicate = True + lvol.write_to_db() + + +def volume__replication_stop(lvol_id, delete=False): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + logger.info("Setting LVol do_replicate: False") + lvol.do_replicate = False + lvol.write_to_db() diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index bcdc66bab..1768b1147 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -254,6 +254,10 @@ def delete(snapshot_uuid, force_delete=False): logger.error(f"Snapshot not found {snapshot_uuid}") return False + if snap.status == SnapShot.STATUS_IN_REPLICATION: + logger.error(f"Snapshot is in replication") + return False + try: snode = db_controller.get_storage_node_by_id(snap.lvol.node_id) except KeyError: @@ -623,3 +627,21 @@ def list_replication_tasks(cluster_id): "Replicated on node": snap.lvol.node_id, }) return utils.print_table(data) + + +def delete_replicated(snapshot_id): + try: + snap = db_controller.get_snapshot_by_id(snapshot_id) + except KeyError: + logger.error(f"Snapshot not found {snapshot_id}") + return False + + snaps = db_controller.get_snapshots_by_node_id(snap.lvol.replication_node_id) + for sn in snaps: + if sn.snap_name == snap.snap_name: + logger.info("Deleting replicated snapshot %s", sn.uuid) + ret = delete(sn.uuid) + if not ret: + logger.error("Failed to delete snapshot %s", sn.uuid) + return False + return True diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index e726c20b9..8a74d3260 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -47,6 +47,14 @@ def process_snap_replicate_start(task, snapshot): return remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) + remote_lv_node = db.get_storage_node_by_id(remote_lv.node_id) + if remote_lv_node.status != StorageNode.STATUS_ONLINE: + task.function_result = "Target node is not online, retrying" + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db() + return + # 2 connect to it ret = snode.rpc_client().bdev_nvme_controller_list(remote_lv.top_bdev) if not ret: @@ -135,6 +143,12 @@ def process_snap_replicate_finish(task, snapshot): def task_runner(task: JobSchedule): snapshot = db.get_snapshot_by_id(task.function_params["snapshot_id"]) + if not snapshot: + task.function_result = "snapshot not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return True + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) if not snode: From d68ebf672767b88d26c83874d8821e7e1e503471 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 02:06:21 +0300 Subject: [PATCH 023/192] wip 3 --- simplyblock_core/controllers/snapshot_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 1768b1147..4bd49c0bd 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -255,7 +255,7 @@ def delete(snapshot_uuid, force_delete=False): return False if snap.status == SnapShot.STATUS_IN_REPLICATION: - logger.error(f"Snapshot is in replication") + logger.error("Snapshot is in replication") return False try: From 54cc929a7b958181b35c7cb9f449fcc8df2467e9 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 14:27:51 +0300 Subject: [PATCH 024/192] wip 4 --- simplyblock_core/rpc_client.py | 7 +++++++ simplyblock_core/services/snapshot_replication.py | 15 ++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index bd5824650..f041d3f2c 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -1274,3 +1274,10 @@ def nvmf_port_unblock_rdma(self, port): def nvmf_get_blocked_ports_rdma(self): return self._request("nvmf_get_blocked_ports") + + def bdev_lvol_add_clone(self, lvol_name, child_name): + params = { + "lvol_name": lvol_name, + "child_name": child_name, + } + return self._request("bdev_lvol_add_clone", params) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 8a74d3260..8d0f2bd08 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -115,11 +115,22 @@ def process_snap_replicate_finish(task, snapshot): snapshot.status = SnapShot.STATUS_ONLINE snapshot.write_to_db() + # detach remote lvol remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) snode = db.get_storage_node_by_id(snapshot.lvol.node_id) snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) - remote_snode = db.get_storage_node_by_id(remote_lv.node_id) + + # chain snaps + snaps = db.get_snapshots_by_node_id(remote_lv.replication_node_id) + snaps = sorted(snaps, key=lambda x: x.create_dt) + for sn in snaps: + if sn.snap_name == snapshot.snap_name: + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") + remote_snode.rpc_client().bdev_lvol_add_clone(remote_lv.top_bdev, sn.snap_bdev) + break + + # convert to snapshot remote_snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) new_snapshot = snapshot @@ -132,6 +143,8 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot.blobid = remote_lv.blobid new_snapshot.created_at = int(time.time()) new_snapshot.write_to_db() + + # delete lvol object remote_lv.bdev_stack = [] remote_lv.write_to_db() lvol_controller.delete_lvol(remote_lv.get_id(), True) From 4e59c73a55207c99ed71cee214a5095fdcbcefd4 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 14:32:28 +0300 Subject: [PATCH 025/192] wip 4 --- simplyblock_core/storage_node_ops.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 3d32dd17a..115f3afea 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -1167,6 +1167,10 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, if not ret: logger.error("Failed to set pollers mask") return False + ret = rpc_client.bdev_lvol_create_poller_group(snode.pollers_mask) + if not ret: + logger.error("Failed to set pollers mask") + return False # 4- start spdk framework ret = rpc_client.framework_start_init() @@ -1728,6 +1732,10 @@ def restart_storage_node( if not ret: logger.error("Failed to set pollers mask") return False + ret = rpc_client.bdev_lvol_create_poller_group(snode.pollers_mask) + if not ret: + logger.error("Failed to set pollers mask") + return False # 4- start spdk framework ret = rpc_client.framework_start_init() From 296a2462a2cef9ed7927bdfdd83f7cb14a004478 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 14:56:27 +0300 Subject: [PATCH 026/192] wip 5 --- .../services/snapshot_replication.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 8d0f2bd08..70dca66d1 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -121,18 +121,30 @@ def process_snap_replicate_finish(task, snapshot): snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) remote_snode = db.get_storage_node_by_id(remote_lv.node_id) - # chain snaps + # chain snaps on primary snaps = db.get_snapshots_by_node_id(remote_lv.replication_node_id) snaps = sorted(snaps, key=lambda x: x.create_dt) for sn in snaps: if sn.snap_name == snapshot.snap_name: logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") - remote_snode.rpc_client().bdev_lvol_add_clone(remote_lv.top_bdev, sn.snap_bdev) + remote_snode.rpc_client().bdev_lvol_add_clone(sn.snap_bdev, remote_lv.top_bdev) break - # convert to snapshot + # convert to snapshot on primary remote_snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + sec_node = db.get_storage_node_by_id(remote_snode.secondary_node_id) + # chain snaps on secondary + if sec_node.status == SnapShot.STATUS_ONLINE: + for sn in snaps: + if sn.snap_name == snapshot.snap_name: + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") + sec_node.rpc_client().bdev_lvol_add_clone(sn.snap_bdev, remote_lv.top_bdev) + break + + # convert to snapshot on secondary + sec_node.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + new_snapshot = snapshot new_snapshot.uuid = str(uuid.uuid4()) new_snapshot.cluster_id = remote_snode.cluster_id From 80035ed45535352c1cfef94c7bf1bb0e31826759 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 19:24:21 +0300 Subject: [PATCH 027/192] wip 7 --- simplyblock_cli/clibase.py | 4 +-- .../controllers/lvol_controller.py | 26 ++++++++++++++++--- .../controllers/snapshot_controller.py | 23 ++++++++++------ simplyblock_core/models/snapshot.py | 2 ++ .../services/snapshot_replication.py | 11 +++++--- .../services/storage_node_monitor.py | 2 +- simplyblock_core/storage_node_ops.py | 20 ++++++++------ 7 files changed, 63 insertions(+), 25 deletions(-) diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 60bebece3..540b4e91b 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -560,10 +560,10 @@ def volume__inflate(self, sub_command, args): return lvol_controller.inflate_lvol(args.volume_id) def volume__replication_start(self, sub_command, args): - return lvol_controller.volume__replication_start(args.lvol_id) + return lvol_controller.replication_start(args.lvol_id) def volume__replication_stop(self, sub_command, args): - return lvol_controller.volume__replication_stop(args.lvol_id) + return lvol_controller.replication_stop(args.lvol_id) def volume__replication_status(self, sub_command, args): return snapshot_controller.list_replication_tasks(args.cluster_id) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index e1236d629..472bb616f 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -10,8 +10,10 @@ from typing import List, Tuple from simplyblock_core import utils, constants -from simplyblock_core.controllers import snapshot_controller, pool_controller, lvol_events +from simplyblock_core.controllers import snapshot_controller, pool_controller, lvol_events, tasks_controller, \ + snapshot_events from simplyblock_core.db_controller import DBController +from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.pool import Pool from simplyblock_core.models.lvol_model import LVol from simplyblock_core.models.storage_node import StorageNode @@ -1772,7 +1774,7 @@ def inflate_lvol(lvol_id): logger.error(f"Failed to inflate LVol: {lvol_id}") return ret -def volume__replication_start(lvol_id): +def replication_start(lvol_id): db_controller = DBController() try: lvol = db_controller.get_lvol_by_id(lvol_id) @@ -1784,8 +1786,14 @@ def volume__replication_start(lvol_id): lvol.do_replicate = True lvol.write_to_db() + for snap in db_controller.get_snapshots(): + if snap.lvol.uuid == lvol.uuid: + if not snap.target_replicated_snap_uuid: + task = tasks_controller.add_snapshot_replication_task(snap) + if task: + snapshot_events.replication_task_created(snap) -def volume__replication_stop(lvol_id, delete=False): +def replication_stop(lvol_id, delete=False): db_controller = DBController() try: lvol = db_controller.get_lvol_by_id(lvol_id) @@ -1796,3 +1804,15 @@ def volume__replication_stop(lvol_id, delete=False): logger.info("Setting LVol do_replicate: False") lvol.do_replicate = False lvol.write_to_db() + + snode = db_controller.get_storage_node_by_id(lvol.node_id) + tasks = db_controller.get_job_tasks(snode.cluster_id) + + + for task in tasks: + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and task.status != JobSchedule.STATUS_DONE: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + if snap.lvol.uuid == lvol.uuid: + tasks_controller.cancel_task(task.uuid) + + return True \ No newline at end of file diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 4bd49c0bd..d2babfc5b 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -359,6 +359,9 @@ def delete(snapshot_uuid, force_delete=False): except KeyError: pass + if snap.target_replicated_snap_uuid: + delete_replicated(snap.uuid) + logger.info("Done") return True @@ -636,12 +639,16 @@ def delete_replicated(snapshot_id): logger.error(f"Snapshot not found {snapshot_id}") return False - snaps = db_controller.get_snapshots_by_node_id(snap.lvol.replication_node_id) - for sn in snaps: - if sn.snap_name == snap.snap_name: - logger.info("Deleting replicated snapshot %s", sn.uuid) - ret = delete(sn.uuid) - if not ret: - logger.error("Failed to delete snapshot %s", sn.uuid) - return False + try: + target_replicated_snap = db_controller.get_snapshot_by_id(snap.target_replicated_snap_uuid) + logger.info("Deleting replicated snapshot %s", target_replicated_snap.uuid) + ret = delete(target_replicated_snap.uuid) + if not ret: + logger.error("Failed to delete snapshot %s", target_replicated_snap.uuid) + return False + + except KeyError: + logger.error(f"Snapshot not found {snap.target_replicated_snap_uuid}") + return False + return True diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py index 93becaf7b..fb7056b9c 100644 --- a/simplyblock_core/models/snapshot.py +++ b/simplyblock_core/models/snapshot.py @@ -30,3 +30,5 @@ class SnapShot(BaseModel): deletion_status: str = "" status: str = "" fabric: str = "tcp" + target_replicated_snap_uuid: str = "" + source_replicated_snap_uuid: str = "" \ No newline at end of file diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 70dca66d1..947eb5e92 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -111,9 +111,6 @@ def process_snap_replicate_finish(task, snapshot): task.status = JobSchedule.STATUS_DONE task.function_params["end_time"] = int(time.time()) task.write_to_db() - if snapshot.status != SnapShot.STATUS_ONLINE: - snapshot.status = SnapShot.STATUS_ONLINE - snapshot.write_to_db() # detach remote lvol remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) @@ -153,9 +150,17 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot.snap_bdev = remote_lv.top_bdev new_snapshot.snap_uuid = remote_lv.lvol_uuid new_snapshot.blobid = remote_lv.blobid + new_snapshot.blobid = remote_lv.blobid new_snapshot.created_at = int(time.time()) + new_snapshot.source_replicated_snap_uuid = snapshot.uuid + new_snapshot.status = SnapShot.STATUS_ONLINE new_snapshot.write_to_db() + if snapshot.status == SnapShot.STATUS_IN_REPLICATION: + snapshot.status = SnapShot.STATUS_ONLINE + snapshot.target_replicated_snap_uuid = new_snapshot.uuid + snapshot.write_to_db() + # delete lvol object remote_lv.bdev_stack = [] remote_lv.write_to_db() diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py index 17a7d0369..b694ed904 100644 --- a/simplyblock_core/services/storage_node_monitor.py +++ b/simplyblock_core/services/storage_node_monitor.py @@ -74,7 +74,7 @@ def get_next_cluster_status(cluster_id): continue online_nodes += 1 # check for jm rep tasks: - ret = node.rpc_client().jc_get_jm_status(node.jm_vuid) + ret = node.rpc_client(timeout=5).jc_get_jm_status(node.jm_vuid) if ret: for jm in ret: if ret[jm] is False: # jm is not ready (has active replication task) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 115f3afea..d9951df2f 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -1167,10 +1167,6 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, if not ret: logger.error("Failed to set pollers mask") return False - ret = rpc_client.bdev_lvol_create_poller_group(snode.pollers_mask) - if not ret: - logger.error("Failed to set pollers mask") - return False # 4- start spdk framework ret = rpc_client.framework_start_init() @@ -1180,6 +1176,12 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, rpc_client.log_set_print_level("DEBUG") + if snode.pollers_mask: + ret = rpc_client.bdev_lvol_create_poller_group(snode.pollers_mask) + if not ret: + logger.error("Failed to set pollers mask") + return False + # 5- set app_thread cpu mask if snode.app_thread_mask: ret = rpc_client.thread_get_stats() @@ -1732,10 +1734,6 @@ def restart_storage_node( if not ret: logger.error("Failed to set pollers mask") return False - ret = rpc_client.bdev_lvol_create_poller_group(snode.pollers_mask) - if not ret: - logger.error("Failed to set pollers mask") - return False # 4- start spdk framework ret = rpc_client.framework_start_init() @@ -1745,6 +1743,12 @@ def restart_storage_node( rpc_client.log_set_print_level("DEBUG") + if snode.pollers_mask: + ret = rpc_client.bdev_lvol_create_poller_group(snode.pollers_mask) + if not ret: + logger.error("Failed to set pollers mask") + return False + # 5- set app_thread cpu mask if snode.app_thread_mask: ret = rpc_client.thread_get_stats() From 9b49217283d6ea775208608638d6409213727992 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 21:23:57 +0300 Subject: [PATCH 028/192] Fix lvol poller cpu mask --- simplyblock_core/models/storage_node.py | 1 + simplyblock_core/storage_node_ops.py | 11 +++++++---- simplyblock_core/utils/__init__.py | 11 +++++++++-- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index 8c76d3649..eb05f30bb 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -103,6 +103,7 @@ class StorageNode(BaseNodeObject): active_tcp: bool = True active_rdma: bool = False lvol_sync_del_queue: List[str] = [] + lvol_poller_mask: str = "" def rpc_client(self, **kwargs): """Return rpc client to this node diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index d9951df2f..cd04f978e 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -887,6 +887,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, app_thread_core = node_config.get("distribution").get("app_thread_core") jm_cpu_core = node_config.get("distribution").get("jm_cpu_core") number_of_distribs = node_config.get("number_of_distribs") + lvol_poller_core = node_config.get("lvol_poller_core") + lvol_poller_mask = utils.generate_mask(lvol_poller_core) pollers_mask = utils.generate_mask(poller_cpu_cores) app_thread_mask = utils.generate_mask(app_thread_core) @@ -1103,6 +1105,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.write_to_db(kv_store) snode.app_thread_mask = app_thread_mask or "" snode.pollers_mask = pollers_mask or "" + snode.lvol_poller_mask = lvol_poller_mask or "" snode.jm_cpu_mask = jm_cpu_mask snode.alceml_cpu_index = alceml_cpu_index snode.alceml_worker_cpu_index = alceml_worker_cpu_index @@ -1176,8 +1179,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, rpc_client.log_set_print_level("DEBUG") - if snode.pollers_mask: - ret = rpc_client.bdev_lvol_create_poller_group(snode.pollers_mask) + if snode.lvol_poller_mask: + ret = rpc_client.bdev_lvol_create_poller_group(snode.lvol_poller_mask) if not ret: logger.error("Failed to set pollers mask") return False @@ -1743,8 +1746,8 @@ def restart_storage_node( rpc_client.log_set_print_level("DEBUG") - if snode.pollers_mask: - ret = rpc_client.bdev_lvol_create_poller_group(snode.pollers_mask) + if snode.lvol_poller_mask: + ret = rpc_client.bdev_lvol_create_poller_group(snode.lvol_poller_mask) if not ret: logger.error("Failed to set pollers mask") return False diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index c6313802e..3851738bb 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -447,6 +447,7 @@ def reserve_n(count): assigned["jm_cpu_core"] = vcpu vcpu = reserve_n(1) assigned["jc_singleton_core"] = vcpu + assigned["lvol_poller_core"] = vcpu assigned["alceml_worker_cpu_cores"] = vcpu vcpu = reserve_n(1) assigned["alceml_cpu_cores"] = vcpu @@ -455,6 +456,8 @@ def reserve_n(count): assigned["jm_cpu_core"] = vcpu vcpu = reserve_n(1) assigned["jc_singleton_core"] = vcpu + vcpu = reserve_n(1) + assigned["lvol_poller_core"] = vcpu vcpus = reserve_n(1) assigned["alceml_worker_cpu_cores"] = vcpus vcpus = reserve_n(2) @@ -468,6 +471,8 @@ def reserve_n(count): assigned["alceml_worker_cpu_cores"] = vcpus vcpus = reserve_n(alceml_count) assigned["alceml_cpu_cores"] = vcpus + vcpus = reserve_n(2) + assigned["lvol_poller_core"] = vcpus dp = int(len(remaining) / 2) vcpus = reserve_n(dp) assigned["distrib_cpu_cores"] = vcpus @@ -486,7 +491,8 @@ def reserve_n(count): assigned.get("alceml_cpu_cores", []), assigned.get("alceml_worker_cpu_cores", []), assigned.get("distrib_cpu_cores", []), - assigned.get("jc_singleton_core", []) + assigned.get("jc_singleton_core", []), + assigned.get("lvol_poller_core", []), ) @@ -1423,7 +1429,8 @@ def regenerate_config(new_config, old_config, force=False): "alceml_cpu_cores": get_core_indexes(core_to_index, distribution[3]), "alceml_worker_cpu_cores": get_core_indexes(core_to_index, distribution[4]), "distrib_cpu_cores": get_core_indexes(core_to_index, distribution[5]), - "jc_singleton_core": get_core_indexes(core_to_index, distribution[6])} + "jc_singleton_core": get_core_indexes(core_to_index, distribution[6]), + "lvol_poller_core": get_core_indexes(core_to_index, distribution[7])} isolated_cores = old_config["nodes"][i]["isolated"] number_of_distribs = 2 From 149fcd5b350e0d9d533384c7cc2e6cb4f268efcf Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 6 Nov 2025 22:51:33 +0300 Subject: [PATCH 029/192] Fix target snap name --- .../services/snapshot_replication.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 947eb5e92..d00fcef0e 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -142,24 +142,29 @@ def process_snap_replicate_finish(task, snapshot): # convert to snapshot on secondary sec_node.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) - new_snapshot = snapshot - new_snapshot.uuid = str(uuid.uuid4()) + new_snapshot_uuid = str(uuid.uuid4()) + + if snapshot.status == SnapShot.STATUS_IN_REPLICATION: + snapshot.status = SnapShot.STATUS_ONLINE + snapshot.target_replicated_snap_uuid = new_snapshot_uuid + snapshot.write_to_db() + + new_snapshot = SnapShot() + new_snapshot.uuid = new_snapshot_uuid new_snapshot.cluster_id = remote_snode.cluster_id new_snapshot.lvol = remote_lv new_snapshot.pool_uuid = remote_lv.pool_uuid new_snapshot.snap_bdev = remote_lv.top_bdev new_snapshot.snap_uuid = remote_lv.lvol_uuid - new_snapshot.blobid = remote_lv.blobid + new_snapshot.size = snapshot.size + new_snapshot.used_size = snapshot.used_size + new_snapshot.snap_name = snapshot.snap_name new_snapshot.blobid = remote_lv.blobid new_snapshot.created_at = int(time.time()) new_snapshot.source_replicated_snap_uuid = snapshot.uuid new_snapshot.status = SnapShot.STATUS_ONLINE new_snapshot.write_to_db() - if snapshot.status == SnapShot.STATUS_IN_REPLICATION: - snapshot.status = SnapShot.STATUS_ONLINE - snapshot.target_replicated_snap_uuid = new_snapshot.uuid - snapshot.write_to_db() # delete lvol object remote_lv.bdev_stack = [] From c0b869f21909269822a47a19b100836c27c501b9 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 02:24:37 +0300 Subject: [PATCH 030/192] fix poller mask --- simplyblock_core/utils/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 3851738bb..bf20c6c53 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -1563,7 +1563,8 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a "alceml_worker_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][4]), "distrib_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][5]), - "jc_singleton_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][6]) + "jc_singleton_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][6]), + "lvol_poller_core": get_core_indexes(core_group["lvol_poller_core"], core_group["distribution"][7]) }, "ssd_pcis": [], "nic_ports": system_info[nid]["nics"] From c6d7d1aea0f2a77213ec3c3ef1d3ecbbb6f262c0 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 03:30:45 +0300 Subject: [PATCH 031/192] fix poller mask --- simplyblock_core/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index bf20c6c53..6f7c8e5c3 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -1564,7 +1564,7 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a core_group["distribution"][4]), "distrib_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][5]), "jc_singleton_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][6]), - "lvol_poller_core": get_core_indexes(core_group["lvol_poller_core"], core_group["distribution"][7]) + "lvol_poller_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][7]) }, "ssd_pcis": [], "nic_ports": system_info[nid]["nics"] From 9900a6a7706240d1f852cf617878b6e655708ae2 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 04:11:43 +0300 Subject: [PATCH 032/192] fix poller mask --- simplyblock_core/storage_node_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index cd04f978e..af485d4b9 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -887,7 +887,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, app_thread_core = node_config.get("distribution").get("app_thread_core") jm_cpu_core = node_config.get("distribution").get("jm_cpu_core") number_of_distribs = node_config.get("number_of_distribs") - lvol_poller_core = node_config.get("lvol_poller_core") + lvol_poller_core = node_config.get("distribution").get("lvol_poller_core") lvol_poller_mask = utils.generate_mask(lvol_poller_core) pollers_mask = utils.generate_mask(poller_cpu_cores) From fa4bfb7d4be38e5d66eeb26d21dab08151c9476b Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 07:48:43 +0300 Subject: [PATCH 033/192] fix chain --- simplyblock_core/controllers/snapshot_controller.py | 8 ++++++-- simplyblock_core/db_controller.py | 6 ++++-- simplyblock_core/rpc_client.py | 2 +- simplyblock_core/services/snapshot_replication.py | 5 ++--- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index d2babfc5b..7b77e4f40 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -607,7 +607,11 @@ def list_replication_tasks(cluster_id): for task in tasks: if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) - snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + duration = "" try: if task.status == JobSchedule.STATUS_RUNNING: @@ -617,7 +621,7 @@ def list_replication_tasks(cluster_id): task.function_params["end_time"] - task.function_params["start_time"]) except Exception as e: logger.error(e) - offset = "" + offset = 0 if "offset" in task.function_params: offset = task.function_params["offset"] data.append({ diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index 277d1b68a..ba832e655 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -258,7 +258,9 @@ def get_events(self, event_id=" ", limit=0, reverse=False) -> List[EventObj]: return EventObj().read_from_db(self.kv_store, id=event_id, limit=limit, reverse=reverse) def get_job_tasks(self, cluster_id, reverse=True, limit=0) -> List[JobSchedule]: - return JobSchedule().read_from_db(self.kv_store, id=cluster_id, reverse=reverse, limit=limit) + ret = JobSchedule().read_from_db(self.kv_store, id=cluster_id, reverse=reverse, limit=limit) + return sorted(ret, key=lambda x: x.date) + def get_task_by_id(self, task_id) -> JobSchedule: for task in self.get_job_tasks(" "): @@ -272,7 +274,7 @@ def get_snapshots_by_node_id(self, node_id) -> List[SnapShot]: for snap in snaps: if snap.lvol.node_id == node_id: ret.append(snap) - return ret + return sorted(ret, key=lambda x: x.create_dt) def get_snode_size(self, node_id) -> int: snode = self.get_storage_node_by_id(node_id) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index f041d3f2c..2ccdfa1a4 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -109,7 +109,7 @@ def _request2(self, method, params=None): if params: payload['params'] = params try: - logger.debug("Requesting method: %s, params: %s", method, params) + logger.debug("Requesting from: %s, method: %s, params: %s",self.ip_address, method, params) response = self.session.post(self.url, data=json.dumps(payload), timeout=self.timeout) except Exception as e: logger.error(e) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index d00fcef0e..645fe5737 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -120,7 +120,6 @@ def process_snap_replicate_finish(task, snapshot): # chain snaps on primary snaps = db.get_snapshots_by_node_id(remote_lv.replication_node_id) - snaps = sorted(snaps, key=lambda x: x.create_dt) for sn in snaps: if sn.snap_name == snapshot.snap_name: logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") @@ -132,9 +131,9 @@ def process_snap_replicate_finish(task, snapshot): sec_node = db.get_storage_node_by_id(remote_snode.secondary_node_id) # chain snaps on secondary - if sec_node.status == SnapShot.STATUS_ONLINE: + if sec_node.status == StorageNode.STATUS_ONLINE: for sn in snaps: - if sn.snap_name == snapshot.snap_name: + if sn.lvol.get_id() == snapshot.lvol.get_id(): logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") sec_node.rpc_client().bdev_lvol_add_clone(sn.snap_bdev, remote_lv.top_bdev) break From 4781a770fa82a1291d416bee1891986bd622638d Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 07:55:34 +0300 Subject: [PATCH 034/192] fix chain --- simplyblock_core/services/snapshot_replication.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 645fe5737..060f08a97 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -119,9 +119,9 @@ def process_snap_replicate_finish(task, snapshot): remote_snode = db.get_storage_node_by_id(remote_lv.node_id) # chain snaps on primary - snaps = db.get_snapshots_by_node_id(remote_lv.replication_node_id) + snaps = db.get_snapshots_by_node_id(remote_lv.node_id) for sn in snaps: - if sn.snap_name == snapshot.snap_name: + if sn.lvol.get_id() == snapshot.lvol.get_id(): logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") remote_snode.rpc_client().bdev_lvol_add_clone(sn.snap_bdev, remote_lv.top_bdev) break From cd6cab758696d219ae79ab906144ef959646b9d4 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 08:02:55 +0300 Subject: [PATCH 035/192] fix chain --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 060f08a97..3c944b206 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -151,7 +151,7 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot = SnapShot() new_snapshot.uuid = new_snapshot_uuid new_snapshot.cluster_id = remote_snode.cluster_id - new_snapshot.lvol = remote_lv + new_snapshot.lvol = snapshot.lvol new_snapshot.pool_uuid = remote_lv.pool_uuid new_snapshot.snap_bdev = remote_lv.top_bdev new_snapshot.snap_uuid = remote_lv.lvol_uuid From 3c9dd460279736d990f451de5b96e8b0d3f088b7 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 08:28:16 +0300 Subject: [PATCH 036/192] fix chain --- .../services/snapshot_replication.py | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 3c944b206..98640745f 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -122,9 +122,13 @@ def process_snap_replicate_finish(task, snapshot): snaps = db.get_snapshots_by_node_id(remote_lv.node_id) for sn in snaps: if sn.lvol.get_id() == snapshot.lvol.get_id(): - logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") - remote_snode.rpc_client().bdev_lvol_add_clone(sn.snap_bdev, remote_lv.top_bdev) - break + try: + target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") + remote_snode.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) + break + except KeyError: + logger.info(f"Snapshot {sn.target_replicated_snap_uuid} not found") # convert to snapshot on primary remote_snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) @@ -134,9 +138,13 @@ def process_snap_replicate_finish(task, snapshot): if sec_node.status == StorageNode.STATUS_ONLINE: for sn in snaps: if sn.lvol.get_id() == snapshot.lvol.get_id(): - logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") - sec_node.rpc_client().bdev_lvol_add_clone(sn.snap_bdev, remote_lv.top_bdev) - break + try: + target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") + sec_node.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) + break + except KeyError: + logger.info(f"Snapshot {sn.target_replicated_snap_uuid} not found") # convert to snapshot on secondary sec_node.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) @@ -151,7 +159,7 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot = SnapShot() new_snapshot.uuid = new_snapshot_uuid new_snapshot.cluster_id = remote_snode.cluster_id - new_snapshot.lvol = snapshot.lvol + new_snapshot.lvol = remote_lv new_snapshot.pool_uuid = remote_lv.pool_uuid new_snapshot.snap_bdev = remote_lv.top_bdev new_snapshot.snap_uuid = remote_lv.lvol_uuid From f46047976d2158509e23ef0bd3e0db8b1bdd17c2 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 08:35:05 +0300 Subject: [PATCH 037/192] fix chain --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 98640745f..f61bea819 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -159,7 +159,7 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot = SnapShot() new_snapshot.uuid = new_snapshot_uuid new_snapshot.cluster_id = remote_snode.cluster_id - new_snapshot.lvol = remote_lv + new_snapshot.lvol = snapshot.lvol new_snapshot.pool_uuid = remote_lv.pool_uuid new_snapshot.snap_bdev = remote_lv.top_bdev new_snapshot.snap_uuid = remote_lv.lvol_uuid From 659857d620d62f2c3a328d4903ee671782d5d828 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 7 Nov 2025 08:54:50 +0300 Subject: [PATCH 038/192] fix chain --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index f61bea819..8100c80f5 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -119,7 +119,7 @@ def process_snap_replicate_finish(task, snapshot): remote_snode = db.get_storage_node_by_id(remote_lv.node_id) # chain snaps on primary - snaps = db.get_snapshots_by_node_id(remote_lv.node_id) + snaps = db.get_snapshots() for sn in snaps: if sn.lvol.get_id() == snapshot.lvol.get_id(): try: From ad546ca5fe667a74a5559109fb0e7c58d3a707b0 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 11 Nov 2025 10:04:15 +0300 Subject: [PATCH 039/192] Enable ndcs and npcs when creating lvol (#729) --- simplyblock_core/rpc_client.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 62f37b1e9..66ef478f8 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -379,11 +379,11 @@ def create_lvol(self, name, size_in_mib, lvs_name, lvol_priority_class=0, ndcs=0 "clear_method": "unmap", "lvol_priority_class": lvol_priority_class, } - # if ndcs or npcs: - # params.update({ - # 'ndcs' : ndcs, - # 'npcs' : npcs, - # }) + if ndcs or npcs: + params.update({ + 'ndcs' : ndcs, + 'npcs' : npcs, + }) return self._request("bdev_lvol_create", params) def delete_lvol(self, name, del_async=False): From 5f6382b008c437d67b8e92d9a7324a1391b3dd7a Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 11 Nov 2025 11:05:47 +0300 Subject: [PATCH 040/192] Fix sfam-2450 cluster update issues (#726) - set cluster mode to default "docker" - remove service "app_CachingNodeMonitor" from services during cluster update --- simplyblock_core/cluster_ops.py | 10 +++++++--- simplyblock_core/models/cluster.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index fbe91a58f..ff07e6634 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -1174,9 +1174,13 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, for service in cluster_docker.services.list(): if image_parts in service.attrs['Spec']['Labels']['com.docker.stack.image'] or \ "simplyblock" in service.attrs['Spec']['Labels']['com.docker.stack.image']: - logger.info(f"Updating service {service.name}") - service.update(image=service_image, force_update=True) - service_names.append(service.attrs['Spec']['Name']) + if service.name == "app_CachingNodeMonitor": + logger.info(f"Removing service {service.name}") + service.remove() + else: + logger.info(f"Updating service {service.name}") + service.update(image=service_image, force_update=True) + service_names.append(service.attrs['Spec']['Name']) if "app_SnapshotMonitor" not in service_names: logger.info("Creating snapshot monitor service") diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index fd4802771..620309f77 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -45,7 +45,7 @@ class Cluster(BaseModel): distr_npcs: int = 0 enable_node_affinity: bool = False grafana_endpoint: str = "" - mode: str = "" + mode: str = "docker" grafana_secret: str = "" contact_point: str = "" ha_type: str = "single" From 4a6a4d70dc771fc502fdd94b501c93b1bfaba75f Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Tue, 11 Nov 2025 09:24:24 +0100 Subject: [PATCH 041/192] Update Dockerfile_base (#730) * Update Dockerfile_base * Update Dockerfile_base --- docker/Dockerfile_base | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base index 226188c96..201d92759 100644 --- a/docker/Dockerfile_base +++ b/docker/Dockerfile_base @@ -38,3 +38,5 @@ RUN pip3 install setuptools --upgrade COPY requirements.txt requirements.txt RUN pip3 install -r requirements.txt + +RUN rm -rf /usr/share/terminfo/ From bf56cb67efb05db6aada28bf7d05b42268a8acc8 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 12 Nov 2025 19:22:03 +0100 Subject: [PATCH 042/192] inherit default cluster mode in new cluster (#733) * inherit default cluster mode in new cluster * added first six cluster id char to spdk pod name * added first six cluster id char to spdk pod name * update env_var --- simplyblock_core/cluster_ops.py | 1 + simplyblock_core/env_var | 2 +- simplyblock_core/snode_client.py | 8 ++++--- simplyblock_core/storage_node_ops.py | 10 ++++----- simplyblock_core/utils/__init__.py | 8 ++++++- .../api/internal/storage_node/docker.py | 1 + .../api/internal/storage_node/kubernetes.py | 21 ++++++++++++------- .../templates/storage_deploy_spdk.yaml.j2 | 2 +- simplyblock_web/utils.py | 1 + 9 files changed, 35 insertions(+), 19 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index ff07e6634..536546eab 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -457,6 +457,7 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.strict_node_anti_affinity = strict_node_anti_affinity default_cluster = clusters[0] + cluster.mode = default_cluster.mode cluster.db_connection = default_cluster.db_connection cluster.grafana_secret = monitoring_secret if default_cluster.mode == "kubernetes" else default_cluster.grafana_secret cluster.grafana_endpoint = default_cluster.grafana_endpoint diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index e1d2e2f8b..cf8093b7c 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,6 +1,6 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev SIMPLY_BLOCK_VERSION=19.2.24 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:inherit_default_cluster_mode SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 2e8504b08..5e5f66f60 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -81,7 +81,7 @@ def info(self): def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None, cluster_ip=None, fdb_connection=None, namespace=None, server_ip=None, rpc_port=None, rpc_username=None, rpc_password=None, multi_threading_enabled=False, timeout=0, ssd_pcie=None, - total_mem=None, system_mem=None, cluster_mode=None): + total_mem=None, system_mem=None, cluster_mode=None, cluster_id=None): params = { "cluster_ip": cluster_ip, "server_ip": server_ip, @@ -113,6 +113,8 @@ def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None params["system_mem"] = system_mem if cluster_mode: params["cluster_mode"] = cluster_mode + if cluster_id: + params["cluster_id"] = cluster_id return self._request("POST", "spdk_process_start", params) def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id): @@ -124,8 +126,8 @@ def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id): # "db_connection": db_connection} # return self._request("POST", "join_swarm", params) - def spdk_process_kill(self, rpc_port): - return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port}) + def spdk_process_kill(self, rpc_port, cluster_id=None): + return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port, "cluster_id": cluster_id}) def leave_swarm(self): return True diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 3d32dd17a..162f0dd1a 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -998,7 +998,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, namespace, mgmt_ip, rpc_port, rpc_user, rpc_pass, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, - ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode) + ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=cluster_id) time.sleep(5) except Exception as e: @@ -1454,7 +1454,7 @@ def remove_storage_node(node_id, force_remove=False, force_migrate=False): if health_controller._check_node_api(snode.mgmt_ip): logger.info("Stopping SPDK container") snode_api = SNodeClient(snode.api_endpoint, timeout=20) - snode_api.spdk_process_kill(snode.rpc_port) + snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id) snode_api.leave_swarm() pci_address = [] for dev in snode.nvme_devices: @@ -1676,7 +1676,7 @@ def restart_storage_node( snode.l_cores, snode.spdk_mem, snode.spdk_image, spdk_debug, cluster_ip, fdb_connection, snode.namespace, snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, - ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode) + ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=snode.cluster_id) except Exception as e: logger.error(e) @@ -2250,7 +2250,7 @@ def shutdown_storage_node(node_id, force=False): logger.info("Stopping SPDK") try: - SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port) + SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port, snode.cluster_id) except SNodeClientException: logger.error('Failed to kill SPDK') return False @@ -3214,7 +3214,7 @@ def recreate_lvstore(snode, force=False): def _kill_app(): storage_events.snode_restart_failed(snode) snode_api = SNodeClient(snode.api_endpoint, timeout=5, retry=5) - snode_api.spdk_process_kill(snode.rpc_port) + snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id) set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE) # If LVol Store recovery failed then stop spdk process diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 941414708..0892db54a 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -725,7 +725,13 @@ def convert_size(size: Union[int, str], unit: str, round_up: bool = False) -> in raw = size / (base ** exponent) return math.ceil(raw) if round_up else int(raw) - +def first_six_chars(s: str) -> str: + """ + Returns the first six characters of a given string. + If the string is shorter than six characters, returns the entire string. + """ + return s[:6] + def nearest_upper_power_of_2(n): # Check if n is already a power of 2 if (n & (n - 1)) == 0: diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index 8e18fc276..d1ee4f9f0 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -142,6 +142,7 @@ class SPDKParams(BaseModel): spdk_image: Optional[str] = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE) cluster_ip: Optional[str] = Field(default=None, pattern=utils.IP_PATTERN) cluster_mode: str + cluster_id: str @api.post('/spdk_process_start', responses={ diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index be3193138..56b4ca563 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -268,6 +268,7 @@ class SPDKParams(BaseModel): spdk_image: str = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE) cluster_ip: str = Field(pattern=utils.IP_PATTERN) cluster_mode: str + cluster_id: str @api.post('/spdk_process_start', responses={ @@ -286,9 +287,10 @@ def spdk_process_start(body: SPDKParams): total_mem_mib = core_utils.convert_size(core_utils.parse_size(body.total_mem), 'MB') if body.total_mem else "" - if _is_pod_up(body.rpc_port) or _is_pod_present(body.rpc_port): + first_six_cluster_id = core_utils.first_six_chars(body.cluster_id) + if _is_pod_up(body.rpc_port, first_six_cluster_id) or _is_pod_present(body.rpc_port, first_six_cluster_id): logger.info("SPDK pod found, removing...") - query = utils.RPCPortParams(rpc_port=body.rpc_port) + query = utils.RPCPortParams(rpc_port=body.rpc_port, cluster_id=body.cluster_id) spdk_process_kill(query) node_prepration_job_name = "snode-spdk-job-" @@ -351,6 +353,7 @@ def spdk_process_start(body: SPDKParams): 'SIMPLYBLOCK_DOCKER_IMAGE': constants.SIMPLY_BLOCK_DOCKER_IMAGE, 'GRAYLOG_SERVER_IP': body.cluster_ip, 'MODE': body.cluster_mode, + 'CLUSTER_ID': first_six_cluster_id, 'SSD_PCIE': ssd_pcie_params, 'PCI_ALLOWED': ssd_pcie_list, 'TOTAL_HP': total_mem_mib @@ -463,7 +466,8 @@ def spdk_process_kill(query: utils.RPCPortParams): k8s_core_v1 = core_utils.get_k8s_core_client() try: namespace = node_utils_k8s.get_namespace() - pod_name = f"snode-spdk-pod-{query.rpc_port}" + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) + pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}" resp = k8s_core_v1.delete_namespaced_pod(pod_name, namespace) retries = 10 while retries > 0: @@ -486,9 +490,9 @@ def spdk_process_kill(query: utils.RPCPortParams): return utils.get_response(True) -def _is_pod_up(rpc_port): +def _is_pod_up(rpc_port, cluster_id): k8s_core_v1 = core_utils.get_k8s_core_client() - pod_name = f"snode-spdk-pod-{rpc_port}" + pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}" try: resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace()) for pod in resp.items: @@ -502,9 +506,9 @@ def _is_pod_up(rpc_port): return False return False -def _is_pod_present(rpc_port): +def _is_pod_present(rpc_port, cluster_id): k8s_core_v1 = core_utils.get_k8s_core_client() - pod_name = f"snode-spdk-pod-{rpc_port}" + pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}" try: resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace()) for pod in resp.items: @@ -525,7 +529,8 @@ def _is_pod_present(rpc_port): })}}}, }) def spdk_process_is_up(query: utils.RPCPortParams): - if _is_pod_up(query.rpc_port): + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) + if _is_pod_up(query.rpc_port, first_six_cluster_id): return utils.get_response(True) else: return utils.get_response(False, "SPDK container is not running") diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index f10478c75..e49aca2e2 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -1,7 +1,7 @@ apiVersion: v1 kind: Pod metadata: - name: snode-spdk-pod-{{ RPC_PORT }} + name: snode-spdk-pod-{{ RPC_PORT }}-{{ CLUSTER_ID }} namespace: {{ NAMESPACE }} labels: app: spdk-app-{{ RPC_PORT }} diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py index b0d1795df..27ff2ce18 100644 --- a/simplyblock_web/utils.py +++ b/simplyblock_web/utils.py @@ -149,6 +149,7 @@ def error_handler(exception: Exception): class RPCPortParams(BaseModel): rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536) + cluster_id: str class DeviceParams(BaseModel): From 0e72282a7ea442e8a5b1d8ce4ecf882edbc1a1c0 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Wed, 12 Nov 2025 21:49:16 +0300 Subject: [PATCH 043/192] Update environment variables for Simply Block (#737) --- simplyblock_core/env_var | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index cf8093b7c..fe494ca34 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,6 +1,6 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev -SIMPLY_BLOCK_VERSION=19.2.24 +SIMPLY_BLOCK_VERSION=19.2.25 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:inherit_default_cluster_mode +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest From 25e3dd29b5fa5b345e73cb905cf169fc021d6eff Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 13 Nov 2025 09:44:28 +0300 Subject: [PATCH 044/192] Main lvol sync delete (#734) * Add lvol sync delete task runner * fix linter issues * fix --- simplyblock_core/cluster_ops.py | 12 +++ .../controllers/tasks_controller.py | 23 ++++++ simplyblock_core/env_var | 2 +- simplyblock_core/models/job_schedule.py | 1 + simplyblock_core/models/storage_node.py | 1 - .../scripts/docker-compose-swarm.yml | 14 ++++ simplyblock_core/services/lvol_monitor.py | 18 +---- simplyblock_core/services/snapshot_monitor.py | 5 +- .../services/tasks_runner_port_allow.py | 22 ++---- .../services/tasks_runner_sync_lvol_del.py | 77 +++++++++++++++++++ 10 files changed, 140 insertions(+), 35 deletions(-) create mode 100644 simplyblock_core/services/tasks_runner_sync_lvol_del.py diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 536546eab..dc429b8f9 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -1194,6 +1194,18 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, networks=["host"], constraints=["node.role == manager"] ) + + if "app_TasksRunnerLVolSyncDelete" not in service_names: + logger.info("Creating lvol sync delete service") + cluster_docker.services.create( + image=service_image, + command="python simplyblock_core/services/tasks_runner_sync_lvol_del.py", + name="app_TasksRunnerLVolSyncDelete", + mounts=["/etc/foundationdb:/etc/foundationdb"], + env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"], + networks=["host"], + constraints=["node.role == manager"] + ) logger.info("Done updating mgmt cluster") elif cluster.mode == "kubernetes": diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index 689027d08..b7c434f63 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -70,6 +70,11 @@ def _add_task(function_name, cluster_id, node_id, device_id, if task_id: logger.info(f"Task found, skip adding new task: {task_id}") return False + elif function_name == JobSchedule.FN_LVOL_SYNC_DEL: + task_id = get_lvol_sync_del_task(cluster_id, node_id, function_params['lvol_bdev_name']) + if task_id: + logger.info(f"Task found, skip adding new task: {task_id}") + return False task_obj = JobSchedule() task_obj.uuid = str(uuid.uuid4()) @@ -386,3 +391,21 @@ def get_jc_comp_task(cluster_id, node_id, jm_vuid=0): if jm_vuid and "jm_vuid" in task.function_params and task.function_params["jm_vuid"] == jm_vuid: return task.uuid return False + + +def add_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name): + return _add_task(JobSchedule.FN_LVOL_SYNC_DEL, cluster_id, node_id, "", + function_params={"lvol_bdev_name": lvol_bdev_name}, max_retry=10) + +def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None): + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL and task.node_id == node_id : + if task.status != JobSchedule.STATUS_DONE and task.canceled is False: + if lvol_bdev_name: + if "lvol_bdev_name" in task.function_params and task.function_params["lvol_bdev_name"] == lvol_bdev_name: + return task.uuid + else: + return task.uuid + return False + diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index fe494ca34..468ba7a02 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,6 +1,6 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev SIMPLY_BLOCK_VERSION=19.2.25 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main-lvol-sync-delete SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py index 3d87a9aca..bbdcd7871 100644 --- a/simplyblock_core/models/job_schedule.py +++ b/simplyblock_core/models/job_schedule.py @@ -22,6 +22,7 @@ class JobSchedule(BaseModel): FN_BALANCING_AFTER_DEV_REMOVE = "balancing_on_dev_rem" FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add" FN_JC_COMP_RESUME = "jc_comp_resume" + FN_LVOL_SYNC_DEL = "lvol_sync_del" canceled: bool = False cluster_id: str = "" diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index 8c76d3649..81639c556 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -102,7 +102,6 @@ class StorageNode(BaseNodeObject): hublvol: HubLVol = None # type: ignore[assignment] active_tcp: bool = True active_rdma: bool = False - lvol_sync_del_queue: List[str] = [] def rpc_client(self, **kwargs): """Return rpc client to this node diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml index ba0f8b61d..fd79f43c1 100644 --- a/simplyblock_core/scripts/docker-compose-swarm.yml +++ b/simplyblock_core/scripts/docker-compose-swarm.yml @@ -349,6 +349,20 @@ services: environment: SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + TasksRunnerLVolSyncDelete: + <<: *service-base + image: $SIMPLYBLOCK_DOCKER_IMAGE + command: "python simplyblock_core/services/tasks_runner_sync_lvol_del.py" + deploy: + placement: + constraints: [node.role == manager] + volumes: + - "/etc/foundationdb:/etc/foundationdb" + networks: + - hostnet + environment: + SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + networks: monitoring-net: external: true diff --git a/simplyblock_core/services/lvol_monitor.py b/simplyblock_core/services/lvol_monitor.py index 884b67396..8486f3a32 100644 --- a/simplyblock_core/services/lvol_monitor.py +++ b/simplyblock_core/services/lvol_monitor.py @@ -132,8 +132,7 @@ def process_lvol_delete_finish(lvol): sec_node = db.get_storage_node_by_id(snode.get_id()) if sec_node: - sec_node.lvol_sync_del_queue.append(f"{lvol.lvs_name}/{lvol.lvol_bdev}") - sec_node.write_to_db() + tasks_controller.add_lvol_sync_del_task(sec_node.cluster_id, sec_node.get_id(), f"{lvol.lvs_name}/{lvol.lvol_bdev}") lvol_events.lvol_delete(lvol) lvol.remove(db.kv_store) @@ -349,19 +348,6 @@ def process_lvol_delete_try_again(lvol): present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) set_snapshot_health_check(snap, present) - snode = db.get_storage_node_by_id(snode.get_id()) - if snode.status == StorageNode.STATUS_ONLINE: - not_deleted = [] - for bdev_name in snode.lvol_sync_del_queue: - logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}") - ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True) - if not ret: - if "code" in err and err["code"] == -19: - logger.error(f"Sync delete completed with error: {err}") - else: - logger.error(f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}") - not_deleted.append(bdev_name) - snode.lvol_sync_del_queue = not_deleted - snode.write_to_db() + time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/snapshot_monitor.py b/simplyblock_core/services/snapshot_monitor.py index c82476e7b..a99ed89f3 100644 --- a/simplyblock_core/services/snapshot_monitor.py +++ b/simplyblock_core/services/snapshot_monitor.py @@ -5,7 +5,7 @@ from simplyblock_core import constants, db_controller, utils from simplyblock_core.models.cluster import Cluster -from simplyblock_core.controllers import health_controller, snapshot_events +from simplyblock_core.controllers import health_controller, snapshot_events, tasks_controller from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.rpc_client import RPCClient @@ -76,8 +76,7 @@ def process_snap_delete_finish(snap, leader_node): non_leader = db.get_storage_node_by_id(non_leader_id) if non_leader: - non_leader.lvol_sync_del_queue.append(snap.snap_bdev) - non_leader.write_to_db() + tasks_controller.add_lvol_sync_del_task(non_leader.cluster_id, non_leader.get_id(), snap.snap_bdev) snapshot_events.snapshot_delete(snap) snap.remove(db.kv_store) diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py index a39de42ab..e95dbdf94 100644 --- a/simplyblock_core/services/tasks_runner_port_allow.py +++ b/simplyblock_core/services/tasks_runner_port_allow.py @@ -3,7 +3,7 @@ from simplyblock_core import db_controller, utils, storage_node_ops, distr_controller -from simplyblock_core.controllers import tcp_ports_events, health_controller +from simplyblock_core.controllers import tcp_ports_events, health_controller, tasks_controller from simplyblock_core.fw_api_client import FirewallClient from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.cluster import Cluster @@ -196,19 +196,13 @@ task.status = JobSchedule.STATUS_RUNNING task.write_to_db(db.kv_store) - not_deleted = [] - for bdev_name in snode.lvol_sync_del_queue: - logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}") - ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True) - if not ret: - if "code" in err and err["code"] == -19: - logger.error(f"Sync delete completed with error: {err}") - else: - logger.error( - f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}") - not_deleted.append(bdev_name) - snode.lvol_sync_del_queue = not_deleted - snode.write_to_db() + # wait for lvol sync delete + lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) + while lvol_sync_del_found: + logger.info("Lvol sync delete task found, waiting") + can_continue = False + time.sleep(3) + lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: sec_rpc_client = sec_node.rpc_client() diff --git a/simplyblock_core/services/tasks_runner_sync_lvol_del.py b/simplyblock_core/services/tasks_runner_sync_lvol_del.py new file mode 100644 index 000000000..fbf0c1ee4 --- /dev/null +++ b/simplyblock_core/services/tasks_runner_sync_lvol_del.py @@ -0,0 +1,77 @@ +# coding=utf-8 +import time + + +from simplyblock_core import db_controller, utils +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.cluster import Cluster +from simplyblock_core.models.storage_node import StorageNode + +logger = utils.get_logger(__name__) + +# get DB controller +db = db_controller.DBController() + + +logger.info("Starting Tasks runner...") +while True: + + clusters = db.get_clusters() + if not clusters: + logger.error("No clusters found!") + else: + for cl in clusters: + if cl.status == Cluster.STATUS_IN_ACTIVATION: + continue + + tasks = db.get_job_tasks(cl.get_id(), reverse=False) + for task in tasks: + + if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL: + if task.status != JobSchedule.STATUS_DONE: + + # get new task object because it could be changed from cancel task + task = db.get_task_by_id(task.uuid) + + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + continue + + node = db.get_storage_node_by_id(task.node_id) + + if not node: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + continue + + if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: + msg = f"Node is {node.status}, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + continue + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + lvol_bdev_name = task.function_params["lvol_bdev_name"] + + logger.info(f"Sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + ret, err = node.rpc_client().delete_lvol(lvol_bdev_name, del_async=True) + if not ret: + if "code" in err and err["code"] == -19: + logger.error(f"Sync delete completed with error: {err}") + else: + logger.error( + f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + + task.function_result = f"bdev {lvol_bdev_name} deleted" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + time.sleep(3) From cd68c603133ccef6709fc792acb52b648bdca009 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Thu, 13 Nov 2025 09:56:35 +0100 Subject: [PATCH 045/192] added fdb multi AZ support (#736) * added fdb and storageclass multi AZ support * use ipv4 for fdb controller manager healthcheck * updated fdb controller manager resource name --- simplyblock_core/constants.py | 3 +- .../charts/templates/foundationdb.yaml | 45 ++++++++++++------- .../charts/templates/storage_class.yaml | 10 ++++- simplyblock_core/scripts/charts/values.yaml | 8 +++- 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index 41824c73a..d82275954 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -133,7 +133,8 @@ def get_config_var(name, default=None): LVOL_NVME_CONNECT_NR_IO_QUEUES=3 LVOL_NVME_KEEP_ALIVE_TO=10 LVOL_NVME_KEEP_ALIVE_TO_TCP=7 -LVOL_NVMF_PORT_START=int(os.getenv('LVOL_NVMF_PORT_START', 9100)) +LVOL_NVMF_PORT_ENV = os.getenv("LVOL_NVMF_PORT_START", "") +LVOL_NVMF_PORT_START = int(LVOL_NVMF_PORT_ENV) if LVOL_NVMF_PORT_ENV else 9100 QPAIR_COUNT=32 CLIENT_QPAIR_COUNT=3 NVME_TIMEOUT_US=8000000 diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml index 1a3134e58..ddcdf9e92 100644 --- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml +++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml @@ -2,20 +2,20 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: controller-manager + name: simplyblock-fdb-controller-manager labels: - control-plane: controller-manager - app: controller-manager + control-plane: simplyblock-fdb-controller-manager + app: simplyblock-fdb-controller-manager spec: selector: matchLabels: - app: controller-manager + app: simplyblock-fdb-controller-manager replicas: 1 template: metadata: labels: - control-plane: controller-manager - app: controller-manager + control-plane: simplyblock-fdb-controller-manager + app: simplyblock-fdb-controller-manager spec: securityContext: runAsUser: 4059 @@ -28,7 +28,7 @@ spec: emptyDir: {} - name: fdb-binaries emptyDir: {} - serviceAccountName: controller-manager + serviceAccountName: simplyblock-fdb-controller-manager initContainers: - name: foundationdb-kubernetes-init-7-3 image: foundationdb/fdb-kubernetes-monitor:7.3.63 @@ -51,6 +51,8 @@ spec: containers: - command: - /manager + args: + - "--health-probe-bind-address=:9443" image: foundationdb/fdb-kubernetes-operator:v2.13.0 name: manager env: @@ -86,13 +88,13 @@ spec: apiVersion: v1 kind: ServiceAccount metadata: - name: controller-manager + name: simplyblock-fdb-controller-manager --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: manager-role + name: simplyblock-fdb-manager-role rules: - apiGroups: - "" @@ -164,7 +166,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: creationTimestamp: null - name: manager-clusterrole + name: simplyblock-fdb-manager-clusterrole rules: - apiGroups: - "" @@ -179,27 +181,27 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: creationTimestamp: null - name: manager-rolebinding + name: simplyblock-fdb-manager-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: manager-role + name: simplyblock-fdb-manager-role subjects: - kind: ServiceAccount - name: controller-manager + name: simplyblock-fdb-controller-manager --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: creationTimestamp: null - name: manager-clusterrolebinding + name: simplyblock-fdb-manager-clusterrolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: manager-clusterrole + name: simplyblock-fdb-manager-clusterrole subjects: - kind: ServiceAccount - name: controller-manager + name: simplyblock-fdb-controller-manager namespace: metadata.namespace ##### cluster file ################# @@ -213,7 +215,11 @@ spec: replacements: enabled: true faultDomain: + {{- if .Values.foundationdb.multiAZ }} + key: topology.kubernetes.io/zone + {{- else }} key: foundationdb.org/none + {{- end }} imageType: split labels: filterOnOwnerReference: false @@ -225,10 +231,17 @@ spec: - foundationdb.org/fdb-process-group-id minimumUptimeSecondsForBounce: 60 processCounts: + {{- if .Values.foundationdb.multiAZ }} + cluster_controller: 1 + log: 4 + storage: 4 + stateless: -1 + {{- else }} cluster_controller: 1 log: 3 storage: 3 stateless: -1 + {{- end }} processes: general: customParameters: diff --git a/simplyblock_core/scripts/charts/templates/storage_class.yaml b/simplyblock_core/scripts/charts/templates/storage_class.yaml index 64e5e6280..9b6a2c9ce 100644 --- a/simplyblock_core/scripts/charts/templates/storage_class.yaml +++ b/simplyblock_core/scripts/charts/templates/storage_class.yaml @@ -7,4 +7,12 @@ provisioner: openebs.io/local allowVolumeExpansion: true reclaimPolicy: Retain volumeBindingMode: WaitForFirstConsumer - +{{- if .Values.storageclass.allowedTopologyZones }} +allowedTopologies: +- matchLabelExpressions: + - key: topology.kubernetes.io/zone + values: +{{- range .Values.storageclass.allowedTopologyZones }} + - {{ . }} +{{- end }} +{{- end }} diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml index 467734176..994e9a21f 100644 --- a/simplyblock_core/scripts/charts/values.yaml +++ b/simplyblock_core/scripts/charts/values.yaml @@ -24,10 +24,16 @@ image: ports: lvolNvmfPortStart: - + +storageclass: + allowedTopologyZones: [] + openebs: enabled: true +foundationdb: + multiAZ: false + mongodb: name: "simplyblock-mongodb" deployment_name: "simplyblock-mongodb" From 1c38b6eead3cd1fc0d59b6bd7255955016783ef8 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Fri, 14 Nov 2025 13:34:42 +0100 Subject: [PATCH 046/192] increased k8s fdb memory limit (#740) --- .../scripts/charts/templates/foundationdb.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml index ddcdf9e92..a3b2d8ccb 100644 --- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml +++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml @@ -298,10 +298,10 @@ spec: resources: limits: cpu: 500m - memory: 2Gi + memory: 4Gi requests: cpu: 100m - memory: 512Mi + memory: 1Gi securityContext: runAsUser: 0 affinity: @@ -321,10 +321,10 @@ spec: resources: limits: cpu: 500m - memory: 2Gi + memory: 4Gi requests: cpu: 100m - memory: 512Mi + memory: 1Gi securityContext: runAsUser: 0 affinity: From 5d9e0a47e330e5e5d0a3a7ccd980f09978c2535d Mon Sep 17 00:00:00 2001 From: noctarius aka Christoph Engelbert Date: Fri, 14 Nov 2025 14:17:24 +0100 Subject: [PATCH 047/192] Added MIT License (#742) --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..37d1834ca --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023-2025 simplyblock GmbH + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From ee8d4605b491af786457794d76b330e152a8074a Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Sat, 15 Nov 2025 12:28:16 +0300 Subject: [PATCH 048/192] Update constants.py (#744) --- simplyblock_core/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index d82275954..30902d839 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -225,4 +225,4 @@ def get_config_var(name, default=None): qos_class_meta_and_migration_weight_percent = 25 -MIG_PARALLEL_JOBS = 16 \ No newline at end of file +MIG_PARALLEL_JOBS = 64 From 2b144912434668ce3502e5386af75828f997460a Mon Sep 17 00:00:00 2001 From: schmidt-scaled Date: Sat, 15 Nov 2025 12:50:21 +0300 Subject: [PATCH 049/192] set size of lvstore cluster in constants (as ratio to distrib page size) --- simplyblock_core/constants.py | 2 +- simplyblock_core/rpc_client.py | 4 ++-- simplyblock_core/services/tasks_runner_failed_migration.py | 2 +- simplyblock_core/services/tasks_runner_migration.py | 2 +- simplyblock_core/services/tasks_runner_new_dev_migration.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index d82275954..36ba14a9e 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -225,4 +225,4 @@ def get_config_var(name, default=None): qos_class_meta_and_migration_weight_percent = 25 -MIG_PARALLEL_JOBS = 16 \ No newline at end of file +MIG_PARALLEL_JOBS = 64 \ No newline at end of file diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 66ef478f8..ce48e1796 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -922,7 +922,7 @@ def distr_migration_status(self, name): params = {"name": name} return self._request("distr_migration_status", params) - def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=1024, jobs=4): + def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=64, jobs=64): params = { "name": name, "storage_ID": storage_ID, @@ -935,7 +935,7 @@ def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=Fals params["jobs"] = jobs return self._request("distr_migration_failure_start", params) - def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=1024, jobs=4): + def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=64, jobs=64): params = { "name": name, } diff --git a/simplyblock_core/services/tasks_runner_failed_migration.py b/simplyblock_core/services/tasks_runner_failed_migration.py index fce4fd8ef..7d0b3e89f 100644 --- a/simplyblock_core/services/tasks_runner_failed_migration.py +++ b/simplyblock_core/services/tasks_runner_failed_migration.py @@ -88,7 +88,7 @@ def task_runner(task): if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True rsp = rpc_client.distr_migration_failure_start( - distr_name, device.cluster_device_order, qos_high_priority, job_size=1024, jobs=constants.MIG_PARALLEL_JOBS) + distr_name, device.cluster_device_order, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS) if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") task.function_result = "Failed to start device migration task" diff --git a/simplyblock_core/services/tasks_runner_migration.py b/simplyblock_core/services/tasks_runner_migration.py index fb085e4aa..e325e3d7e 100644 --- a/simplyblock_core/services/tasks_runner_migration.py +++ b/simplyblock_core/services/tasks_runner_migration.py @@ -93,7 +93,7 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024, + rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS) if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") diff --git a/simplyblock_core/services/tasks_runner_new_dev_migration.py b/simplyblock_core/services/tasks_runner_new_dev_migration.py index f62a7f210..9feec7a56 100644 --- a/simplyblock_core/services/tasks_runner_new_dev_migration.py +++ b/simplyblock_core/services/tasks_runner_new_dev_migration.py @@ -98,7 +98,7 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024, + rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS) if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") From 314c4cfe60cfaf11c3dafb8e856f94bd17940878 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 17 Nov 2025 11:21:31 +0100 Subject: [PATCH 050/192] Update sc name (#746) * migrated to k8s csi hostpath * added more permission * added more permission * updated talos docs --- docs/talos.md | 14 -- simplyblock_core/scripts/charts/Chart.yaml | 5 - .../templates/csi-hostpath-controller.yaml | 217 ++++++++++++++++++ .../templates/csi-hostpath-driverinfo.yaml | 24 ++ .../charts/templates/csi-hostpath-node.yaml | 163 +++++++++++++ .../charts/templates/foundationdb.yaml | 2 +- .../scripts/charts/templates/mongodb.yaml | 4 +- .../charts/templates/storage_class.yaml | 9 +- .../scripts/charts/values-template.yaml | 194 ---------------- simplyblock_core/scripts/charts/values.yaml | 7 +- 10 files changed, 416 insertions(+), 223 deletions(-) create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml delete mode 100644 simplyblock_core/scripts/charts/values-template.yaml diff --git a/docs/talos.md b/docs/talos.md index 47ff817d5..f1406ef38 100644 --- a/docs/talos.md +++ b/docs/talos.md @@ -19,26 +19,12 @@ kubectl label namespace simplyblock \ --overwrite ``` - -Patch the host machine so that OpenEBS could work - Create a machine config patch with the contents below and save as patch.yaml ``` cat > patch.yaml <<'EOF' machine: sysctls: vm.nr_hugepages: "1024" - nodeLabels: - openebs.io/engine: mayastor - kubelet: - extraMounts: - - destination: /var/openebs/local - type: bind - source: /var/openebs/local - options: - - rbind - - rshared - - rw EOF talosctl -e -n patch mc -p @patch.yaml diff --git a/simplyblock_core/scripts/charts/Chart.yaml b/simplyblock_core/scripts/charts/Chart.yaml index 9d1b62643..380f67bcd 100644 --- a/simplyblock_core/scripts/charts/Chart.yaml +++ b/simplyblock_core/scripts/charts/Chart.yaml @@ -26,11 +26,6 @@ dependencies: version: "25.18.0" repository: "https://prometheus-community.github.io/helm-charts" condition: monitoring.enabled - - name: openebs - version: 3.9.0 - repository: https://openebs.github.io/charts - alias: openebs - condition: openebs.enabled - name: ingress-nginx version: 4.10.1 repository: "https://kubernetes.github.io/ingress-nginx" diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml new file mode 100644 index 000000000..153c29bda --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml @@ -0,0 +1,217 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-hostpathplugin-sa + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: csi-hostpathplugin +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: [""] + resources: ["persistentvolumeclaims/status"] + verbs: ["get", "update", "patch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csinodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csistoragecapacities"] + verbs: ["get", "list", "watch", "create", "update", "delete"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch", "update", "get", "list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: csi-hostpathplugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: csi-hostpathplugin +subjects: + - kind: ServiceAccount + name: csi-hostpathplugin-sa + namespace: {{ .Release.Namespace }} +--- +kind: StatefulSet +apiVersion: apps/v1 +metadata: + name: csi-hostpathplugin + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin +spec: + serviceName: "csi-hostpathplugin" + # One replica only: + # Host path driver only works when everything runs + # on a single node. + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + template: + metadata: + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + spec: + serviceAccountName: csi-hostpathplugin-sa + containers: + - name: hostpath + image: registry.k8s.io/sig-storage/hostpathplugin:v1.17.0 + args: + - "--drivername=hostpath.csi.k8s.io" + - "--v=5" + - "--endpoint=$(CSI_ENDPOINT)" + - "--nodeid=$(KUBE_NODE_NAME)" + # end hostpath args + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + securityContext: + privileged: true + ports: + - containerPort: 9898 + name: healthz + protocol: TCP + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: healthz + initialDelaySeconds: 10 + timeoutSeconds: 3 + periodSeconds: 2 + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /var/lib/kubelet/pods + mountPropagation: Bidirectional + name: mountpoint-dir + - mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + name: plugins-dir + - mountPath: /csi-data-dir + name: csi-data-dir + - mountPath: /dev + name: dev-dir + + - name: liveness-probe + volumeMounts: + - mountPath: /csi + name: socket-dir + image: registry.k8s.io/sig-storage/livenessprobe:v2.17.0 + args: + - --csi-address=/csi/csi.sock + - --health-port=9898 + + - name: csi-provisioner + image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0 + args: + - -v=5 + - --csi-address=/csi/csi.sock + - --feature-gates=Topology=true + - --enable-capacity + - --capacity-ownerref-level=0 # pod is owner + - --node-deployment=true + - --strict-topology=true + - --immediate-topology=false + - --worker-threads=5 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + # end csi-provisioner args + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + volumeMounts: + - mountPath: /csi + name: socket-dir + + - name: csi-resizer + image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0 + args: + - -v=5 + - -csi-address=/csi/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + volumeMounts: + - mountPath: /csi + name: socket-dir + + volumes: + - hostPath: + path: /var/lib/kubelet/plugins/csi-hostpath + type: DirectoryOrCreate + name: socket-dir + - hostPath: + path: /var/lib/kubelet/pods + type: DirectoryOrCreate + name: mountpoint-dir + - hostPath: + path: /var/lib/kubelet/plugins_registry + type: Directory + name: registration-dir + - hostPath: + path: /var/lib/kubelet/plugins + type: Directory + name: plugins-dir + - hostPath: + # 'path' is where PV data is persisted on host. + # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot + path: /var/lib/csi-hostpath-data/ + type: DirectoryOrCreate + name: csi-data-dir + - hostPath: + path: /dev + type: Directory + name: dev-dir + # end csi volumes diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml new file mode 100644 index 000000000..c02431500 --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml @@ -0,0 +1,24 @@ +apiVersion: storage.k8s.io/v1 +kind: CSIDriver +metadata: + name: hostpath.csi.k8s.io + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: hostpath.csi.k8s.io + app.kubernetes.io/component: csi-driver +spec: + # Supports persistent and ephemeral inline volumes. + volumeLifecycleModes: + - Persistent + - Ephemeral + # To determine at runtime which mode a volume uses, pod info and its + # "csi.storage.k8s.io/ephemeral" entry are needed. + podInfoOnMount: true + # No attacher needed. + attachRequired: false + storageCapacity: true + # Kubernetes may use fsGroup to change permissions and ownership + # of the volume to match user requested fsGroup in the pod's SecurityPolicy + fsGroupPolicy: File + \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml new file mode 100644 index 000000000..07e08f36e --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml @@ -0,0 +1,163 @@ + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-hostpath-node-sa + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: csi-hostpath-node +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: csi-hostpath-node +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: csi-hostpath-node +subjects: + - kind: ServiceAccount + name: csi-hostpath-node-sa + namespace: {{ .Release.Namespace }} +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: csi-hostpathplugin + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin +spec: + selector: + matchLabels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + template: + metadata: + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + spec: + serviceAccountName: csi-hostpath-node-sa + containers: + - name: node-driver-registrar + image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0 + args: + - --v=5 + - --csi-address=/csi/csi.sock + - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + env: + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /registration + name: registration-dir + - mountPath: /csi-data-dir + name: csi-data-dir + + - name: hostpath + image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0 + args: + - --drivername=hostpath.csi.k8s.io + - --v=5 + - --endpoint=$(CSI_ENDPOINT) + - --nodeid=$(KUBE_NODE_NAME) + - --capacity=slow=10Gi + - --capacity=fast=100Gi + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + securityContext: + privileged: true + ports: + - containerPort: 9898 + name: healthz + protocol: TCP + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: healthz + initialDelaySeconds: 10 + timeoutSeconds: 3 + periodSeconds: 2 + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /var/lib/kubelet/pods + mountPropagation: Bidirectional + name: mountpoint-dir + - mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + name: plugins-dir + - mountPath: /csi-data-dir + name: csi-data-dir + - mountPath: /dev + name: dev-dir + - name: liveness-probe + volumeMounts: + - mountPath: /csi + name: socket-dir + image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0 + args: + - --csi-address=/csi/csi.sock + - --health-port=9898 + + volumes: + - hostPath: + path: /var/lib/kubelet/plugins/csi-hostpath + type: DirectoryOrCreate + name: socket-dir + - hostPath: + path: /var/lib/kubelet/pods + type: DirectoryOrCreate + name: mountpoint-dir + - hostPath: + path: /var/lib/kubelet/plugins_registry + type: Directory + name: registration-dir + - hostPath: + path: /var/lib/kubelet/plugins + type: Directory + name: plugins-dir + - hostPath: + # 'path' is where PV data is persisted on host. + # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot + path: /var/lib/csi-hostpath-data/ + type: DirectoryOrCreate + name: csi-data-dir + - hostPath: + path: /dev + type: Directory + name: dev-dir diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml index a3b2d8ccb..4eb7f1410 100644 --- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml +++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml @@ -283,7 +283,7 @@ spec: runAsUser: 0 volumeClaimTemplate: spec: - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath accessModes: - ReadWriteOnce resources: diff --git a/simplyblock_core/scripts/charts/templates/mongodb.yaml b/simplyblock_core/scripts/charts/templates/mongodb.yaml index 740dd7642..815df6505 100644 --- a/simplyblock_core/scripts/charts/templates/mongodb.yaml +++ b/simplyblock_core/scripts/charts/templates/mongodb.yaml @@ -14,7 +14,7 @@ spec: name: data-volume spec: accessModes: [ "ReadWriteOnce" ] - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath resources: requests: storage: 5Gi @@ -22,7 +22,7 @@ spec: name: logs-volume spec: accessModes: [ "ReadWriteOnce" ] - storageClassName: openebs-local-hostpath + storageClassName: local-hostpath resources: requests: storage: 5Gi diff --git a/simplyblock_core/scripts/charts/templates/storage_class.yaml b/simplyblock_core/scripts/charts/templates/storage_class.yaml index 9b6a2c9ce..b23cb4a07 100644 --- a/simplyblock_core/scripts/charts/templates/storage_class.yaml +++ b/simplyblock_core/scripts/charts/templates/storage_class.yaml @@ -2,8 +2,13 @@ apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: - name: openebs-local-hostpath -provisioner: openebs.io/local + name: local-hostpath + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpath-fast + app.kubernetes.io/component: storageclass +provisioner: hostpath.csi.k8s.io allowVolumeExpansion: true reclaimPolicy: Retain volumeBindingMode: WaitForFirstConsumer diff --git a/simplyblock_core/scripts/charts/values-template.yaml b/simplyblock_core/scripts/charts/values-template.yaml deleted file mode 100644 index 79693e7cd..000000000 --- a/simplyblock_core/scripts/charts/values-template.yaml +++ /dev/null @@ -1,194 +0,0 @@ -graylog: - rootPasswordSha2: "${GRAYLOG_ROOT_PASSWORD_SHA2}" - passwordSecret: "${GRAYLOG_PASSWORD_SECRET}" - -cluster: - secret: "${CLUSTER_SECRET}" - id: "${CLUSTER_ID}" - ip: "${CLUSTER_IP}" - -monitoring: - enabled: ${ENABLE_MONITORING} - -log: - deletionInterval: "${LOG_DELETION_INTERVAL}" - retentionPeriod: "${RETENTION_PERIOD}" - level: "${LOG_LEVEL}" - maxNumberIndex: "${MAX_NUMBER_OF_INDICES}" - -grafana: - endpoint: "${GRAFANA_ENDPOINT}" - contactPoint: "${CONTACT_POINT}" - -image: - simplyblock: - repository: "${SIMPLYBLOCK_REPOSITORY}" - tag: "${SIMPLYBLOCK_TAG}" - pullPolicy: "Always" - -openebs: - enabled: true - -mongodb: - name: "simplyblock-mongodb" - deployment_name: "simplyblock-mongodb" - resources: - requests: - cpu: 100m - memory: 300Mi - limits: - cpu: 250m - memory: 1Gi - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app.kubernetes.io/component - operator: In - values: - - mongodb - topologyKey: "kubernetes.io/hostname" - -opensearch: - fullnameOverride: "simplyblock-opensearch" - singleNode: true - replicas: 1 - - antiAffinity: "hard" - persistence: - enabled: true - storageClass: openebs-local-hostpath - size: 10Gi - - resources: - requests: - cpu: "100m" - memory: "512Mi" - limits: - cpu: "500m" - memory: "3Gi" - - extraEnvs: - - name: OPENSEARCH_JAVA_OPTS - value: "-Xms1g -Xmx1g" - - name: bootstrap.memory_lock - value: "true" - - name: action.auto_create_index - value: "false" - - name: plugins.security.ssl.http.enabled - value: "false" - - name: plugins.security.disabled - value: "true" - - securityConfig: - enabled: false - -prometheus: - server: - fullnameOverride: simplyblock-prometheus - enabled: true - statefulSet: - enabled: true - name: simplyblock-prometheus - replicaCount: 1 - podLabels: - app: simplyblock-prometheus - podAnnotations: {} - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app.kubernetes.io/component - operator: In - values: - - simplyblock-prometheus - topologyKey: "kubernetes.io/hostname" - service: - servicePort: 9090 - type: ClusterIP - gRPC: - enabled: true - servicePort: 10901 - additionalPorts: - - name: http-thanos - port: 10902 - targetPort: 10902 - protocol: TCP - securityContext: - fsGroup: 65534 - persistentVolume: - enabled: true - size: 5Gi - storageClass: openebs-local-hostpath - extraArgs: - storage.tsdb.min-block-duration: 2h - storage.tsdb.max-block-duration: 2h - sidecarContainers: - thanos-sidecar: - image: thanosio/thanos:v0.31.0 - args: - - sidecar - - --tsdb.path=/prometheus - - --prometheus.url=http://localhost:9090 - - --objstore.config-file=/etc/thanos/objstore.yml - ports: - - name: grpc - containerPort: 10901 - - name: http - containerPort: 10902 - volumeMounts: - - name: storage-volume - mountPath: /prometheus - - name: objstore-config - mountPath: /etc/thanos - resources: - requests: - cpu: "100m" - memory: "256Mi" - limits: - cpu: "250m" - memory: "1Gi" - resources: - requests: - cpu: "100m" - memory: "512Mi" - limits: - cpu: "500m" - memory: "1Gi" - configMapOverrideName: simplyblock-prometheus-config - extraVolumes: - - name: objstore-config - configMap: - name: simplyblock-objstore-config - alertmanager: - enabled: false - - prometheus-pushgateway: - enabled: false - - prometheus-node-exporter: - enabled: false - - kube-state-metrics: - enabled: false - -ingress: - enabled: true - ingressClassName: nginx - useDNS: ${USE_DNS} - host: "${DNS_NAME}" - tlsSecret: ${TLS_SECRET} - controller: - hostNetwork: ${USE_HOST} - dnsPolicy: ClusterFirstWithHostNet - service: - type: ${SERVICE_TYPE} - nodePorts: - tcp: - 4501: 32451 - extraArgs: - tcp-services-configmap: "${K8S_NAMESPACE}/simplyblock-tcp-services" - nodeSelector: - simplyblock.io/role: mgmt-plane diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml index 994e9a21f..0b70f321e 100644 --- a/simplyblock_core/scripts/charts/values.yaml +++ b/simplyblock_core/scripts/charts/values.yaml @@ -28,9 +28,6 @@ ports: storageclass: allowedTopologyZones: [] -openebs: - enabled: true - foundationdb: multiAZ: false @@ -63,7 +60,7 @@ opensearch: antiAffinity: "hard" persistence: enabled: true - storageClass: openebs-local-hostpath + storageClass: local-hostpath size: 10Gi resources: @@ -129,7 +126,7 @@ prometheus: persistentVolume: enabled: true size: 5Gi - storageClass: openebs-local-hostpath + storageClass: local-hostpath extraArgs: storage.tsdb.min-block-duration: 2h storage.tsdb.max-block-duration: 2h From ce6ae0f17ff1acc55a6078cc79f9762a53db124d Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 17 Nov 2025 14:58:26 +0100 Subject: [PATCH 051/192] updated to distributed provisioning (#748) * updated to distributed provisioning * remove host storage capacity check --- .../templates/csi-hostpath-driverinfo.yaml | 2 +- .../charts/templates/csi-hostpath-node.yaml | 163 ------------------ ...ntroller.yaml => csi-hostpath-plugin.yaml} | 144 +++++++++------- 3 files changed, 81 insertions(+), 228 deletions(-) delete mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml rename simplyblock_core/scripts/charts/templates/{csi-hostpath-controller.yaml => csi-hostpath-plugin.yaml} (83%) diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml index c02431500..2a9d7d044 100644 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml @@ -17,7 +17,7 @@ spec: podInfoOnMount: true # No attacher needed. attachRequired: false - storageCapacity: true + storageCapacity: false # Kubernetes may use fsGroup to change permissions and ownership # of the volume to match user requested fsGroup in the pod's SecurityPolicy fsGroupPolicy: File diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml deleted file mode 100644 index 07e08f36e..000000000 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml +++ /dev/null @@ -1,163 +0,0 @@ - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: csi-hostpath-node-sa - namespace: {{ .Release.Namespace }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: csi-hostpath-node -rules: - - apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "list", "watch"] - - apiGroups: ["storage.k8s.io"] - resources: ["volumeattachments"] - verbs: ["get", "list", "watch", "update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: csi-hostpath-node -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: csi-hostpath-node -subjects: - - kind: ServiceAccount - name: csi-hostpath-node-sa - namespace: {{ .Release.Namespace }} ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: csi-hostpathplugin - labels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin -spec: - selector: - matchLabels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin - template: - metadata: - labels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin - spec: - serviceAccountName: csi-hostpath-node-sa - containers: - - name: node-driver-registrar - image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0 - args: - - --v=5 - - --csi-address=/csi/csi.sock - - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock - securityContext: - # This is necessary only for systems with SELinux, where - # non-privileged sidecar containers cannot access unix domain socket - # created by privileged CSI driver container. - privileged: true - env: - - name: KUBE_NODE_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: spec.nodeName - volumeMounts: - - mountPath: /csi - name: socket-dir - - mountPath: /registration - name: registration-dir - - mountPath: /csi-data-dir - name: csi-data-dir - - - name: hostpath - image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0 - args: - - --drivername=hostpath.csi.k8s.io - - --v=5 - - --endpoint=$(CSI_ENDPOINT) - - --nodeid=$(KUBE_NODE_NAME) - - --capacity=slow=10Gi - - --capacity=fast=100Gi - env: - - name: CSI_ENDPOINT - value: unix:///csi/csi.sock - - name: KUBE_NODE_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: spec.nodeName - securityContext: - privileged: true - ports: - - containerPort: 9898 - name: healthz - protocol: TCP - livenessProbe: - failureThreshold: 5 - httpGet: - path: /healthz - port: healthz - initialDelaySeconds: 10 - timeoutSeconds: 3 - periodSeconds: 2 - volumeMounts: - - mountPath: /csi - name: socket-dir - - mountPath: /var/lib/kubelet/pods - mountPropagation: Bidirectional - name: mountpoint-dir - - mountPath: /var/lib/kubelet/plugins - mountPropagation: Bidirectional - name: plugins-dir - - mountPath: /csi-data-dir - name: csi-data-dir - - mountPath: /dev - name: dev-dir - - name: liveness-probe - volumeMounts: - - mountPath: /csi - name: socket-dir - image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0 - args: - - --csi-address=/csi/csi.sock - - --health-port=9898 - - volumes: - - hostPath: - path: /var/lib/kubelet/plugins/csi-hostpath - type: DirectoryOrCreate - name: socket-dir - - hostPath: - path: /var/lib/kubelet/pods - type: DirectoryOrCreate - name: mountpoint-dir - - hostPath: - path: /var/lib/kubelet/plugins_registry - type: Directory - name: registration-dir - - hostPath: - path: /var/lib/kubelet/plugins - type: Directory - name: plugins-dir - - hostPath: - # 'path' is where PV data is persisted on host. - # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot - path: /var/lib/csi-hostpath-data/ - type: DirectoryOrCreate - name: csi-data-dir - - hostPath: - path: /dev - type: Directory - name: dev-dir diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml similarity index 83% rename from simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml rename to simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml index 153c29bda..8e695e593 100644 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml @@ -52,8 +52,9 @@ subjects: - kind: ServiceAccount name: csi-hostpathplugin-sa namespace: {{ .Release.Namespace }} + --- -kind: StatefulSet +kind: DaemonSet apiVersion: apps/v1 metadata: name: csi-hostpathplugin @@ -63,11 +64,6 @@ metadata: app.kubernetes.io/name: csi-hostpathplugin app.kubernetes.io/component: plugin spec: - serviceName: "csi-hostpathplugin" - # One replica only: - # Host path driver only works when everything runs - # on a single node. - replicas: 1 selector: matchLabels: app.kubernetes.io/instance: hostpath.csi.k8s.io @@ -84,67 +80,12 @@ spec: spec: serviceAccountName: csi-hostpathplugin-sa containers: - - name: hostpath - image: registry.k8s.io/sig-storage/hostpathplugin:v1.17.0 - args: - - "--drivername=hostpath.csi.k8s.io" - - "--v=5" - - "--endpoint=$(CSI_ENDPOINT)" - - "--nodeid=$(KUBE_NODE_NAME)" - # end hostpath args - env: - - name: CSI_ENDPOINT - value: unix:///csi/csi.sock - - name: KUBE_NODE_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: spec.nodeName - securityContext: - privileged: true - ports: - - containerPort: 9898 - name: healthz - protocol: TCP - livenessProbe: - failureThreshold: 5 - httpGet: - path: /healthz - port: healthz - initialDelaySeconds: 10 - timeoutSeconds: 3 - periodSeconds: 2 - volumeMounts: - - mountPath: /csi - name: socket-dir - - mountPath: /var/lib/kubelet/pods - mountPropagation: Bidirectional - name: mountpoint-dir - - mountPath: /var/lib/kubelet/plugins - mountPropagation: Bidirectional - name: plugins-dir - - mountPath: /csi-data-dir - name: csi-data-dir - - mountPath: /dev - name: dev-dir - - - name: liveness-probe - volumeMounts: - - mountPath: /csi - name: socket-dir - image: registry.k8s.io/sig-storage/livenessprobe:v2.17.0 - args: - - --csi-address=/csi/csi.sock - - --health-port=9898 - - name: csi-provisioner image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0 args: - -v=5 - --csi-address=/csi/csi.sock - --feature-gates=Topology=true - - --enable-capacity - - --capacity-ownerref-level=0 # pod is owner - --node-deployment=true - --strict-topology=true - --immediate-topology=false @@ -163,7 +104,6 @@ spec: valueFrom: fieldRef: fieldPath: metadata.name - # end csi-provisioner args securityContext: # This is necessary only for systems with SELinux, where # non-privileged sidecar containers cannot access unix domain socket @@ -172,7 +112,6 @@ spec: volumeMounts: - mountPath: /csi name: socket-dir - - name: csi-resizer image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0 args: @@ -187,6 +126,84 @@ spec: - mountPath: /csi name: socket-dir + - name: node-driver-registrar + image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0 + args: + - --v=5 + - --csi-address=/csi/csi.sock + - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + env: + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /registration + name: registration-dir + - mountPath: /csi-data-dir + name: csi-data-dir + + - name: hostpath + image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0 + args: + - --drivername=hostpath.csi.k8s.io + - --v=5 + - --endpoint=$(CSI_ENDPOINT) + - --nodeid=$(KUBE_NODE_NAME) + - --capacity=slow=10Gi + - --capacity=fast=100Gi + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + securityContext: + privileged: true + ports: + - containerPort: 9898 + name: healthz + protocol: TCP + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: healthz + initialDelaySeconds: 10 + timeoutSeconds: 3 + periodSeconds: 2 + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /var/lib/kubelet/pods + mountPropagation: Bidirectional + name: mountpoint-dir + - mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + name: plugins-dir + - mountPath: /csi-data-dir + name: csi-data-dir + - mountPath: /dev + name: dev-dir + - name: liveness-probe + volumeMounts: + - mountPath: /csi + name: socket-dir + image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0 + args: + - --csi-address=/csi/csi.sock + - --health-port=9898 + volumes: - hostPath: path: /var/lib/kubelet/plugins/csi-hostpath @@ -214,4 +231,3 @@ spec: path: /dev type: Directory name: dev-dir - # end csi volumes From 5596c1179092f036d28911a4f449896b5ae8c1be Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 17 Nov 2025 16:41:17 +0100 Subject: [PATCH 052/192] Update Dockerfile_base (#750) --- docker/Dockerfile_base | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base index 201d92759..735d331b1 100644 --- a/docker/Dockerfile_base +++ b/docker/Dockerfile_base @@ -39,4 +39,3 @@ COPY requirements.txt requirements.txt RUN pip3 install -r requirements.txt -RUN rm -rf /usr/share/terminfo/ From aaa9b420e01d8a1b80f7a79aee45095e69af0af5 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Tue, 18 Nov 2025 12:52:08 +0100 Subject: [PATCH 053/192] sleep after openshift core isolation until reboot (#753) * sleep after openshift core isolation until reboot * increaased timeout to 3mins * check and remove old job if found * check and remove old job if found --- .../api/internal/storage_node/kubernetes.py | 28 ++++++++++++++++++- simplyblock_web/node_utils_k8s.py | 19 ++++++++++++- .../oc_storage_core_isolation.yaml.j2 | 15 +++++++++- 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index 56b4ca563..b6ab71b63 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -423,9 +423,35 @@ def spdk_process_start(body: SPDKParams): logger.info(f"Job deleted: '{core_resp.metadata.name}' in namespace '{namespace}") elif core_isolate and openshift: + batch_v1 = core_utils.get_k8s_batch_client() + try: + batch_v1.read_namespaced_job( + name=node_prepration_core_name, + namespace=namespace + ) + logger.info(f"Existing Job '{node_prepration_core_name}' found — deleting it first...") + + batch_v1.delete_namespaced_job( + name=node_prepration_core_name, + namespace=namespace, + body=V1DeleteOptions( + propagation_policy='Foreground', + grace_period_seconds=0 + ) + ) + + node_utils_k8s.wait_for_job_deletion(node_prepration_core_name, namespace) + + logger.info(f"Old Job '{node_prepration_core_name}' fully deleted.") + + except ApiException as e: + if e.status == 404: + logger.info(f"No pre-existing Job '{node_prepration_core_name}' found. Proceeding.") + else: + raise + core_template = env.get_template('oc_storage_core_isolation.yaml.j2') core_yaml = yaml.safe_load(core_template.render(values)) - batch_v1 = core_utils.get_k8s_batch_client() core_resp = batch_v1.create_namespaced_job(namespace=namespace, body=core_yaml) msg = f"Job created: '{core_resp.metadata.name}' in namespace '{namespace}" logger.info(msg) diff --git a/simplyblock_web/node_utils_k8s.py b/simplyblock_web/node_utils_k8s.py index 4626a89c9..b1440744d 100644 --- a/simplyblock_web/node_utils_k8s.py +++ b/simplyblock_web/node_utils_k8s.py @@ -5,6 +5,7 @@ import time from simplyblock_core.utils import get_k8s_batch_client +from kubernetes.client import ApiException node_name = os.environ.get("HOSTNAME") @@ -23,7 +24,7 @@ def get_namespace(): return out return default_namespace -def wait_for_job_completion(job_name, namespace, timeout=60): +def wait_for_job_completion(job_name, namespace, timeout=180): batch_v1 = get_k8s_batch_client() for _ in range(timeout): job = batch_v1.read_namespaced_job(job_name, namespace) @@ -33,3 +34,19 @@ def wait_for_job_completion(job_name, namespace, timeout=60): raise RuntimeError(f"Job '{job_name}' failed") time.sleep(3) raise TimeoutError(f"Timeout waiting for Job '{job_name}' to complete") + +def wait_for_job_deletion(job_name, namespace, timeout=60): + batch_v1 = get_k8s_batch_client() + + for _ in range(timeout): + try: + batch_v1.read_namespaced_job(job_name, namespace) + except ApiException as e: + if e.status == 404: + return True + else: + raise + + time.sleep(2) + + raise TimeoutError(f"Timeout waiting for Job '{job_name}' to be deleted") diff --git a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 index 734d9c59e..74f66721d 100644 --- a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 +++ b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 @@ -34,9 +34,18 @@ spec: - | set -e + MARKER="/var/simplyblock/.cpu_isolation_applied" + echo "--- Installing jq ---" apk add --no-cache jq + echo "--- Checking if node was already configured ---" + + if [[ -f "$MARKER" ]]; then + echo "[INFO] Node already configured. Skipping sleep and exiting..." + exit 0 + fi + echo "--- Reading isolated cores from config ---" CONFIG_FILE="/var/simplyblock/sn_config_file" @@ -105,4 +114,8 @@ spec: echo "[INFO] Init setup and CPU isolation complete." - echo "--- Init setup complete ---" + echo "[INFO] Marking node as configured." + touch "$MARKER" + + echo "[INFO] Node is rebooting. Sleeping indefinitely to stop pipeline..." + sleep infinity From b60925d23cc7c159cb72b774d689060f84e2c648 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 19 Nov 2025 12:09:16 +0100 Subject: [PATCH 054/192] added try and except to patch_prometheus_configmap func (#756) --- simplyblock_core/utils/__init__.py | 50 ++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 0892db54a..96a00ecac 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -2037,17 +2037,47 @@ def patch_prometheus_configmap(username: str, password: str): load_kube_config_with_fallback() v1 = client.CoreV1Api() - cm = v1.read_namespaced_config_map(name="sbcli-simplyblock-prometheus-config", namespace=constants.K8S_NAMESPACE) - prometheus_yml = cm.data.get("prometheus.yml", "") + try: + cm = v1.read_namespaced_config_map( + name="sbcli-simplyblock-prometheus-config", + namespace=constants.K8S_NAMESPACE + ) + except client.exceptions.ApiException as e: + logger.error(f"Failed to read ConfigMap: {e}") + return False + + try: + prometheus_yml = cm.data.get("prometheus.yml", "") + if not prometheus_yml: + logger.error("prometheus.yml key not found in ConfigMap.") + return False - prometheus_yml = re.sub(r"username:*", f"username: '{username}'", prometheus_yml) - prometheus_yml = re.sub(r"password:*", f"password: '{password}'", prometheus_yml) + try: + prometheus_yml = re.sub(r"username:.*", f"username: '{username}'", prometheus_yml) + prometheus_yml = re.sub(r"password:.*", f"password: '{password}'", prometheus_yml) + except re.error as e: + logger.error(f"Regex error while patching Prometheus YAML: {e}") + return False - patch_body = { - "data": { - "prometheus.yml": prometheus_yml + patch_body = { + "data": { + "prometheus.yml": prometheus_yml + } } - } - v1.patch_namespaced_config_map(name="sbcli-simplyblock-prometheus-config", namespace=constants.K8S_NAMESPACE, body=patch_body) - logger.info("Patched sbcli-simplyblock-prometheus-config ConfigMap with new credentials.") + v1.patch_namespaced_config_map( + name="sbcli-simplyblock-prometheus-config", + namespace=constants.K8S_NAMESPACE, + body=patch_body + ) + + logger.info("Patched sbcli-simplyblock-prometheus-config ConfigMap with new credentials.") + return True + + except client.exceptions.ApiException as e: + logger.error(f"Failed to patch ConfigMap: {e}") + return False + + except Exception as e: + logger.error(f"Unexpected error while patching ConfigMap: {e}") + return False From bb90c602bb3852fa6186f54dfdb340acc7dda2d4 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Sun, 23 Nov 2025 09:36:54 +0100 Subject: [PATCH 055/192] added hostNetwork true to simplyblock controlplane services (#771) --- .../scripts/charts/templates/app_k8s.yaml | 67 ++++++++++++++----- 1 file changed, 52 insertions(+), 15 deletions(-) diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index ec2e5b378..d17ea092a 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -19,6 +19,8 @@ spec: app: simplyblock-admin-control spec: serviceAccountName: simplyblock-control-sa + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: simplyblock-control image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -133,6 +135,8 @@ spec: labels: app: simplyblock-storage-node-monitor spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: storage-node-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -183,6 +187,8 @@ spec: labels: app: simplyblock-mgmt-node-monitor spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: mgmt-node-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -235,6 +241,8 @@ spec: labels: app: simplyblock-lvol-stats-collector spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: lvol-stats-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -284,7 +292,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-main-distr-event-collector - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: main-distr-event-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -334,7 +344,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-capacity-and-stats-collector - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: capacity-and-stats-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -385,7 +397,8 @@ spec: labels: app: simplyblock-capacity-monitor spec: - + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: capacity-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -435,7 +448,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-health-check - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: health-check image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -485,7 +500,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-device-monitor - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: device-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -535,7 +552,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-lvol-monitor - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: lvol-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -584,7 +603,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-snapshot-monitor - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: snapshot-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -685,7 +706,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-tasks-runner-restart - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: tasks-runner-restart image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -735,7 +758,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-tasks-runner-migration - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: tasks-runner-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -784,7 +809,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-tasks-runner-failed-migration - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: tasks-runner-failed-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -833,7 +860,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-tasks-runner-cluster-status - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: tasks-runner-cluster-status image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -882,7 +911,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-tasks-runner-new-device-migration - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: tasks-runner-new-device-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -931,7 +962,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-tasks-node-add-runner - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: tasks-node-addrunner image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -983,7 +1016,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-tasks-runner-port-allow - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: tasks-runner-port-allow image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -1032,7 +1067,9 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-tasks-runner-jc-comp-resume - spec: + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet containers: - name: tasks-runner-jc-comp-resume image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" From 7655dbdb624dee54978cefad637cf4966aded3e1 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 25 Nov 2025 01:22:45 +0300 Subject: [PATCH 056/192] Set cluster_id optional on SNodeAPI docker version --- simplyblock_core/snode_client.py | 2 +- simplyblock_web/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index fc97f5cbf..c091127cb 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -73,7 +73,7 @@ def _request(self, method, path, payload=None): return None, None def is_live(self): - return self._request("GET", "/check") + return self._request("GET", "check") def info(self): return self._request("GET", "info") diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py index 27ff2ce18..a610cd177 100644 --- a/simplyblock_web/utils.py +++ b/simplyblock_web/utils.py @@ -149,7 +149,7 @@ def error_handler(exception: Exception): class RPCPortParams(BaseModel): rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536) - cluster_id: str + cluster_id: Optional[str] class DeviceParams(BaseModel): From 880630f79328d222a83d935cbc7a468aa9c5c875 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 25 Nov 2025 03:26:16 +0300 Subject: [PATCH 057/192] fix type checker --- simplyblock_web/api/internal/storage_node/kubernetes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index b6ab71b63..905ec411a 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -492,6 +492,9 @@ def spdk_process_kill(query: utils.RPCPortParams): k8s_core_v1 = core_utils.get_k8s_core_client() try: namespace = node_utils_k8s.get_namespace() + if not query.cluster_id: + return utils.get_response(False, f"param required: cluster_id") + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}" resp = k8s_core_v1.delete_namespaced_pod(pod_name, namespace) From 91443ed3a8fad4cad6eabbcccef6d0fcaba47d26 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 25 Nov 2025 03:26:16 +0300 Subject: [PATCH 058/192] fix type checker --- simplyblock_web/api/internal/storage_node/kubernetes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index 905ec411a..d5e98eb1d 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -493,7 +493,7 @@ def spdk_process_kill(query: utils.RPCPortParams): try: namespace = node_utils_k8s.get_namespace() if not query.cluster_id: - return utils.get_response(False, f"param required: cluster_id") + return utils.get_response(False, "param required: cluster_id") first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}" @@ -558,6 +558,9 @@ def _is_pod_present(rpc_port, cluster_id): })}}}, }) def spdk_process_is_up(query: utils.RPCPortParams): + if not query.cluster_id: + return utils.get_response(False, "param required: cluster_id") + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) if _is_pod_up(query.rpc_port, first_six_cluster_id): return utils.get_response(True) From 43c97a52f08af1a10fe58bbc7e5c08405f91afdd Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 25 Nov 2025 14:00:05 +0300 Subject: [PATCH 059/192] Set cluster_id optional on SNodeAPI docker version (#777) * Set cluster_id optional on SNodeAPI docker version * fix type checker * fix type checker --- simplyblock_core/snode_client.py | 2 +- simplyblock_web/api/internal/storage_node/kubernetes.py | 6 ++++++ simplyblock_web/utils.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 5e5f66f60..c9b40e878 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -73,7 +73,7 @@ def _request(self, method, path, payload=None): return None, None def is_live(self): - return self._request("GET", "/check") + return self._request("GET", "check") def info(self): return self._request("GET", "info") diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index b6ab71b63..d5e98eb1d 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -492,6 +492,9 @@ def spdk_process_kill(query: utils.RPCPortParams): k8s_core_v1 = core_utils.get_k8s_core_client() try: namespace = node_utils_k8s.get_namespace() + if not query.cluster_id: + return utils.get_response(False, "param required: cluster_id") + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}" resp = k8s_core_v1.delete_namespaced_pod(pod_name, namespace) @@ -555,6 +558,9 @@ def _is_pod_present(rpc_port, cluster_id): })}}}, }) def spdk_process_is_up(query: utils.RPCPortParams): + if not query.cluster_id: + return utils.get_response(False, "param required: cluster_id") + first_six_cluster_id = core_utils.first_six_chars(query.cluster_id) if _is_pod_up(query.rpc_port, first_six_cluster_id): return utils.get_response(True) diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py index 27ff2ce18..a610cd177 100644 --- a/simplyblock_web/utils.py +++ b/simplyblock_web/utils.py @@ -149,7 +149,7 @@ def error_handler(exception: Exception): class RPCPortParams(BaseModel): rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536) - cluster_id: str + cluster_id: Optional[str] class DeviceParams(BaseModel): From 33ee3e4288fbe4edde57945860091dcef39ca77c Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 26 Nov 2025 12:40:05 +0100 Subject: [PATCH 060/192] add cluster_id param for spdk_process_is_up (#779) * add cluster_id param for spdk_process_is_up * update image tag * update image tag * update env image tag to main --- simplyblock_core/controllers/health_controller.py | 4 ++-- simplyblock_core/env_var | 4 ++-- simplyblock_core/services/storage_node_monitor.py | 2 +- simplyblock_core/snode_client.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/controllers/health_controller.py b/simplyblock_core/controllers/health_controller.py index c013e2d58..94855f111 100644 --- a/simplyblock_core/controllers/health_controller.py +++ b/simplyblock_core/controllers/health_controller.py @@ -128,11 +128,11 @@ def _check_node_api(ip): return False -def _check_spdk_process_up(ip, rpc_port): +def _check_spdk_process_up(ip, rpc_port, cluster_id): try: snode_api = SNodeClient(f"{ip}:5000", timeout=10, retry=2) logger.debug(f"Node API={ip}:5000") - is_up, _ = snode_api.spdk_process_is_up(rpc_port) + is_up, _ = snode_api.spdk_process_is_up(rpc_port, cluster_id) logger.debug(f"SPDK is {is_up}") return is_up except Exception as e: diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index 468ba7a02..f34a430a9 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,6 +1,6 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev -SIMPLY_BLOCK_VERSION=19.2.25 +SIMPLY_BLOCK_VERSION=19.2.27 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main-lvol-sync-delete +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py index 17a7d0369..bfb92c11b 100644 --- a/simplyblock_core/services/storage_node_monitor.py +++ b/simplyblock_core/services/storage_node_monitor.py @@ -289,7 +289,7 @@ def node_rpc_timeout_check_and_report(node): spdk_process = False if node_api_check: # 3- check spdk_process - spdk_process = health_controller._check_spdk_process_up(snode.mgmt_ip, snode.rpc_port) + spdk_process = health_controller._check_spdk_process_up(snode.mgmt_ip, snode.rpc_port, snode.cluster_id) logger.info(f"Check: spdk process {snode.mgmt_ip}:5000 ... {spdk_process}") # 4- check rpc diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index c9b40e878..6f1bee0db 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -154,8 +154,8 @@ def bind_device_to_spdk(self, device_pci): params = {"device_pci": device_pci} return self._request("POST", "bind_device_to_spdk", params) - def spdk_process_is_up(self, rpc_port): - params = {"rpc_port": rpc_port} + def spdk_process_is_up(self, rpc_port, cluster_id): + params = {"rpc_port": rpc_port, "cluster_id": cluster_id} return self._request("GET", "spdk_process_is_up", params) def get_file_content(self, file_name): From 2531483bf224ddb6fbec886de3164f667d21e7fd Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Thu, 27 Nov 2025 09:43:34 +0100 Subject: [PATCH 061/192] updated images for openshift preflight check (#741) * updated images for openshift preflight check * added Lincense * updated maintainer * fixed cyclic terminfo symlink * check that the directory exist * create rm directory * remove rm directory --- docker/Dockerfile | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index ce1a83ae1..1e1f8c3bd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,12 +1,33 @@ # syntax=docker/dockerfile:1 FROM simplyblock/simplyblock:base_image +LABEL name="simplyblock" +LABEL vendor="Simplyblock" +LABEL version="1.0.0" +LABEL release="1" +LABEL summary="Simplyblock controlplane plane component" +LABEL description="Simplyblock controlplane plane container" +LABEL maintainer="developers@simplyblock.io" + +COPY LICENSE /licenses/LICENSE + WORKDIR /app COPY requirements.txt . -RUN pip3 install -r requirements.txt +RUN pip3 install --no-cache-dir -r requirements.txt + COPY . /app RUN python setup.py install + +RUN if [ -d /usr/share/terminfo ]; then \ + find /usr/share/terminfo -lname '*ncr260vt300wpp*' -exec rm -f {} + ; \ + rm -f /usr/share/terminfo/n/ncr260vt300wpp || true ; \ + fi + +RUN useradd -u 1001 -r -g 0 -d /app -s /sbin/nologin simplyblock && \ + chown -R 1001:0 /app + +USER 1001 From 6ff60fd11283b65adeba95c7141b13bf0045ad81 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 27 Nov 2025 18:52:55 +0300 Subject: [PATCH 062/192] Fix snapshot replications tickets sfam-2497: add snapshot check sfam-2495: snapshot list --cluster-id sfam-2498: clone fail --- simplyblock_cli/cli-reference.yaml | 12 ++++++++++++ simplyblock_cli/cli.py | 8 ++++++++ simplyblock_cli/clibase.py | 5 ++++- simplyblock_core/controllers/health_controller.py | 13 ++++++++----- simplyblock_core/controllers/snapshot_controller.py | 4 ++-- simplyblock_core/db_controller.py | 8 +++++--- simplyblock_core/services/snapshot_monitor.py | 6 ------ simplyblock_core/services/snapshot_replication.py | 2 +- 8 files changed, 40 insertions(+), 18 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index e9e276b41..d2e6c3a9d 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1875,6 +1875,11 @@ commands: dest: all type: bool action: store_true + - name: "--cluster-id" + help: "Filter snapshots by cluster UUID" + dest: cluster_id + type: str + required: false - name: delete help: "Deletes a snapshot" arguments: @@ -1887,6 +1892,13 @@ commands: dest: force type: bool action: store_true + - name: check + help: "Check a snapshot health" + arguments: + - name: "snapshot_id" + help: "Snapshot id" + dest: snapshot_id + type: str - name: clone help: "Provisions a new logical volume from an existing snapshot" arguments: diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 0c9677096..a09b5d893 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -762,6 +762,7 @@ def init_snapshot(self): self.init_snapshot__add(subparser) self.init_snapshot__list(subparser) self.init_snapshot__delete(subparser) + self.init_snapshot__check(subparser) self.init_snapshot__clone(subparser) self.init_snapshot__replication_status(subparser) self.init_snapshot__delete_replication_only(subparser) @@ -775,12 +776,17 @@ def init_snapshot__add(self, subparser): def init_snapshot__list(self, subparser): subcommand = self.add_sub_command(subparser, 'list', 'Lists all snapshots') argument = subcommand.add_argument('--all', help='List soft deleted snapshots', dest='all', action='store_true') + argument = subcommand.add_argument('--cluster-id', help='Filter snapshots by cluster UUID', type=str, dest='cluster_id', required=False) def init_snapshot__delete(self, subparser): subcommand = self.add_sub_command(subparser, 'delete', 'Deletes a snapshot') subcommand.add_argument('snapshot_id', help='Snapshot id', type=str) argument = subcommand.add_argument('--force', help='Force remove', dest='force', action='store_true') + def init_snapshot__check(self, subparser): + subcommand = self.add_sub_command(subparser, 'check', 'Check a snapshot health') + subcommand.add_argument('snapshot_id', help='Snapshot id', type=str) + def init_snapshot__clone(self, subparser): subcommand = self.add_sub_command(subparser, 'clone', 'Provisions a new logical volume from an existing snapshot') subcommand.add_argument('snapshot_id', help='Snapshot id', type=str) @@ -1142,6 +1148,8 @@ def run(self): ret = self.snapshot__list(sub_command, args) elif sub_command in ['delete']: ret = self.snapshot__delete(sub_command, args) + elif sub_command in ['check']: + ret = self.snapshot__check(sub_command, args) elif sub_command in ['clone']: ret = self.snapshot__clone(sub_command, args) elif sub_command in ['replication-status']: diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 540b4e91b..648e02595 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -636,11 +636,14 @@ def snapshot__add(self, sub_command, args): return snapshot_id if not error else error def snapshot__list(self, sub_command, args): - return snapshot_controller.list(args.all) + return snapshot_controller.list(args.all, args.cluster_id) def snapshot__delete(self, sub_command, args): return snapshot_controller.delete(args.snapshot_id, args.force) + def snapshot__check(self, sub_command, args): + return health_controller.check_snap(args.snapshot_id) + def snapshot__clone(self, sub_command, args): new_size = args.resize diff --git a/simplyblock_core/controllers/health_controller.py b/simplyblock_core/controllers/health_controller.py index c013e2d58..9ff6c6203 100644 --- a/simplyblock_core/controllers/health_controller.py +++ b/simplyblock_core/controllers/health_controller.py @@ -785,12 +785,15 @@ def check_snap(snap_id): return False snode = db_controller.get_storage_node_by_id(snap.lvol.node_id) - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, timeout=5, retry=1) + logger.info(f"Checking snap bdev: {snap.snap_bdev} on node: {snap.lvol.node_id}") + check_primary = snode.rpc_client().get_bdevs(snap.snap_bdev) + if snode.secondary_node_id: + secondary_node = db_controller.get_storage_node_by_id(snode.secondary_node_id) + logger.info(f"Checking snap bdev: {snap.snap_bdev} on node: {snap.lvol.node_id}") + check_secondary = secondary_node.rpc_client().get_bdevs(snap.snap_bdev) + return check_primary and check_secondary - ret = rpc_client.get_bdevs(snap.snap_bdev) - return ret + return False def check_jm_device(device_id): diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 7b77e4f40..a76cdec06 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -226,8 +226,8 @@ def add(lvol_id, snapshot_name): return snap.uuid, False -def list(all=False): - snaps = db_controller.get_snapshots() +def list(all=False, cluster_id=None): + snaps = db_controller.get_snapshots(cluster_id) data = [] for snap in snaps: logger.debug(snap) diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index ba832e655..d7f24d0aa 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -159,9 +159,11 @@ def get_hostnames_by_pool_id(self, pool_id) -> List[str]: hostnames.append(lv.hostname) return hostnames - def get_snapshots(self) -> List[SnapShot]: - ret = SnapShot().read_from_db(self.kv_store) - return ret + def get_snapshots(self, cluster_id=None) -> List[SnapShot]: + snaps = SnapShot().read_from_db(self.kv_store) + if cluster_id: + snaps = [n for n in snaps if n.cluster_id == cluster_id] + return sorted(snaps, key=lambda x: x.create_dt) def get_snapshot_by_id(self, id) -> SnapShot: ret = SnapShot().read_from_db(self.kv_store, id) diff --git a/simplyblock_core/services/snapshot_monitor.py b/simplyblock_core/services/snapshot_monitor.py index a99ed89f3..af2fdc921 100644 --- a/simplyblock_core/services/snapshot_monitor.py +++ b/simplyblock_core/services/snapshot_monitor.py @@ -111,7 +111,6 @@ def set_snap_offline(snap): node_bdev_names = [] node_lvols_nqns = {} sec_node_bdev_names = {} - sec_node_lvols_nqns = {} sec_node = None if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: @@ -142,11 +141,6 @@ def set_snap_offline(snap): for bdev in ret: sec_node_bdev_names[bdev['name']] = bdev - ret = sec_rpc_client.subsystem_list() - if ret: - for sub in ret: - sec_node_lvols_nqns[sub['nqn']] = sub - if snode.lvstore_status == "ready": for snap in db.get_snapshots_by_node_id(snode.get_id()): diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 8100c80f5..28d855773 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -119,7 +119,7 @@ def process_snap_replicate_finish(task, snapshot): remote_snode = db.get_storage_node_by_id(remote_lv.node_id) # chain snaps on primary - snaps = db.get_snapshots() + snaps = db.get_snapshots(remote_snode.cluster_id) for sn in snaps: if sn.lvol.get_id() == snapshot.lvol.get_id(): try: From 98ff76485b731d4d66807b91772bb120afd4a82b Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 27 Nov 2025 19:00:48 +0300 Subject: [PATCH 063/192] Fix sfam-2496 --- simplyblock_core/controllers/snapshot_controller.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index a76cdec06..793f12a09 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -220,9 +220,10 @@ def add(lvol_id, snapshot_name): logger.info("Done") snapshot_events.snapshot_create(snap) - task = tasks_controller.add_snapshot_replication_task(snap) - if task: - snapshot_events.replication_task_created(snap) + if lvol.do_replicate: + task = tasks_controller.add_snapshot_replication_task(snap) + if task: + snapshot_events.replication_task_created(snap) return snap.uuid, False From 05b6cd1fea52e5063259c7d342d7570134544134 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 27 Nov 2025 19:17:42 +0300 Subject: [PATCH 064/192] Follow up 1 --- simplyblock_core/controllers/health_controller.py | 7 +++---- simplyblock_core/services/lvol_monitor.py | 7 ------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/simplyblock_core/controllers/health_controller.py b/simplyblock_core/controllers/health_controller.py index 9ff6c6203..464bfb4e4 100644 --- a/simplyblock_core/controllers/health_controller.py +++ b/simplyblock_core/controllers/health_controller.py @@ -785,15 +785,14 @@ def check_snap(snap_id): return False snode = db_controller.get_storage_node_by_id(snap.lvol.node_id) - logger.info(f"Checking snap bdev: {snap.snap_bdev} on node: {snap.lvol.node_id}") check_primary = snode.rpc_client().get_bdevs(snap.snap_bdev) + logger.info(f"Checking snap bdev: {snap.snap_bdev} on node: {snap.lvol.node_id} is {bool(check_primary)}") if snode.secondary_node_id: secondary_node = db_controller.get_storage_node_by_id(snode.secondary_node_id) - logger.info(f"Checking snap bdev: {snap.snap_bdev} on node: {snap.lvol.node_id}") check_secondary = secondary_node.rpc_client().get_bdevs(snap.snap_bdev) + logger.info(f"Checking snap bdev: {snap.snap_bdev} on node: {snode.secondary_node_id} is {bool(check_secondary)}") return check_primary and check_secondary - - return False + return check_primary def check_jm_device(device_id): diff --git a/simplyblock_core/services/lvol_monitor.py b/simplyblock_core/services/lvol_monitor.py index 8486f3a32..b3405db6f 100644 --- a/simplyblock_core/services/lvol_monitor.py +++ b/simplyblock_core/services/lvol_monitor.py @@ -342,12 +342,5 @@ def process_lvol_delete_try_again(lvol): if passed: set_lvol_status(lvol, LVol.STATUS_ONLINE) - if snode.lvstore_status == "ready": - - for snap in db.get_snapshots_by_node_id(snode.get_id()): - present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) - set_snapshot_health_check(snap, present) - - time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) From 32d6ad4347c714e13f639ed8fd428b6a9705e68e Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 27 Nov 2025 20:30:52 +0300 Subject: [PATCH 065/192] fix lvol replication_start --- simplyblock_core/controllers/lvol_controller.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 472bb616f..6e8f346b5 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1782,9 +1782,17 @@ def replication_start(lvol_id): logger.error(e) return False - logger.info("Setting LVol do_replicate: True") lvol.do_replicate = True + if not lvol.replication_node_id: + snode = db_controller.get_storage_node_by_id(lvol.node_id) + cluster = db_controller.get_cluster_by_id(snode.cluster_id) + if not cluster.snapshot_replication_target_cluster: + logger.error(f"Cluster: {snode.cluster_id} not replicated") + return False + random_nodes = _get_next_3_nodes(cluster.snapshot_replication_target_cluster, lvol.size) + lvol.replication_node_id = random_nodes[0].get_id() lvol.write_to_db() + logger.info("Setting LVol do_replicate: True") for snap in db_controller.get_snapshots(): if snap.lvol.uuid == lvol.uuid: @@ -1792,6 +1800,8 @@ def replication_start(lvol_id): task = tasks_controller.add_snapshot_replication_task(snap) if task: snapshot_events.replication_task_created(snap) + return True + def replication_stop(lvol_id, delete=False): db_controller = DBController() From 580a519101a3d13ed5daaaf2bc6db2f7f97ca6c1 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 27 Nov 2025 21:00:01 +0300 Subject: [PATCH 066/192] fix rep service --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 28d855773..1ddcc93ac 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -159,7 +159,7 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot = SnapShot() new_snapshot.uuid = new_snapshot_uuid new_snapshot.cluster_id = remote_snode.cluster_id - new_snapshot.lvol = snapshot.lvol + new_snapshot.lvol = remote_lv new_snapshot.pool_uuid = remote_lv.pool_uuid new_snapshot.snap_bdev = remote_lv.top_bdev new_snapshot.snap_uuid = remote_lv.lvol_uuid From 254035d341ee9f74d64456679c1aebcfa6bc614c Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 27 Nov 2025 21:21:07 +0300 Subject: [PATCH 067/192] fix snapshot clone return value --- simplyblock_cli/clibase.py | 4 ++-- simplyblock_core/db_controller.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 648e02595..fdf516fcc 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -647,8 +647,8 @@ def snapshot__check(self, sub_command, args): def snapshot__clone(self, sub_command, args): new_size = args.resize - success, details = snapshot_controller.clone(args.snapshot_id, args.lvol_name, new_size) - return details + clone_id, error = snapshot_controller.clone(args.snapshot_id, args.lvol_name, new_size) + return clone_id if not error else error def snapshot__replication_status(self, sub_command, args): return snapshot_controller.list_replication_tasks(args.cluster_id) diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index d7f24d0aa..4174039be 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -163,7 +163,7 @@ def get_snapshots(self, cluster_id=None) -> List[SnapShot]: snaps = SnapShot().read_from_db(self.kv_store) if cluster_id: snaps = [n for n in snaps if n.cluster_id == cluster_id] - return sorted(snaps, key=lambda x: x.create_dt) + return sorted(snaps, key=lambda x: x.created_at) def get_snapshot_by_id(self, id) -> SnapShot: ret = SnapShot().read_from_db(self.kv_store, id) From 36f45b95d87d0e3c7a496ed6513eab4788bf4dea Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Thu, 27 Nov 2025 22:56:10 +0100 Subject: [PATCH 068/192] added graylog env GRAYLOG_MESSAGE_JOURNAL_MAX_SIZE (#782) --- simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml index 9c0f46e1f..1349a33a9 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml @@ -68,6 +68,8 @@ spec: value: "false" - name: GRAYLOG_ELASTICSEARCH_REPLICAS value: "1" + - name: GRAYLOG_MESSAGE_JOURNAL_MAX_SIZE + value: "10gb" ports: - containerPort: 5044 - containerPort: 5140 From f412121e57aba9a4f03a49a2c222fe0199c6ec11 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 2 Dec 2025 00:36:28 +0300 Subject: [PATCH 069/192] Create partitions and alcemls on node add in parallel (#763) (#785) * Create partitions and alcemls on node add in parallel * fix 1 * connect to remote alcemls in parallel * Create distrib bdevs in parallel * Create distrib bdevs in parallel * prepare for merge * Fix sfam-2485 --- simplyblock_core/storage_node_ops.py | 185 +++++++++++++++------------ 1 file changed, 102 insertions(+), 83 deletions(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 162f0dd1a..719284ab4 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -413,8 +413,8 @@ def _create_storage_device_stack(rpc_client, nvme, snode, after_restart): return nvme -def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, jm_percent, partition_size=0): - nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev) +def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, jm_percent, partition_size, nbd_index): + nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev, f"/dev/nbd{nbd_index}") time.sleep(3) if not nbd_device: logger.error("Failed to start nbd dev") @@ -447,79 +447,84 @@ def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, j def _prepare_cluster_devices_partitions(snode, devices): db_controller = DBController() - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password) - new_devices = [] - jm_devices = [] - dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id) - bdevs_names = [d['name'] for d in rpc_client.get_bdevs()] + devices_to_partition = [] + thread_list = [] for index, nvme in enumerate(devices): if nvme.status == "not_found": continue - if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_NEW]: logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}") new_devices.append(nvme) continue - if nvme.is_partition: - dev_part = f"{nvme.nvme_bdev[:-2]}p1" - if dev_part in bdevs_names: - if dev_part not in jm_devices: - jm_devices.append(dev_part) - - new_device = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=False) - if not new_device: - logger.error("failed to create dev stack") - return False - new_devices.append(new_device) - if new_device.status == NVMeDevice.STATUS_ONLINE: - device_events.device_create(new_device) - + t = threading.Thread(target=_create_storage_device_stack, args=(snode.rpc_client(), nvme, snode, False,)) + thread_list.append(t) + new_devices.append(nvme) + t.start() else: - # look for partitions - partitioned_devices = _search_for_partitions(rpc_client, nvme) - logger.debug("partitioned_devices") - logger.debug(partitioned_devices) - if len(partitioned_devices) == (1 + snode.num_partitions_per_dev): - logger.info("Partitioned devices found") - else: + devices_to_partition.append(nvme) + partitioned_devices = _search_for_partitions(snode.rpc_client(), nvme) + if len(partitioned_devices) != (1 + snode.num_partitions_per_dev): logger.info(f"Creating partitions for {nvme.nvme_bdev}") - _create_device_partitions(rpc_client, nvme, snode, snode.num_partitions_per_dev, snode.jm_percent, - snode.partition_size) - partitioned_devices = _search_for_partitions(rpc_client, nvme) - if len(partitioned_devices) == (1 + snode.num_partitions_per_dev): - logger.info("Device partitions created") - else: - logger.error("Failed to create partitions") - return False + t = threading.Thread( + target=_create_device_partitions, + args=(snode.rpc_client(), nvme, snode, snode.num_partitions_per_dev, + snode.jm_percent, snode.partition_size, index+1,)) + thread_list.append(t) + t.start() - jm_devices.append(partitioned_devices.pop(0).nvme_bdev) + for thread in thread_list: + thread.join() + thread_list = [] + for nvme in devices_to_partition: + partitioned_devices = _search_for_partitions(snode.rpc_client(), nvme) + if len(partitioned_devices) == (1 + snode.num_partitions_per_dev): + logger.info("Device partitions created") + # remove 1st partition for jm + partitioned_devices.pop(0) for dev in partitioned_devices: - ret = _create_storage_device_stack(rpc_client, dev, snode, after_restart=False) - if not ret: - logger.error("failed to create dev stack") - return False - if dev.status == NVMeDevice.STATUS_ONLINE: - if dev.cluster_device_order < 0: - dev.cluster_device_order = dev_order - dev_order += 1 - device_events.device_create(dev) + t = threading.Thread(target=_create_storage_device_stack, + args=(snode.rpc_client(), dev, snode, False,)) + thread_list.append(t) new_devices.append(dev) + t.start() + else: + logger.error("Failed to create partitions") + return False - snode.nvme_devices = new_devices + for thread in thread_list: + thread.join() + + # assign device order + dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id) + for nvme in new_devices: + if nvme.status == NVMeDevice.STATUS_ONLINE: + if nvme.cluster_device_order < 0: + nvme.cluster_device_order = dev_order + dev_order += 1 + device_events.device_create(nvme) + + # create jm device + jm_devices = [] + bdevs_names = [d['name'] for d in snode.rpc_client().get_bdevs()] + for nvme in new_devices: + if nvme.status == NVMeDevice.STATUS_ONLINE: + dev_part = f"{nvme.nvme_bdev[:-2]}p1" + if dev_part in bdevs_names: + if dev_part not in jm_devices: + jm_devices.append(dev_part) if jm_devices: - jm_device = _create_jm_stack_on_raid(rpc_client, jm_devices, snode, after_restart=False) + jm_device = _create_jm_stack_on_raid(snode.rpc_client(), jm_devices, snode, after_restart=False) if not jm_device: logger.error("Failed to create JM device") return False snode.jm_device = jm_device + snode.nvme_devices = new_devices return True @@ -701,6 +706,8 @@ def _connect_to_remote_devs( allowed_node_statuses.append(StorageNode.STATUS_RESTARTING) allowed_dev_statuses.append(NVMeDevice.STATUS_UNAVAILABLE) + devices_to_connect = [] + connect_threads = [] nodes = db_controller.get_storage_nodes_by_cluster_id(this_node.cluster_id) # connect to remote devs for node_index, node in enumerate(nodes): @@ -715,12 +722,29 @@ def _connect_to_remote_devs( if not dev.alceml_bdev: raise ValueError(f"device alceml bdev not found!, {dev.get_id()}") + devices_to_connect.append(dev) + t = threading.Thread( + target=connect_device, + args=(f"remote_{dev.alceml_bdev}", dev, this_node, node_bdev_names, reattach,)) + connect_threads.append(t) + t.start() - dev.remote_bdev = connect_device( - f"remote_{dev.alceml_bdev}", dev, this_node, - bdev_names=node_bdev_names, reattach=reattach, - ) - remote_devices.append(dev) + for t in connect_threads: + t.join() + + node_bdevs = rpc_client.get_bdevs() + if node_bdevs: + node_bdev_names = [b['name'] for b in node_bdevs] + + for dev in devices_to_connect: + for bdev in node_bdev_names: + if bdev.startswith(f"remote_{dev.alceml_bdev}"): + dev.remote_bdev = bdev + break + if not dev.remote_bdev: + logger.error(f"Failed to connect to remote device {dev.alceml_name}") + continue + remote_devices.append(dev) return remote_devices @@ -1980,7 +2004,7 @@ def restart_storage_node( logger.error('Failed to connect to remote devices') return False node.write_to_db(kv_store) - + logger.info("Sending device status event") snode = db_controller.get_storage_node_by_id(snode.get_id()) @@ -2137,21 +2161,6 @@ def list_storage_devices(node_id, is_json): "Health": snode.jm_device.health_check }) - for jm_id in snode.jm_ids: - try: - jm_device = db_controller.get_jm_device_by_id(jm_id) - except KeyError: - continue - - jm_devices.append({ - "UUID": jm_device.uuid, - "Name": jm_device.device_name, - "Size": utils.humanbytes(jm_device.size), - "Status": jm_device.status, - "IO Err": jm_device.io_error, - "Health": jm_device.health_check - }) - for device in snode.remote_devices: logger.debug(device) logger.debug("*" * 20) @@ -3604,6 +3613,15 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): + def _create_distr(snode, name, params): + try: + rpc_client.bdev_distrib_create(**params) + except Exception: + logger.error("Failed to create bdev distrib") + ret = distr_controller.send_cluster_map_to_distr(snode, name) + if not ret: + logger.error("Failed to send cluster map") + rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) db_controller = DBController() cluster = db_controller.get_cluster_by_id(snode.cluster_id) @@ -3620,11 +3638,11 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): else: node_bdev_names = [] + thread_list = [] for bdev in stack: type = bdev['type'] name = bdev['name'] params = bdev['params'] - if name in node_bdev_names: continue @@ -3640,23 +3658,21 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): snode.distrib_cpu_index = (snode.distrib_cpu_index + 1) % len(snode.distrib_cpu_cores) params['full_page_unmap'] = cluster.full_page_unmap - ret = rpc_client.bdev_distrib_create(**params) - if ret: - ret = distr_controller.send_cluster_map_to_distr(snode, name) - if not ret: - return False, "Failed to send cluster map" - # time.sleep(1) + t = threading.Thread(target=_create_distr, args=(snode, name, params,)) + thread_list.append(t) + t.start() + ret = True elif type == "bdev_lvstore" and lvstore_stack and not primary_node: - ret = rpc_client.create_lvstore(**params) - # if ret and snode.jm_vuid > 0: - # rpc_client.bdev_lvol_set_lvs_ops(snode.lvstore, snode.jm_vuid, snode.lvol_subsys_port) + ret = rpc_client.create_lvstore(**params) elif type == "bdev_ptnonexcl": ret = rpc_client.bdev_PT_NoExcl_create(**params) elif type == "bdev_raid": - + if thread_list: + for t in thread_list: + t.join() distribs_list = bdev["distribs_list"] strip_size_kb = params["strip_size_kb"] ret = rpc_client.bdev_raid_create(name, distribs_list, strip_size_kb=strip_size_kb) @@ -3674,6 +3690,9 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): _remove_bdev_stack(created_bdevs[::-1], rpc_client) return False, f"Failed to create BDev: {name}" + if thread_list: + for t in thread_list: + t.join() return True, None From 3c60a2c0f5561680047db0f28aa0b0b09d758409 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 2 Dec 2025 00:36:40 +0300 Subject: [PATCH 070/192] Remove stats from fdb and get it from Prometheus (#762) (#786) * expose prometheus port 9090 using HAProxy * wip 3 * wip 4 * wip 5 * wip 6 * keep only 10 records for device,node,cluster stats in fdb * wip7 * update env_var image * fix 1 * remove connected_clients from lvol stats * Fix pool api * Fix linter issues and type checker * prepare for merge * Fix history param parser * fix prom get_metrics param end_time --- requirements.txt | 1 + simplyblock_core/cluster_ops.py | 36 +++-- .../controllers/device_controller.py | 18 +-- .../controllers/lvol_controller.py | 27 +--- .../controllers/pool_controller.py | 20 +-- simplyblock_core/prom_client.py | 126 ++++++++++++++++++ .../services/capacity_and_stats_collector.py | 15 +++ .../services/lvol_stat_collector.py | 11 ++ simplyblock_core/storage_node_ops.py | 27 +--- simplyblock_web/api/v1/pool.py | 35 ++--- simplyblock_web/api/v2/pool.py | 4 +- 11 files changed, 211 insertions(+), 109 deletions(-) create mode 100644 simplyblock_core/prom_client.py diff --git a/requirements.txt b/requirements.txt index 030cca8e0..9ee458f00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,3 +24,4 @@ flask-openapi3 jsonschema fastapi uvicorn +prometheus_api_client \ No newline at end of file diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index dc429b8f9..24be657d7 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -25,6 +25,7 @@ from simplyblock_core.models.stats import LVolStatObject, ClusterStatObject, NodeStatObject, DeviceStatObject from simplyblock_core.models.nvme_device import NVMeDevice from simplyblock_core.models.storage_node import StorageNode +from simplyblock_core.prom_client import PromClient from simplyblock_core.utils import pull_docker_image_with_retry logger = utils.get_logger(__name__) @@ -1001,16 +1002,11 @@ def list_all_info(cluster_id) -> str: def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]: - cluster = db_controller.get_cluster_by_id(cluster_id) - - if history: - records_number = utils.parse_history_param(history) - if not records_number: - raise ValueError(f"Error parsing history string: {history}") - else: - records_number = 20 - - records = db_controller.get_cluster_capacity(cluster, records_number) + try: + _ = db_controller.get_cluster_by_id(cluster_id) + except KeyError: + logger.error(f"Cluster not found: {cluster_id}") + return [] cap_stats_keys = [ "date", @@ -1021,20 +1017,17 @@ def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]: "size_util", "size_prov_util", ] + prom_client = PromClient(cluster_id) + records = prom_client.get_cluster_metrics(cluster_id, cap_stats_keys, history) return utils.process_records(records, records_count, keys=cap_stats_keys) def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes=False) -> t.List[dict]: - cluster = db_controller.get_cluster_by_id(cluster_id) - - if history_string: - records_number = utils.parse_history_param(history_string) - if not records_number: - raise ValueError(f"Error parsing history string: {history_string}") - else: - records_number = 20 - - records = db_controller.get_cluster_stats(cluster, records_number) + try: + _ = db_controller.get_cluster_by_id(cluster_id) + except KeyError: + logger.error(f"Cluster not found: {cluster_id}") + return [] io_stats_keys = [ "date", @@ -1072,6 +1065,9 @@ def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes "write_latency_ticks", ] ) + + prom_client = PromClient(cluster_id) + records = prom_client.get_cluster_metrics(cluster_id, io_stats_keys, history_string) # combine records return utils.process_records(records, records_count, keys=io_stats_keys) diff --git a/simplyblock_core/controllers/device_controller.py b/simplyblock_core/controllers/device_controller.py index 8e684c942..6f7a0d9f5 100644 --- a/simplyblock_core/controllers/device_controller.py +++ b/simplyblock_core/controllers/device_controller.py @@ -6,6 +6,7 @@ from simplyblock_core.db_controller import DBController from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice from simplyblock_core.models.storage_node import StorageNode +from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient @@ -440,7 +441,7 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True): else: records_number = 20 - records = db_controller.get_device_capacity(device, records_number) + # records = db_controller.get_device_capacity(device, records_number) cap_stats_keys = [ "date", "size_total", @@ -448,6 +449,8 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True): "size_free", "size_util", ] + prom_client = PromClient(device.cluster_id) + records = prom_client.get_device_metrics(device_id, cap_stats_keys, history) records_list = utils.process_records(records, records_count, keys=cap_stats_keys) if not parse_sizes: @@ -474,15 +477,6 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True): logger.error("device not found") return False - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records_list = db_controller.get_device_stats(device, records_number) io_stats_keys = [ "date", "read_bytes", @@ -496,8 +490,10 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True): "write_io_ps", "write_latency_ps", ] + prom_client = PromClient(device.cluster_id) + records = prom_client.get_device_metrics(device_id, io_stats_keys, history) # combine records - new_records = utils.process_records(records_list, records_count, keys=io_stats_keys) + new_records = utils.process_records(records, records_count, keys=io_stats_keys) if not parse_sizes: return new_records diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 4d7a5aad3..be8c4fc55 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -15,6 +15,7 @@ from simplyblock_core.models.pool import Pool from simplyblock_core.models.lvol_model import LVol from simplyblock_core.models.storage_node import StorageNode +from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient logger = lg.getLogger() @@ -1521,19 +1522,11 @@ def get_capacity(lvol_uuid, history, records_count=20, parse_sizes=True): db_controller = DBController() try: lvol = db_controller.get_lvol_by_id(lvol_uuid) + pool = db_controller.get_pool_by_id(lvol.pool_uuid) except KeyError as e: logger.error(e) return False - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records_list = db_controller.get_lvol_stats(lvol, limit=records_number) cap_stats_keys = [ "date", "size_total", @@ -1543,6 +1536,8 @@ def get_capacity(lvol_uuid, history, records_count=20, parse_sizes=True): "size_prov", "size_prov_util" ] + prom_client = PromClient(pool.cluster_id) + records_list = prom_client.get_lvol_metrics(lvol_uuid, cap_stats_keys, history) new_records = utils.process_records(records_list, records_count, keys=cap_stats_keys) if not parse_sizes: @@ -1564,19 +1559,11 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si db_controller = DBController() try: lvol = db_controller.get_lvol_by_id(lvol_uuid) + pool = db_controller.get_pool_by_id(lvol.pool_uuid) except KeyError as e: logger.error(e) return False - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records_list = db_controller.get_lvol_stats(lvol, limit=records_number) io_stats_keys = [ "date", "read_bytes", @@ -1587,7 +1574,6 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si "write_bytes_ps", "write_io_ps", "write_latency_ps", - "connected_clients", ] if with_sizes: io_stats_keys.extend( @@ -1612,6 +1598,8 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si "write_latency_ticks", ] ) + prom_client = PromClient(pool.cluster_id) + records_list = prom_client.get_lvol_metrics(lvol_uuid, io_stats_keys, history) # combine records new_records = utils.process_records(records_list, records_count, keys=io_stats_keys) @@ -1630,7 +1618,6 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si "Write speed": utils.humanbytes(record['write_bytes_ps']), "Write IOPS": record['write_io_ps'], "Write lat": record['write_latency_ps'], - "Con": record['connected_clients'], }) return out diff --git a/simplyblock_core/controllers/pool_controller.py b/simplyblock_core/controllers/pool_controller.py index db7016d7d..2440a6bd7 100644 --- a/simplyblock_core/controllers/pool_controller.py +++ b/simplyblock_core/controllers/pool_controller.py @@ -12,6 +12,7 @@ from simplyblock_core.controllers import pool_events, lvol_controller from simplyblock_core.db_controller import DBController from simplyblock_core.models.pool import Pool +from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient logger = lg.getLogger() @@ -321,15 +322,18 @@ def get_io_stats(pool_id, history, records_count=20): logger.error(f"Pool not found {pool_id}") return False - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 + io_stats_keys = [ + "date", + "read_bytes_ps", + "read_io_ps", + "read_latency_ps", + "write_bytes_ps", + "write_io_ps", + "write_latency_ps", + ] - out = db_controller.get_pool_stats(pool, records_number) + prom_client = PromClient(pool.cluster_id) + out = prom_client.get_pool_metrics(pool_id, io_stats_keys, history) new_records = utils.process_records(out, records_count) return utils.print_table([ diff --git a/simplyblock_core/prom_client.py b/simplyblock_core/prom_client.py new file mode 100644 index 000000000..82756161b --- /dev/null +++ b/simplyblock_core/prom_client.py @@ -0,0 +1,126 @@ +import logging +import re +from datetime import datetime, timedelta + +from simplyblock_core.db_controller import DBController +from simplyblock_core.models.mgmt_node import MgmtNode + +from prometheus_api_client import PrometheusConnect + +logger = logging.getLogger() + + +class PromClientException(Exception): + def __init__(self, message): + self.message = message + + +class PromClient: + + def __init__(self, cluster_id): + db_controller = DBController() + cluster_ip = None + for node in db_controller.get_mgmt_nodes(): + if node.cluster_id == cluster_id and node.status == MgmtNode.STATUS_ONLINE: + cluster_ip = node.mgmt_ip + break + if cluster_ip is None: + raise PromClientException("Cluster has no online mgmt nodes") + + self.ip_address = f"{cluster_ip}:9090" + self.url = 'http://%s/' % self.ip_address + self.client = PrometheusConnect(url=self.url, disable_ssl=True) + + def parse_history_param(self, history_string): + if not history_string: + logger.error("Invalid history value") + return False + + # process history + results = re.search(r'^(\d+[hmd])(\d+[hmd])?$', history_string.lower()) + if not results: + logger.error(f"Error parsing history string: {history_string}") + logger.info("History format: xxdyyh , e.g: 1d12h, 1d, 2h, 1m") + return False + + history_in_days = 0 + history_in_hours = 0 + history_in_minutes = 0 + for s in results.groups(): + if not s: + continue + ind = s[-1] + v = int(s[:-1]) + if ind == 'd': + history_in_days = v + if ind == 'h': + history_in_hours = v + if ind == 'm': + history_in_minutes = v + + history_in_hours += int(history_in_minutes/60) + history_in_minutes = history_in_minutes % 60 + history_in_days += int(history_in_hours/24) + history_in_hours = history_in_hours % 24 + return history_in_days, history_in_hours, history_in_minutes + + def get_metrics(self, key_prefix, metrics_lst, params, history=None): + start_time = datetime.now() - timedelta(minutes=10) + if history: + try: + days,hours,minutes = self.parse_history_param(history) + start_time = datetime.now() - timedelta(days=days, hours=hours, minutes=minutes) + except Exception: + raise PromClientException(f"Error parsing history string: {history}") + end_time = datetime.now() + data_out: list[dict] = [] + for key in metrics_lst: + metrics = self.client.get_metric_range_data( + f"{key_prefix}_{key}", label_config=params, start_time=start_time, end_time=end_time) + for m in metrics: + mt_name = key + mt_values = m["values"] + for i, v in enumerate(mt_values): + value = v[1] + try: + value = int(value) + except Exception: + pass + if len(data_out) <= i: + data_out.append({mt_name: value}) + else: + d = data_out[i] + if mt_name not in d: + d[mt_name] = value + + return data_out + + def get_cluster_metrics(self, cluster_uuid, metrics_lst, history=None): + params = { + "cluster": cluster_uuid + } + return self.get_metrics("cluster", metrics_lst, params, history) + + def get_node_metrics(self, snode_uuid, metrics_lst, history=None): + params = { + "snode": snode_uuid + } + return self.get_metrics("snode", metrics_lst, params, history) + + def get_device_metrics(self, device_uuid, metrics_lst, history=None): + params = { + "device": device_uuid + } + return self.get_metrics("device", metrics_lst, params, history) + + def get_lvol_metrics(self, lvol_uuid, metrics_lst, history=None): + params = { + "lvol": lvol_uuid + } + return self.get_metrics("lvol", metrics_lst, params, history) + + def get_pool_metrics(self, pool_uuid, metrics_lst, history=None): + params = { + "pool": pool_uuid + } + return self.get_metrics("pool", metrics_lst, params, history) diff --git a/simplyblock_core/services/capacity_and_stats_collector.py b/simplyblock_core/services/capacity_and_stats_collector.py index 6f702d051..022dd84b5 100644 --- a/simplyblock_core/services/capacity_and_stats_collector.py +++ b/simplyblock_core/services/capacity_and_stats_collector.py @@ -83,6 +83,11 @@ def add_device_stats(cl, device, capacity_dict, stats_dict): stat_obj.write_to_db(db.kv_store) last_object_record[device.get_id()] = stat_obj + all_stats = db.get_device_stats(device, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj @@ -117,6 +122,11 @@ def add_node_stats(node, records): stat_obj = NodeStatObject(data=data) stat_obj.write_to_db(db.kv_store) + all_stats = db.get_node_stats(node, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj @@ -146,6 +156,11 @@ def add_cluster_stats(cl, records): stat_obj = ClusterStatObject(data=data) stat_obj.write_to_db(db.kv_store) + all_stats = db.get_cluster_stats(cl, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj diff --git a/simplyblock_core/services/lvol_stat_collector.py b/simplyblock_core/services/lvol_stat_collector.py index 09aa7d571..1933b6703 100644 --- a/simplyblock_core/services/lvol_stat_collector.py +++ b/simplyblock_core/services/lvol_stat_collector.py @@ -154,6 +154,11 @@ def add_lvol_stats(cluster, lvol, stats_list, capacity_dict=None): stat_obj.write_to_db(db.kv_store) last_object_record[lvol.get_id()] = stat_obj + all_stats = db.get_lvol_stats(lvol, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj @@ -173,6 +178,12 @@ def add_pool_stats(pool, records): stat_obj = PoolStatObject(data=data) stat_obj.write_to_db(db.kv_store) + + all_stats = db.get_pool_stats(pool, limit=0) + if len(all_stats) > 10: + for st in all_stats[10:]: + st.remove(db.kv_store) + return stat_obj diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 719284ab4..9b6630680 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -31,6 +31,7 @@ from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.models.cluster import Cluster +from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient, RPCException from simplyblock_core.snode_client import SNodeClient, SNodeClientException from simplyblock_web import node_utils @@ -2474,20 +2475,11 @@ def resume_storage_node(node_id): def get_node_capacity(node_id, history, records_count=20, parse_sizes=True): db_controller = DBController() try: - this_node = db_controller.get_storage_node_by_id(node_id) + node = db_controller.get_storage_node_by_id(node_id) except KeyError: logger.error("Storage node Not found") return - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records = db_controller.get_node_capacity(this_node, records_number) cap_stats_keys = [ "date", "size_total", @@ -2497,6 +2489,8 @@ def get_node_capacity(node_id, history, records_count=20, parse_sizes=True): "size_util", "size_prov_util", ] + prom_client = PromClient(node.cluster_id) + records = prom_client.get_node_metrics(node_id, cap_stats_keys, history) new_records = utils.process_records(records, records_count, keys=cap_stats_keys) if not parse_sizes: @@ -2523,17 +2517,6 @@ def get_node_iostats_history(node_id, history, records_count=20, parse_sizes=Tru except KeyError: logger.error("node not found") return False - - if history: - records_number = utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - records = db_controller.get_node_stats(node, records_number) - io_stats_keys = [ "date", "read_bytes", @@ -2571,6 +2554,8 @@ def get_node_iostats_history(node_id, history, records_count=20, parse_sizes=Tru "write_latency_ticks", ] ) + prom_client = PromClient(node.cluster_id) + records = prom_client.get_node_metrics(node_id, io_stats_keys, history) # combine records new_records = utils.process_records(records, records_count, keys=io_stats_keys) diff --git a/simplyblock_web/api/v1/pool.py b/simplyblock_web/api/v1/pool.py index a24a9e9b7..3b4fe5f72 100644 --- a/simplyblock_web/api/v1/pool.py +++ b/simplyblock_web/api/v1/pool.py @@ -184,21 +184,10 @@ def pool_iostats(uuid, history): except KeyError: return utils.get_response_error(f"Pool not found: {uuid}", 404) - if history: - records_number = core_utils.parse_history_param(history) - if not records_number: - logger.error(f"Error parsing history string: {history}") - return False - else: - records_number = 20 - - out = db.get_pool_stats(pool, records_number) - records_count = 20 - new_records = core_utils.process_records(out, records_count) - + data = pool_controller.get_io_stats(uuid, history) ret = { "object_data": pool.get_clean_dict(), - "stats": new_records or [] + "stats": data or [] } return utils.get_response(ret) @@ -207,21 +196,13 @@ def pool_iostats(uuid, history): @bp.route('/pool/iostats-all-lvols/', methods=['GET']) def lvol_iostats(pool_uuid): try: - db.get_pool_by_id(pool_uuid) + pool = db.get_pool_by_id(pool_uuid) except KeyError: return utils.get_response_error(f"Pool not found: {pool_uuid}", 404) - ret = [] - for lvol in db.get_lvols_by_pool_id(pool_uuid): - - records_list = db.get_lvol_stats(lvol, limit=1) - - if records_list: - data = records_list[0].get_clean_dict() - else: - data = {} - ret.append({ - "object_data": lvol.get_clean_dict(), - "stats": data - }) + data = pool_controller.get_capacity(pool_uuid) + ret = { + "object_data": pool.get_clean_dict(), + "stats": data or [] + } return utils.get_response(ret) diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py index c779f70ca..d34ce0b2a 100644 --- a/simplyblock_web/api/v2/pool.py +++ b/simplyblock_web/api/v2/pool.py @@ -122,5 +122,5 @@ def update(cluster: Cluster, pool: StoragePool, parameters: UpdatableStoragePool @instance_api.get('/iostats', name='clusters:storage-pools:iostats') def iostats(cluster: Cluster, pool: StoragePool, limit: int = 20): - records = db.get_pool_stats(pool, limit) - return core_utils.process_records(records, 20) + data = pool_controller.get_io_stats(pool.get_id(), history="") + return core_utils.process_records(data, 20) From 6ddfd0b1d6b9912ba6f017412d2b3c246556945d Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 2 Dec 2025 01:55:14 +0300 Subject: [PATCH 071/192] Increase jc comp resume retry on node not online (#690) --- simplyblock_core/services/tasks_runner_jc_comp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simplyblock_core/services/tasks_runner_jc_comp.py b/simplyblock_core/services/tasks_runner_jc_comp.py index 676156af3..6caf85b19 100644 --- a/simplyblock_core/services/tasks_runner_jc_comp.py +++ b/simplyblock_core/services/tasks_runner_jc_comp.py @@ -57,6 +57,7 @@ if node.status != StorageNode.STATUS_ONLINE: msg = f"Node is {node.status}, retry task" logger.info(msg) + task.retry += 1 task.function_result = msg task.status = JobSchedule.STATUS_SUSPENDED task.write_to_db(db.kv_store) @@ -79,6 +80,7 @@ logger.info(msg) task.function_result = msg task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 task.write_to_db(db.kv_store) continue From 8e3fe701bb3a6dad609ddaea9876b56a1b21421c Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 2 Dec 2025 16:22:08 +0300 Subject: [PATCH 072/192] Adds missing services to k8s mgmt (#788) * Adds missing services to k8s mgmt * added function create_k8s_service logic (#789) * Add labels and missing service on upgrade * fix linter --------- Co-authored-by: Geoffrey Israel --- simplyblock_core/cluster_ops.py | 67 ++++++++----- .../scripts/charts/templates/app_k8s.yaml | 51 ++++++++++ simplyblock_core/utils/__init__.py | 94 ++++++++++++++++++- 3 files changed, 185 insertions(+), 27 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 24be657d7..5cc9cb3ba 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -1180,44 +1180,43 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, service_names.append(service.attrs['Spec']['Name']) if "app_SnapshotMonitor" not in service_names: - logger.info("Creating snapshot monitor service") - cluster_docker.services.create( - image=service_image, - command="python simplyblock_core/services/snapshot_monitor.py", - name="app_SnapshotMonitor", - mounts=["/etc/foundationdb:/etc/foundationdb"], - env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"], - networks=["host"], - constraints=["node.role == manager"] - ) + utils.create_docker_service( + cluster_docker=cluster_docker, + service_name="app_SnapshotMonitor", + service_file="python simplyblock_core/services/snapshot_monitor.py", + service_image=service_image) if "app_TasksRunnerLVolSyncDelete" not in service_names: - logger.info("Creating lvol sync delete service") - cluster_docker.services.create( - image=service_image, - command="python simplyblock_core/services/tasks_runner_sync_lvol_del.py", - name="app_TasksRunnerLVolSyncDelete", - mounts=["/etc/foundationdb:/etc/foundationdb"], - env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"], - networks=["host"], - constraints=["node.role == manager"] - ) + utils.create_docker_service( + cluster_docker=cluster_docker, + service_name="app_TasksRunnerLVolSyncDelete", + service_file="python simplyblock_core/services/tasks_runner_sync_lvol_del.py", + service_image=service_image) + + if "app_TasksRunnerJCCompResume" not in service_names: + utils.create_docker_service( + cluster_docker=cluster_docker, + service_name="app_TasksRunnerJCCompResume", + service_file="python simplyblock_core/services/tasks_runner_jc_comp.py", + service_image=service_image) + logger.info("Done updating mgmt cluster") elif cluster.mode == "kubernetes": utils.load_kube_config_with_fallback() apps_v1 = k8s_client.AppsV1Api() - + namespace = constants.K8S_NAMESPACE image_without_tag = constants.SIMPLY_BLOCK_DOCKER_IMAGE.split(":")[0] image_parts = "/".join(image_without_tag.split("/")[-2:]) service_image = mgmt_image or constants.SIMPLY_BLOCK_DOCKER_IMAGE - + deployment_names = [] # Update Deployments - deployments = apps_v1.list_namespaced_deployment(namespace=constants.K8S_NAMESPACE) + deployments = apps_v1.list_namespaced_deployment(namespace=namespace) for deploy in deployments.items: if deploy.metadata.name == constants.ADMIN_DEPLOY_NAME: logger.info(f"Skipping deployment {deploy.metadata.name}") continue + deployment_names.append(deploy.metadata.name) for c in deploy.spec.template.spec.containers: if image_parts in c.image: logger.info(f"Updating deployment {deploy.metadata.name} image to {service_image}") @@ -1227,12 +1226,28 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, deploy.spec.template.metadata.annotations = annotations apps_v1.patch_namespaced_deployment( name=deploy.metadata.name, - namespace=constants.K8S_NAMESPACE, + namespace=namespace, body={"spec": {"template": deploy.spec.template}} ) + if "simplyblock-tasks-runner-sync-lvol-del" not in deployment_names: + utils.create_k8s_service( + namespace=namespace, + deployment_name="simplyblock-tasks-runner-sync-lvol-del", + container_name="tasks-runner-sync-lvol-del", + service_file="simplyblock_core/services/tasks_runner_sync_lvol_del.py", + container_image=service_image) + + if "simplyblock-snapshot-monitor" not in deployment_names: + utils.create_k8s_service( + namespace=namespace, + deployment_name="simplyblock-snapshot-monitor", + container_name="snapshot-monitor", + service_file="simplyblock_core/services/snapshot_monitor.py", + container_image=service_image) + # Update DaemonSets - daemonsets = apps_v1.list_namespaced_daemon_set(namespace=constants.K8S_NAMESPACE) + daemonsets = apps_v1.list_namespaced_daemon_set(namespace=namespace) for ds in daemonsets.items: for c in ds.spec.template.spec.containers: if image_parts in c.image: @@ -1243,7 +1258,7 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, ds.spec.template.metadata.annotations = annotations apps_v1.patch_namespaced_daemon_set( name=ds.metadata.name, - namespace=constants.K8S_NAMESPACE, + namespace=namespace, body={"spec": {"template": ds.spec.template}} ) diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index d17ea092a..49c7490b7 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -1100,6 +1100,57 @@ spec: - key: cluster-file path: fdb.cluster --- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simplyblock-tasks-runner-sync-lvol-del + namespace: {{ .Release.Namespace }} +spec: + replicas: 1 + selector: + matchLabels: + app: simplyblock-tasks-runner-sync-lvol-del + template: + metadata: + annotations: + log-collector/enabled: "true" + reloader.stakater.com/auto: "true" + reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" + labels: + app: simplyblock-tasks-runner-sync-lvol-del + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: tasks-runner-sync-lvol-del + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" + command: ["python", "simplyblock_core/services/tasks_runner_sync_lvol_del.py"] + env: + - name: SIMPLYBLOCK_LOG_LEVEL + valueFrom: + configMapKeyRef: + name: simplyblock-config + key: LOG_LEVEL + volumeMounts: + - name: fdb-cluster-file + mountPath: /etc/foundationdb/fdb.cluster + subPath: fdb.cluster + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "400m" + memory: "1Gi" + volumes: + - name: fdb-cluster-file + configMap: + name: simplyblock-fdb-cluster-config + items: + - key: cluster-file + path: fdb.cluster +--- apiVersion: apps/v1 kind: DaemonSet diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 96a00ecac..7bc2fa112 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -12,8 +12,12 @@ import time import socket from typing import Union, Any, Optional, Tuple +from docker import DockerClient from kubernetes import client, config -from kubernetes.client import ApiException +from kubernetes.client import ApiException, V1Deployment, V1DeploymentSpec, V1ObjectMeta, \ + V1PodTemplateSpec, V1PodSpec, V1Container, V1EnvVar, V1VolumeMount, V1Volume, V1ConfigMapVolumeSource, \ + V1LabelSelector, V1ResourceRequirements + import docker from prettytable import PrettyTable from docker.errors import APIError, DockerException, ImageNotFound, NotFound @@ -2081,3 +2085,91 @@ def patch_prometheus_configmap(username: str, password: str): except Exception as e: logger.error(f"Unexpected error while patching ConfigMap: {e}") return False + + +def create_docker_service(cluster_docker: DockerClient, service_name: str, service_file: str, service_image: str): + logger.info(f"Creating service: {service_name}") + cluster_docker.services.create( + image=service_image, + command=service_file, + name=service_name, + mounts=["/etc/foundationdb:/etc/foundationdb"], + env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"], + networks=["host"], + constraints=["node.role == manager"], + labels={ + "com.docker.stack.image": service_image, + "com.docker.stack.namespace": "app"} + ) + +def create_k8s_service(namespace: str, deployment_name: str, + container_name: str, service_file: str, container_image: str): + + logger.info(f"Creating deployment: {deployment_name} in namespace {namespace}") + load_kube_config_with_fallback() + apps_v1 = client.AppsV1Api() + + env_list = [ + V1EnvVar( + name="SIMPLYBLOCK_LOG_LEVEL", + value_from={"config_map_key_ref": {"name": "simplyblock-config", "key": "LOG_LEVEL"}} + ) + ] + + volume_mounts = [ + V1VolumeMount( + name="fdb-cluster-file", + mount_path="/etc/foundationdb/fdb.cluster", + sub_path="fdb.cluster" + ) + ] + + volumes = [ + V1Volume( + name="fdb-cluster-file", + config_map=V1ConfigMapVolumeSource( + name="simplyblock-fdb-cluster-config", + items=[{"key": "cluster-file", "path": "fdb.cluster"}] + ) + ) + ] + + container = V1Container( + name=container_name, + image=container_image, + command=["python", service_file], + env=env_list, + volume_mounts=volume_mounts, + resources=V1ResourceRequirements( + requests={"cpu": "200m", "memory": "256Mi"}, + limits={"cpu": "400m", "memory": "1Gi"} + ) + ) + + pod_spec = V1PodSpec( + containers=[container], + volumes=volumes, + host_network=True, + dns_policy="ClusterFirstWithHostNet" + ) + + pod_template = V1PodTemplateSpec( + metadata=V1ObjectMeta(labels={"app": deployment_name}), + spec=pod_spec + ) + + deployment_spec = V1DeploymentSpec( + replicas=1, + selector=V1LabelSelector(match_labels={"app": deployment_name}), + template=pod_template + ) + + deployment = V1Deployment( + api_version="apps/v1", + kind="Deployment", + metadata=V1ObjectMeta(name=deployment_name, namespace=namespace), + spec=deployment_spec + ) + + apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment) + logger.info(f"Deployment {deployment_name} created successfully.") From a77c5e430cccfafff899c589017fa5eafea62534 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 3 Dec 2025 11:49:23 +0100 Subject: [PATCH 073/192] fix sfam-2507 (#791) * fix sfam-2507 * refactored code to k8s standard * fixed failing type check --------- Co-authored-by: hamdykhader --- simplyblock_core/cluster_ops.py | 41 +++++++++++----- .../scripts/charts/templates/app_k8s.yaml | 11 +++++ simplyblock_web/api/v1/cluster.py | 48 +++++++++++++++++++ simplyblock_web/api/v2/cluster.py | 6 ++- simplyblock_web/auth_middleware.py | 2 + 5 files changed, 96 insertions(+), 12 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 5cc9cb3ba..fb43e8022 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -436,18 +436,23 @@ def _run_fio(mount_point) -> None: def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, qpair_count, - max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric="tcp") -> str: + max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric="tcp", + cluster_ip=None, grafana_secret=None) -> str: + + default_cluster = None + monitoring_secret = os.environ.get("MONITORING_SECRET", "") clusters = db_controller.get_clusters() - if not clusters: - raise ValueError("No previous clusters found!") + if clusters: + default_cluster = clusters[0] + else: + logger.info("No previous clusters found") if distr_ndcs == 0 and distr_npcs == 0: raise ValueError("both distr_ndcs and distr_npcs cannot be 0") - monitoring_secret = os.environ.get("MONITORING_SECRET", "") - logger.info("Adding new cluster") + cluster = Cluster() cluster.uuid = str(uuid.uuid4()) cluster.cluster_name = name @@ -456,12 +461,27 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.nqn = f"{constants.CLUSTER_NQN}:{cluster.uuid}" cluster.secret = utils.generate_string(20) cluster.strict_node_anti_affinity = strict_node_anti_affinity + if default_cluster: + cluster.mode = default_cluster.mode + cluster.db_connection = default_cluster.db_connection + cluster.grafana_secret = grafana_secret if grafana_secret else default_cluster.grafana_secret + cluster.grafana_endpoint = default_cluster.grafana_endpoint + else: + # creating first cluster on k8s + cluster.mode = "kubernetes" + logger.info("Retrieving foundationdb connection string...") + fdb_cluster_string = utils.get_fdb_cluster_string(constants.FDB_CONFIG_NAME, constants.K8S_NAMESPACE) + cluster.db_connection = fdb_cluster_string + if monitoring_secret: + cluster.grafana_secret = monitoring_secret + else: + raise Exception("monitoring_secret is required") + cluster.grafana_endpoint = "http://simplyblock-grafana" + if not cluster_ip: + cluster_ip = "0.0.0.0" - default_cluster = clusters[0] - cluster.mode = default_cluster.mode - cluster.db_connection = default_cluster.db_connection - cluster.grafana_secret = monitoring_secret if default_cluster.mode == "kubernetes" else default_cluster.grafana_secret - cluster.grafana_endpoint = default_cluster.grafana_endpoint + # add mgmt node object + mgmt_node_ops.add_mgmt_node(cluster_ip, "kubernetes", cluster.uuid) _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret) @@ -491,7 +511,6 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.create_dt = str(datetime.datetime.now()) cluster.write_to_db(db_controller.kv_store) cluster_events.cluster_create(cluster) - qos_controller.add_class("Default", 100, cluster.get_id()) return cluster.get_id() diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index 49c7490b7..988955a4f 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -93,6 +93,17 @@ spec: configMapKeyRef: name: simplyblock-config key: LOG_LEVEL + - name: LVOL_NVMF_PORT_START + value: "{{ .Values.ports.lvolNvmfPortStart }}" + - name: K8S_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: MONITORING_SECRET + valueFrom: + secretKeyRef: + name: simplyblock-grafana-secrets + key: MONITORING_SECRET - name: FLASK_DEBUG value: "False" - name: FLASK_ENV diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py index 698d9582d..5c5567694 100644 --- a/simplyblock_web/api/v1/cluster.py +++ b/simplyblock_web/api/v1/cluster.py @@ -60,6 +60,54 @@ def add_cluster(): )) +@bp.route('/cluster/create_first', methods=['POST']) +def create_first_cluster(): + cl_data = request.get_json() + + if db.get_clusters(): + return utils.get_response_error("Cluster found!", 400) + + blk_size = 512 + if 'blk_size' in cl_data: + if cl_data['blk_size'] not in [512, 4096]: + return utils.get_response_error("blk_size can be 512 or 4096", 400) + else: + blk_size = cl_data['blk_size'] + page_size_in_blocks = cl_data.get('distr_ndcs', 2097152) + distr_ndcs = cl_data.get('distr_ndcs', 1) + distr_npcs = cl_data.get('distr_npcs', 1) + distr_bs = cl_data.get('distr_bs', 4096) + distr_chunk_bs = cl_data.get('distr_chunk_bs', 4096) + ha_type = cl_data.get('ha_type', 'single') + enable_node_affinity = cl_data.get('enable_node_affinity', False) + qpair_count = cl_data.get('qpair_count', 256) + name = cl_data.get('name', None) + fabric = cl_data.get('fabric', "tcp") + cap_warn = cl_data.get('cap_warn', 0) + cap_crit = cl_data.get('cap_crit', 0) + prov_cap_warn = cl_data.get('prov_cap_warn', 0) + prov_cap_crit = cl_data.get('prov_cap_crit', 0) + max_queue_size = cl_data.get('max_queue_size', 128) + inflight_io_threshold = cl_data.get('inflight_io_threshold', 4) + strict_node_anti_affinity = cl_data.get('strict_node_anti_affinity', False) + is_single_node = cl_data.get('is_single_node', False) + cluster_ip = cl_data.get('cluster_ip', None) + grafana_secret = cl_data.get('grafana_secret', None) + + try: + cluster_id = cluster_ops.add_cluster( + blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, + distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, + qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric, + cluster_ip=cluster_ip, grafana_secret=grafana_secret) + if cluster_id: + return utils.get_response(db.get_cluster_by_id(cluster_id)) + else: + return utils.get_response(False, "Failed to create cluster", 400) + except Exception as e: + return utils.get_response(False, str(e), 404) + + @bp.route('/cluster', methods=['GET'], defaults={'uuid': None}) @bp.route('/cluster/', methods=['GET']) def list_clusters(uuid): diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py index 422766246..8b203bb4a 100644 --- a/simplyblock_web/api/v2/cluster.py +++ b/simplyblock_web/api/v2/cluster.py @@ -24,7 +24,7 @@ class _UpdateParams(BaseModel): class ClusterParams(BaseModel): - name: Optional[str] = None + name: str | None = None blk_size: Literal[512, 4096] = 512 page_size_in_blocks: int = Field(2097152, gt=0) cap_warn: util.Percent = 0 @@ -41,6 +41,10 @@ class ClusterParams(BaseModel): inflight_io_threshold: int = 4 enable_node_affinity: bool = False strict_node_anti_affinity: bool = False + is_single_node: bool = False + fabric: str = "tcp" + cluster_ip: str | None = None + grafana_secret: str | None = None @api.get('/', name='clusters:list') diff --git a/simplyblock_web/auth_middleware.py b/simplyblock_web/auth_middleware.py index 8a1a9e83a..70755b46a 100644 --- a/simplyblock_web/auth_middleware.py +++ b/simplyblock_web/auth_middleware.py @@ -34,6 +34,8 @@ def decorated(*args: Any, **kwargs: Any) -> ResponseType: # Skip authentication for Swagger UI if request.method == "GET" and request.path.startswith("/swagger"): return cast(ResponseType, f(*args, **kwargs)) + if request.method == "POST" and request.path.startswith("/cluster/create_first"): + return cast(ResponseType, f(*args, **kwargs)) cluster_id: str = "" cluster_secret: str = "" From db2ca62ac91f858217800f0aa58c6a97bff47e7d Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 3 Dec 2025 12:58:37 +0100 Subject: [PATCH 074/192] Update cluster.py (#793) --- simplyblock_web/api/v2/cluster.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py index 8b203bb4a..7834e3f06 100644 --- a/simplyblock_web/api/v2/cluster.py +++ b/simplyblock_web/api/v2/cluster.py @@ -24,7 +24,7 @@ class _UpdateParams(BaseModel): class ClusterParams(BaseModel): - name: str | None = None + name: str = "" blk_size: Literal[512, 4096] = 512 page_size_in_blocks: int = Field(2097152, gt=0) cap_warn: util.Percent = 0 @@ -43,9 +43,8 @@ class ClusterParams(BaseModel): strict_node_anti_affinity: bool = False is_single_node: bool = False fabric: str = "tcp" - cluster_ip: str | None = None - grafana_secret: str | None = None - + cluster_ip: str = "" + grafana_secret: str = "" @api.get('/', name='clusters:list') def list() -> List[ClusterDTO]: From 67455edc4f54d758e7658d34528baec09468185d Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 3 Dec 2025 15:10:01 +0100 Subject: [PATCH 075/192] Update mgmt_node_ops.py (#795) --- simplyblock_core/mgmt_node_ops.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/simplyblock_core/mgmt_node_ops.py b/simplyblock_core/mgmt_node_ops.py index 84375d819..a867e4cbf 100644 --- a/simplyblock_core/mgmt_node_ops.py +++ b/simplyblock_core/mgmt_node_ops.py @@ -106,8 +106,6 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo logger.info(f"Node IP: {dev_ip}") - hostname = utils.get_node_name_by_ip(dev_ip) - utils.label_node_as_mgmt_plane(hostname) db_connection = cluster_data['db_connection'] db_controller = DBController() nodes = db_controller.get_mgmt_nodes() @@ -225,10 +223,9 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo def add_mgmt_node(mgmt_ip, mode, cluster_id=None): db_controller = DBController() + hostname = "" if mode == "docker": hostname = utils.get_hostname() - elif mode == "kubernetes": - hostname = utils.get_node_name_by_ip(mgmt_ip) try: node = db_controller.get_mgmt_node_by_hostname(hostname) if node: From 20068bf71648854826dfa6356555f1dad52796d0 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 3 Dec 2025 15:10:36 +0100 Subject: [PATCH 076/192] remove function get_node_name_by_ip (#794) --- simplyblock_core/cluster_ops.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index fb43e8022..33a3b8aab 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -282,9 +282,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, if not dev_ip: raise ValueError("Error getting ip: For Kubernetes-based deployments, please supply --mgmt-ip.") - current_node = utils.get_node_name_by_ip(dev_ip) - utils.label_node_as_mgmt_plane(current_node) - if not cli_pass: cli_pass = utils.generate_string(10) From e91dbc07518e4cf71d95f028489bc5b7afadde36 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Wed, 3 Dec 2025 19:25:07 +0300 Subject: [PATCH 077/192] Fix /cluster/create_first response (#798) --- simplyblock_web/api/v1/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py index 5c5567694..f4eb2e690 100644 --- a/simplyblock_web/api/v1/cluster.py +++ b/simplyblock_web/api/v1/cluster.py @@ -101,7 +101,7 @@ def create_first_cluster(): qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric, cluster_ip=cluster_ip, grafana_secret=grafana_secret) if cluster_id: - return utils.get_response(db.get_cluster_by_id(cluster_id)) + return utils.get_response(db.get_cluster_by_id(cluster_id).to_dict()) else: return utils.get_response(False, "Failed to create cluster", 400) except Exception as e: From cd2b26187d0bbfedbfdd1fde107e45bb8d36d26e Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Wed, 3 Dec 2025 21:09:41 +0300 Subject: [PATCH 078/192] replicate snapshot back to src _1 (#790) * replicate snapshot back to src _1 * fix linter --- .../controllers/lvol_controller.py | 2 +- .../controllers/snapshot_controller.py | 11 ++++- .../controllers/tasks_controller.py | 6 +-- .../services/snapshot_replication.py | 47 ++++++++++++------- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 6e8f346b5..feb6e01ae 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1797,7 +1797,7 @@ def replication_start(lvol_id): for snap in db_controller.get_snapshots(): if snap.lvol.uuid == lvol.uuid: if not snap.target_replicated_snap_uuid: - task = tasks_controller.add_snapshot_replication_task(snap) + task = tasks_controller.add_snapshot_replication_task(snap.cluster_id, snap.lvol.node_id, snap.get_id()) if task: snapshot_events.replication_task_created(snap) return True diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 793f12a09..744c36d90 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -221,9 +221,18 @@ def add(lvol_id, snapshot_name): logger.info("Done") snapshot_events.snapshot_create(snap) if lvol.do_replicate: - task = tasks_controller.add_snapshot_replication_task(snap) + task = tasks_controller.add_snapshot_replication_task(snap.cluster_id, snap.lvol.node_id, snap.get_id()) if task: snapshot_events.replication_task_created(snap) + if lvol.cloned_from_snap: + lvol_snap = db_controller.get_snapshot_by_id(lvol.cloned_from_snap) + if lvol_snap.source_replicated_snap_uuid: + org_snap = db_controller.get_snapshot_by_id(lvol_snap.source_replicated_snap_uuid) + if org_snap and org_snap.status == SnapShot.STATUS_ONLINE: + task = tasks_controller.add_snapshot_replication_task( + snap.cluster_id, org_snap.lvol.node_id, snap.get_id(), replicate_to_source=True) + if task: + logger.info("Created snapshot replication task on original node") return snap.uuid, False diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index 8f1e769e0..34e717ce0 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -425,6 +425,6 @@ def get_snapshot_replication_task(cluster_id, snapshot_id): return False -def add_snapshot_replication_task(snapshot): - return _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, snapshot.cluster_id, snapshot.lvol.node_id, "", - function_params={"snapshot_id": snapshot.get_id()}) +def add_snapshot_replication_task(cluster_id, node_id, snapshot_id, replicate_to_source=False): + return _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, cluster_id, node_id, "", + function_params={"snapshot_id": snapshot_id, "replicate_to_source": replicate_to_source}) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 1ddcc93ac..704c6351b 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -19,23 +19,29 @@ def process_snap_replicate_start(task, snapshot): # 1 create lvol on remote node logger.info("Starting snapshot replication task") snode = db.get_storage_node_by_id(snapshot.lvol.node_id) - if "remote_lvol_id" not in task.function_params or not task.function_params["remote_lvol_id"] : - remote_node_uuid = db.get_storage_node_by_id(snapshot.lvol.replication_node_id) - cluster = db.get_cluster_by_id(remote_node_uuid.cluster_id) - remote_pool_uuid = None - if cluster.snapshot_replication_target_pool: - remote_pool_uuid = cluster.snapshot_replication_target_pool - else: - for bool in db.get_pools(remote_node_uuid.cluster_id): - if bool.status == Pool.STATUS_ACTIVE: - remote_pool_uuid = bool.uuid - break - if not remote_pool_uuid: - logger.error(f"Unable to find pool on remote cluster: {remote_node_uuid.cluster_id}") - return + replicate_to_source = task.function_params["replicate_to_source"] + if "remote_lvol_id" not in task.function_params or not task.function_params["remote_lvol_id"]: + if replicate_to_source: + org_snap = db.get_snapshot_by_id(snapshot.lvol.source_replicated_snap_uuid) + remote_node_uuid = db.get_storage_node_by_id(org_snap.lvol.node_id) + remote_pool_uuid = org_snap.lvol.pool_uuid + else: # replicate to target + remote_node_uuid = db.get_storage_node_by_id(snapshot.lvol.replication_node_id) + cluster = db.get_cluster_by_id(remote_node_uuid.cluster_id) + remote_pool_uuid = None + if cluster.snapshot_replication_target_pool: + remote_pool_uuid = cluster.snapshot_replication_target_pool + else: + for bool in db.get_pools(remote_node_uuid.cluster_id): + if bool.status == Pool.STATUS_ACTIVE: + remote_pool_uuid = bool.uuid + break + if not remote_pool_uuid: + logger.error(f"Unable to find pool on remote cluster: {remote_node_uuid.cluster_id}") + return lv_id, err = lvol_controller.add_lvol_ha( - f"REP_{snapshot.snap_name}", snapshot.size, snapshot.lvol.replication_node_id, snapshot.lvol.ha_type, + f"REP_{snapshot.snap_name}", snapshot.size, remote_node_uuid.get_id(), snapshot.lvol.ha_type, remote_pool_uuid) if lv_id: task.function_params["remote_lvol_id"] = lv_id @@ -118,10 +124,17 @@ def process_snap_replicate_finish(task, snapshot): snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) remote_snode = db.get_storage_node_by_id(remote_lv.node_id) + replicate_to_source = task.function_params["replicate_to_source"] + if replicate_to_source: + org_snap = db.get_snapshot_by_id(snapshot.source_replicated_snap_uuid) + snapshot_lvol_id = org_snap.lvol.get_id() + else: + snapshot_lvol_id = snapshot.lvol.get_id() + # chain snaps on primary snaps = db.get_snapshots(remote_snode.cluster_id) for sn in snaps: - if sn.lvol.get_id() == snapshot.lvol.get_id(): + if sn.lvol.get_id() == snapshot_lvol_id: try: target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") @@ -137,7 +150,7 @@ def process_snap_replicate_finish(task, snapshot): # chain snaps on secondary if sec_node.status == StorageNode.STATUS_ONLINE: for sn in snaps: - if sn.lvol.get_id() == snapshot.lvol.get_id(): + if sn.lvol.get_id() == snapshot_lvol_id: try: target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") From bc79957f919344161a204a47a4216f515e11b68c Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 4 Dec 2025 13:46:31 +0300 Subject: [PATCH 079/192] Remove user creation and switch (#799) --- docker/Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1e1f8c3bd..c8999b47d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -27,7 +27,3 @@ RUN if [ -d /usr/share/terminfo ]; then \ rm -f /usr/share/terminfo/n/ncr260vt300wpp || true ; \ fi -RUN useradd -u 1001 -r -g 0 -d /app -s /sbin/nologin simplyblock && \ - chown -R 1001:0 /app - -USER 1001 From 43a4caeb6683ebdcf973c9d47768063a063c2bcd Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 4 Dec 2025 16:11:58 +0300 Subject: [PATCH 080/192] Fix apiv2 pool add response to return pool dict (#800) --- simplyblock_web/api/v2/pool.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py index d34ce0b2a..4ef2c897b 100644 --- a/simplyblock_web/api/v2/pool.py +++ b/simplyblock_web/api/v2/pool.py @@ -54,9 +54,8 @@ def add(request: Request, cluster: Cluster, parameters: StoragePoolParams) -> Re if not id_or_false: raise ValueError('Failed to create pool') - - entity_url = request.app.url_path_for('clusters:storage-pools:detail', cluster_id=cluster.get_id(), pool_id=id_or_false) - return Response(status_code=201, headers={'Location': entity_url}) + pool = db.get_pool_by_id(id_or_false) + return pool.to_dict() instance_api = APIRouter(prefix='/{pool_id}') From ce479ebaae71c4f87c02da23cbf85d856389ce23 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Thu, 4 Dec 2025 14:12:43 +0100 Subject: [PATCH 081/192] Update mgmt_node_ops.py (#796) --- simplyblock_core/mgmt_node_ops.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/simplyblock_core/mgmt_node_ops.py b/simplyblock_core/mgmt_node_ops.py index a867e4cbf..6d752a86c 100644 --- a/simplyblock_core/mgmt_node_ops.py +++ b/simplyblock_core/mgmt_node_ops.py @@ -112,10 +112,7 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo if not nodes: logger.error("No mgmt nodes was found in the cluster!") return False - for node in nodes: - if node.hostname == hostname: - logger.error("Node already exists in the cluster") - return False + logger.info("Adding management node object") node_id = add_mgmt_node(dev_ip, mode, cluster_id) From ec075725a8ea439d3ef091bce6fedcd15dee049c Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 4 Dec 2025 16:14:16 +0300 Subject: [PATCH 082/192] Fix add-node apiv2 to remove unused param "full_page_unmap" (#801) --- simplyblock_web/api/v2/storage_node.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index f93fa5250..b59fadce3 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -37,7 +37,6 @@ class StorageNodeParams(BaseModel): test_device: bool = Field(False) spdk_image: Optional[str] spdk_debug: bool = Field(False) - full_page_unmap: bool = Field(False) data_nics: List[str] = Field([]) namespace: str = Field('default') jm_percent: util.Percent = Field(3) @@ -65,7 +64,6 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Re 'enable_test_device': parameters.test_device, 'namespace': parameters.namespace, 'enable_ha_jm': parameters.ha_jm, - 'full_page_unmap': parameters.full_page_unmap, } ) if not task_id_or_false: From f48c839aba5c454eedf4a491cc3a347b54423d51 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 4 Dec 2025 16:18:52 +0300 Subject: [PATCH 083/192] Main fix add node apiv2 (#802) * Fix add-node apiv2 to remove unused param "full_page_unmap" * Fix optional param initial value for node-add apiv2 "spdk_image" --- simplyblock_web/api/v2/storage_node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index b59fadce3..a7a9da7f8 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -35,7 +35,7 @@ class StorageNodeParams(BaseModel): max_snapshots: int = Field(500) ha_jm: bool = Field(True) test_device: bool = Field(False) - spdk_image: Optional[str] + spdk_image: Optional[str] = Field("") spdk_debug: bool = Field(False) data_nics: List[str] = Field([]) namespace: str = Field('default') From f673f0ad30e209641674a0745e9bcc09ae9194fd Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Thu, 4 Dec 2025 15:37:50 +0100 Subject: [PATCH 084/192] Update cluster_ops.py (#797) * Update cluster_ops.py * Update cluster_ops.py * add grafana port 3000 to url * removed debug log message --- simplyblock_core/cluster_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 33a3b8aab..5e6352cc0 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -473,15 +473,15 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.grafana_secret = monitoring_secret else: raise Exception("monitoring_secret is required") - cluster.grafana_endpoint = "http://simplyblock-grafana" + cluster.grafana_endpoint = "http://simplyblock-grafana:3000" if not cluster_ip: cluster_ip = "0.0.0.0" # add mgmt node object mgmt_node_ops.add_mgmt_node(cluster_ip, "kubernetes", cluster.uuid) - + _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret) - + cluster.distr_ndcs = distr_ndcs cluster.distr_npcs = distr_npcs cluster.distr_bs = distr_bs From d2ad9737a2d17d62f79ba4c31809fd9a72b2d9fe Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 4 Dec 2025 18:35:50 +0300 Subject: [PATCH 085/192] Fix node-add apiv2 response (#803) --- simplyblock_web/api/v2/storage_node.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index a7a9da7f8..ab0a3f827 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -68,9 +68,7 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Re ) if not task_id_or_false: raise ValueError('Failed to create add-node task') - - task_url = request.app.url_path_for('clusters:storage-nodes:detail', cluster_id=cluster.get_id(), task_id=task_id_or_false) - return Response(status_code=201, headers={'Location': task_url}) + return task_id_or_false instance_api = APIRouter(prefix='/{storage_node_id}') From 1b0d7aa4a40d6a7190f3ac9413577cb115c97a5d Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 4 Dec 2025 18:41:09 +0300 Subject: [PATCH 086/192] Main fix node list apiv2 response (#804) * Fix node-add apiv2 response * Fix sn list apiv2 response --- simplyblock_web/api/v2/storage_node.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index ab0a3f827..d1aec59be 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -21,9 +21,9 @@ @api.get('/', name='clusters:storage-nodes:list') -def list(cluster: Cluster) -> List[StorageNodeDTO]: +def list(cluster: Cluster) -> List[dict]: return [ - StorageNodeDTO.from_model(storage_node) + storage_node.to_dict() for storage_node in db.get_storage_nodes_by_cluster_id(cluster.get_id()) ] @@ -46,7 +46,7 @@ class StorageNodeParams(BaseModel): @api.post('/', name='clusters:storage-nodes:create', status_code=201, responses={201: {"content": None}}) -def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Response: +def add(request: Request, cluster: Cluster, parameters: StorageNodeParams): task_id_or_false = tasks_controller.add_node_add_task( cluster.get_id(), { From e0cd5abcd9a99500bd88ee50bfae426e8218a8c7 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Thu, 4 Dec 2025 17:11:27 +0100 Subject: [PATCH 087/192] Update storage_deploy_spdk.yaml.j2 (#805) --- simplyblock_web/templates/storage_deploy_spdk.yaml.j2 | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index e49aca2e2..81f1e1eda 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -87,16 +87,6 @@ spec: value: "{{ TOTAL_HP }}" - name: RPC_PORT value: "{{ RPC_PORT }}" - - name: SPDKCSI_SECRET - valueFrom: - secretKeyRef: - name: simplyblock-csi-secret - key: secret.json - - name: CLUSTER_CONFIG - valueFrom: - configMapKeyRef: - name: simplyblock-csi-cm - key: config.json lifecycle: postStart: exec: From 7d8bb865e1905a6dea9ddf82bf855c94998c101f Mon Sep 17 00:00:00 2001 From: Raunak Jalan <41023976+RaunakJalan@users.noreply.github.com> Date: Fri, 5 Dec 2025 13:53:14 +0530 Subject: [PATCH 088/192] Adding quick outage case, changes to ssh utils (#806) Co-authored-by: root --- e2e/__init__.py | 8 +- e2e/continuous_log_collector.py | 4 +- e2e/e2e_tests/cluster_test_base.py | 10 +- e2e/e2e_tests/single_node_multi_fio_perf.py | 9 +- .../continuous_failover_ha_multi_client.py | 131 +- ...s_failover_ha_multi_client_quick_outage.py | 535 +++++++ .../continuous_failover_ha_multi_outage.py | 398 ++++- e2e/utils/ssh_utils.py | 1389 ++++++++++++----- 8 files changed, 1974 insertions(+), 510 deletions(-) create mode 100644 e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py diff --git a/e2e/__init__.py b/e2e/__init__.py index e8cae33f7..31164238e 100644 --- a/e2e/__init__.py +++ b/e2e/__init__.py @@ -55,6 +55,7 @@ from stress_test.continuous_failover_ha_geomtery import RandomMultiGeometryFailoverTest from stress_test.continuous_failover_ha_2node import RandomMultiClient2NodeFailoverTest from stress_test.continuous_failover_ha_rdma import RandomRDMAFailoverTest +from stress_test.continuous_failover_ha_multi_client_quick_outage import RandomRapidFailoverNoGap from e2e_tests.upgrade_tests.major_upgrade import TestMajorUpgrade @@ -96,8 +97,8 @@ def get_all_tests(custom=True, ha_test=False): TestLvolFioNpcs0, TestLvolFioNpcs1, TestLvolFioNpcs2, - TestLvolFioQOSBW, - TestLvolFioQOSIOPS, + # TestLvolFioQOSBW, + # TestLvolFioQOSIOPS, TestSingleNodeOutage, # TestSingleNodeReboot, # TestHASingleNodeReboot, @@ -147,6 +148,7 @@ def get_stress_tests(): RandomMultiGeometryFailoverTest, RandomMultiClient2NodeFailoverTest, RandomRDMAFailoverTest, + RandomRapidFailoverNoGap, ] return tests @@ -161,4 +163,4 @@ def get_load_tests(): tests = [ TestLvolOutageLoadTest ] - return tests \ No newline at end of file + return tests diff --git a/e2e/continuous_log_collector.py b/e2e/continuous_log_collector.py index 48f06fd80..96b157760 100644 --- a/e2e/continuous_log_collector.py +++ b/e2e/continuous_log_collector.py @@ -22,7 +22,7 @@ def __init__(self,docker_logs_path=None): def get_log_directory(self): timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - return os.path.join(Path.home(), "container-logs", f"manual-logs-{timestamp}") + return os.path.join('/mnt/nfs_share/', f"snapshot-repliction-from-replicated-clone-{timestamp}") def collect_logs(self, test_name): all_nodes = set() @@ -75,4 +75,4 @@ def collect_logs(self, test_name): if __name__ == "__main__": collector = ContinuousLogCollector() - collector.collect_logs(test_name="Manual") + collector.collect_logs(test_name="snapshot-repliction-from-replicated-clone") diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py index 5077544b0..15743725b 100644 --- a/e2e/e2e_tests/cluster_test_base.py +++ b/e2e/e2e_tests/cluster_test_base.py @@ -401,13 +401,17 @@ def collect_management_details(self, post_teardown=False): cmd = f"{self.base_cmd} sn check {result['uuid']} >& {base_path}/node{node}_check{suffix}.txt" self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd) + cmd = f"{self.base_cmd} sn get {result['uuid']} >& {base_path}/node{node}_get{suffix}.txt" + self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd) + node+=1 - for node in self.fio_node: + all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines: + for node in all_nodes: base_path = os.path.join(self.docker_logs_path, node) - cmd = f"journalctl -k >& {base_path}/jounalctl_{node}.txt" + cmd = f"journalctl -k --no-tail >& {base_path}/jounalctl_{node}-final.txt" self.ssh_obj.exec_command(node, cmd) - cmd = f"dmesg -T >& {base_path}/dmesg_{node}.txt" + cmd = f"dmesg -T >& {base_path}/dmesg_{node}-final.txt" self.ssh_obj.exec_command(node, cmd) def teardown(self, delete_lvols=True, close_ssh=True): diff --git a/e2e/e2e_tests/single_node_multi_fio_perf.py b/e2e/e2e_tests/single_node_multi_fio_perf.py index 86a75c4d5..681cc1742 100644 --- a/e2e/e2e_tests/single_node_multi_fio_perf.py +++ b/e2e/e2e_tests/single_node_multi_fio_perf.py @@ -187,10 +187,11 @@ def cleanup_lvols(self, lvol_configs): self.logger.info("Starting cleanup of LVOLs") for config in lvol_configs: lvol_name = config['lvol_name'] - self.ssh_obj.unmount_path(node=self.client_machines[0], - device=self.lvol_devices[lvol_name]['MountPath']) - self.ssh_obj.remove_dir(node=self.client_machines[0], - dir_path=self.lvol_devices[lvol_name]['MountPath']) + if config['mount']: + self.ssh_obj.unmount_path(node=self.client_machines[0], + device=self.lvol_devices[lvol_name]['MountPath']) + self.ssh_obj.remove_dir(node=self.client_machines[0], + dir_path=self.lvol_devices[lvol_name]['MountPath']) lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name) subsystems = self.ssh_obj.get_nvme_subsystems(node=self.client_machines[0], nqn_filter=lvol_id) diff --git a/e2e/stress_test/continuous_failover_ha_multi_client.py b/e2e/stress_test/continuous_failover_ha_multi_client.py index a2869482d..a97c42676 100644 --- a/e2e/stress_test/continuous_failover_ha_multi_client.py +++ b/e2e/stress_test/continuous_failover_ha_multi_client.py @@ -42,6 +42,7 @@ def __init__(self, **kwargs): self.sn_nodes = [] self.current_outage_node = None self.snapshot_names = [] + self.current_outage_nodes = [] self.disconnect_thread = None self.outage_start_time = None self.outage_end_time = None @@ -60,8 +61,7 @@ def __init__(self, **kwargs): # self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt", # "interface_partial_network_interrupt", # "partial_nw"] - self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt", - "interface_partial_network_interrupt"] + self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt"] # self.outage_types = ["partial_nw"] self.blocked_ports = None self.outage_log_file = os.path.join("logs", f"outage_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log") @@ -111,7 +111,26 @@ def create_lvols_with_fio(self, count): lvol_name = f"{self.lvol_name}_{i}" if not is_crypto else f"c{self.lvol_name}_{i}" self.logger.info(f"Creating lvol with Name: {lvol_name}, fs type: {fs_type}, crypto: {is_crypto}") try: - if self.current_outage_node: + self.logger.info(f"Current Outage Node: {self.current_outage_nodes}") + if self.current_outage_nodes: + self.logger.info(f"Primary vs secondary: {self.sn_primary_secondary_map}") + skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] in self.current_outage_nodes] + self.logger.info(f"Skip Nodes: {skip_nodes}") + for node in self.current_outage_nodes: + skip_nodes.append(node) + self.logger.info(f"Skip Nodes: {skip_nodes}") + self.logger.info(f"Storage Nodes with sec: {self.sn_nodes_with_sec}") + host_id = [node for node in self.sn_nodes_with_sec if node not in skip_nodes] + self.sbcli_utils.add_lvol( + lvol_name=lvol_name, + pool_name=self.pool_name, + size=self.lvol_size, + crypto=is_crypto, + key1=self.lvol_crypt_keys[0], + key2=self.lvol_crypt_keys[1], + host_id=host_id[0] + ) + elif self.current_outage_node: skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] == self.current_outage_node] skip_nodes.append(self.current_outage_node) skip_nodes.append(self.sn_primary_secondary_map[self.current_outage_node]) @@ -276,7 +295,7 @@ def create_lvols_with_fio(self, count): "iodepth": 1, "numjobs": 5, "time_based": True, - "runtime": 2000, + "runtime": 3000, "log_avg_msec": 1000, "iolog_file": self.lvol_mount_details[lvol_name]["iolog_base_path"], }, @@ -306,11 +325,11 @@ def perform_random_outage(self): node_ip = node_details[0]["mgmt_ip"] node_rpc_port = node_details[0]["rpc_port"] - sleep_n_sec(120) + sleep_n_sec(5) for node in self.sn_nodes_with_sec: - self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], - storage_node_id=node) - + # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], + # storage_node_id=node) + self.logger.info(f"Skipping lvstore dump!!") for node in self.sn_nodes_with_sec: cur_node_details = self.sbcli_utils.get_storage_node_details(node) cur_node_ip = cur_node_details[0]["mgmt_ip"] @@ -417,7 +436,7 @@ def perform_random_outage(self): self.disconnect_thread = threading.Thread( target=self.ssh_obj.disconnect_all_active_interfaces, - args=(node_ip, active_interfaces, 600), + args=(node_ip, active_interfaces, 300), ) self.disconnect_thread.start() elif outage_type == "interface_partial_network_interrupt": @@ -430,7 +449,7 @@ def perform_random_outage(self): self.disconnect_thread = threading.Thread( target=self.ssh_obj.disconnect_all_active_interfaces, - args=(node_ip, active_interfaces, 600), + args=(node_ip, active_interfaces, 300), ) self.disconnect_thread.start() elif outage_type == "partial_nw": @@ -478,12 +497,12 @@ def perform_random_outage(self): self.ssh_obj.disconnect_lvol_node_device(node=self.lvol_mount_details[lvol]["Client"], device=self.lvol_mount_details[lvol]["Device"]) if outage_type != "partial_nw" or outage_type != "partial_nw_single_port": - sleep_n_sec(120) + sleep_n_sec(10) return outage_type - def restart_nodes_after_failover(self, outage_type): + def restart_nodes_after_failover(self, outage_type, restart=False): """Perform steps for node restart.""" node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) node_ip = node_details[0]["mgmt_ip"] @@ -543,14 +562,48 @@ def restart_nodes_after_failover(self, outage_type): self.ssh_obj.exec_command(node=self.lvol_mount_details[lvol]["Client"], command=connect) elif outage_type == "container_stop": - self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) - # Log the restart event - self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=1) + if restart: + max_retries = 10 + retry_delay = 10 # seconds + + # Retry mechanism for restarting the node + for attempt in range(max_retries): + try: + force=False + if attempt == max_retries - 1: + force=True + self.logger.info("[CHECK] Restarting Node via CLI with Force flag as via API Fails.") + else: + self.logger.info("[CHECK] Restarting Node via CLI as via API Fails.") + self.ssh_obj.restart_node(node=self.mgmt_nodes[0], + node_id=self.current_outage_node, + force=force) + # else: + # self.sbcli_utils.restart_node(node_uuid=self.current_outage_node, expected_error_code=[503]) + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) + break # Exit loop if successful + except Exception as _: + if attempt < max_retries - 2: + self.logger.info(f"Attempt {attempt + 1} failed to restart node. Retrying in {retry_delay} seconds...") + sleep_n_sec(retry_delay) + elif attempt < max_retries - 1: + self.logger.info(f"Attempt {attempt + 1} failed to restart node via API. Retrying in {retry_delay} seconds via CMD...") + sleep_n_sec(retry_delay) + else: + self.logger.info("Max retries reached. Failed to restart node.") + raise # Rethrow the last exception + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) + # Log the restart event + self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=0) + else: + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) + # Log the restart event + self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=2) elif "network_interrupt" in outage_type: self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000) # Log the restart event - self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=11) + self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=6) if not self.k8s_test: for node in self.storage_nodes: @@ -608,9 +661,9 @@ def restart_nodes_after_failover(self, outage_type): # sleep_n_sec(30) for node in self.sn_nodes_with_sec: - self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], - storage_node_id=node) - + # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], + # storage_node_id=node) + self.logger.info(f"Skipping lvstore dump!!") def create_snapshots_and_clones(self): """Create snapshots and clones during an outage.""" @@ -777,7 +830,7 @@ def create_snapshots_and_clones(self): "iodepth": 1, "numjobs": 5, "time_based": True, - "runtime": 2000, + "runtime": 3000, "log_avg_msec": 1000, "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"], }, @@ -786,22 +839,23 @@ def create_snapshots_and_clones(self): self.fio_threads.append(fio_thread) self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}.") - self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"], - new_size=f"{self.int_lvol_size}G") + if self.lvol_mount_details[lvol]["ID"]: + self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"], + new_size=f"{self.int_lvol_size}G") sleep_n_sec(10) - self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"], - new_size=f"{self.int_lvol_size}G") - + if self.clone_mount_details[clone_name]["ID"]: + self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"], + new_size=f"{self.int_lvol_size}G") + def delete_random_lvols(self, count): """Delete random lvols during an outage.""" skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] == self.current_outage_node] skip_nodes.append(self.current_outage_node) skip_nodes.append(self.sn_primary_secondary_map[self.current_outage_node]) - skip_nodes_lvol = [] - self.logger.info(f"Skipping Nodes: {skip_nodes_lvol}") + self.logger.info(f"Skipping Nodes: {skip_nodes}") available_lvols = [ - lvol for node, lvols in self.node_vs_lvol.items() if node not in skip_nodes_lvol for lvol in lvols + lvol for node, lvols in self.node_vs_lvol.items() if node not in skip_nodes for lvol in lvols ] self.logger.info(f"Available Lvols: {available_lvols}") if len(available_lvols) < count: @@ -922,7 +976,7 @@ def perform_failover_during_outage(self): storage_node_id=node, logs_path=self.docker_logs_path ) - self.create_lvols_with_fio(3) + self.create_lvols_with_fio(5) if not self.k8s_test: for node in self.storage_nodes: self.ssh_obj.restart_docker_logging( @@ -1041,7 +1095,7 @@ def restart_fio(self, iteration): "iodepth": 1, "numjobs": 5, "time_based": True, - "runtime": 2000, + "runtime": 3000, "log_avg_msec": 1000, "iolog_file": self.lvol_mount_details[lvol]["iolog_base_path"], }, @@ -1150,7 +1204,7 @@ def run(self): storage_node_id=node, logs_path=self.docker_logs_path ) - self.create_lvols_with_fio(5) + self.create_lvols_with_fio(3) if not self.k8s_test: for node in self.storage_nodes: self.ssh_obj.restart_docker_logging( @@ -1175,7 +1229,7 @@ def run(self): else: self.logger.info(f"Current outage node: {self.current_outage_node} is secondary node. Skipping delete and create") if outage_type != "partial_nw" or outage_type != "partial_nw_single_port": - sleep_n_sec(280) + sleep_n_sec(100) for node in self.sn_nodes_with_sec: cur_node_details = self.sbcli_utils.get_storage_node_details(node) cur_node_ip = cur_node_details[0]["mgmt_ip"] @@ -1195,7 +1249,7 @@ def run(self): ) self.logger.info("Waiting for fallback.") if outage_type != "partial_nw" or outage_type != "partial_nw_single_port": - sleep_n_sec(100) + sleep_n_sec(15) time_duration = self.common_utils.calculate_time_duration( start_timestamp=self.outage_start_time, end_timestamp=self.outage_end_time @@ -1213,23 +1267,24 @@ def run(self): no_task_ok = outage_type in {"partial_nw", "partial_nw_single_port", "lvol_disconnect_primary"} if not self.sbcli_utils.is_secondary_node(self.current_outage_node): self.validate_migration_for_node(self.outage_start_time, 2000, None, 60, no_task_ok=no_task_ok) + # pass for clone, clone_details in self.clone_mount_details.items(): self.common_utils.validate_fio_test(clone_details["Client"], log_file=clone_details["Log"]) - # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"]) - # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"]) for lvol, lvol_details in self.lvol_mount_details.items(): self.common_utils.validate_fio_test(lvol_details["Client"], log_file=lvol_details["Log"]) - # self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"]) - # self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"]) + self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"]) + self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"]) # Perform failover and manage resources during outage outage_type = self.perform_failover_during_outage() if outage_type != "partial_nw" or outage_type != "partial_nw_single_port": - sleep_n_sec(100) + sleep_n_sec(15) time_duration = self.common_utils.calculate_time_duration( start_timestamp=self.outage_start_time, end_timestamp=self.outage_end_time diff --git a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py new file mode 100644 index 000000000..afa98b055 --- /dev/null +++ b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py @@ -0,0 +1,535 @@ +# stress_test/continuous_failover_ha_multi_client_quick_outage.py +# Fast outages with long-running FIO, no churn beyond initial setup. +# - Create lvols, snapshots, clones ONCE at the beginning +# - Start 30min FIO on all mounts (lvols + clones) +# - Run fast outages (as soon as node is ONLINE again) +# - Every 5 outages: wait for all FIO to complete, validate, then (optionally) wait for migration window +# - Graceful shutdown: suspend -> wait SUSPENDED -> shutdown -> wait OFFLINE -> keep offline 5 min -> restart +# - After any restart: 15–30s idle then immediately next outage + +import os +import random +import string +import threading +from datetime import datetime +from utils.common_utils import sleep_n_sec +from exceptions.custom_exception import LvolNotConnectException +from stress_test.lvol_ha_stress_fio import TestLvolHACluster + + +def _rand_id(n=15, first_alpha=True): + letters = string.ascii_uppercase + digits = string.digits + allc = letters + digits + if first_alpha: + return random.choice(letters) + ''.join(random.choices(allc, k=n-1)) + return ''.join(random.choices(allc, k=n)) + + +class RandomRapidFailoverNoGap(TestLvolHACluster): + """ + - Minimal churn (only bootstrap creates) + - Long FIO (30 mins) on every lvol/clone + - Outage pacing: next outage right after ONLINE; add 15–30s buffer post-restart + - Validate FIO and pause for migration every 5 outages + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # Base knobs + self.total_lvols = 20 + self.lvol_size = "40G" + self.fio_size = "15G" + + # Validation cadence & FIO runtime + self.validate_every = 5 + self._iter = 0 + self._per_wave_fio_runtime = 3600 # 60 minutes + self._fio_wait_timeout = 5000 # wait for all to finish + + # Internal state + self.fio_threads = [] + self.lvol_mount_details = {} + self.clone_mount_details = {} + self.sn_nodes = [] + self.sn_nodes_with_sec = [] + self.sn_primary_secondary_map = {} + self.node_vs_lvol = {} + self.snapshot_names = [] + self.snap_vs_node = {} + self.current_outage_node = None + self.outage_start_time = None + self.outage_end_time = None + self.first_outage_ts = None # track the first outage for migration window + self.test_name = "longfio_nochurn_rapid_outages" + + self.outage_types = [ + "graceful_shutdown", + "container_stop", + # "interface_full_network_interrupt", + ] + + # Names + self.lvol_base = f"lvl{_rand_id(12)}" + self.clone_base = f"cln{_rand_id(12)}" + self.snap_base = f"snap{_rand_id(12)}" + + # Logging file for outages + self.outage_log_file = os.path.join("logs", f"outage_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log") + self._init_outage_log() + + # ---------- small utilities ---------- + + def _init_outage_log(self): + os.makedirs(os.path.dirname(self.outage_log_file), exist_ok=True) + with open(self.outage_log_file, "w") as f: + f.write("Timestamp,Node,Outage_Type,Event\n") + + def _log_outage_event(self, node, outage_type, event): + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + with open(self.outage_log_file, "a") as f: + f.write(f"{ts},{node},{outage_type},{event}\n") + + def _short_bs(self): + # return f"{2 ** random.randint(2, 7)}K" # 4K–128K + return f"{2 ** 6}K" + + def _pick_outage(self): + random.shuffle(self.outage_types) + return self.outage_types[0] + + # ---------- cluster bootstrap ---------- + + def _wait_cluster_active(self, timeout=900, poll=5): + """ + Poll `sbctl cluster list` until status ACTIVE. + Avoids 400 in_activation when creating lvol/snap/clone during bring-up. + """ + end = datetime.now().timestamp() + timeout + while datetime.now().timestamp() < end: + try: + info = self.ssh_obj.cluster_list(self.mgmt_nodes[0], self.cluster_id) # must wrap "sbctl cluster list" + self.logger.info(info) + # Expect a single row with Status + status = str(info).upper() + if "ACTIVE" in status: + return + except Exception as e: + self.logger.info(f"ERROR: {e}") + sleep_n_sec(poll) + raise RuntimeError("Cluster did not become ACTIVE within timeout") + + def _bootstrap_cluster(self): + # Ensure Cluster is ACTIVE + self._wait_cluster_active() + + # create pool + self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + + # discover storage nodes + storage_nodes = self.sbcli_utils.get_storage_nodes() + for res in storage_nodes['results']: + self.sn_nodes.append(res["uuid"]) + self.sn_nodes_with_sec.append(res["uuid"]) + self.sn_primary_secondary_map[res["uuid"]] = res["secondary_node_id"] + + self.logger.info(f"[LFNG] SN sec map: {self.sn_primary_secondary_map}") + + # initial lvols + mount + then later clone from snapshots + self._create_lvols(count=self.total_lvols) # start_fio=False → we launch after clones + self._seed_snapshots_and_clones() # also mounts clones + + # Start 30 min FIO on all (lvols + clones) + self._kick_fio_for_all(runtime=self._per_wave_fio_runtime) + + # start container logs + if not self.k8s_test: + for node in self.storage_nodes: + self.ssh_obj.restart_docker_logging( + node_ip=node, + containers=self.container_nodes[node], + log_dir=os.path.join(self.docker_logs_path, node), + test_name=self.test_name + ) + else: + self.runner_k8s_log.restart_logging() + + # ---------- lvol / fio helpers ---------- + + def _create_lvols(self, count=1): + for _ in range(count): + fs_type = random.choice(["ext4", "xfs"]) + is_crypto = random.choice([True, False]) + name_core = f"{self.lvol_base}_{_rand_id(6, first_alpha=False)}" + lvol_name = name_core if not is_crypto else f"c{name_core}" + + kwargs = dict( + lvol_name=lvol_name, + pool_name=self.pool_name, + size=self.lvol_size, + crypto=is_crypto, + key1=self.lvol_crypt_keys[0], + key2=self.lvol_crypt_keys[1], + ) + + # Avoid outage node & partner during initial placement + if self.current_outage_node: + skip_nodes = [self.current_outage_node, self.sn_primary_secondary_map.get(self.current_outage_node)] + skip_nodes += [p for p, s in self.sn_primary_secondary_map.items() if s == self.current_outage_node] + host_id = [n for n in self.sn_nodes_with_sec if n not in skip_nodes] + if host_id: + kwargs["host_id"] = host_id[0] + + # Ensure cluster ACTIVE before creating + self._wait_cluster_active() + + try: + self.sbcli_utils.add_lvol(**kwargs) + except Exception as e: + self.logger.warning(f"[LFNG] lvol create failed ({lvol_name}) → {e}; retry once after ACTIVE gate") + self._wait_cluster_active() + self.sbcli_utils.add_lvol(**kwargs) + + # record + lvol_id = self.sbcli_utils.get_lvol_id(lvol_name) + self.lvol_mount_details[lvol_name] = { + "ID": lvol_id, + "Command": None, + "Mount": None, + "Device": None, + "MD5": None, + "FS": fs_type, + "Log": f"{self.log_path}/{lvol_name}.log", + "snapshots": [], + "iolog_base_path": f"{self.log_path}/{lvol_name}_fio_iolog", + } + + # refresh list + self.ssh_obj.exec_command(node=self.mgmt_nodes[0], command=f"{self.base_cmd} lvol list", supress_logs=True) + + # track node placement + lvol_node_id = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)[0]["node_id"] + self.node_vs_lvol.setdefault(lvol_node_id, []).append(lvol_name) + + # connect + connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=lvol_name) + self.lvol_mount_details[lvol_name]["Command"] = connect_ls + + client_node = random.choice(self.fio_node) + self.lvol_mount_details[lvol_name]["Client"] = client_node + + initial = self.ssh_obj.get_devices(node=client_node) + for c in connect_ls: + _, err = self.ssh_obj.exec_command(node=client_node, command=c) + if err: + nqn = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)[0]["nqn"] + self.ssh_obj.disconnect_nvme(node=client_node, nqn_grep=nqn) + self.logger.info(f"[LFNG] connect error → clean lvol {lvol_name}") + self.sbcli_utils.delete_lvol(lvol_name=lvol_name, max_attempt=20, skip_error=True) + sleep_n_sec(3) + del self.lvol_mount_details[lvol_name] + self.node_vs_lvol[lvol_node_id].remove(lvol_name) + break + + final = self.ssh_obj.get_devices(node=client_node) + new_dev = None + for d in final: + if d not in initial: + new_dev = f"/dev/{d.strip()}" + break + if not new_dev: + raise LvolNotConnectException("LVOL did not connect") + + self.lvol_mount_details[lvol_name]["Device"] = new_dev + self.ssh_obj.format_disk(node=client_node, device=new_dev, fs_type=fs_type) + + mnt = f"{self.mount_path}/{lvol_name}" + self.ssh_obj.mount_path(node=client_node, device=new_dev, mount_path=mnt) + self.lvol_mount_details[lvol_name]["Mount"] = mnt + + # clean old logs + self.ssh_obj.delete_files(client_node, [ + f"{mnt}/*fio*", + f"{self.log_path}/local-{lvol_name}_fio*", + f"{self.log_path}/{lvol_name}_fio_iolog*" + ]) + + def _seed_snapshots_and_clones(self): + """Create one snapshot and one clone per lvol (best effort). Mount clones on same client.""" + for lvol, det in list(self.lvol_mount_details.items()): + # Ensure ACTIVE + self._wait_cluster_active() + + snap_name = f"{self.snap_base}_{_rand_id(8, first_alpha=False)}" + out, err = self.ssh_obj.add_snapshot(self.mgmt_nodes[0], det["ID"], snap_name) + if "(False," in str(out) or "(False," in str(err): + self.logger.warning(f"[LFNG] snapshot create failed for {lvol} → skip clone") + continue + + self.snapshot_names.append(snap_name) + node_id = self.sbcli_utils.get_lvol_details(lvol_id=det["ID"])[0]["node_id"] + self.snap_vs_node[snap_name] = node_id + det["snapshots"].append(snap_name) + + snap_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snap_name) + clone_name = f"{self.clone_base}_{_rand_id(8, first_alpha=False)}" + try: + self.ssh_obj.add_clone(self.mgmt_nodes[0], snap_id, clone_name) + except Exception as e: + self.logger.warning(f"[LFNG] clone create failed for {lvol} → {e}") + continue + + # connect clone + fs_type = det["FS"] + client = det["Client"] + + self.clone_mount_details[clone_name] = { + "ID": self.sbcli_utils.get_lvol_id(clone_name), + "Command": None, + "Mount": None, + "Device": None, + "MD5": None, + "FS": fs_type, + "Log": f"{self.log_path}/{clone_name}.log", + "snapshot": snap_name, + "Client": client, + "iolog_base_path": f"{self.log_path}/{clone_name}_fio_iolog", + } + + connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=clone_name) + self.clone_mount_details[clone_name]["Command"] = connect_ls + + initial = self.ssh_obj.get_devices(node=client) + for c in connect_ls: + _, err = self.ssh_obj.exec_command(node=client, command=c) + if err: + nqn = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])[0]["nqn"] + self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn) + self.logger.info(f"[LFNG] connect clone error → cleanup") + self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True) + sleep_n_sec(3) + del self.clone_mount_details[clone_name] + continue + + final = self.ssh_obj.get_devices(node=client) + new_dev = None + for d in final: + if d not in initial: + new_dev = f"/dev/{d.strip()}" + break + if not new_dev: + raise LvolNotConnectException("Clone did not connect") + + self.clone_mount_details[clone_name]["Device"] = new_dev + if fs_type == "xfs": + self.ssh_obj.clone_mount_gen_uuid(client, new_dev) + mnt = f"{self.mount_path}/{clone_name}" + self.ssh_obj.mount_path(node=client, device=new_dev, mount_path=mnt) + self.clone_mount_details[clone_name]["Mount"] = mnt + + # purge old logs + self.ssh_obj.delete_files(client, [ + f"{self.log_path}/local-{clone_name}_fio*", + f"{self.log_path}/{clone_name}_fio_iolog*", + f"{mnt}/*fio*" + ]) + + def _kick_fio_for_all(self, runtime=None): + """Start verified fio (PID-checked; auto-rerun) for all lvols + clones.""" + # small stagger to avoid SSH bursts + def _launch(name, det): + self.ssh_obj.run_fio_test( + det["Client"], None, det["Mount"], det["Log"], + size=self.fio_size, name=f"{name}_fio", rw="randrw", + bs=self._short_bs(), nrfiles=8, iodepth=1, numjobs=2, + time_based=True, runtime=runtime, log_avg_msec=1000, + iolog_file=det["iolog_base_path"], max_latency="30s", + verify="md5", verify_dump=1, verify_fatal=1, retries=6, + use_latency=False + ) + + for lvol, det in self.lvol_mount_details.items(): + self.ssh_obj.delete_files(det["Client"], [f"/mnt/{lvol}/*"]) + t = threading.Thread(target=_launch, args=(lvol, det)) + t.start() + self.fio_threads.append(t) + sleep_n_sec(0.2) + + for cname, det in self.clone_mount_details.items(): + self.ssh_obj.delete_files(det["Client"], [f"/mnt/{cname}/*"]) + t = threading.Thread(target=_launch, args=(cname, det)) + t.start() + self.fio_threads.append(t) + sleep_n_sec(0.2) + + # ---------- outage flow ---------- + + def _perform_outage(self): + random.shuffle(self.sn_nodes) + self.current_outage_node = self.sn_nodes[0] + outage_type = self._pick_outage() + + if self.first_outage_ts is None: + self.first_outage_ts = int(datetime.now().timestamp()) + + cur_node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=self.current_outage_node, + logs_path=self.docker_logs_path + ) + + # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], + # storage_node_id=self.current_outage_node) + + self.outage_start_time = int(datetime.now().timestamp()) + self._log_outage_event(self.current_outage_node, outage_type, "Outage started") + self.logger.info(f"[LFNG] Outage={outage_type} node={self.current_outage_node}") + + node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) + node_ip = node_details[0]["mgmt_ip"] + node_rpc_port = node_details[0]["rpc_port"] + + if outage_type == "graceful_shutdown": + # suspend -> wait SUSPENDED -> shutdown -> wait OFFLINE + try: + self.logger.info(f"[LFNG] Suspending node via: sbcli-dev sn suspend {self.current_outage_node}") + self.sbcli_utils.suspend_node(node_uuid=self.current_outage_node, expected_error_code=[503]) + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "suspended", timeout=600) + except Exception: + self.logger.warning("[LFNG] Suspend failed from API; ignoring if already suspended") + + try: + self.sbcli_utils.shutdown_node(node_uuid=self.current_outage_node, force=True, expected_error_code=[503]) + except Exception: + self.ssh_obj.shutdown_node(node=self.mgmt_nodes[0], node_id=self.current_outage_node, force=True) + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "offline", timeout=900) + + for node in self.sn_nodes_with_sec: + if node != self.current_outage_node: + cur_node_details = self.sbcli_utils.get_storage_node_details(node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) + # Keep node strictly offline for 5 minutes + sleep_n_sec(500) + + elif outage_type == "container_stop": + self.ssh_obj.stop_spdk_process(node_ip, node_rpc_port) + + elif outage_type == "interface_full_network_interrupt": + # Down all active data interfaces for ~300s (5 minutes) with ping verification + active = self.ssh_obj.get_active_interfaces(node_ip) + self.ssh_obj.disconnect_all_active_interfaces(node_ip, active, 300) + sleep_n_sec(280) + + return outage_type + + def restart_nodes_after_failover(self, outage_type): + node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) + + self.logger.info(f"[LFNG] Recover outage={outage_type} node={self.current_outage_node}") + + cur_node_details = self.sbcli_utils.get_storage_node_details(self.sn_primary_secondary_map[self.current_outage_node]) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=self.sn_primary_secondary_map[self.current_outage_node], + logs_path=self.docker_logs_path + ) + + # Only wait for ONLINE (skip deep health) + if outage_type == 'graceful_shutdown': + try: + self.ssh_obj.restart_node(self.mgmt_nodes[0], node_id=self.current_outage_node, force=True) + except Exception: + pass + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900) + elif outage_type == 'container_stop': + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900) + elif "network_interrupt" in outage_type: + self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900) + + self._log_outage_event(self.current_outage_node, outage_type, "Node online") + self.outage_end_time = int(datetime.now().timestamp()) + + cur_node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=self.current_outage_node, + logs_path=self.docker_logs_path + ) + + # keep container log streaming going + if not self.k8s_test: + for node in self.storage_nodes: + self.ssh_obj.restart_docker_logging( + node_ip=node, + containers=self.container_nodes[node], + log_dir=os.path.join(self.docker_logs_path, node), + test_name=self.test_name + ) + else: + self.runner_k8s_log.restart_logging() + + # small cool-down before next outage to reduce SSH churn + # sleep_n_sec(random.randint(1, 5)) + + # ---------- main ---------- + + def run(self): + self.logger.info("[LFNG] Starting RandomRapidFailoverNoGap") + self._bootstrap_cluster() + sleep_n_sec(5) + + iteration = 1 + while True: + outage_type = self._perform_outage() + self.restart_nodes_after_failover(outage_type) + + self._iter += 1 + if self._iter % self.validate_every == 0: + self.logger.info(f"[LFNG] {self._iter} outages → wait & validate all FIO") + # Join launch threads so we know all jobs issued + for t in self.fio_threads: + t.join(timeout=10) + self.fio_threads = [] + + # Wait for all fio jobs to end (they’re 30min jobs) + self.common_utils.manage_fio_threads(self.fio_node, [], timeout=self._fio_wait_timeout) + + for node in self.sn_nodes_with_sec: + cur_node_details = self.sbcli_utils.get_storage_node_details(node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) + + self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], + storage_node_id=node) + + # Validate logs + for lvol, det in self.lvol_mount_details.items(): + self.common_utils.validate_fio_test(det["Client"], log_file=det["Log"]) + for cname, det in self.clone_mount_details.items(): + self.common_utils.validate_fio_test(det["Client"], log_file=det["Log"]) + + # Optional: wait for migration window after FIO completes + # (replace with your actual migration-check, if any) + self.logger.info("[LFNG] FIO validated; pausing briefly for migration window") + sleep_n_sec(10) + + # Re-kick next 30min wave + self._kick_fio_for_all(runtime=self._per_wave_fio_runtime) + self.logger.info("[LFNG] Next FIO wave started") + + self.logger.info(f"[LFNG] Iter {iteration} complete → starting next outage ASAP") + iteration += 1 \ No newline at end of file diff --git a/e2e/stress_test/continuous_failover_ha_multi_outage.py b/e2e/stress_test/continuous_failover_ha_multi_outage.py index fb5f6d507..e96a0b547 100644 --- a/e2e/stress_test/continuous_failover_ha_multi_outage.py +++ b/e2e/stress_test/continuous_failover_ha_multi_outage.py @@ -1,5 +1,6 @@ from utils.common_utils import sleep_n_sec from datetime import datetime +from collections import defaultdict from stress_test.continuous_failover_ha_multi_client import RandomMultiClientFailoverTest from exceptions.custom_exception import LvolNotConnectException import threading @@ -8,13 +9,20 @@ import os +generated_sequences = set() + def generate_random_sequence(length): letters = string.ascii_uppercase numbers = string.digits all_chars = letters + numbers - first_char = random.choice(letters) - remaining_chars = ''.join(random.choices(all_chars, k=length - 1)) - return first_char + remaining_chars + + while True: + first_char = random.choice(letters) + remaining_chars = ''.join(random.choices(all_chars, k=length-1)) + result = first_char + remaining_chars + if result not in generated_sequences: + generated_sequences.add(result) + return result class RandomMultiClientMultiFailoverTest(RandomMultiClientFailoverTest): @@ -25,7 +33,7 @@ class RandomMultiClientMultiFailoverTest(RandomMultiClientFailoverTest): def __init__(self, **kwargs): super().__init__(**kwargs) - self.total_lvols = 20 + self.total_lvols = 40 self.lvol_name = f"lvl{generate_random_sequence(15)}" self.clone_name = f"cln{generate_random_sequence(15)}" self.snapshot_name = f"snap{generate_random_sequence(15)}" @@ -48,9 +56,12 @@ def __init__(self, **kwargs): self.lvols_without_sec_connect = [] self.test_name = "n_plus_k_failover_multi_client_ha" self.outage_types = [ + "graceful_shutdown", + "interface_full_network_interrupt" + ] + self.outage_types2 = [ "container_stop", "graceful_shutdown", - "interface_partial_network_interrupt", "interface_full_network_interrupt" ] self.blocked_ports = None @@ -61,30 +72,101 @@ def _initialize_outage_log(self): with open(self.outage_log_file, 'w') as log: log.write("Timestamp,Node,Outage_Type,Event\n") - def log_outage_event(self, node, outage_type, event): - timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + def log_outage_event(self, node, outage_type, event, outage_time=0): + """Log an outage event to the outage log file. + + Args: + node (str): Node UUID or IP where the event occurred. + outage_type (str): Type of outage (e.g., port_network_interrupt, container_stop, graceful_shutdown). + event (str): Event description (e.g., 'Outage started', 'Node restarted'). + outage_time (int): Minutes to add to self.outage_start_time. If 0/None, use current time. + """ + # Compute timestamp + if outage_time: + # Uses self.outage_start_time (epoch seconds) + outage_time (minutes) + base_epoch = getattr(self, "outage_start_time", None) + if isinstance(base_epoch, (int, float)) and base_epoch > 0: + ts_dt = datetime.fromtimestamp(int(base_epoch) + int(outage_time) * 60) + else: + # Fallback to now if outage_start_time is missing/invalid + ts_dt = datetime.now() + else: + ts_dt = datetime.now() + + timestamp = ts_dt.strftime('%Y-%m-%d %H:%M:%S') + + # Write the log line with open(self.outage_log_file, 'a') as log: log.write(f"{timestamp},{node},{outage_type},{event}\n") + def _build_reverse_secondary_map(self): + rev = defaultdict(set) # secondary -> {primary,...} + for p, s in self.sn_primary_secondary_map.items(): + if s: + rev[s].add(p) + return rev + + def _pick_outage_nodes(self, primary_candidates, k): + rev = self._build_reverse_secondary_map() + order = primary_candidates[:] + + random.shuffle(order) + + chosen, blocked = [], set() + for node in order: + if node in blocked: + continue + + chosen.append(node) + blocked.add(node) # itself + sec = self.sn_primary_secondary_map.get(node) + if sec: + blocked.add(sec) # its secondary + blocked.update(rev.get(node, ())) # any primary whose secondary == node + + if len(chosen) == k: + break + + if len(chosen) < k: + raise Exception( + f"Cannot pick {k} nodes without primary/secondary conflicts; only {len(chosen)} possible with current topology." + ) + return chosen + def perform_n_plus_k_outages(self): """ - Perform K (self.npcs) parallel outages as part of N+K configuration. - Ensure only primary nodes are selected for outage. + Select K outage nodes such that no two are in a primary/secondary + relationship (in either direction). Candidates = keys of the map. """ - primary_nodes = [node for node in self.sn_nodes if not self.sbcli_utils.is_secondary_node(node)] + # Candidates are nodes that are primary *for someone* (map keys) + primary_candidates = list(self.sn_primary_secondary_map.keys()) + self.current_outage_nodes = [] - if len(primary_nodes) < self.npcs: - raise Exception(f"Not enough primary nodes to perform {self.npcs} outages. Found only {len(primary_nodes)}.") + if len(primary_candidates) < self.npcs: + raise Exception( + f"Need {self.npcs} outage nodes, but only {len(primary_candidates)} primary-role nodes exist." + ) - outage_nodes = random.sample(primary_nodes, k=self.npcs) + outage_nodes = self._pick_outage_nodes(primary_candidates, self.npcs) + self.logger.info(f"Selected outage nodes: {outage_nodes}") outage_combinations = [] - + outage_num = 0 for node in outage_nodes: - outage_type = random.choice(self.outage_types) + if outage_num == 0: + outage_type = random.choice(self.outage_types) + outage_num = 1 + else: + outage_type = random.choice(self.outage_types2) node_details = self.sbcli_utils.get_storage_node_details(node) node_ip = node_details[0]["mgmt_ip"] node_rpc_port = node_details[0]["rpc_port"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) + self.logger.info(f"Performing {outage_type} on primary node {node}.") self.log_outage_event(node, outage_type, "Outage started") @@ -105,26 +187,74 @@ def perform_n_plus_k_outages(self): def _graceful_shutdown_node(self, node): try: - self.sbcli_utils.suspend_node(node_uuid=node, expected_error_code=[503]) - self.sbcli_utils.wait_for_storage_node_status(node, "suspended", timeout=1000) - self.sbcli_utils.shutdown_node(node_uuid=node, expected_error_code=[503]) - self.sbcli_utils.wait_for_storage_node_status(node, "offline", timeout=1000) + sleep_n_sec(10) + max_retries = 10 + retry_delay = 10 # seconds + # Retry mechanism for suspending the node + for attempt in range(max_retries): + try: + if attempt == max_retries - 1: + self.logger.info("[CHECK] Suspending Node via CLI as via API Fails.") + self.ssh_obj.suspend_node(node=self.mgmt_nodes[0], + node_id=node) + else: + self.sbcli_utils.suspend_node(node_uuid=node, expected_error_code=[503]) + self.sbcli_utils.wait_for_storage_node_status(node, "suspended", timeout=1000) + break # Exit loop if successful + except Exception as _: + if attempt < max_retries - 2: + self.logger.info(f"Attempt {attempt + 1} failed to suspend node. Retrying in {retry_delay} seconds...") + sleep_n_sec(retry_delay) + elif attempt < max_retries - 1: + self.logger.info(f"Attempt {attempt + 1} failed to suspend node via API. Retrying in {retry_delay} seconds via CMD...") + sleep_n_sec(retry_delay) + else: + self.logger.info("Max retries reached. Failed to suspend node.") + raise # Rethrow the last exception + + sleep_n_sec(10) # Wait before shutting down + + # Retry mechanism for shutting down the node + for attempt in range(max_retries): + try: + if attempt == max_retries - 1: + self.logger.info("[CHECK] Shutting down Node via CLI as via API Fails.") + self.ssh_obj.shutdown_node(node=self.mgmt_nodes[0], + node_id=node, + force=True) + else: + self.sbcli_utils.shutdown_node(node_uuid=node, force=True, + expected_error_code=[503]) + self.sbcli_utils.wait_for_storage_node_status(node, "offline", timeout=1000) + break # Exit loop if successful + except Exception as _: + if attempt < max_retries - 2: + self.logger.info(f"Attempt {attempt + 1} failed to shutdown node. Retrying in {retry_delay} seconds...") + sleep_n_sec(retry_delay) + elif attempt < max_retries - 1: + self.logger.info(f"Attempt {attempt + 1} failed to shutdown node via API. Retrying in {retry_delay} seconds via CMD...") + sleep_n_sec(retry_delay) + else: + self.logger.info("Max retries reached. Failed to shutdown node.") + raise # Rethrow the last exception except Exception as e: self.logger.error(f"Failed graceful shutdown for node {node}: {str(e)}") def _disconnect_partial_interface(self, node, node_ip): active_interfaces = [nic["if_name"] for nic in self.sbcli_utils.get_storage_node_details(node)[0]["data_nics"]] + active_interfaces = ['eth1'] self.disconnect_thread = threading.Thread( target=self.ssh_obj.disconnect_all_active_interfaces, - args=(node_ip, active_interfaces, 600) + args=(node_ip, active_interfaces, 300) ) self.disconnect_thread.start() def _disconnect_full_interface(self, node, node_ip): + self.logger.info("Handling full interface based network interruption...") active_interfaces = self.ssh_obj.get_active_interfaces(node_ip) self.disconnect_thread = threading.Thread( target=self.ssh_obj.disconnect_all_active_interfaces, - args=(node_ip, active_interfaces, 600) + args=(node_ip, active_interfaces, 300) ) self.disconnect_thread.start() @@ -134,50 +264,81 @@ def delete_random_lvols(self, count): lvol for node, lvols in self.node_vs_lvol.items() if node not in self.current_outage_nodes for lvol in lvols ] + + self.logger.info(f"Available Lvols: {available_lvols}") if len(available_lvols) < count: self.logger.warning("Not enough lvols available to delete the requested count.") count = len(available_lvols) for lvol in random.sample(available_lvols, count): - self.logger.info(f"Deleting lvol {lvol}") + self.logger.info(f"Deleting lvol {lvol}.") snapshots = self.lvol_mount_details[lvol]["snapshots"] to_delete = [] - - # Handle dependent clones for clone_name, clone_details in self.clone_mount_details.items(): if clone_details["snapshot"] in snapshots: - self.common_utils.validate_fio_test(clone_details["Client"], clone_details["Log"]) + self.common_utils.validate_fio_test(clone_details["Client"], + log_file=clone_details["Log"]) self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=False) fio_pids = self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=True) + sleep_n_sec(10) for pid in fio_pids: self.ssh_obj.kill_processes(clone_details["Client"], pid=pid) + attempt = 1 + while len(fio_pids) > 2: + self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=False) + fio_pids = self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=True) + if attempt >= 30: + raise Exception("FIO not killed on clone") + attempt += 1 + sleep_n_sec(20) + + sleep_n_sec(10) self.ssh_obj.unmount_path(clone_details["Client"], f"/mnt/{clone_name}") self.ssh_obj.remove_dir(clone_details["Client"], dir_path=f"/mnt/{clone_name}") self.disconnect_lvol(clone_details['ID']) - self.sbcli_utils.delete_lvol(clone_name) + self.sbcli_utils.delete_lvol(clone_name, max_attempt=20, skip_error=True) + sleep_n_sec(30) if clone_name in self.lvols_without_sec_connect: self.lvols_without_sec_connect.remove(clone_name) to_delete.append(clone_name) - + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone_name}_fio*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone_name}_fio_iolog*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"/mnt/{clone_name}/*"]) + # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone_name}*.log"]) for del_key in to_delete: del self.clone_mount_details[del_key] - - # Delete snapshots for snapshot in snapshots: snapshot_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snapshot) + # snapshot_node = self.snap_vs_node[snapshot] + # if snapshot_node not in skip_nodes: self.ssh_obj.delete_snapshot(self.mgmt_nodes[0], snapshot_id=snapshot_id) self.snapshot_names.remove(snapshot) - # Stop FIO and cleanup lvol - self.common_utils.validate_fio_test(self.lvol_mount_details[lvol]["Client"], self.lvol_mount_details[lvol]["Log"]) + self.common_utils.validate_fio_test(self.lvol_mount_details[lvol]["Client"], + log_file=self.lvol_mount_details[lvol]["Log"]) self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=False) + sleep_n_sec(10) fio_pids = self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=True) for pid in fio_pids: self.ssh_obj.kill_processes(self.lvol_mount_details[lvol]["Client"], pid=pid) + attempt = 1 + while len(fio_pids) > 2: + self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=False) + fio_pids = self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=True) + if attempt >= 30: + raise Exception("FIO not killed on lvols") + attempt += 1 + sleep_n_sec(20) + + sleep_n_sec(10) self.ssh_obj.unmount_path(self.lvol_mount_details[lvol]["Client"], f"/mnt/{lvol}") self.ssh_obj.remove_dir(self.lvol_mount_details[lvol]["Client"], dir_path=f"/mnt/{lvol}") self.disconnect_lvol(self.lvol_mount_details[lvol]['ID']) - self.sbcli_utils.delete_lvol(lvol) + self.sbcli_utils.delete_lvol(lvol, max_attempt=20, skip_error=True) + self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/local-{lvol}_fio*"]) + self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"]) + self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"/mnt/{lvol}/*"]) + # self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/{lvol}*.log"]) if lvol in self.lvols_without_sec_connect: self.lvols_without_sec_connect.remove(lvol) del self.lvol_mount_details[lvol] @@ -190,14 +351,19 @@ def delete_random_lvols(self, count): def create_snapshots_and_clones(self): """Create snapshots and clones during an outage, avoiding lvols on outage nodes.""" self.int_lvol_size += 1 + skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] in self.current_outage_nodes] + self.logger.info(f"Skip Nodes: {skip_nodes}") + for node in self.current_outage_nodes: + skip_nodes.append(node) + self.logger.info(f"Skip Nodes: {skip_nodes}") available_lvols = [ lvol for node, lvols in self.node_vs_lvol.items() - if node not in self.current_outage_nodes for lvol in lvols + if node not in skip_nodes for lvol in lvols ] if not available_lvols: self.logger.warning("No available lvols to create snapshots and clones.") return - + self.logger.info(f"Available lvols: {available_lvols}") for _ in range(3): random.shuffle(available_lvols) lvol = available_lvols[0] @@ -205,69 +371,140 @@ def create_snapshots_and_clones(self): temp_name = generate_random_sequence(5) if snapshot_name in self.snapshot_names: snapshot_name = f"{snapshot_name}_{temp_name}" - try: output, error = self.ssh_obj.add_snapshot(self.mgmt_nodes[0], self.lvol_mount_details[lvol]["ID"], snapshot_name) - if "(False," in output or "(False," in error: - raise Exception(output or error) + if "(False," in output: + raise Exception(output) + if "(False," in error: + raise Exception(error) except Exception as e: - self.logger.warning(f"Snapshot creation failed: {e}") - continue - + self.logger.warning(f"Snap creation fails with {str(e)}. Retrying with different name.") + try: + snapshot_name = f"snap_{lvol}" + temp_name = generate_random_sequence(5) + snapshot_name = f"{snapshot_name}_{temp_name}" + self.ssh_obj.add_snapshot(self.mgmt_nodes[0], self.lvol_mount_details[lvol]["ID"], snapshot_name) + except Exception as exp: + self.logger.warning(f"Retry Snap creation fails with {str(exp)}.") + continue + self.snapshot_names.append(snapshot_name) + lvol_node_id = self.sbcli_utils.get_lvol_details( + lvol_id=self.lvol_mount_details[lvol]["ID"])[0]["node_id"] + self.snap_vs_node[snapshot_name] = lvol_node_id self.lvol_mount_details[lvol]["snapshots"].append(snapshot_name) - clone_name = f"clone_{generate_random_sequence(15)}" + if clone_name in list(self.clone_mount_details): + clone_name = f"{clone_name}_{temp_name}" sleep_n_sec(30) snapshot_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snapshot_name) try: self.ssh_obj.add_clone(self.mgmt_nodes[0], snapshot_id, clone_name) except Exception as e: - self.logger.warning(f"Clone creation failed: {e}") - continue - + self.logger.warning(f"Clone creation fails with {str(e)}. Retrying with different name.") + try: + clone_name = f"clone_{generate_random_sequence(15)}" + temp_name = generate_random_sequence(5) + clone_name = f"{clone_name}_{temp_name}" + self.ssh_obj.add_clone(self.mgmt_nodes[0], snapshot_id, clone_name) + except Exception as exp: + self.logger.warning(f"Retry Clone creation fails with {str(exp)}.") + continue fs_type = self.lvol_mount_details[lvol]["FS"] client = self.lvol_mount_details[lvol]["Client"] self.clone_mount_details[clone_name] = { - "ID": self.sbcli_utils.get_lvol_id(clone_name), - "Command": None, - "Mount": None, - "Device": None, - "MD5": None, - "FS": fs_type, - "Log": f"{self.log_path}/{clone_name}.log", - "snapshot": snapshot_name, - "Client": client + "ID": self.sbcli_utils.get_lvol_id(clone_name), + "Command": None, + "Mount": None, + "Device": None, + "MD5": None, + "FS": fs_type, + "Log": f"{self.log_path}/{clone_name}.log", + "snapshot": snapshot_name, + "Client": client, + "iolog_base_path": f"{self.log_path}/{clone_name}_fio_iolog" } + self.logger.info(f"Created clone {clone_name}.") + + sleep_n_sec(3) + + self.ssh_obj.exec_command(node=self.mgmt_nodes[0], + command=f"{self.base_cmd} lvol list") + connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=clone_name) self.clone_mount_details[clone_name]["Command"] = connect_ls + + # if self.secondary_outage: + # connect_ls = [connect_ls[0]] + # self.lvols_without_sec_connect.append(clone_name) + initial_devices = self.ssh_obj.get_devices(node=client) for connect_str in connect_ls: _, error = self.ssh_obj.exec_command(node=client, command=connect_str) if error: - self.logger.warning(f"Clone connect failed: {error}") + lvol_details = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"]) + nqn = lvol_details[0]["nqn"] + self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn) + self.logger.info(f"Connecting clone {clone_name} has error: {error}. Disconnect all connections for that clone!!") + self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True) + sleep_n_sec(30) + del self.clone_mount_details[clone_name] continue + sleep_n_sec(3) final_devices = self.ssh_obj.get_devices(node=client) - lvol_device = next((f"/dev/{d.strip()}" for d in final_devices if d not in initial_devices), None) + lvol_device = None + for device in final_devices: + if device not in initial_devices: + lvol_device = f"/dev/{device.strip()}" + break if not lvol_device: - raise LvolNotConnectException("Clone device not found") + raise LvolNotConnectException("LVOL did not connect") self.clone_mount_details[clone_name]["Device"] = lvol_device + # Mount and Run FIO if fs_type == "xfs": self.ssh_obj.clone_mount_gen_uuid(client, lvol_device) - mount_point = f"{self.mount_path}/{clone_name}" self.ssh_obj.mount_path(node=client, device=lvol_device, mount_path=mount_point) self.clone_mount_details[clone_name]["Mount"] = mount_point + # clone_node_id = self.sbcli_utils.get_lvol_details( + # lvol_id=self.lvol_mount_details[clone_name]["ID"])[0]["node_id"] + + # self.node_vs_lvol[clone_node_id].append(clone_name) + + sleep_n_sec(10) + self.ssh_obj.delete_files(client, [f"{mount_point}/*fio*"]) self.ssh_obj.delete_files(client, [f"{self.log_path}/local-{clone_name}_fio*"]) - + self.ssh_obj.delete_files(client, [f"{self.log_path}/{clone_name}_fio_iolog*"]) + + sleep_n_sec(5) + + # Start FIO + # fio_thread = threading.Thread( + # target=self.ssh_obj.run_fio_test, + # args=(client, None, self.clone_mount_details[clone_name]["Mount"], self.clone_mount_details[clone_name]["Log"]), + # kwargs={ + # "size": self.fio_size, + # "name": f"{clone_name}_fio", + # "rw": "randrw", + # "bs": f"{2 ** random.randint(2, 7)}K", + # "nrfiles": 16, + # "iodepth": 1, + # "numjobs": 5, + # "time_based": True, + # "runtime": 2000, + # "log_avg_msec": 1000, + # "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"], + # "debug": True, + # }, + # ) fio_thread = threading.Thread( target=self.ssh_obj.run_fio_test, - args=(client, None, mount_point, self.clone_mount_details[clone_name]["Log"]), + args=(client, None, self.clone_mount_details[clone_name]["Mount"], self.clone_mount_details[clone_name]["Log"]), kwargs={ "size": self.fio_size, "name": f"{clone_name}_fio", @@ -278,15 +515,21 @@ def create_snapshots_and_clones(self): "numjobs": 5, "time_based": True, "runtime": 2000, + "log_avg_msec": 1000, + "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"], }, ) fio_thread.start() self.fio_threads.append(fio_thread) + self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}.") - self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}") - self.sbcli_utils.resize_lvol(self.lvol_mount_details[lvol]["ID"], f"{self.int_lvol_size}G") + if self.lvol_mount_details[lvol]["ID"]: + self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"], + new_size=f"{self.int_lvol_size}G") sleep_n_sec(10) - self.sbcli_utils.resize_lvol(self.clone_mount_details[clone_name]["ID"], f"{self.int_lvol_size}G") + if self.clone_mount_details[clone_name]["ID"]: + self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"], + new_size=f"{self.int_lvol_size}G") def run(self): @@ -301,6 +544,8 @@ def run(self): for result in storage_nodes['results']: self.sn_nodes.append(result["uuid"]) self.sn_nodes_with_sec.append(result["uuid"]) + self.sn_primary_secondary_map[result["uuid"]] = result["secondary_node_id"] + self.logger.info(f"Secondary node map: {self.sn_primary_secondary_map}") sleep_n_sec(30) @@ -320,11 +565,23 @@ def run(self): for node, outage_type in outage_events: self.current_outage_node = node - self.restart_nodes_after_failover(outage_type) + if outage_type == "container_stop" and self.npcs > 1: + self.restart_nodes_after_failover(outage_type, True) + else: + self.restart_nodes_after_failover(outage_type) self.logger.info("Waiting for fallback recovery.") sleep_n_sec(100) + for node in self.sn_nodes_with_sec: + cur_node_details = self.sbcli_utils.get_storage_node_details(node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) + time_duration = self.common_utils.calculate_time_duration( start_timestamp=self.outage_start_time, end_timestamp=self.outage_end_time @@ -343,12 +600,27 @@ def run(self): # for node, outage_type in outage_events: # if not self.sbcli_utils.is_secondary_node(node): self.validate_migration_for_node(self.outage_start_time, 2000, None, 60, no_task_ok=no_task_ok) + self.common_utils.manage_fio_threads(self.fio_node, self.fio_threads, timeout=20000) for clone, clone_details in self.clone_mount_details.items(): self.common_utils.validate_fio_test(clone_details["Client"], clone_details["Log"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"]) + self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"]) for lvol, lvol_details in self.lvol_mount_details.items(): self.common_utils.validate_fio_test(lvol_details["Client"], lvol_details["Log"]) + self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"]) + self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"]) self.logger.info(f"N+K failover iteration {iteration} complete.") + + for node in self.sn_nodes_with_sec: + cur_node_details = self.sbcli_utils.get_storage_node_details(node) + cur_node_ip = cur_node_details[0]["mgmt_ip"] + self.ssh_obj.fetch_distrib_logs( + storage_node_ip=cur_node_ip, + storage_node_id=node, + logs_path=self.docker_logs_path + ) iteration += 1 + diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py index bd06f06f7..ee265d507 100644 --- a/e2e/utils/ssh_utils.py +++ b/e2e/utils/ssh_utils.py @@ -13,6 +13,10 @@ import string import re import subprocess +import shlex +import socket +from collections import defaultdict +from typing import Optional, List SSH_KEY_LOCATION = os.path.join(Path.home(), ".ssh", os.environ.get("KEY_NAME")) @@ -47,31 +51,227 @@ def __init__(self, bastion_server): self.log_monitor_threads = {} self.log_monitor_stop_flags = {} self.ssh_semaphore = threading.Semaphore(10) # Max 10 SSH calls in parallel (tune as needed) + self._bastion_client = None + self._reconnect_locks = defaultdict(threading.Lock) + self.ssh_pass = None + + def _candidate_usernames(self, explicit_user) -> List[str]: + if explicit_user: + if isinstance(explicit_user, (list, tuple)): + return list(explicit_user) + return [str(explicit_user)] + return ["ec2-user", "ubuntu", "rocky", "root"] + + def _load_private_keys(self) -> List[paramiko.PKey]: + """ + Try Ed25519 then RSA. If SSH_KEY_LOCATION/env points to a file, use it. + Else try ~/.ssh/id_ed25519 and ~/.ssh/id_rsa. If SSH_KEY_PATH is a dir, load all files from it. + """ + paths = [] + # explicit single file via KEY_NAME → SSH_KEY_LOCATION + if SSH_KEY_LOCATION and os.path.isfile(SSH_KEY_LOCATION): + paths.append(SSH_KEY_LOCATION) + # defaults + home = os.path.join(Path.home(), ".ssh") + paths.extend([os.path.join(home, "id_ed25519"), os.path.join(home, "id_rsa")]) + + keys = [] + seen = set() + for p in paths: + if not os.path.exists(p) or p in seen: + continue + seen.add(p) + try: + keys.append(paramiko.Ed25519Key.from_private_key_file(p)) + continue + except Exception: + pass + try: + keys.append(paramiko.RSAKey.from_private_key_file(p)) + except Exception: + pass + if not keys and not self.ssh_pass: + raise FileNotFoundError("No usable SSH private key found and SSH_PASS not set.") + return keys + + def _try_connect(self, host: str, username: str, pkey: Optional[paramiko.PKey], password: Optional[str], sock=None, timeout=30): + cli = paramiko.SSHClient() + cli.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + cli.connect( + hostname=host, + username=username, + pkey=pkey, + password=(password if pkey is None else None), + timeout=timeout, + banner_timeout=timeout, + auth_timeout=timeout, + allow_agent=False, + look_for_keys=False, + sock=sock + ) + return cli + + # def connect(self, address: str, port: int = 22, + # bastion_server_address: str = None, + # username: str = "ec2-user", + # is_bastion_server: bool = False): + # """Connect to cluster nodes""" + # # --- prep usernames list --- + # default_users = ["ec2-user", "ubuntu", "rocky", "root"] + # if getattr(self, "ssh_user", None): + # if isinstance(self.ssh_user, (list, tuple)): + # usernames = list(self.ssh_user) + # else: + # usernames = [str(self.ssh_user)] + # else: + # usernames = default_users + + # # Load key (Ed25519 -> RSA fallback) + # if not os.path.exists(SSH_KEY_LOCATION): + # raise FileNotFoundError(f"SSH private key not found at {SSH_KEY_LOCATION}") + # try: + # private_key = paramiko.Ed25519Key(filename=SSH_KEY_LOCATION) + # except Exception: + # private_key = paramiko.RSAKey.from_private_key_file(SSH_KEY_LOCATION) + + # # Helper to store/replace a connection + # def _store(host, client): + # if self.ssh_connections.get(host): + # try: + # self.ssh_connections[host].close() + # except Exception: + # pass + # self.ssh_connections[host] = client + + # # ---------- direct connection ---------- + # bastion_server_address = bastion_server_address or self.bastion_server + # if not bastion_server_address: + # self.logger.info(f"Connecting directly to {address} on port {port}...") + # last_err = None + # for user in usernames: + # ssh = paramiko.SSHClient() + # ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + # try: + # ssh.connect( + # hostname=address, + # username=user, + # port=port, + # pkey=private_key, + # timeout=300, + # banner_timeout=30, + # auth_timeout=30, + # allow_agent=False, + # look_for_keys=False, + # ) + # self.logger.info(f"Connected directly to {address} as '{user}'.") + # _store(address, ssh) + # return + # except Exception as e: + # last_err = e + # self.logger.info(f"Direct login failed for '{user}': {repr(e)}") + # try: + # ssh.close() + # except Exception: + # pass + # raise Exception(f"All usernames failed for {address}. Last error: {repr(last_err)}") + + # # ---------- connect to bastion ---------- + # self.logger.info(f"Connecting to bastion server {bastion_server_address}...") + # bastion_ssh = paramiko.SSHClient() + # bastion_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + # last_err = None + # bastion_user_used = None + # for b_user in usernames: + # try: + # bastion_ssh.connect( + # hostname=bastion_server_address, + # username=b_user, + # port=port, + # pkey=private_key, + # timeout=300, + # banner_timeout=30, + # auth_timeout=30, + # allow_agent=False, + # look_for_keys=False, + # ) + # self.logger.info(f"Connected to bastion as '{b_user}'.") + # _store(bastion_server_address, bastion_ssh) + # bastion_user_used = b_user + # break + # except Exception as e: + # last_err = e + # self.logger.info(f"Bastion login failed for '{b_user}': {repr(e)}") + # if bastion_user_used is None: + # raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}") + # if is_bastion_server: + # return # caller only needed bastion + + # # ---------- tunnel to target through bastion ---------- + # self.logger.info(f"Connecting to target server {address} through bastion server...") + # transport = bastion_ssh.get_transport() + # last_err = None + # for user in usernames: + # # IMPORTANT: open a NEW channel for each username attempt + # try: + # channel = transport.open_channel( + # "direct-tcpip", + # (address, port), + # ("localhost", 0), + # ) + # except paramiko.ssh_exception.ChannelException as ce: + # self.logger.error( + # f"Channel open failed: {repr(ce)} — check AllowTcpForwarding/PermitOpen on bastion." + # ) + # raise + # target_ssh = paramiko.SSHClient() + # target_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + # try: + # target_ssh.connect( + # address, + # username=user, + # port=port, + # sock=channel, + # pkey=private_key, + # timeout=300, + # banner_timeout=30, + # auth_timeout=30, + # allow_agent=False, + # look_for_keys=False, + # ) + # self.logger.info(f"Connected to {address} as '{user}' via bastion '{bastion_user_used}'.") + # _store(address, target_ssh) + # return + # except Exception as e: + # last_err = e + # self.logger.info(f"Target login failed for '{user}': {repr(e)}") + # try: + # target_ssh.close() + # except Exception: + # pass + # try: + # channel.close() + # except Exception: + # pass + + # raise Exception( + # f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}" + # ) def connect(self, address: str, port: int = 22, bastion_server_address: str = None, username: str = "ec2-user", is_bastion_server: bool = False): - """Connect to cluster nodes""" - # --- prep usernames list --- - default_users = ["ec2-user", "ubuntu", "rocky", "root"] - if getattr(self, "ssh_user", None): - if isinstance(self.ssh_user, (list, tuple)): - usernames = list(self.ssh_user) - else: - usernames = [str(self.ssh_user)] - else: - usernames = default_users + """ + Connect to a host directly or via bastion, trying multiple usernames and keys, + with optional password fallback. + """ + # Resolve bastion + bastion_server_address = bastion_server_address or self.bastion_server - # Load key (Ed25519 -> RSA fallback) - if not os.path.exists(SSH_KEY_LOCATION): - raise FileNotFoundError(f"SSH private key not found at {SSH_KEY_LOCATION}") - try: - private_key = paramiko.Ed25519Key(filename=SSH_KEY_LOCATION) - except Exception: - private_key = paramiko.RSAKey.from_private_key_file(SSH_KEY_LOCATION) + usernames = self._candidate_usernames(self.ssh_user or username) + keys = self._load_private_keys() + password = self.ssh_pass - # Helper to store/replace a connection def _store(host, client): if self.ssh_connections.get(host): try: @@ -80,230 +280,291 @@ def _store(host, client): pass self.ssh_connections[host] = client - # ---------- direct connection ---------- - bastion_server_address = bastion_server_address or self.bastion_server + # --- NO BASTION: direct connect --- if not bastion_server_address: - self.logger.info(f"Connecting directly to {address} on port {port}...") last_err = None + self.logger.info(f"Connecting directly to {address} on port {port}...") for user in usernames: - ssh = paramiko.SSHClient() - ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - try: - ssh.connect( - hostname=address, - username=user, - port=port, - pkey=private_key, - timeout=300, - banner_timeout=30, - auth_timeout=30, - allow_agent=False, - look_for_keys=False, - ) - self.logger.info(f"Connected directly to {address} as '{user}'.") - _store(address, ssh) - return - except Exception as e: - last_err = e - self.logger.info(f"Direct login failed for '{user}': {repr(e)}") + # try keys + for key in keys: try: - ssh.close() - except Exception: - pass + cli = self._try_connect(address, user, key, None, timeout=30) + self.logger.info(f"Connected directly to {address} as '{user}'.") + _store(address, cli) + return + except Exception as e: + last_err = e + # then password + if password: + try: + cli = self._try_connect(address, user, None, password, timeout=30) + self.logger.info(f"Connected directly to {address} as '{user}' (password).") + _store(address, cli) + return + except Exception as e: + last_err = e raise Exception(f"All usernames failed for {address}. Last error: {repr(last_err)}") - # ---------- connect to bastion ---------- - self.logger.info(f"Connecting to bastion server {bastion_server_address}...") - bastion_ssh = paramiko.SSHClient() - bastion_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - last_err = None - bastion_user_used = None - for b_user in usernames: - try: - bastion_ssh.connect( - hostname=bastion_server_address, - username=b_user, - port=port, - pkey=private_key, - timeout=300, - banner_timeout=30, - auth_timeout=30, - allow_agent=False, - look_for_keys=False, - ) - self.logger.info(f"Connected to bastion as '{b_user}'.") - _store(bastion_server_address, bastion_ssh) - bastion_user_used = b_user + # --- VIA BASTION --- + # ensure bastion client (reuse if alive) + if (not self._bastion_client) or (not self._bastion_client.get_transport()) or (not self._bastion_client.get_transport().is_active()): + last_err = None + self.logger.info(f"Connecting to bastion server {bastion_server_address}...") + for b_user in self._candidate_usernames(self.ssh_user or username): + for key in keys: + try: + cli = self._try_connect(bastion_server_address, b_user, key, None, timeout=30) + self._bastion_client = cli + self.logger.info(f"Connected to bastion as '{b_user}'.") + break + except Exception as e: + last_err = e + else: + if password: + try: + cli = self._try_connect(bastion_server_address, b_user, None, password, timeout=30) + self._bastion_client = cli + self.logger.info(f"Connected to bastion as '{b_user}' (password).") + break + except Exception as e: + last_err = e + continue break - except Exception as e: - last_err = e - self.logger.info(f"Bastion login failed for '{b_user}': {repr(e)}") - if bastion_user_used is None: - raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}") + if (not self._bastion_client) or (not self._bastion_client.get_transport()) or (not self._bastion_client.get_transport().is_active()): + raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}") + if is_bastion_server: - return # caller only needed bastion + # caller only wanted bastion connection open + _store(bastion_server_address, self._bastion_client) + return - # ---------- tunnel to target through bastion ---------- + # open a channel through bastion → target self.logger.info(f"Connecting to target server {address} through bastion server...") - transport = bastion_ssh.get_transport() + bastion_transport = self._bastion_client.get_transport() + last_err = None for user in usernames: - # IMPORTANT: open a NEW channel for each username attempt - try: - channel = transport.open_channel( - "direct-tcpip", - (address, port), - ("localhost", 0), - ) - except paramiko.ssh_exception.ChannelException as ce: - self.logger.error( - f"Channel open failed: {repr(ce)} — check AllowTcpForwarding/PermitOpen on bastion." - ) - raise - target_ssh = paramiko.SSHClient() - target_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - try: - target_ssh.connect( - address, - username=user, - port=port, - sock=channel, - pkey=private_key, - timeout=300, - banner_timeout=30, - auth_timeout=30, - allow_agent=False, - look_for_keys=False, - ) - self.logger.info(f"Connected to {address} as '{user}' via bastion '{bastion_user_used}'.") - _store(address, target_ssh) - return - except Exception as e: - last_err = e - self.logger.info(f"Target login failed for '{user}': {repr(e)}") + # new channel for each attempt + chan = bastion_transport.open_channel("direct-tcpip", (address, port), ("127.0.0.1", 0)) + # try keys + for key in keys: try: - target_ssh.close() - except Exception: - pass + cli = self._try_connect(address, user, key, None, sock=chan, timeout=30) + self.logger.info(f"Connected to {address} as '{user}' via bastion.") + _store(address, cli) + return + except Exception as e: + last_err = e + # then password + if password: try: - channel.close() - except Exception: - pass - - raise Exception( - f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}" - ) - + cli = self._try_connect(address, user, None, password, sock=chan, timeout=30) + self.logger.info(f"Connected to {address} as '{user}' via bastion (password).") + _store(address, cli) + return + except Exception as e: + last_err = e + try: + chan.close() + except Exception: + pass + + raise Exception(f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}") + + + + # def exec_command(self, node, command, timeout=360, max_retries=3, stream_callback=None, supress_logs=False): + # """Executes a command on a given machine with streaming output and retry mechanism. + + # Args: + # node (str): Machine to run command on. + # command (str): Command to run. + # timeout (int): Timeout in seconds. + # max_retries (int): Number of retries in case of failures. + # stream_callback (callable, optional): A callback function for streaming output. Defaults to None. + + # Returns: + # tuple: Final output and error strings after command execution. + # """ + # retry_count = 0 + # while retry_count < max_retries: + # with self.ssh_semaphore: + # ssh_connection = self.ssh_connections.get(node) + # try: + # # Ensure the SSH connection is active, otherwise reconnect + # if not ssh_connection or not ssh_connection.get_transport().is_active() or retry_count > 0: + # self.logger.info(f"Reconnecting SSH to node {node}") + # self.connect( + # address=node, + # is_bastion_server=True if node == self.bastion_server else False + # ) + # ssh_connection = self.ssh_connections[node] + + # if not supress_logs: + # self.logger.info(f"Executing command: {command}") + # stdin, stdout, stderr = ssh_connection.exec_command(command, timeout=timeout) + + # output = [] + # error = [] + + # # Read stdout and stderr dynamically if stream_callback is provided + # if stream_callback: + # while not stdout.channel.exit_status_ready(): + # # Process stdout + # if stdout.channel.recv_ready(): + # chunk = stdout.channel.recv(1024).decode() + # output.append(chunk) + # stream_callback(chunk, is_error=False) # Callback for stdout + + # # Process stderr + # if stderr.channel.recv_stderr_ready(): + # chunk = stderr.channel.recv_stderr(1024).decode() + # error.append(chunk) + # stream_callback(chunk, is_error=True) # Callback for stderr + + # time.sleep(0.1) + + # # Finalize any remaining output + # if stdout.channel.recv_ready(): + # chunk = stdout.channel.recv(1024).decode() + # output.append(chunk) + # stream_callback(chunk, is_error=False) + + # if stderr.channel.recv_stderr_ready(): + # chunk = stderr.channel.recv_stderr(1024).decode() + # error.append(chunk) + # stream_callback(chunk, is_error=True) + # else: + # # Default behavior: Read the entire output at once + # output = stdout.read().decode() + # error = stderr.read().decode() + + # # Combine the output into strings + # output = "".join(output) if isinstance(output, list) else output + # error = "".join(error) if isinstance(error, list) else error + + # # Log the results + # if output: + # if not supress_logs: + # self.logger.info(f"Command output: {output}") + # if error: + # if not supress_logs: + # self.logger.error(f"Command error: {error}") + + # if not output and not error: + # if not supress_logs: + # self.logger.warning(f"Command '{command}' executed but returned no output or error.") + + # return output, error + + # except EOFError as e: + # self.logger.error(f"EOFError occurred while executing command '{command}': {e}. Retrying ({retry_count + 1}/{max_retries})...") + # retry_count += 1 + # time.sleep(2) # Short delay before retrying + + # except paramiko.SSHException as e: + # self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...") + # retry_count += 1 + # time.sleep(2) # Short delay before retrying + + # except paramiko.buffered_pipe.PipeTimeout as e: + # self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...") + # retry_count += 1 + # time.sleep(2) # Short delay before retrying + + # except Exception as e: + # self.logger.error(f"SSH command failed (General Exception): {e}. Retrying ({retry_count + 1}/{max_retries})...") + # retry_count += 1 + # time.sleep(2) # Short delay before retrying + + # # If we exhaust retries, return failure + # self.logger.error(f"Failed to execute command '{command}' on node {node} after {max_retries} retries.") + # return "", "Command failed after max retries" def exec_command(self, node, command, timeout=360, max_retries=3, stream_callback=None, supress_logs=False): - """Executes a command on a given machine with streaming output and retry mechanism. - - Args: - node (str): Machine to run command on. - command (str): Command to run. - timeout (int): Timeout in seconds. - max_retries (int): Number of retries in case of failures. - stream_callback (callable, optional): A callback function for streaming output. Defaults to None. - - Returns: - tuple: Final output and error strings after command execution. """ - retry_count = 0 - while retry_count < max_retries: + Execute a command with auto-reconnect (serialized per node), optional streaming, + and proper exit-status capture to reduce “ran but no output” confusion. + """ + retry = 0 + while retry < max_retries: with self.ssh_semaphore: - ssh_connection = self.ssh_connections.get(node) + # serialize reconnect attempts per node + lock = self._reconnect_locks[node] + with lock: + ssh = self.ssh_connections.get(node) + if not ssh or not ssh.get_transport() or not ssh.get_transport().is_active() or retry > 0: + if not supress_logs: + self.logger.info(f"Reconnecting SSH to node {node}") + # if node is the bastion itself + self.connect(node, is_bastion_server=(node == self.bastion_server)) + ssh = self.ssh_connections[node] + try: - # Ensure the SSH connection is active, otherwise reconnect - if not ssh_connection or not ssh_connection.get_transport().is_active() or retry_count > 0: - self.logger.info(f"Reconnecting SSH to node {node}") - self.connect( - address=node, - is_bastion_server=True if node == self.bastion_server else False - ) - ssh_connection = self.ssh_connections[node] - if not supress_logs: self.logger.info(f"Executing command: {command}") - stdin, stdout, stderr = ssh_connection.exec_command(command, timeout=timeout) + stdin, stdout, stderr = ssh.exec_command(command, timeout=timeout) + output_chunks, error_chunks = [], [] - output = [] - error = [] - - # Read stdout and stderr dynamically if stream_callback is provided if stream_callback: while not stdout.channel.exit_status_ready(): - # Process stdout if stdout.channel.recv_ready(): - chunk = stdout.channel.recv(1024).decode() - output.append(chunk) - stream_callback(chunk, is_error=False) # Callback for stdout - - # Process stderr + chunk = stdout.channel.recv(8192).decode(errors="replace") + output_chunks.append(chunk) + stream_callback(chunk, is_error=False) if stderr.channel.recv_stderr_ready(): - chunk = stderr.channel.recv_stderr(1024).decode() - error.append(chunk) - stream_callback(chunk, is_error=True) # Callback for stderr - - time.sleep(0.1) - - # Finalize any remaining output - if stdout.channel.recv_ready(): - chunk = stdout.channel.recv(1024).decode() - output.append(chunk) + chunk = stderr.channel.recv_stderr(8192).decode(errors="replace") + error_chunks.append(chunk) + stream_callback(chunk, is_error=True) + time.sleep(0.05) + + # flush remaining + while stdout.channel.recv_ready(): + chunk = stdout.channel.recv(8192).decode(errors="replace") + output_chunks.append(chunk) stream_callback(chunk, is_error=False) - - if stderr.channel.recv_stderr_ready(): - chunk = stderr.channel.recv_stderr(1024).decode() - error.append(chunk) + while stderr.channel.recv_stderr_ready(): + chunk = stderr.channel.recv_stderr(8192).decode(errors="replace") + error_chunks.append(chunk) stream_callback(chunk, is_error=True) + + exit_status = stdout.channel.recv_exit_status() + out = "".join(output_chunks) + err = "".join(error_chunks) else: - # Default behavior: Read the entire output at once - output = stdout.read().decode() - error = stderr.read().decode() + out = stdout.read().decode(errors="replace") + err = stderr.read().decode(errors="replace") + exit_status = stdout.channel.recv_exit_status() - # Combine the output into strings - output = "".join(output) if isinstance(output, list) else output - error = "".join(error) if isinstance(error, list) else error + if (not supress_logs) and out: + self.logger.info(f"Command output: {out.strip()[:2000]}") + if (not supress_logs) and err: + self.logger.error(f"Command error: {err.strip()[:2000]}") - # Log the results - if output: - if not supress_logs: - self.logger.info(f"Command output: {output}") - if error: - if not supress_logs: - self.logger.error(f"Command error: {error}") + if exit_status != 0 and not err: + # some tools write nothing on stderr but non-zero exit + err = f"Non-zero exit status: {exit_status}" - if not output and not error: + if not out and not err: if not supress_logs: self.logger.warning(f"Command '{command}' executed but returned no output or error.") - return output, error - - except EOFError as e: - self.logger.error(f"EOFError occurred while executing command '{command}': {e}. Retrying ({retry_count + 1}/{max_retries})...") - retry_count += 1 - time.sleep(2) # Short delay before retrying + return out, err - except paramiko.SSHException as e: - self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...") - retry_count += 1 - time.sleep(2) # Short delay before retrying - - except paramiko.buffered_pipe.PipeTimeout as e: - self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...") - retry_count += 1 - time.sleep(2) # Short delay before retrying + except (EOFError, paramiko.SSHException, paramiko.buffered_pipe.PipeTimeout, socket.error) as e: + retry += 1 + self.logger.error(f"SSH command failed ({type(e).__name__}): {e}. Retrying ({retry}/{max_retries})...") + time.sleep(min(2 * retry, 5)) except Exception as e: - self.logger.error(f"SSH command failed (General Exception): {e}. Retrying ({retry_count + 1}/{max_retries})...") - retry_count += 1 - time.sleep(2) # Short delay before retrying + retry += 1 + self.logger.error(f"SSH command failed (General): {e}. Retrying ({retry}/{max_retries})...") + time.sleep(min(2 * retry, 5)) - # If we exhaust retries, return failure self.logger.error(f"Failed to execute command '{command}' on node {node} after {max_retries} retries.") return "", "Command failed after max retries" - + def format_disk(self, node, device, fs_type="ext4"): """Format disk on the given node @@ -362,14 +623,133 @@ def get_devices(self, node): return output.strip().split() - def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs): - """Run FIO Tests with given params and proper logging for MD5 error timestamp tracing. + # def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs): + # """ + # Run FIO with optional 'ensure_running' that verifies process presence and retries start up to N times. + + # kwargs: + # - ensure_running: bool (default False) + # - max_start_retries: int (default 3) + # """ + # location = "" + # if device: + # location = f"--filename={device}" + # if directory: + # location = f"--directory={directory}" + + # runtime = kwargs.get("runtime", 3600) + # name = kwargs.get("name", f"fio_{_rid(6)}") + # ioengine = kwargs.get("ioengine", "libaio") + # iodepth = kwargs.get("iodepth", 1) + # time_based = "--time_based" if kwargs.get("time_based", True) else "" + # rw = kwargs.get("rw", "randrw") + # bs = kwargs.get("bs", "4K") + # size = kwargs.get("size", "1G") + # rwmixread = kwargs.get("rwmixread", 70) + # numjobs = kwargs.get("numjobs", 2) + # nrfiles = kwargs.get("nrfiles", 8) + # log_avg_ms = kwargs.get("log_avg_msec", 1000) + # output_fmt = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else '' + # output_file = f" --output={kwargs['output_file']} " if kwargs.get("output_file") else '' + # iolog_base = kwargs.get("iolog_file") + + # iolog_opt = f"--write_iolog={iolog_base}" if iolog_base else "" + # log_opt = f"--log_avg_msec={log_avg_ms}" if log_avg_ms else "" + + # command = ( + # f"sudo fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} " + # f"{time_based} --runtime={runtime} --rw={rw} --max_latency=20s --bs={bs} --size={size} --rwmixread={rwmixread} " + # f"--verify=md5 --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} " + # f"{log_opt} {iolog_opt} {output_fmt}{output_file}" + # ) + # if kwargs.get("debug"): + # command += " --debug=all" + # if log_file: + # command += f" > {log_file} 2>&1" + + # ensure_running = bool(kwargs.get("ensure_running", False)) + # max_start_retries = int(kwargs.get("max_start_retries", 3)) + + # launch_retries = 3 + # for attempt in range(1, launch_retries + 1): + + # try: + # self.logger.info(f"Starting FIO on {node}: {name} → {location} (attempt {attempt}/{launch_retries})") + # self.exec_command(node=node, command=f"sudo {command}", max_retries=2) + # break + # except Exception as e: + # self.logger.error(f"FIO start failed: {e}") + # if attempt == launch_retries: + # raise + # time.sleep(1.0 * attempt) + + # # Ensure process is up (pgrep name) + # start_retries = 6 + # for i in range(start_retries): + # out, err = self.exec_command( + # node=node, + # command=f"pgrep -fa 'fio.*{name}' || true", + # max_retries=1, + # ) + # if out.strip(): + # self.logger.info(f"FIO is running for {name}: {out.strip().splitlines()[0]}") + # return + # # Not running yet → small backoff and try again + # time.sleep(2 + i) + # # If still not, try re-launch quickly + # if i >= 2: + # self.logger.warning(f"FIO still not running for {name}; re-issuing start (try {i-1}/{start_retries-3})") + # try: + # self.exec_command(node=node, command=f"sudo {command}", max_retries=1) + # except Exception as e: + # self.logger.warning(f"Re-start attempt raised: {e}") + + # # If we get here, fio didn’t stick + # raise RuntimeError(f"FIO failed to stay running for job {name} on {node}") + + # def _is_running(): + # # Use pgrep on job name (fio --name=) for a quick check + # # Fall back to ps+grep if pgrep not present. + # try: + # out, _ = self.exec_command(node=node, command=f"pgrep -fl 'fio.*--name={name}'", max_retries=1) + # return bool(out.strip()) + # except Exception: + # out, _ = self.exec_command(node=node, command=f"ps ax | grep -E 'fio.*--name={name}' | grep -v grep || true", max_retries=1) + # return bool(out.strip()) + + # # Try to start; handle EOF/channel close by reconnect+retry + # attempts = 0 + # while True: + # attempts += 1 + # try: + # self.exec_command(node=node, command=command, max_retries=3) + # except Exception as e: + # # Channel/EOF during start is common in churn; retry a few times + # if attempts < max_start_retries: + # self.logger.error(f"FIO start error ({e}); retrying {attempts}/{max_start_retries} in 2s") + # time.sleep(2) + # continue + # else: + # raise + + # if not ensure_running: + # return + + # # Verify started; retry if not + # time.sleep(1.0) + # if _is_running(): + # return + + # if attempts >= max_start_retries: + # raise RuntimeError(f"FIO failed to start after {max_start_retries} attempts for job '{name}'") + + # self.logger.warning(f"FIO not detected running for '{name}'; retrying start {attempts}/{max_start_retries}") + # time.sleep(1.0) - Args: - node (str): Node to perform ssh operation on - device (str): Device path. Defaults to None. - directory (str, optional): Directory to run test on. Defaults to None. - log_file (str, optional): Log file to redirect output to. Defaults to None. + def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs): + """ + Start FIO in a detached tmux session so it survives SSH channel drops during fast outages. + Verifies process presence and re-kicks a few times if missing. """ location = "" if device: @@ -377,72 +757,63 @@ def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwarg if directory: location = f"--directory={directory}" - runtime = kwargs.get("runtime", 3600) - rw = kwargs.get("rw", "randrw") - name = kwargs.get("name", "test") - ioengine = kwargs.get("ioengine", "libaio") - iodepth = kwargs.get("iodepth", 1) - bs = kwargs.get("bs", "4k") - rwmixread = kwargs.get("rwmixread", 70) - size = kwargs.get("size", "10MiB") - time_based = "--time_based" if kwargs.get("time_based", True) else "" - numjobs = kwargs.get("numjobs", 1) - nrfiles = kwargs.get("nrfiles", 1) - - output_format = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else '' + runtime = kwargs.get("runtime", 3600) + name = kwargs.get("name", f"fio_{_rid(6)}") + ioengine = kwargs.get("ioengine", "libaio") + iodepth = kwargs.get("iodepth", 1) + time_based = "--time_based" if kwargs.get("time_based", True) else "" + rw = kwargs.get("rw", "randrw") + bs = kwargs.get("bs", "4K") + size = kwargs.get("size", "1G") + rwmixread = kwargs.get("rwmixread", 70) + numjobs = kwargs.get("numjobs", 2) + nrfiles = kwargs.get("nrfiles", 8) + log_avg_ms = kwargs.get("log_avg_msec", 1000) + max_latency = kwargs.get("max_latency", "20s") + use_latency = kwargs.get("use_latency", True) + output_fmt = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else '' output_file = f" --output={kwargs['output_file']} " if kwargs.get("output_file") else '' + iolog_base = kwargs.get("iolog_file") - log_avg_msec = kwargs.get("log_avg_msec", 1000) - log_avg_msec_opt = f"--log_avg_msec={log_avg_msec}" if log_avg_msec else "" - - iolog_base = kwargs.get("iolog_file", None) - iolog_opt = f"--write_iolog={iolog_base}" if iolog_base else "" - verify_md5 = "--verify=md5" if iodepth == 1 else "" + iolog_opt = f"--write_iolog={iolog_base}" if iolog_base else "" + log_opt = f"--log_avg_msec={log_avg_ms}" if log_avg_ms else "" + latency = f" --max_latency={max_latency}" if use_latency else "" - command = ( - f"sudo fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} " - f"{time_based} --runtime={runtime} --rw={rw} --max_latency=30s --bs={bs} --size={size} --rwmixread={rwmixread} " - f"{verify_md5} --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} " - f"{log_avg_msec_opt} {iolog_opt} " - f"{output_format}{output_file}" - ) - # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - # log_file = log_file or f"/tmp/{name}_{timestamp}.log" + # raw fio command + fio_cmd = ( + f"fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} " + f"{time_based} --runtime={runtime} --rw={rw} {latency} --bs={bs} --size={size} --rwmixread={rwmixread} " + f"--verify=md5 --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} " + f"{log_opt} {iolog_opt} {output_fmt}{output_file}" + ).strip() if kwargs.get("debug"): - command += " --debug=all" + fio_cmd += " --debug=all" + # run fio under tmux so HUP/SSH channel drops don't kill it + session = f"fio_{name}" if log_file: - command += f" > {log_file} 2>&1" - - # else: - # command += " --debug=verify" - - # awk_ts = " | awk '{ print strftime(\"[%Y-%m-%d %H:%M:%S]\"), $0; fflush(); }' | " - # command += awk_ts - # command += f"tee {log_file}" - - self.logger.info(f"Executing FIO command:\n{command}") + fio_cmd = f"{fio_cmd} > {log_file} 2>&1" + + start_cmd = f"sudo tmux new-session -d -s {session} \"{fio_cmd}\" || sudo tmux kill-session -t {session} 2>/dev/null || true; sudo tmux new-session -d -s {session} \"{fio_cmd}\"" + self.logger.info(f"Starting FIO on {node}: {name} in tmux session '{session}'") + self.exec_command(node=node, command=start_cmd, max_retries=2) + + # Ensure process is up: check tmux & pgrep + for i in range(8): + out, _ = self.exec_command(node=node, command=f"pgrep -fa 'fio.*{name}' || true", max_retries=1, supress_logs=True) + tmux_ok, _ = self.exec_command(node=node, command=f"sudo tmux has-session -t {session} 2>/dev/null || echo MISSING", max_retries=1, supress_logs=True) + if out.strip() and "MISSING" not in tmux_ok: + self.logger.info(f"FIO is running for {name}: {out.strip().splitlines()[0]}") + return + if i >= 2: + self.logger.warning(f"FIO not detected yet for {name}; re-issuing start (try {i-1}/5)") + self.exec_command(node=node, command=start_cmd, max_retries=1, supress_logs=True) + time.sleep(2 + i) - start_time = time.time() - output, error = self.exec_command(node=node, command=command, timeout=runtime * 2) - end_time = time.time() - - total_time = end_time - start_time - self.fio_runtime[name] = start_time - self.logger.info(f"Total time taken to run the command: {total_time:.2f} seconds") - - # Return all generated iolog files (one per job) - iolog_files = [f"{iolog_base}.{i}" for i in range(numjobs)] - return { - "output": output, - "error": error, - "start_time": start_time, - "end_time": end_time, - "iolog_files": iolog_files, - } + raise RuntimeError(f"FIO failed to stay running for job {name} on {node}") - + def find_process_name(self, node, process_name, return_pid=False): if return_pid: command = "ps -ef | grep -i '%s' | awk '{print $2}'" % process_name @@ -700,15 +1071,35 @@ def get_lvol_id(self, node, lvol_name): return output.strip().split() def get_snapshot_id(self, node, snapshot_name): - cmd = "%s snapshot list | grep -i '%s ' | awk '{print $2}'" % (self.base_cmd, snapshot_name) - output, error = self.exec_command(node=node, command=cmd) + start = time.time() + deadline = start + 600 # 10 minutes + wait_interval = 10 # seconds between checks + snapshot_id = "" + + while time.time() < deadline: + cmd = "%s snapshot list | grep -i '%s ' | awk '{print $2}'" % (self.base_cmd, snapshot_name) + output, error = self.exec_command(node=node, command=cmd) + if output.strip(): + if hasattr(self, "logger"): + self.logger.info(f"Snapshot '{snapshot_name}' is visible with ID: {snapshot_id}") + break + time.sleep(wait_interval) + + if not output.strip(): + if hasattr(self, "logger"): + self.logger.error(f"Timed out waiting for snapshot '{snapshot_name}' to appear within 10 minutes.") return output.strip() def add_snapshot(self, node, lvol_id, snapshot_name): cmd = f"{self.base_cmd} -d snapshot add {lvol_id} {snapshot_name}" output, error = self.exec_command(node=node, command=cmd) - return output, error + + snapshot_id = self.get_snapshot_id(node=node, snapshot_name=snapshot_name) + + if not snapshot_id: + if hasattr(self, "logger"): + self.logger.error(f"Timed out waiting for snapshot '{snapshot_name}' to appear within 10 minutes.") def add_clone(self, node, snapshot_id, clone_name): cmd = f"{self.base_cmd} -d snapshot clone {snapshot_id} {clone_name}" @@ -971,30 +1362,81 @@ def get_active_interfaces(self, node_ip): return [] - def disconnect_all_active_interfaces(self, node_ip, interfaces, reconnect_time=300): - """ - Disconnect all active network interfaces on a node in a single SSH call. + # def disconnect_all_active_interfaces(self, node_ip, interfaces, reconnect_time=300): + # """ + # Disconnect all active network interfaces on a node in a single SSH call. + + # Args: + # node_ip (str): IP of the target node. + # interfaces (list): List of active network interfaces to disconnect. + # """ + # if not interfaces: + # self.logger.warning(f"No active interfaces to disconnect on node {node_ip}.") + # return + + # # Combine disconnect commands for all interfaces + # disconnect_cmds = " && ".join([f"sudo nmcli connection down {iface}" for iface in interfaces]) + # reconnect_cmds = " && ".join([f"sudo nmcli connection up {iface}" for iface in interfaces]) + + # cmd = ( + # f'nohup sh -c "{disconnect_cmds} && sleep {reconnect_time} && {reconnect_cmds}" &' + # ) + # self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}") + # try: + # self.exec_command(node_ip, cmd) + # except Exception as e: + # self.logger.error(f"Failed to execute combined disconnect command on {node_ip}: {e}") + + def _ping_once(self, ip: str, count: int = 1, wait: int = 1) -> bool: + try: + # Use system ping; True means "ping success" + res = subprocess.run(["ping", "-c", str(count), "-W", str(wait), ip], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return res.returncode == 0 + except Exception: + return False - Args: - node_ip (str): IP of the target node. - interfaces (list): List of active network interfaces to disconnect. + def disconnect_all_active_interfaces( + self, + node_ip: str, + interfaces: list[str], + duration_secs: int = 300, + max_tries: int = 3, + ): + """ + Bring all given interfaces DOWN, verify outage by ping, keep for duration, then bring them UP. + Fire-and-forget style; robust against brief SSH flaps. """ if not interfaces: - self.logger.warning(f"No active interfaces to disconnect on node {node_ip}.") + self.logger.info(f"No active interfaces provided for {node_ip}; skipping NIC down.") return - # Combine disconnect commands for all interfaces - disconnect_cmds = " && ".join([f"sudo nmcli connection down {iface}" for iface in interfaces]) - reconnect_cmds = " && ".join([f"sudo nmcli connection up {iface}" for iface in interfaces]) + down_cmd = " && ".join([f"nmcli connection down {i}" for i in interfaces]) + up_cmd = " && ".join([f"nmcli connection up {i}" for i in interfaces]) + cmd = f'nohup sh -c "{down_cmd} && sleep {duration_secs} && {up_cmd}" &' - cmd = ( - f'nohup sh -c "{disconnect_cmds} && sleep {reconnect_time} && {reconnect_cmds}" &' - ) - self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}") try: - self.exec_command(node_ip, cmd) + self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}") + out, err = self.exec_command(node=node_ip, command=cmd, max_retries=1, timeout=20) + if err: + raise Exception(err) except Exception as e: - self.logger.error(f"Failed to execute combined disconnect command on {node_ip}: {e}") + self.logger.info(f"Command: {cmd}, error: {e}! Checking pings!!") + + # Verify outage begins (best-effort). If ping still works, attempt to issue 'down' again. + time.sleep(5) + tries = 0 + attempts = 10 + while self._ping_once(node_ip) and attempts > 0: + tries += 1 + if tries >= max_tries: + self.logger.warning(f"Ping to {node_ip} still responding after NIC down attempts; continuing anyway.") + break + self.logger.info(f"Ping to {node_ip} still alive; retrying NIC down...") + # re-run only the DOWN part (don’t append sleep again to avoid stacking) + self.exec_command(node=node_ip, command=cmd, max_retries=2) + time.sleep(3) + attempts -= 1 def check_tmux_installed(self, node_ip): """Check tmux installation @@ -1420,132 +1862,263 @@ def dump_lvstore(self, node_ip, storage_node_id): self.logger.error(f"Failed to dump lvstore on {node_ip}: {e}") return None - def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path): - """ - Fetch distrib names using bdev_get_bdevs RPC, generate and execute RPC JSON, - and copy logs from SPDK container. + # def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path): + # """ + # Fetch distrib names using bdev_get_bdevs RPC, generate and execute RPC JSON, + # and copy logs from SPDK container. + + # Args: + # storage_node_ip (str): IP of the storage node + # storage_node_id (str): ID of the storage node + # """ + # self.logger.info(f"Fetching distrib logs for Storage Node ID: {storage_node_id} on {storage_node_ip}") + + # # Step 1: Find the SPDK container + # find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$'" + # container_name_output, _ = self.exec_command(storage_node_ip, find_container_cmd) + # container_name = container_name_output.strip() + + # if not container_name: + # self.logger.warning(f"No SPDK container found on {storage_node_ip}") + # return + + # # Step 2: Get bdev_get_bdevs output + # # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'" + # # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd) + + # # if error: + # # self.logger.error(f"Error running bdev_get_bdevs: {error}") + # # return + + # # # Step 3: Save full output to local file + # # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S") + # # raw_output_path = f"{Path.home()}/bdev_output_{storage_node_ip}_{timestamp}.json" + # # with open(raw_output_path, "w") as f: + # # f.write(bdev_output) + # # self.logger.info(f"Saved raw bdev_get_bdevs output to {raw_output_path}") + + # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S") + # base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs/" + + # cmd = f"sudo mkdir -p '{base_path}'" + # self.exec_command(storage_node_ip, cmd) + + # remote_output_path = f"bdev_output_{storage_node_ip}_{timestamp}.json" + + # # 1. Run to capture output into a variable (for parsing) + # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs'" + # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd) + + # if error: + # self.logger.error(f"Error running bdev_get_bdevs: {error}") + # return + + # # 2. Run again to save output on host machine (audit trail) + # bdev_save_cmd = ( + # f"sudo bash -c \"docker exec {container_name} python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs > {remote_output_path}\"") + + # self.exec_command(storage_node_ip, bdev_save_cmd) + # self.logger.info(f"Saved bdev_get_bdevs output to {remote_output_path} on {storage_node_ip}") + + + # # Step 4: Extract unique distrib names + # try: + # bdevs = json.loads(bdev_output) + # distribs = list({bdev['name'] for bdev in bdevs if bdev['name'].startswith('distrib_')}) + # except json.JSONDecodeError as e: + # self.logger.error(f"JSON parsing failed: {e}") + # return + + # if not distribs: + # self.logger.warning("No distrib names found in bdev_get_bdevs output.") + # return + + # self.logger.info(f"Distributions found: {distribs}") + + # # Step 5: Process each distrib + # for distrib in distribs: + # self.logger.info(f"Processing distrib: {distrib}") + # rpc_json = { + # "subsystems": [ + # { + # "subsystem": "distr", + # "config": [ + # { + # "method": "distr_debug_placement_map_dump", + # "params": {"name": distrib} + # } + # ] + # } + # ] + # } + + # rpc_json_str = json.dumps(rpc_json) + # remote_json_path = "/tmp/stack.json" + + # # Save JSON file remotely + # create_json_command = f"echo '{rpc_json_str}' | sudo tee {remote_json_path}" + # self.exec_command(storage_node_ip, create_json_command) + + # # Copy into container + # copy_json_command = f"sudo docker cp {remote_json_path} {container_name}:{remote_json_path}" + # self.exec_command(storage_node_ip, copy_json_command) + + # # Run RPC inside container + # rpc_command = f"sudo docker exec {container_name} bash -c 'python scripts/rpc_sock.py {remote_json_path} /mnt/ramdisk/{container_name}/spdk.sock'" + # self.exec_command(storage_node_ip, rpc_command) + + # # Find and copy log + # find_log_command = f"sudo docker exec {container_name} ls /tmp/ | grep {distrib}" + # log_file_name, _ = self.exec_command(storage_node_ip, find_log_command) + # log_file_name = log_file_name.strip().replace("\r", "").replace("\n", "") + + # if not log_file_name: + # self.logger.error(f"No log file found for distrib {distrib}.") + # continue + + # log_file_path = f"/tmp/{log_file_name}" + # local_log_path = f"{base_path}/{log_file_name}_{storage_node_ip}_{timestamp}" + # copy_log_cmd = f"sudo docker cp {container_name}:{log_file_path} {local_log_path}" + # self.exec_command(storage_node_ip, copy_log_cmd) + + # self.logger.info(f"Fetched log for {distrib}: {local_log_path}") + + # # Clean up + # delete_log_cmd = f"sudo docker exec {container_name} rm -f {log_file_path}" + # self.exec_command(storage_node_ip, delete_log_cmd) + + # self.logger.info("All distrib logs retrieved successfully.") - Args: - storage_node_ip (str): IP of the storage node - storage_node_id (str): ID of the storage node - """ + def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path): self.logger.info(f"Fetching distrib logs for Storage Node ID: {storage_node_id} on {storage_node_ip}") - # Step 1: Find the SPDK container - find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$'" - container_name_output, _ = self.exec_command(storage_node_ip, find_container_cmd) - container_name = container_name_output.strip() - + # 0) Find SPDK container name + find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' || true" + container_name_out, _ = self.exec_command(storage_node_ip, find_container_cmd) + container_name = (container_name_out or "").strip() if not container_name: self.logger.warning(f"No SPDK container found on {storage_node_ip}") return - # Step 2: Get bdev_get_bdevs output - # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'" - # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd) - - # if error: - # self.logger.error(f"Error running bdev_get_bdevs: {error}") - # return - - # # Step 3: Save full output to local file - # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S") - # raw_output_path = f"{Path.home()}/bdev_output_{storage_node_ip}_{timestamp}.json" - # with open(raw_output_path, "w") as f: - # f.write(bdev_output) - # self.logger.info(f"Saved raw bdev_get_bdevs output to {raw_output_path}") - - timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S") - base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs/" - - cmd = f"sudo mkdir -p '{base_path}'" - self.exec_command(storage_node_ip, cmd) - - remote_output_path = f"bdev_output_{storage_node_ip}_{timestamp}.json" - - # 1. Run to capture output into a variable (for parsing) - bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'" - bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd) - - if error: - self.logger.error(f"Error running bdev_get_bdevs: {error}") + # 1) Get bdevs via correct sock + timestamp = datetime.now().strftime("%Y%m%d_%H-%M-%S") + base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs" + self.exec_command(storage_node_ip, f"sudo mkdir -p '{base_path}' && sudo chmod -R 777 '{base_path}'") + bdev_cmd = ( + f"sudo docker exec {container_name} bash -lc " + f"\"python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs\"" + ) + bdev_output, bdev_err = self.exec_command(storage_node_ip, bdev_cmd) + if (bdev_err and bdev_err.strip()) and not bdev_output: + self.logger.error(f"bdev_get_bdevs error on {storage_node_ip}: {bdev_err.strip()}") return - # 2. Run again to save output on host machine (audit trail) - bdev_save_cmd = ( - f"sudo bash -c \"docker exec {container_name} python spdk/scripts/rpc.py bdev_get_bdevs > {remote_output_path}\"") - - self.exec_command(storage_node_ip, bdev_save_cmd) - self.logger.info(f"Saved bdev_get_bdevs output to {remote_output_path} on {storage_node_ip}") - - - # Step 4: Extract unique distrib names + # Parse distrib names try: bdevs = json.loads(bdev_output) - distribs = list({bdev['name'] for bdev in bdevs if bdev['name'].startswith('distrib_')}) + distribs = sorted({ + b.get("name", "") + for b in bdevs + if isinstance(b, dict) and str(b.get("name","")).startswith("distrib_") + }) except json.JSONDecodeError as e: - self.logger.error(f"JSON parsing failed: {e}") + self.logger.error(f"JSON parsing failed on {storage_node_ip}: {e}") return - if not distribs: - self.logger.warning("No distrib names found in bdev_get_bdevs output.") + self.logger.warning(f"No distrib_* bdevs found on {storage_node_ip}.") + return + self.logger.info(f"[{storage_node_ip}] Distributions: {distribs}") + + # 2) Run multiple docker exec in parallel from ONE SSH exec + distrib_list_str = " ".join(shlex.quote(d) for d in distribs) + remote_tar = f"/tmp/distrib_logs_{timestamp}.tar.gz" + + # IMPORTANT: This script runs on the HOST and spawns many `docker exec ... &` in parallel. + # It throttles with MAXJ, waits, then tars outputs from /tmp inside the container into one tarball on the host. + remote_script = f"""\ +set -euo pipefail +CN={shlex.quote(container_name)} +SOCK="/mnt/ramdisk/$CN/spdk.sock" +TS="{timestamp}" +MAXJ=8 +WORKDIR_HOST="{base_path}" +mkdir -p "$WORKDIR_HOST" + +# Make a temporary host folder to collect per-distrib files copied out of the container +HOST_STAGING="/tmp/distrib_host_collect_$TS" +mkdir -p "$HOST_STAGING" + +pids=() + +for d in {distrib_list_str}; do + ( + # Build JSON on host then copy into container (avoids many ssh execs) + JF="/tmp/stack_${{d}}.json" + cat > "$JF" <<'EOF_JSON' +{{ + "subsystems": [ + {{ + "subsystem": "distr", + "config": [ + {{ + "method": "distr_debug_placement_map_dump", + "params": {{"name": "__DIST__"}} + }} + ] + }} + ] +}} +EOF_JSON + # substitute distrib name + sed -i "s/__DIST__/$d/g" "$JF" + + # Copy JSON into container + sudo docker cp "$JF" "$CN:/tmp/stack_${{d}}.json" + + # Run rpc inside container (socket path respected) + sudo docker exec "$CN" bash -lc "python scripts/rpc_sock.py /tmp/stack_${{d}}.json {shlex.quote('/mnt/ramdisk/'+container_name+'/spdk.sock')} > /tmp/rpc_${{d}}.log 2>&1 || true" + + # Copy any files for this distrib out to host staging (rpc log + any matching /tmp/*d*) + sudo docker cp "$CN:/tmp/rpc_${{d}}.log" "$HOST_STAGING/rpc_${{d}}.log" 2>/dev/null || true + # try to pull any distrib-related artifacts + for f in $(sudo docker exec "$CN" bash -lc "ls /tmp/ 2>/dev/null | grep -F \"$d\" || true"); do + sudo docker cp "$CN:/tmp/$f" "$HOST_STAGING/$f" 2>/dev/null || true + done + + # cleanup container temp for this distrib + sudo docker exec "$CN" bash -lc "rm -f /tmp/stack_${{d}}.json /tmp/rpc_${{d}}.log" || true + rm -f "$JF" || true + ) & + + # throttle parallel jobs + while [ "$(jobs -rp | wc -l)" -ge "$MAXJ" ]; do sleep 0.2; done +done + +# Wait for all background jobs +wait + +# Tar once on host +tar -C "$HOST_STAGING" -czf {shlex.quote(remote_tar)} . 2>/dev/null || true + +# Move artifacts to final location +mv -f {shlex.quote(remote_tar)} "$WORKDIR_HOST/" || true + +# Also copy loose files (for convenience) then clean staging +cp -rf "$HOST_STAGING"/. "$WORKDIR_HOST"/ 2>/dev/null || true +rm -rf "$HOST_STAGING" || true + +echo "$WORKDIR_HOST/{os.path.basename(remote_tar)}" +""" + + run_many_cmd = "bash -lc " + shlex.quote(remote_script) + tar_out, tar_err = self.exec_command(storage_node_ip, run_many_cmd) + if (tar_err and tar_err.strip()) and not tar_out: + self.logger.error(f"[{storage_node_ip}] Parallel docker-exec script error: {tar_err.strip()}") return - self.logger.info(f"Distributions found: {distribs}") - - # Step 5: Process each distrib - for distrib in distribs: - self.logger.info(f"Processing distrib: {distrib}") - rpc_json = { - "subsystems": [ - { - "subsystem": "distr", - "config": [ - { - "method": "distr_debug_placement_map_dump", - "params": {"name": distrib} - } - ] - } - ] - } - - rpc_json_str = json.dumps(rpc_json) - remote_json_path = "/tmp/stack.json" - - # Save JSON file remotely - create_json_command = f"echo '{rpc_json_str}' | sudo tee {remote_json_path}" - self.exec_command(storage_node_ip, create_json_command) - - # Copy into container - copy_json_command = f"sudo docker cp {remote_json_path} {container_name}:{remote_json_path}" - self.exec_command(storage_node_ip, copy_json_command) - - # Run RPC inside container - rpc_command = f"sudo docker exec {container_name} bash -c 'python scripts/rpc_sock.py {remote_json_path}'" - self.exec_command(storage_node_ip, rpc_command) - - # Find and copy log - find_log_command = f"sudo docker exec {container_name} ls /tmp/ | grep {distrib}" - log_file_name, _ = self.exec_command(storage_node_ip, find_log_command) - log_file_name = log_file_name.strip().replace("\r", "").replace("\n", "") - - if not log_file_name: - self.logger.error(f"No log file found for distrib {distrib}.") - continue - - log_file_path = f"/tmp/{log_file_name}" - local_log_path = f"{base_path}/{log_file_name}_{storage_node_ip}_{timestamp}" - copy_log_cmd = f"sudo docker cp {container_name}:{log_file_path} {local_log_path}" - self.exec_command(storage_node_ip, copy_log_cmd) - - self.logger.info(f"Fetched log for {distrib}: {local_log_path}") - - # Clean up - delete_log_cmd = f"sudo docker exec {container_name} rm -f {log_file_path}" - self.exec_command(storage_node_ip, delete_log_cmd) + final_tar = (tar_out or "").strip().splitlines()[-1] if tar_out else f"{base_path}/{os.path.basename(remote_tar)}" + self.logger.info(f"[{storage_node_ip}] Distrib logs saved: {base_path} (tar: {final_tar})") - self.logger.info("All distrib logs retrieved successfully.") def clone_mount_gen_uuid(self, node, device): """Repair the XFS filesystem and generate a new UUID. @@ -1722,8 +2295,8 @@ def start_netstat_dmesg_logging(self, node_ip, log_dir): self.exec_command(node_ip, f"sudo tmux new-session -d -s netstat_log 'bash -c \"while true; do netstat -s | grep \\\"segments dropped\\\" >> {netstat_log}; sleep 5; done\"'") self.exec_command(node_ip, f"sudo tmux new-session -d -s dmesg_log 'bash -c \"while true; do sudo dmesg | grep -i \\\"tcp\\\" >> {dmesg_log}; sleep 5; done\"'") - self.exec_command(node_ip, f"sudo tmux new-session -d -s journalctl_log 'bash -c \"while true; do sudo journalctl -k | grep -i \\\"tcp\\\" >> {journalctl_log}; sleep 5; done\"'") - + self.exec_command(node_ip, f"sudo tmux new-session -d -s journalctl_log 'bash -c \"while true; do sudo journalctl -k --no-tail | grep -i \\\"tcp\\\" >> {journalctl_log}; sleep 5; done\"'") + def reset_iptables_in_spdk(self, node_ip): """ Resets iptables rules inside the SPDK container on a given node. @@ -1915,6 +2488,7 @@ def start_resource_monitors(self, node_ip, log_dir): root_log = f"{log_dir}/root_partition_usage_{node_ip}_{timestamp}.txt" docker_mem_log = f"{log_dir}/docker_mem_usage_{node_ip}_{timestamp}.txt" system_mem_log = f"{log_dir}/system_memory_usage_{node_ip}_{timestamp}.txt" + docker_stats_logs = f"{log_dir}/docker_stats_usage_{node_ip}_{timestamp}.txt" # Ensure log directory exists and is writable self.exec_command(node_ip, f"sudo mkdir -p {log_dir} && sudo chmod 777 {log_dir}") @@ -1939,14 +2513,29 @@ def start_resource_monitors(self, node_ip, log_dir): 'bash -c "while true; do date >> {system_mem_log}; free -h >> {system_mem_log}; echo >> {system_mem_log}; sleep 10; done"' """ + docker_stats_cmd = f""" + sudo tmux new-session -d -s docker_stats_all \ + 'bash -c "while true; do date >> {docker_stats_logs}; docker stats --no-stream >> {docker_stats_logs}; echo >> {docker_stats_logs}; sleep 10; done"' + """ + self.exec_command(node_ip, df_cmd) self.exec_command(node_ip, docker_cmd) self.exec_command(node_ip, system_cmd) + self.exec_command(node_ip, docker_stats_cmd) - self.logger.info(f"Started root partition, container memory, and system memory logging on {node_ip}") + self.logger.info(f"Started root partition, container memory, docker stats and system memory logging on {node_ip}") + + def cluster_list(self, node_ip, cluster_id): + """Sets cluster in suspended state + Args: + node_ip (str): Mgmt Node IP to run command on + cluster_id (str): Cluster id to put in suspended state + """ + cmd = f"{self.base_cmd} cluster list" + output, _ = self.exec_command(node_ip, cmd) + return output.strip() - def suspend_cluster(self, node_ip, cluster_id): """Sets cluster in suspended state @@ -1995,7 +2584,7 @@ def ensure_nfs_mounted(self, node, nfs_server, nfs_path, mount_point, is_local = """ check_cmd = f"mount | grep -w '{mount_point}'" mount_cmd = f"sudo mkdir -p {mount_point} && sudo mount -t nfs {nfs_server}:{nfs_path} {mount_point}" - install_check_cmd = "dnf list installed nfs-util" + install_check_cmd = "dnf list installed nfs-utils" install_cmd = "sudo dnf install -y nfs-utils" try: @@ -2300,3 +2889,9 @@ def stop_log_monitor(self): self._monitor_stop_flag.set() self._monitor_thread.join(timeout=10) print("K8s log monitor thread stopped.") + +def _rid(n=6): + import string, random + letters = string.ascii_uppercase + digits = string.digits + return random.choice(letters) + ''.join(random.choices(letters + digits, k=n-1)) From df90efe05d97dca1538609dc8db51b2646de0bfd Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 5 Dec 2025 14:06:50 +0300 Subject: [PATCH 089/192] Fix sn list apiv2 response _2 --- simplyblock_web/api/v2/storage_node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index d1aec59be..e612d7177 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -21,9 +21,9 @@ @api.get('/', name='clusters:storage-nodes:list') -def list(cluster: Cluster) -> List[dict]: +def list(cluster: Cluster) -> List[StorageNodeDTO]: return [ - storage_node.to_dict() + StorageNodeDTO.from_model(storage_node) for storage_node in db.get_storage_nodes_by_cluster_id(cluster.get_id()) ] From 98b6384cb4bec1dc6c4a6ac42e319024214a94ff Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 5 Dec 2025 14:19:50 +0300 Subject: [PATCH 090/192] Fix sn list apiv2 response _3 --- simplyblock_web/api/v2/dtos.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index 54c1b5b01..f8ba77ae7 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -151,16 +151,18 @@ def from_model(model: SnapShot, request: Request, cluster_id, pool_id, volume_id class StorageNodeDTO(BaseModel): - id: UUID + uuid: UUID status: str - ip: IPv4Address + mgmt_ip: IPv4Address + health_check: bool @staticmethod def from_model(model: StorageNode): return StorageNodeDTO( - id=UUID(model.get_id()), + uuid=UUID(model.get_id()), status=model.status, - ip=IPv4Address(model.mgmt_ip), + mgmt_ip=IPv4Address(model.mgmt_ip), + health_check=model.health_check, ) From 24d6ced0c56a8b06b9eb6b60deae90af843cf431 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Fri, 5 Dec 2025 14:44:49 +0300 Subject: [PATCH 091/192] Fix sn list apiv2 response _2 (#807) --- simplyblock_web/api/v2/dtos.py | 10 ++++++---- simplyblock_web/api/v2/storage_node.py | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index 54c1b5b01..f8ba77ae7 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -151,16 +151,18 @@ def from_model(model: SnapShot, request: Request, cluster_id, pool_id, volume_id class StorageNodeDTO(BaseModel): - id: UUID + uuid: UUID status: str - ip: IPv4Address + mgmt_ip: IPv4Address + health_check: bool @staticmethod def from_model(model: StorageNode): return StorageNodeDTO( - id=UUID(model.get_id()), + uuid=UUID(model.get_id()), status=model.status, - ip=IPv4Address(model.mgmt_ip), + mgmt_ip=IPv4Address(model.mgmt_ip), + health_check=model.health_check, ) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index d1aec59be..e612d7177 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -21,9 +21,9 @@ @api.get('/', name='clusters:storage-nodes:list') -def list(cluster: Cluster) -> List[dict]: +def list(cluster: Cluster) -> List[StorageNodeDTO]: return [ - storage_node.to_dict() + StorageNodeDTO.from_model(storage_node) for storage_node in db.get_storage_nodes_by_cluster_id(cluster.get_id()) ] From d1163e32657f800765fc08c4e09551b1da4daa65 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 8 Dec 2025 10:20:02 +0100 Subject: [PATCH 092/192] Update cluster.py (#808) --- simplyblock_web/api/v1/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py index f4eb2e690..2447cf958 100644 --- a/simplyblock_web/api/v1/cluster.py +++ b/simplyblock_web/api/v1/cluster.py @@ -73,7 +73,7 @@ def create_first_cluster(): return utils.get_response_error("blk_size can be 512 or 4096", 400) else: blk_size = cl_data['blk_size'] - page_size_in_blocks = cl_data.get('distr_ndcs', 2097152) + page_size_in_blocks = cl_data.get('page_size_in_blocks', 2097152) distr_ndcs = cl_data.get('distr_ndcs', 1) distr_npcs = cl_data.get('distr_npcs', 1) distr_bs = cl_data.get('distr_bs', 4096) From 7a52b77b46e5a633ee9c12bfb48cf2068ea9f1ee Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 8 Dec 2025 15:08:54 +0300 Subject: [PATCH 093/192] Add stats to spdk_http_proxy_server.py Prints max, avg and last 3 sec avg for read lines from http socket and rpc response receive from spdk --- .../services/spdk_http_proxy_server.py | 63 ++++++++++++++++--- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 06eeee008..6bf6c3748 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -6,6 +6,8 @@ import os import socket import sys +import threading +import time from http.server import HTTPServer from http.server import ThreadingHTTPServer @@ -19,6 +21,33 @@ logger.addHandler(logger_handler) logger.setLevel(logging.INFO) +read_line_time_diff: dict = {} +recv_from_spdk_time_diff: dict = {} +def print_stats(): + time.sleep(3) + t = time.time_ns() + read_line_time_diff_max = max(list(read_line_time_diff.values())) + read_line_time_diff_avg = int(sum(list(read_line_time_diff.values()))/len(read_line_time_diff)) + last_3_sec = [] + for k,v in read_line_time_diff.items(): + if k > t - 3*1000*1000*1000: + last_3_sec.append(v) + read_line_time_diff_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + logger.info(f"Periodic stats: {t}: read_line_time max={read_line_time_diff_max}ns, avg={read_line_time_diff_avg}ns, last 3s avg={read_line_time_diff_avg_last_3_sec}ns") + if len(read_line_time_diff) > 10000: + read_line_time_diff.clear() + + recv_from_spdk_time_max = max(list(recv_from_spdk_time_diff.values())) + recv_from_spdk_time_avg = int(sum(list(recv_from_spdk_time_diff.values()))/len(recv_from_spdk_time_diff)) + last_3_sec = [] + for k,v in recv_from_spdk_time_diff.items(): + if k > t - 3*1000*1000*1000: + last_3_sec.append(v) + recv_from_spdk_time_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + logger.info(f"Periodic stats: {t}: recv_from_spdk_time max={recv_from_spdk_time_max}ns, avg={recv_from_spdk_time_avg}ns, last 3s avg={recv_from_spdk_time_avg_last_3_sec}ns") + if len(recv_from_spdk_time_diff) > 10000: + recv_from_spdk_time_diff.clear() + def get_env_var(name, default=None, is_required=False): if not name: @@ -29,14 +58,18 @@ def get_env_var(name, default=None, is_required=False): raise Exception("env value is required: %s" % name) return os.environ.get(name, default) - +unix_sockets: list[socket] = [] # type: ignore[valid-type] def rpc_call(req): + logger.info(f"active threads: {threading.active_count()}") + logger.info(f"active unix sockets: {len(unix_sockets)}") req_data = json.loads(req.decode('ascii')) + req_time = time.time_ns() params = "" if "params" in req_data: params = str(req_data['params']) - logger.info(f"Request function: {str(req_data['method'])}, params: {params}") + logger.info(f"Request:{req_time} function: {str(req_data['method'])}, params: {params}") sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + unix_sockets.append(sock) sock.settimeout(TIMEOUT) sock.connect(rpc_sock) sock.sendall(req) @@ -48,7 +81,7 @@ def rpc_call(req): buf = '' closed = False response = None - + recv_from_spdk_time_start = time.time_ns() while not closed: newdata = sock.recv(1024*1024*1024) if newdata == b'': @@ -59,21 +92,25 @@ def rpc_call(req): except ValueError: continue # incomplete response; keep buffering break + recv_from_spdk_time_end = time.time_ns() + time_diff = recv_from_spdk_time_end - recv_from_spdk_time_start + logger.info(f"recv_from_spdk_time_diff: {time_diff}") + recv_from_spdk_time_diff[recv_from_spdk_time_start] = time_diff sock.close() + unix_sockets.remove(sock) if not response and len(buf) > 0: raise ValueError('Invalid response') - logger.debug(f"Response data: {buf}") + logger.info(f"Response:{req_time}") return buf class ServerHandler(BaseHTTPRequestHandler): - + server_session: list[int] = [] key = "" - def do_HEAD(self): self.send_response(200) self.send_header('Content-type', 'text/html') @@ -96,6 +133,10 @@ def do_INTERNALERROR(self): self.end_headers() def do_POST(self): + req_time = time.time_ns() + self.server_session.append(req_time) + logger.info(f"incoming request at: {req_time}") + logger.info(f"active server session: {len(self.server_session)}") if self.headers['Authorization'] != 'Basic ' + self.key: self.do_AUTHHEAD() else: @@ -103,6 +144,7 @@ def do_POST(self): data_string = self.rfile.read(int(self.headers['Content-Length'])) elif "chunked" in self.headers.get("Transfer-Encoding", ""): data_string = b'' + read_line_time_start = time.time_ns() while True: line = self.rfile.readline().strip() chunk_length = int(line, 16) @@ -118,7 +160,10 @@ def do_POST(self): # Finally, a chunk size of 0 is an end indication if chunk_length == 0: break - + read_line_time_end = time.time_ns() + time_diff = read_line_time_end - read_line_time_start + logger.info(f"read_line_time_diff: {time_diff}") + read_line_time_diff[read_line_time_start] = time_diff try: response = rpc_call(data_string) if response is not None: @@ -129,12 +174,14 @@ def do_POST(self): except ValueError: self.do_INTERNALERROR() + self.server_session.remove(req_time) def run_server(host, port, user, password, is_threading_enabled=False): # encoding user and password key = base64.b64encode((user+':'+password).encode(encoding='ascii')).decode('ascii') - + print_stats_thread = threading.Thread(target=print_stats, ) + print_stats_thread.start() try: ServerHandler.key = key httpd = (ThreadingHTTPServer if is_threading_enabled else HTTPServer)((host, port), ServerHandler) From 9526f8c2dc1d83dc44deb34e1d12b71dbbcdcbd9 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 8 Dec 2025 15:25:51 +0300 Subject: [PATCH 094/192] Add stats to spdk_http_proxy_server.py _2 --- .../services/spdk_http_proxy_server.py | 50 ++++++++++--------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 6bf6c3748..75ffc4f5c 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -24,29 +24,33 @@ read_line_time_diff: dict = {} recv_from_spdk_time_diff: dict = {} def print_stats(): - time.sleep(3) - t = time.time_ns() - read_line_time_diff_max = max(list(read_line_time_diff.values())) - read_line_time_diff_avg = int(sum(list(read_line_time_diff.values()))/len(read_line_time_diff)) - last_3_sec = [] - for k,v in read_line_time_diff.items(): - if k > t - 3*1000*1000*1000: - last_3_sec.append(v) - read_line_time_diff_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) - logger.info(f"Periodic stats: {t}: read_line_time max={read_line_time_diff_max}ns, avg={read_line_time_diff_avg}ns, last 3s avg={read_line_time_diff_avg_last_3_sec}ns") - if len(read_line_time_diff) > 10000: - read_line_time_diff.clear() - - recv_from_spdk_time_max = max(list(recv_from_spdk_time_diff.values())) - recv_from_spdk_time_avg = int(sum(list(recv_from_spdk_time_diff.values()))/len(recv_from_spdk_time_diff)) - last_3_sec = [] - for k,v in recv_from_spdk_time_diff.items(): - if k > t - 3*1000*1000*1000: - last_3_sec.append(v) - recv_from_spdk_time_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) - logger.info(f"Periodic stats: {t}: recv_from_spdk_time max={recv_from_spdk_time_max}ns, avg={recv_from_spdk_time_avg}ns, last 3s avg={recv_from_spdk_time_avg_last_3_sec}ns") - if len(recv_from_spdk_time_diff) > 10000: - recv_from_spdk_time_diff.clear() + while True: + try: + time.sleep(3) + t = time.time_ns() + read_line_time_diff_max = max(list(read_line_time_diff.values())) + read_line_time_diff_avg = int(sum(list(read_line_time_diff.values()))/len(read_line_time_diff)) + last_3_sec = [] + for k,v in read_line_time_diff.items(): + if k > t - 3*1000*1000*1000: + last_3_sec.append(v) + read_line_time_diff_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + logger.info(f"Periodic stats: {t}: read_line_time max={read_line_time_diff_max}ns, avg={read_line_time_diff_avg}ns, last 3s avg={read_line_time_diff_avg_last_3_sec}ns") + if len(read_line_time_diff) > 10000: + read_line_time_diff.clear() + + recv_from_spdk_time_max = max(list(recv_from_spdk_time_diff.values())) + recv_from_spdk_time_avg = int(sum(list(recv_from_spdk_time_diff.values()))/len(recv_from_spdk_time_diff)) + last_3_sec = [] + for k,v in recv_from_spdk_time_diff.items(): + if k > t - 3*1000*1000*1000: + last_3_sec.append(v) + recv_from_spdk_time_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + logger.info(f"Periodic stats: {t}: recv_from_spdk_time max={recv_from_spdk_time_max}ns, avg={recv_from_spdk_time_avg}ns, last 3s avg={recv_from_spdk_time_avg_last_3_sec}ns") + if len(recv_from_spdk_time_diff) > 10000: + recv_from_spdk_time_diff.clear() + except Exception as e: + logger.error(e) def get_env_var(name, default=None, is_required=False): From d1636621037e385a11af6a2d6997d188c80f73ae Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 8 Dec 2025 13:50:22 +0100 Subject: [PATCH 095/192] Fdb health check (#809) * check fdb connection string * check fdb connection string * remove fdb cluster uuid * removed simplyblock crd --- simplyblock_web/api/v1/__init__.py | 31 ++++++++++++++++++++++++++++++ simplyblock_web/auth_middleware.py | 2 ++ 2 files changed, 33 insertions(+) diff --git a/simplyblock_web/api/v1/__init__.py b/simplyblock_web/api/v1/__init__.py index 4bcc5ba41..084a737cc 100644 --- a/simplyblock_web/api/v1/__init__.py +++ b/simplyblock_web/api/v1/__init__.py @@ -1,9 +1,12 @@ import logging +import os +from flask import jsonify from flask import Flask from simplyblock_web.auth_middleware import token_required from simplyblock_web import utils +from simplyblock_core import constants from . import cluster from . import mgmt_node @@ -39,3 +42,31 @@ def before_request(): @api.route('/', methods=['GET']) def status(): return utils.get_response("Live") + +@api.route('/health/fdb', methods=['GET']) +def health_fdb(): + fdb_cluster_file = constants.KVD_DB_FILE_PATH + + if not os.path.exists(fdb_cluster_file): + return jsonify({ + "fdb_connected": False, + "message": "FDB cluster file not found" + }), 503 + + try: + with open(fdb_cluster_file, 'r') as f: + cluster_data = f.read().strip() + if not cluster_data: + return jsonify({ + "fdb_connected": False, + "message": "FDB cluster file is empty" + }), 503 + except Exception as e: + return jsonify({ + "fdb_connected": False, + "message": f"Failed to read FDB cluster file: {str(e)}" + }), 503 + + return jsonify({ + "fdb_connected": True, + }), 200 diff --git a/simplyblock_web/auth_middleware.py b/simplyblock_web/auth_middleware.py index 70755b46a..87449cb64 100644 --- a/simplyblock_web/auth_middleware.py +++ b/simplyblock_web/auth_middleware.py @@ -36,6 +36,8 @@ def decorated(*args: Any, **kwargs: Any) -> ResponseType: return cast(ResponseType, f(*args, **kwargs)) if request.method == "POST" and request.path.startswith("/cluster/create_first"): return cast(ResponseType, f(*args, **kwargs)) + if request.method == "GET" and request.path.startswith("/health/fdb"): + return cast(ResponseType, f(*args, **kwargs)) cluster_id: str = "" cluster_secret: str = "" From 3a3e7b54fb9bec9665752c4521b689af75347337 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 8 Dec 2025 16:07:29 +0300 Subject: [PATCH 096/192] Fix 2 --- simplyblock_core/services/spdk_http_proxy_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 75ffc4f5c..96e976ef3 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -24,6 +24,7 @@ read_line_time_diff: dict = {} recv_from_spdk_time_diff: dict = {} def print_stats(): + global read_line_time_diff, recv_from_spdk_time_diff while True: try: time.sleep(3) From 93f7c8cd105a8b74b75da0ff35f858681e20339e Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 8 Dec 2025 16:11:55 +0300 Subject: [PATCH 097/192] Fix 2 --- simplyblock_core/services/spdk_http_proxy_server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 96e976ef3..64ad95615 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -65,6 +65,7 @@ def get_env_var(name, default=None, is_required=False): unix_sockets: list[socket] = [] # type: ignore[valid-type] def rpc_call(req): + global recv_from_spdk_time_diff logger.info(f"active threads: {threading.active_count()}") logger.info(f"active unix sockets: {len(unix_sockets)}") req_data = json.loads(req.decode('ascii')) @@ -138,6 +139,7 @@ def do_INTERNALERROR(self): self.end_headers() def do_POST(self): + global read_line_time_diff req_time = time.time_ns() self.server_session.append(req_time) logger.info(f"incoming request at: {req_time}") From db31b6ebacb7875452a4920838ace6040a73767a Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 8 Dec 2025 16:38:13 +0300 Subject: [PATCH 098/192] Fix 3 --- .../services/spdk_http_proxy_server.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 64ad95615..b9f0fd790 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -24,7 +24,6 @@ read_line_time_diff: dict = {} recv_from_spdk_time_diff: dict = {} def print_stats(): - global read_line_time_diff, recv_from_spdk_time_diff while True: try: time.sleep(3) @@ -35,8 +34,11 @@ def print_stats(): for k,v in read_line_time_diff.items(): if k > t - 3*1000*1000*1000: last_3_sec.append(v) - read_line_time_diff_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) - logger.info(f"Periodic stats: {t}: read_line_time max={read_line_time_diff_max}ns, avg={read_line_time_diff_avg}ns, last 3s avg={read_line_time_diff_avg_last_3_sec}ns") + if len(last_3_sec) > 0: + read_line_time_diff_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + else: + read_line_time_diff_avg_last_3_sec = 0 + logger.info(f"Periodic stats: {t}: read_line_time: max={read_line_time_diff_max} ns, avg={read_line_time_diff_avg} ns, last_3s_avg={read_line_time_diff_avg_last_3_sec} ns") if len(read_line_time_diff) > 10000: read_line_time_diff.clear() @@ -46,8 +48,11 @@ def print_stats(): for k,v in recv_from_spdk_time_diff.items(): if k > t - 3*1000*1000*1000: last_3_sec.append(v) - recv_from_spdk_time_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) - logger.info(f"Periodic stats: {t}: recv_from_spdk_time max={recv_from_spdk_time_max}ns, avg={recv_from_spdk_time_avg}ns, last 3s avg={recv_from_spdk_time_avg_last_3_sec}ns") + if len(last_3_sec) > 0: + recv_from_spdk_time_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + else: + recv_from_spdk_time_avg_last_3_sec = 0 + logger.info(f"Periodic stats: {t}: recv_from_spdk_time: max={recv_from_spdk_time_max} ns, avg={recv_from_spdk_time_avg} ns, last_3s_avg={recv_from_spdk_time_avg_last_3_sec} ns") if len(recv_from_spdk_time_diff) > 10000: recv_from_spdk_time_diff.clear() except Exception as e: @@ -65,7 +70,6 @@ def get_env_var(name, default=None, is_required=False): unix_sockets: list[socket] = [] # type: ignore[valid-type] def rpc_call(req): - global recv_from_spdk_time_diff logger.info(f"active threads: {threading.active_count()}") logger.info(f"active unix sockets: {len(unix_sockets)}") req_data = json.loads(req.decode('ascii')) @@ -139,7 +143,6 @@ def do_INTERNALERROR(self): self.end_headers() def do_POST(self): - global read_line_time_diff req_time = time.time_ns() self.server_session.append(req_time) logger.info(f"incoming request at: {req_time}") @@ -147,11 +150,11 @@ def do_POST(self): if self.headers['Authorization'] != 'Basic ' + self.key: self.do_AUTHHEAD() else: + read_line_time_start = time.time_ns() if "Content-Length" in self.headers: data_string = self.rfile.read(int(self.headers['Content-Length'])) elif "chunked" in self.headers.get("Transfer-Encoding", ""): data_string = b'' - read_line_time_start = time.time_ns() while True: line = self.rfile.readline().strip() chunk_length = int(line, 16) @@ -167,10 +170,10 @@ def do_POST(self): # Finally, a chunk size of 0 is an end indication if chunk_length == 0: break - read_line_time_end = time.time_ns() - time_diff = read_line_time_end - read_line_time_start - logger.info(f"read_line_time_diff: {time_diff}") - read_line_time_diff[read_line_time_start] = time_diff + read_line_time_end = time.time_ns() + time_diff = read_line_time_end - read_line_time_start + logger.info(f"read_line_time_diff: {time_diff}") + read_line_time_diff[read_line_time_start] = time_diff try: response = rpc_call(data_string) if response is not None: From e344c9336d6c7f0ae233d72cb63868c2a2eded66 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 8 Dec 2025 16:02:58 +0300 Subject: [PATCH 099/192] Fix sfam-2515 check JM replication status on sec before dropping leadership during node restart and node down>online status change --- .../controllers/storage_events.py | 12 ++++++++++ simplyblock_core/models/storage_node.py | 22 ++++++++++++++++++- .../services/tasks_runner_port_allow.py | 8 +++++++ simplyblock_core/storage_node_ops.py | 6 +++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/storage_events.py b/simplyblock_core/controllers/storage_events.py index b73890cd8..bd5a9eb8d 100644 --- a/simplyblock_core/controllers/storage_events.py +++ b/simplyblock_core/controllers/storage_events.py @@ -72,3 +72,15 @@ def snode_rpc_timeout(node, timeout_seconds, caused_by=ec.CAUSED_BY_MONITOR): event_level=EventObj.LEVEL_WARN, message=f"Storage node RPC timeout detected after {timeout_seconds} seconds", node_id=node.get_id()) + + +def jm_repl_tasks_found(node, jm_vuid, caused_by=ec.CAUSED_BY_MONITOR): + ec.log_event_cluster( + cluster_id=node.cluster_id, + domain=ec.DOMAIN_CLUSTER, + event=ec.EVENT_STATUS_CHANGE, + db_object=node, + caused_by=caused_by, + event_level=EventObj.LEVEL_WARN, + message=f"JM replication task found for jm {jm_vuid}", + node_id=node.get_id()) diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index 81639c556..45abceec9 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -1,5 +1,5 @@ # coding=utf-8 - +import time from typing import List from uuid import uuid4 @@ -302,3 +302,23 @@ def create_alceml(self, name, nvme_bdev, uuid, **kwargs): alceml_worker_cpu_mask=alceml_worker_cpu_mask, **kwargs, ) + + def wait_for_jm_rep_tasks_to_finish(self, jm_vuid): + retry = 10 + while retry > 0: + try: + jm_replication_tasks = False + ret = self.rpc_client().jc_get_jm_status(jm_vuid) + for jm in ret: + if ret[jm] is False: # jm is not ready (has active replication task) + jm_replication_tasks = True + break + if jm_replication_tasks: + logger.warning(f"Replication task found on node: {self.get_id()}, jm_vuid: {jm_vuid}, retry...") + retry -= 1 + time.sleep(20) + else: + return True + except Exception: + logger.warning("Failed to get replication task!") + return False diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py index e95dbdf94..96ffc4664 100644 --- a/simplyblock_core/services/tasks_runner_port_allow.py +++ b/simplyblock_core/services/tasks_runner_port_allow.py @@ -206,6 +206,14 @@ if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: sec_rpc_client = sec_node.rpc_client() + ret = sec_node.wait_for_jm_rep_tasks_to_finish(node.jm_vuid) + if not ret: + msg = "JM replication task found on secondary" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + continue sec_rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False, bs_nonleadership=True) port_number = task.function_params["port_number"] diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 9b6630680..ec4f8e514 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -3177,6 +3177,12 @@ def recreate_lvstore(snode, force=False): time.sleep(0.5) ### 4- set leadership to false + ret = sec_node.wait_for_jm_rep_tasks_to_finish(snode.jm_vuid) + if not ret: + msg = f"JM replication task found for jm {snode.jm_vuid}" + logger.error(msg) + storage_events.jm_repl_tasks_found(sec_node, snode.jm_vuid) + sec_rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False, bs_nonleadership=True) sec_rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid) ### 4-1 check for inflight IO. retry every 100ms up to 10 seconds From 82600bad315a8f9b1a08b8aebefe145100e23bc9 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 9 Dec 2025 13:41:39 +0300 Subject: [PATCH 100/192] Fix sfam-2524 Do not cancel snapshot replication task on node shutdown --- simplyblock_core/storage_node_ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 94a44c4dd..913cbe5d0 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -2253,7 +2253,8 @@ def shutdown_storage_node(node_id, force=False): if force is False: return False for task in tasks: - if task.function_name != JobSchedule.FN_NODE_RESTART: + if task.function_name not in [ + JobSchedule.FN_NODE_RESTART, JobSchedule.FN_SNAPSHOT_REPLICATION, JobSchedule.FN_LVOL_SYNC_DEL]: tasks_controller.cancel_task(task.uuid) logger.info("Shutting down node") From 70d38644bd6706c859031f7929a50ce0043ab2bc Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 9 Dec 2025 14:04:33 +0300 Subject: [PATCH 101/192] Fix sfam-2523 Show task status to be canceled when replication task status is done and cancel flag is true --- simplyblock_core/controllers/snapshot_controller.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 744c36d90..83ad36494 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -631,6 +631,9 @@ def list_replication_tasks(cluster_id): task.function_params["end_time"] - task.function_params["start_time"]) except Exception as e: logger.error(e) + status = task.status + if task.canceled: + status = "cancelled" offset = 0 if "offset" in task.function_params: offset = task.function_params["offset"] @@ -640,7 +643,7 @@ def list_replication_tasks(cluster_id): "Size": utils.humanbytes(snap.used_size), "Duration": duration, "Offset": offset, - "Status": task.status, + "Status": status, "Replicated on node": snap.lvol.node_id, }) return utils.print_table(data) From 6a9357ca7bda0009563e5a823fa29942d501371b Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 10 Dec 2025 17:58:41 +0300 Subject: [PATCH 102/192] Fix sfam-2502 Fix the check for JM repl tasks to be before blocking the port --- simplyblock_core/storage_node_ops.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index ec4f8e514..ccfdbebe0 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -3172,17 +3172,18 @@ def recreate_lvstore(snode, force=False): port_type = "tcp" if sec_node.active_rdma: port_type = "udp" - fw_api.firewall_set_port(snode.lvol_subsys_port, port_type, "block", sec_node.rpc_port) - tcp_ports_events.port_deny(sec_node, snode.lvol_subsys_port) - time.sleep(0.5) - ### 4- set leadership to false ret = sec_node.wait_for_jm_rep_tasks_to_finish(snode.jm_vuid) if not ret: msg = f"JM replication task found for jm {snode.jm_vuid}" logger.error(msg) storage_events.jm_repl_tasks_found(sec_node, snode.jm_vuid) + fw_api.firewall_set_port(snode.lvol_subsys_port, port_type, "block", sec_node.rpc_port) + tcp_ports_events.port_deny(sec_node, snode.lvol_subsys_port) + + time.sleep(0.5) + ### 4- set leadership to false sec_rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False, bs_nonleadership=True) sec_rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid) ### 4-1 check for inflight IO. retry every 100ms up to 10 seconds From 3b40dc10afdb8a4730090bdeca27d88df041dc3d Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 11 Dec 2025 16:06:36 +0300 Subject: [PATCH 103/192] Fix sfam-2527 Fix snapshot chaining --- .../services/snapshot_replication.py | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 704c6351b..6e172b303 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -123,41 +123,43 @@ def process_snap_replicate_finish(task, snapshot): snode = db.get_storage_node_by_id(snapshot.lvol.node_id) snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) remote_snode = db.get_storage_node_by_id(remote_lv.node_id) - replicate_to_source = task.function_params["replicate_to_source"] + target_prev_snap = None if replicate_to_source: org_snap = db.get_snapshot_by_id(snapshot.source_replicated_snap_uuid) - snapshot_lvol_id = org_snap.lvol.get_id() + snaps = db.get_snapshots(org_snap.cluster_id) + for sn in snaps: + if sn.lvol.get_id() == org_snap.lvol.get_id(): + try: + target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) + break + except KeyError: + logger.info(f"Snapshot {sn.target_replicated_snap_uuid} not found") + else: - snapshot_lvol_id = snapshot.lvol.get_id() + snaps = db.get_snapshots(snapshot.cluster_id) + for sn in snaps: + if sn.lvol.get_id() == snapshot.lvol.get_id(): + try: + target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) + break + except KeyError: + logger.info(f"Snapshot {sn.target_replicated_snap_uuid} not found") # chain snaps on primary - snaps = db.get_snapshots(remote_snode.cluster_id) - for sn in snaps: - if sn.lvol.get_id() == snapshot_lvol_id: - try: - target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) - logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") - remote_snode.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) - break - except KeyError: - logger.info(f"Snapshot {sn.target_replicated_snap_uuid} not found") + if target_prev_snap: + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {target_prev_snap.snap_bdev}") + remote_snode.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) # convert to snapshot on primary remote_snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) - sec_node = db.get_storage_node_by_id(remote_snode.secondary_node_id) # chain snaps on secondary + sec_node = db.get_storage_node_by_id(remote_snode.secondary_node_id) if sec_node.status == StorageNode.STATUS_ONLINE: - for sn in snaps: - if sn.lvol.get_id() == snapshot_lvol_id: - try: - target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) - logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") - sec_node.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) - break - except KeyError: - logger.info(f"Snapshot {sn.target_replicated_snap_uuid} not found") + if target_prev_snap: + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") + sec_node.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) # convert to snapshot on secondary sec_node.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) From bc6470475c8db8bfce58c571957615d0271f168f Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Fri, 12 Dec 2025 14:51:24 +0300 Subject: [PATCH 104/192] Add task API to v2 (#818) * Add task API to v2 * Add task API to v2 _2 * Add task API to v2 _3 * Add task API to v2 _4 --- simplyblock_web/api/v2/__init__.py | 3 ++- simplyblock_web/api/v2/task.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/simplyblock_web/api/v2/__init__.py b/simplyblock_web/api/v2/__init__.py index ff8511e1c..c3723cce6 100644 --- a/simplyblock_web/api/v2/__init__.py +++ b/simplyblock_web/api/v2/__init__.py @@ -10,6 +10,7 @@ from . import pool from . import snapshot from . import storage_node +from . import task from simplyblock_core.db_controller import DBController @@ -37,7 +38,7 @@ def _verify_api_token( storage_node.api.include_router(storage_node.instance_api) cluster.instance_api.include_router(storage_node.api) - +cluster.instance_api.include_router(task.api) volume.api.include_router(volume.instance_api) pool.instance_api.include_router(volume.api) diff --git a/simplyblock_web/api/v2/task.py b/simplyblock_web/api/v2/task.py index c17bec3b7..83890640f 100644 --- a/simplyblock_web/api/v2/task.py +++ b/simplyblock_web/api/v2/task.py @@ -40,3 +40,5 @@ def _lookup_task(task_id: UUID) -> JobSchedule: @instance_api.get('/', name='clusters:tasks:detail') def get(cluster: Cluster, task: Task) -> TaskDTO: return TaskDTO.from_model(task) + +api.include_router(instance_api) From 196b93c407b7cf68bf5201ed4b82caeb845b0a2d Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 12 Dec 2025 17:13:02 +0300 Subject: [PATCH 105/192] Increase snapshot replication task retry on node not online --- simplyblock_core/services/snapshot_replication.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 6e172b303..05e1956f4 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -206,9 +206,9 @@ def task_runner(task: JobSchedule): task.write_to_db(db.kv_store) return True - snode = db.get_storage_node_by_id(snapshot.lvol.node_id) - - if not snode: + try: + snode = db.get_storage_node_by_id(snapshot.lvol.node_id) + except KeyError: task.function_result = "node not found" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) @@ -217,6 +217,7 @@ def task_runner(task: JobSchedule): if snode.status != StorageNode.STATUS_ONLINE: task.function_result = "node is not online, retrying" task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 task.write_to_db(db.kv_store) return False From d81835a5f849f5107fb2164419652e8cebd50066 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 13 Dec 2025 00:59:40 +0300 Subject: [PATCH 106/192] fix sfam-2516 _1 --- simplyblock_core/cluster_ops.py | 4 -- .../controllers/tasks_controller.py | 11 ++-- .../services/snapshot_replication.py | 60 +++++++++++-------- 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 7eecde07e..193e3660d 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -1171,10 +1171,6 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, logger.info("Updating mgmt cluster") if cluster.mode == "docker": - sbcli=constants.SIMPLY_BLOCK_CLI_NAME - subprocess.check_call(f"pip install {sbcli} --upgrade".split(' ')) - logger.info(f"{sbcli} upgraded") - cluster_docker = utils.get_docker_client(cluster_id) logger.info(f"Pulling image {constants.SIMPLY_BLOCK_DOCKER_IMAGE}") pull_docker_image_with_retry(cluster_docker, constants.SIMPLY_BLOCK_DOCKER_IMAGE) diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index 34e717ce0..444a67559 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -77,7 +77,8 @@ def _add_task(function_name, cluster_id, node_id, device_id, return False elif function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: - task_id = get_snapshot_replication_task(cluster_id, function_params['snapshot_id']) + task_id = get_snapshot_replication_task( + cluster_id, function_params['snapshot_id'], function_params['replicate_to_source']) if task_id: logger.info(f"Task found, skip adding new task: {task_id}") return False @@ -416,15 +417,17 @@ def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None): return task.uuid return False -def get_snapshot_replication_task(cluster_id, snapshot_id): +def get_snapshot_replication_task(cluster_id, snapshot_id, replicate_to_source): tasks = db.get_job_tasks(cluster_id) for task in tasks: if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION and task.function_params["snapshot_id"] == snapshot_id: if task.status != JobSchedule.STATUS_DONE and task.canceled is False: - return task.uuid + if task.function_params["replicate_to_source"] == replicate_to_source: + return task.uuid return False def add_snapshot_replication_task(cluster_id, node_id, snapshot_id, replicate_to_source=False): return _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, cluster_id, node_id, "", - function_params={"snapshot_id": snapshot_id, "replicate_to_source": replicate_to_source}) + function_params={"snapshot_id": snapshot_id, "replicate_to_source": replicate_to_source}, + send_to_cluster_log=False) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 05e1956f4..cd35a307a 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -22,7 +22,7 @@ def process_snap_replicate_start(task, snapshot): replicate_to_source = task.function_params["replicate_to_source"] if "remote_lvol_id" not in task.function_params or not task.function_params["remote_lvol_id"]: if replicate_to_source: - org_snap = db.get_snapshot_by_id(snapshot.lvol.source_replicated_snap_uuid) + org_snap = db.get_snapshot_by_id(snapshot.source_replicated_snap_uuid) remote_node_uuid = db.get_storage_node_by_id(org_snap.lvol.node_id) remote_pool_uuid = org_snap.lvol.pool_uuid else: # replicate to target @@ -113,11 +113,6 @@ def process_snap_replicate_start(task, snapshot): def process_snap_replicate_finish(task, snapshot): - task.function_result = "Done" - task.status = JobSchedule.STATUS_DONE - task.function_params["end_time"] = int(time.time()) - task.write_to_db() - # detach remote lvol remote_lv = db.get_lvol_by_id(task.function_params["remote_lvol_id"]) snode = db.get_storage_node_by_id(snapshot.lvol.node_id) @@ -126,18 +121,13 @@ def process_snap_replicate_finish(task, snapshot): replicate_to_source = task.function_params["replicate_to_source"] target_prev_snap = None if replicate_to_source: - org_snap = db.get_snapshot_by_id(snapshot.source_replicated_snap_uuid) - snaps = db.get_snapshots(org_snap.cluster_id) - for sn in snaps: - if sn.lvol.get_id() == org_snap.lvol.get_id(): - try: - target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) - break - except KeyError: - logger.info(f"Snapshot {sn.target_replicated_snap_uuid} not found") - + org_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) + try: + target_prev_snap = db.get_snapshot_by_id(org_snap.source_replicated_snap_uuid) + except KeyError: + logger.info(f"Snapshot {org_snap.source_replicated_snap_uuid} not found") else: - snaps = db.get_snapshots(snapshot.cluster_id) + snaps = db.get_snapshots(remote_snode.cluster_id) for sn in snaps: if sn.lvol.get_id() == snapshot.lvol.get_id(): try: @@ -149,20 +139,32 @@ def process_snap_replicate_finish(task, snapshot): # chain snaps on primary if target_prev_snap: logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {target_prev_snap.snap_bdev}") - remote_snode.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) + ret = remote_snode.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) + if not ret: + logger.error("Failed to chain replicated snapshot on primary node") + return False # convert to snapshot on primary - remote_snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + ret = remote_snode.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + if not ret: + logger.error("Failed to convert to snapshot on primary node") + return False # chain snaps on secondary sec_node = db.get_storage_node_by_id(remote_snode.secondary_node_id) if sec_node.status == StorageNode.STATUS_ONLINE: if target_prev_snap: - logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {sn.snap_bdev}") - sec_node.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) + logger.info(f"Chaining replicated lvol: {remote_lv.top_bdev} to snap: {target_prev_snap.snap_bdev}") + ret = sec_node.rpc_client().bdev_lvol_add_clone(target_prev_snap.snap_bdev, remote_lv.top_bdev) + if not ret: + logger.error("Failed to chain replicated snapshot on secondary node") + return False # convert to snapshot on secondary - sec_node.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + ret = sec_node.rpc_client().bdev_lvol_convert(remote_lv.top_bdev) + if not ret: + logger.error("Failed to convert to snapshot on secondary node") + return False new_snapshot_uuid = str(uuid.uuid4()) @@ -195,7 +197,7 @@ def process_snap_replicate_finish(task, snapshot): remote_lv.remove(db.kv_store) snapshot_events.replication_task_finished(snapshot) - return True + return new_snapshot_uuid def task_runner(task: JobSchedule): @@ -269,7 +271,17 @@ def task_runner(task: JobSchedule): task.write_to_db() return False if status == "Done": - process_snap_replicate_finish(task, snapshot) + new_snapshot_uuid = process_snap_replicate_finish(task, snapshot) + if new_snapshot_uuid: + task.function_result = new_snapshot_uuid + task.status = JobSchedule.STATUS_DONE + task.function_params["end_time"] = int(time.time()) + task.write_to_db() + else: + task.function_result = f"complete repl failed, retrying" + task.status = JobSchedule.STATUS_SUSPENDED + task.retry += 1 + task.write_to_db() return True From 5c1dd949fa1298ba057a761de8d1880a942a9cec Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 13 Dec 2025 01:20:48 +0300 Subject: [PATCH 107/192] fix sfam-2516 _2 --- simplyblock_core/services/snapshot_replication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index cd35a307a..fa8ce255a 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -23,7 +23,7 @@ def process_snap_replicate_start(task, snapshot): if "remote_lvol_id" not in task.function_params or not task.function_params["remote_lvol_id"]: if replicate_to_source: org_snap = db.get_snapshot_by_id(snapshot.source_replicated_snap_uuid) - remote_node_uuid = db.get_storage_node_by_id(org_snap.lvol.node_id) + remote_node_uuid = db.get_storage_node_by_id(task.node_id) remote_pool_uuid = org_snap.lvol.pool_uuid else: # replicate to target remote_node_uuid = db.get_storage_node_by_id(snapshot.lvol.replication_node_id) From 0b920c3b934e3f64901faf600b14ca3f167c34d8 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 13 Dec 2025 01:40:56 +0300 Subject: [PATCH 108/192] fix sfam-2516 _3 --- simplyblock_core/controllers/snapshot_controller.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 83ad36494..79e701cda 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -644,7 +644,8 @@ def list_replication_tasks(cluster_id): "Duration": duration, "Offset": offset, "Status": status, - "Replicated on node": snap.lvol.node_id, + "Result": task.function_result, + "Cluster ID": task.cluster_id, }) return utils.print_table(data) From a485224d33cab1297a147fa9b2f2cf55aa9c04fa Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 10 Dec 2025 23:39:59 +0300 Subject: [PATCH 109/192] fix linter --- e2e/continuous_log_collector.py | 1 - e2e/e2e_tests/cluster_test_base.py | 2 +- e2e/stress_test/continuous_failover_ha_multi_client.py | 4 ++-- .../continuous_failover_ha_multi_client_quick_outage.py | 3 +-- e2e/utils/ssh_utils.py | 3 ++- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/e2e/continuous_log_collector.py b/e2e/continuous_log_collector.py index 96b157760..d1ea68c38 100644 --- a/e2e/continuous_log_collector.py +++ b/e2e/continuous_log_collector.py @@ -1,6 +1,5 @@ import os from datetime import datetime -from pathlib import Path from utils.ssh_utils import SshUtils, RunnerK8sLog from logger_config import setup_logger diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py index 15743725b..d37222c88 100644 --- a/e2e/e2e_tests/cluster_test_base.py +++ b/e2e/e2e_tests/cluster_test_base.py @@ -405,7 +405,7 @@ def collect_management_details(self, post_teardown=False): self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd) node+=1 - all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines: + all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines for node in all_nodes: base_path = os.path.join(self.docker_logs_path, node) cmd = f"journalctl -k --no-tail >& {base_path}/jounalctl_{node}-final.txt" diff --git a/e2e/stress_test/continuous_failover_ha_multi_client.py b/e2e/stress_test/continuous_failover_ha_multi_client.py index a97c42676..0f0c9f94e 100644 --- a/e2e/stress_test/continuous_failover_ha_multi_client.py +++ b/e2e/stress_test/continuous_failover_ha_multi_client.py @@ -329,7 +329,7 @@ def perform_random_outage(self): for node in self.sn_nodes_with_sec: # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], # storage_node_id=node) - self.logger.info(f"Skipping lvstore dump!!") + self.logger.info("Skipping lvstore dump!!") for node in self.sn_nodes_with_sec: cur_node_details = self.sbcli_utils.get_storage_node_details(node) cur_node_ip = cur_node_details[0]["mgmt_ip"] @@ -663,7 +663,7 @@ def restart_nodes_after_failover(self, outage_type, restart=False): for node in self.sn_nodes_with_sec: # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], # storage_node_id=node) - self.logger.info(f"Skipping lvstore dump!!") + self.logger.info("Skipping lvstore dump!!") def create_snapshots_and_clones(self): """Create snapshots and clones during an outage.""" diff --git a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py index afa98b055..c2c1051a2 100644 --- a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py +++ b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py @@ -306,7 +306,7 @@ def _seed_snapshots_and_clones(self): if err: nqn = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])[0]["nqn"] self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn) - self.logger.info(f"[LFNG] connect clone error → cleanup") + self.logger.info("[LFNG] connect clone error → cleanup") self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True) sleep_n_sec(3) del self.clone_mount_details[clone_name] @@ -431,7 +431,6 @@ def _perform_outage(self): return outage_type def restart_nodes_after_failover(self, outage_type): - node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) self.logger.info(f"[LFNG] Recover outage={outage_type} node={self.current_outage_node}") diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py index ee265d507..a50a61726 100644 --- a/e2e/utils/ssh_utils.py +++ b/e2e/utils/ssh_utils.py @@ -2891,7 +2891,8 @@ def stop_log_monitor(self): print("K8s log monitor thread stopped.") def _rid(n=6): - import string, random + import string + import random letters = string.ascii_uppercase digits = string.digits return random.choice(letters) + ''.join(random.choices(letters + digits, k=n-1)) From 9d145e251ce812f628d34748087a998138107ab0 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 13 Dec 2025 22:02:04 +0300 Subject: [PATCH 110/192] fix sfam-2516 _4 --- simplyblock_cli/cli-reference.yaml | 7 ++++++ simplyblock_cli/cli.py | 7 ++++++ simplyblock_cli/clibase.py | 3 +++ .../controllers/snapshot_controller.py | 25 ++++++++++++++++++- simplyblock_core/models/snapshot.py | 4 ++- .../services/snapshot_replication.py | 24 ++++++++++-------- 6 files changed, 58 insertions(+), 12 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index d2e6c3a9d..b76f1b746 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1929,6 +1929,13 @@ commands: help: "Snapshot UUID" dest: snapshot_id type: str + - name: get + help: "Gets a snapshot information" + arguments: + - name: "snapshot_id" + help: "Snapshot UUID" + dest: snapshot_id + type: str - name: "qos" help: "qos commands" weight: 700 diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index a09b5d893..2b004aaeb 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -766,6 +766,7 @@ def init_snapshot(self): self.init_snapshot__clone(subparser) self.init_snapshot__replication_status(subparser) self.init_snapshot__delete_replication_only(subparser) + self.init_snapshot__get(subparser) def init_snapshot__add(self, subparser): @@ -801,6 +802,10 @@ def init_snapshot__delete_replication_only(self, subparser): subcommand = self.add_sub_command(subparser, 'delete-replication-only', 'Delete replicated version of a snapshot') subcommand.add_argument('snapshot_id', help='Snapshot UUID', type=str) + def init_snapshot__get(self, subparser): + subcommand = self.add_sub_command(subparser, 'get', 'Gets a snapshot information') + subcommand.add_argument('snapshot_id', help='Snapshot UUID', type=str) + def init_qos(self): subparser = self.add_command('qos', 'qos commands') @@ -1156,6 +1161,8 @@ def run(self): ret = self.snapshot__replication_status(sub_command, args) elif sub_command in ['delete-replication-only']: ret = self.snapshot__delete_replication_only(sub_command, args) + elif sub_command in ['get']: + ret = self.snapshot__get(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index fdf516fcc..1617e8f10 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -656,6 +656,9 @@ def snapshot__replication_status(self, sub_command, args): def snapshot__delete_replication_only(self, sub_command, args): return snapshot_controller.delete_replicated(args.snapshot_id) + def snapshot__get(self, sub_command, args): + return snapshot_controller.get(args.snapshot_id) + def qos__add(self, sub_command, args): return qos_controller.add_class(args.name, args.weight, args.cluster_id) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 79e701cda..de998f033 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -1,4 +1,5 @@ # coding=utf-8 +import json import logging as lg import time import uuid @@ -218,7 +219,14 @@ def add(lvol_id, snapshot_name): snap.snap_ref_id = original_snap.get_id() snap.write_to_db(db_controller.kv_store) - logger.info("Done") + for sn in db_controller.get_snapshots(cluster.get_id()): + if sn.lvol.get_id() == lvol_id: + if not sn.next_snap_uuid: + sn.next_snap_uuid = snap.get_id() + snap.prev_snap_uuid = sn.get_id() + sn.write_to_db() + snap.write_to_db() + snapshot_events.snapshot_create(snap) if lvol.do_replicate: task = tasks_controller.add_snapshot_replication_task(snap.cluster_id, snap.lvol.node_id, snap.get_id()) @@ -634,6 +642,10 @@ def list_replication_tasks(cluster_id): status = task.status if task.canceled: status = "cancelled" + replicate_to = "target" + if "replicate_to_source" in task.function_params: + if task.function_params["replicate_to_source"] is True: + replicate_to = "source" offset = 0 if "offset" in task.function_params: offset = task.function_params["offset"] @@ -644,6 +656,7 @@ def list_replication_tasks(cluster_id): "Duration": duration, "Offset": offset, "Status": status, + "Replicate to": replicate_to, "Result": task.function_result, "Cluster ID": task.cluster_id, }) @@ -670,3 +683,13 @@ def delete_replicated(snapshot_id): return False return True + + +def get(snapshot_uuid): + try: + snap = db_controller.get_snapshot_by_id(snapshot_uuid) + except KeyError: + logger.error(f"Snapshot not found {snapshot_uuid}") + return False + + return json.dumps(snap.get_clean_dict(), indent=2) diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py index fb7056b9c..4be27c328 100644 --- a/simplyblock_core/models/snapshot.py +++ b/simplyblock_core/models/snapshot.py @@ -31,4 +31,6 @@ class SnapShot(BaseModel): status: str = "" fabric: str = "tcp" target_replicated_snap_uuid: str = "" - source_replicated_snap_uuid: str = "" \ No newline at end of file + source_replicated_snap_uuid: str = "" + next_snap_uuid: str = "" + prev_snap_uuid: str = "" \ No newline at end of file diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index fa8ce255a..7e9e5daa8 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -124,17 +124,16 @@ def process_snap_replicate_finish(task, snapshot): org_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) try: target_prev_snap = db.get_snapshot_by_id(org_snap.source_replicated_snap_uuid) - except KeyError: - logger.info(f"Snapshot {org_snap.source_replicated_snap_uuid} not found") + except KeyError as e: + logger.error(e) else: - snaps = db.get_snapshots(remote_snode.cluster_id) - for sn in snaps: - if sn.lvol.get_id() == snapshot.lvol.get_id(): - try: - target_prev_snap = db.get_snapshot_by_id(sn.target_replicated_snap_uuid) - break - except KeyError: - logger.info(f"Snapshot {sn.target_replicated_snap_uuid} not found") + if snapshot.prev_snap_uuid: + try: + prev_snap = db.get_snapshot_by_id(snapshot.prev_snap_uuid) + if prev_snap.target_replicated_snap_uuid: + target_prev_snap = db.get_snapshot_by_id(prev_snap.target_replicated_snap_uuid) + except KeyError as e: + logger.error(e) # chain snaps on primary if target_prev_snap: @@ -187,6 +186,11 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot.created_at = int(time.time()) new_snapshot.source_replicated_snap_uuid = snapshot.uuid new_snapshot.status = SnapShot.STATUS_ONLINE + if target_prev_snap: + new_snapshot.prev_snap_uuid = target_prev_snap.get_id() + target_prev_snap.next_snap_uuid = new_snapshot_uuid + target_prev_snap.write_to_db() + new_snapshot.write_to_db() From 0db9022515d620afd6b675ddbd786d6250d4a321 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 13 Dec 2025 22:04:35 +0300 Subject: [PATCH 111/192] wip --- simplyblock_core/controllers/snapshot_controller.py | 1 + simplyblock_core/services/snapshot_replication.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index de998f033..3991ba81a 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -226,6 +226,7 @@ def add(lvol_id, snapshot_name): snap.prev_snap_uuid = sn.get_id() sn.write_to_db() snap.write_to_db() + break snapshot_events.snapshot_create(snap) if lvol.do_replicate: diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 7e9e5daa8..4af72bdc0 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -193,7 +193,6 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot.write_to_db() - # delete lvol object remote_lv.bdev_stack = [] remote_lv.write_to_db() @@ -282,7 +281,7 @@ def task_runner(task: JobSchedule): task.function_params["end_time"] = int(time.time()) task.write_to_db() else: - task.function_result = f"complete repl failed, retrying" + task.function_result = "complete repl failed, retrying" task.status = JobSchedule.STATUS_SUSPENDED task.retry += 1 task.write_to_db() From ff05ec645c2eac4621df42650b69f1e4a0f06670 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 13 Dec 2025 22:34:27 +0300 Subject: [PATCH 112/192] wip --- simplyblock_cli/cli-reference.yaml | 16 +++++++++++++++ simplyblock_cli/cli.py | 14 +++++++++++++ simplyblock_cli/clibase.py | 3 +++ .../controllers/snapshot_controller.py | 20 +++++++++++++++++++ 4 files changed, 53 insertions(+) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index b76f1b746..1efe3085c 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1936,6 +1936,22 @@ commands: help: "Snapshot UUID" dest: snapshot_id type: str + - name: set + help: "set snapshot db value" + private: true + arguments: + - name: "snapshot_id" + help: "snapshot id" + dest: snapshot_id + type: str + - name: "attr_name" + help: "attr_name" + dest: attr_name + type: str + - name: "attr_value" + help: "attr_value" + dest: attr_value + type: str - name: "qos" help: "qos commands" weight: 700 diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 2b004aaeb..2d34c1485 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -767,6 +767,8 @@ def init_snapshot(self): self.init_snapshot__replication_status(subparser) self.init_snapshot__delete_replication_only(subparser) self.init_snapshot__get(subparser) + if self.developer_mode: + self.init_snapshot__set(subparser) def init_snapshot__add(self, subparser): @@ -806,6 +808,12 @@ def init_snapshot__get(self, subparser): subcommand = self.add_sub_command(subparser, 'get', 'Gets a snapshot information') subcommand.add_argument('snapshot_id', help='Snapshot UUID', type=str) + def init_snapshot__set(self, subparser): + subcommand = self.add_sub_command(subparser, 'set', 'set snapshot db value') + subcommand.add_argument('snapshot_id', help='snapshot id', type=str) + subcommand.add_argument('attr_name', help='attr_name', type=str) + subcommand.add_argument('attr_value', help='attr_value', type=str) + def init_qos(self): subparser = self.add_command('qos', 'qos commands') @@ -1163,6 +1171,12 @@ def run(self): ret = self.snapshot__delete_replication_only(sub_command, args) elif sub_command in ['get']: ret = self.snapshot__get(sub_command, args) + elif sub_command in ['set']: + if not self.developer_mode: + print("This command is private.") + ret = False + else: + ret = self.snapshot__set(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 1617e8f10..fcdceee36 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -659,6 +659,9 @@ def snapshot__delete_replication_only(self, sub_command, args): def snapshot__get(self, sub_command, args): return snapshot_controller.get(args.snapshot_id) + def snapshot__set(self, sub_command, args): + return snapshot_controller.set(args.snapshot_id, args.attr_name, args.attr_value) + def qos__add(self, sub_command, args): return qos_controller.add_class(args.name, args.weight, args.cluster_id) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index 3991ba81a..a045f118f 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -220,6 +220,8 @@ def add(lvol_id, snapshot_name): snap.write_to_db(db_controller.kv_store) for sn in db_controller.get_snapshots(cluster.get_id()): + if sn.get_id() == snap.get_id(): + continue if sn.lvol.get_id() == lvol_id: if not sn.next_snap_uuid: sn.next_snap_uuid = snap.get_id() @@ -694,3 +696,21 @@ def get(snapshot_uuid): return False return json.dumps(snap.get_clean_dict(), indent=2) + + +def set(snapshot_uuid, attr, value) -> bool: + try: + snap = db_controller.get_snapshot_by_id(snapshot_uuid) + except KeyError: + logger.error(f"Snapshot not found {snapshot_uuid}") + return False + + if attr not in snap.get_attrs_map(): + raise KeyError('Attribute not found') + + value = snap.get_attrs_map()[attr]['type'](value) + logger.info(f"Setting {attr} to {value}") + setattr(snap, attr, value) + snap.write_to_db() + return True + From 87861690b6e0fdd08d7d22ea69106dde696834a2 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 13 Dec 2025 23:20:06 +0300 Subject: [PATCH 113/192] wip 2 --- simplyblock_cli/cli-reference.yaml | 5 +++++ simplyblock_cli/clibase.py | 2 +- simplyblock_core/controllers/snapshot_controller.py | 12 +++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 1efe3085c..54e66ff25 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1880,6 +1880,11 @@ commands: dest: cluster_id type: str required: false + - name: "--with-details" + help: "List snapshots with replicate and chaining details" + dest: with_details + type: bool + action: store_true - name: delete help: "Deletes a snapshot" arguments: diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index fcdceee36..18d1e9b5f 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -636,7 +636,7 @@ def snapshot__add(self, sub_command, args): return snapshot_id if not error else error def snapshot__list(self, sub_command, args): - return snapshot_controller.list(args.all, args.cluster_id) + return snapshot_controller.list(args.all, args.cluster_id, args.with_details) def snapshot__delete(self, sub_command, args): return snapshot_controller.delete(args.snapshot_id, args.force) diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index a045f118f..7494a240e 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -247,14 +247,14 @@ def add(lvol_id, snapshot_name): return snap.uuid, False -def list(all=False, cluster_id=None): +def list(all=False, cluster_id=None, with_details=False): snaps = db_controller.get_snapshots(cluster_id) data = [] for snap in snaps: logger.debug(snap) if snap.deleted is True and all is False: continue - data.append({ + d = { "UUID": snap.uuid, "Name": snap.snap_name, "Size": utils.humanbytes(snap.used_size), @@ -264,7 +264,13 @@ def list(all=False, cluster_id=None): "Created At": time.strftime("%H:%M:%S, %d/%m/%Y", time.gmtime(snap.created_at)), "Health": snap.health_check, "Status": snap.status, - }) + } + if with_details: + d["Replication target snap"] = snap.target_replicated_snap_uuid + d["Replication source snap"] = snap.source_replicated_snap_uuid + d["Rrev snap"] = snap.prev_snap_uuid + d["Next snap"] = snap.next_snap_uuid + data.append(d) return utils.print_table(data) From 4734c2d23a9a04cbdc7c29fef9b1f54e20d72ee5 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sat, 13 Dec 2025 23:21:59 +0300 Subject: [PATCH 114/192] wip 2 --- simplyblock_cli/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 2d34c1485..4c051c86f 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -780,6 +780,7 @@ def init_snapshot__list(self, subparser): subcommand = self.add_sub_command(subparser, 'list', 'Lists all snapshots') argument = subcommand.add_argument('--all', help='List soft deleted snapshots', dest='all', action='store_true') argument = subcommand.add_argument('--cluster-id', help='Filter snapshots by cluster UUID', type=str, dest='cluster_id', required=False) + argument = subcommand.add_argument('--with-details', help='List snapshots with replicate and chaining details', dest='with_details', action='store_true') def init_snapshot__delete(self, subparser): subcommand = self.add_sub_command(subparser, 'delete', 'Deletes a snapshot') From b1599511d93d21f76b006d784e51b6a6c9c71a48 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Mon, 15 Dec 2025 13:59:44 +0300 Subject: [PATCH 115/192] Show number of devices on storage node response (#819) show used number of cpus beaed on the spdk_mask --- simplyblock_core/storage_node_ops.py | 3 +-- simplyblock_web/api/v2/dtos.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index ccfdbebe0..582fe918d 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -1108,8 +1108,6 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.active_tcp=active_tcp snode.active_rdma=active_rdma - if 'cpu_count' in node_info: - snode.cpu = node_info['cpu_count'] if 'cpu_hz' in node_info: snode.cpu_hz = node_info['cpu_hz'] if 'memory' in node_info: @@ -1117,6 +1115,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, if 'hugepages' in node_info: snode.hugepages = node_info['hugepages'] + snode.cpu = len(utils.hexa_to_cpu_list(spdk_cpu_mask)) snode.l_cores = l_cores or "" snode.spdk_cpu_mask = spdk_cpu_mask or "" snode.spdk_mem = minimum_hp_memory diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index f8ba77ae7..c44333d29 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -155,6 +155,7 @@ class StorageNodeDTO(BaseModel): status: str mgmt_ip: IPv4Address health_check: bool + online_devices: str @staticmethod def from_model(model: StorageNode): @@ -163,6 +164,7 @@ def from_model(model: StorageNode): status=model.status, mgmt_ip=IPv4Address(model.mgmt_ip), health_check=model.health_check, + online_devices=f"{len(model.nvme_devices)}/{len([d for d in model.nvme_devices if d.status=='online'])}", ) From 8e10e29237b33cc5ef19354b2bf7d11026cbb494 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Mon, 15 Dec 2025 22:53:35 +0300 Subject: [PATCH 116/192] Return capacity usage per object in API V2 (cluster, node, device, pool, lvol) (#820) * Return capacity usage per node object in API V2 * Fix type checker * Adds capacity re reponse API v2 to: cluster, node, device, pool, lvol --- simplyblock_web/api/v2/cluster.py | 20 +++++++----- simplyblock_web/api/v2/device.py | 18 ++++++++--- simplyblock_web/api/v2/dtos.py | 42 +++++++++++++++++++++++--- simplyblock_web/api/v2/pool.py | 21 ++++++++----- simplyblock_web/api/v2/storage_node.py | 19 ++++++++---- simplyblock_web/api/v2/volume.py | 19 ++++++++---- 6 files changed, 103 insertions(+), 36 deletions(-) diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py index 7834e3f06..19e9dbbf4 100644 --- a/simplyblock_web/api/v2/cluster.py +++ b/simplyblock_web/api/v2/cluster.py @@ -48,12 +48,14 @@ class ClusterParams(BaseModel): @api.get('/', name='clusters:list') def list() -> List[ClusterDTO]: - return [ - ClusterDTO.from_model(cluster) - for cluster - in db.get_clusters() - ] - + data = [] + for cluster in db.get_clusters(): + stat_obj = None + ret = db.get_cluster_capacity(cluster, 1) + if ret: + stat_obj = ret[0] + data.append(ClusterDTO.from_model(cluster, stat_obj)) + return data @api.post('/', name='clusters:create', status_code=201, responses={201: {"content": None}}) def add(request: Request, parameters: ClusterParams): @@ -80,7 +82,11 @@ def _lookup_cluster(cluster_id: UUID): @instance_api.get('/', name='clusters:detail') def get(cluster: Cluster) -> ClusterDTO: - return ClusterDTO.from_model(cluster) + stat_obj = None + ret = db.get_cluster_capacity(cluster, 1) + if ret: + stat_obj = ret[0] + return ClusterDTO.from_model(cluster, stat_obj) class UpdatableClusterParameters(BaseModel): diff --git a/simplyblock_web/api/v2/device.py b/simplyblock_web/api/v2/device.py index 1c7b40d7e..4fa0949fb 100644 --- a/simplyblock_web/api/v2/device.py +++ b/simplyblock_web/api/v2/device.py @@ -18,10 +18,14 @@ @api.get('/', name='clusters:storage_nodes:devices:list') def list(cluster: Cluster, storage_node: StorageNode) -> List[DeviceDTO]: - return [ - DeviceDTO.from_model(device) - for device in storage_node.nvme_devices - ] + data = [] + for device in storage_node.nvme_devices: + stat_obj = None + ret = db.get_device_stats(device, 1) + if ret: + stat_obj = ret[0] + data.append(DeviceDTO.from_model(device, stat_obj)) + return data instance_api = APIRouter(prefix='/{device_id}') @@ -38,7 +42,11 @@ def _lookup_device(storage_node: StorageNode, device_id: UUID) -> NVMeDevice: @instance_api.get('/', name='clusters:storage_nodes:devices:detail') def get(cluster: Cluster, storage_node: StorageNode, device: Device) -> DeviceDTO: - return DeviceDTO.from_model(device) + stat_obj = None + ret = db.get_device_stats(device, 1) + if ret: + stat_obj = ret[0] + return DeviceDTO.from_model(device, stat_obj) @instance_api.delete('/', name='clusters:storage_nodes:devices:delete', status_code=204, responses={204: {"content": None}}) diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index c44333d29..62f1a94e1 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -12,11 +12,33 @@ from simplyblock_core.models.nvme_device import NVMeDevice from simplyblock_core.models.pool import Pool from simplyblock_core.models.snapshot import SnapShot +from simplyblock_core.models.stats import StatsObject from simplyblock_core.models.storage_node import StorageNode from . import util +class CapacityStatDTO(BaseModel): + date: int + size_total: int + size_prov: int + size_used: int + size_free: int + size_util: int + + @staticmethod + def from_model(model: StatsObject): + return CapacityStatDTO( + date=model.date, + size_total=model.size_total, + size_prov=model.size_prov, + size_used=model.size_used, + size_free=model.size_free, + size_util=model.size_util, + ) + + + class ClusterDTO(BaseModel): id: UUID name: Optional[str] @@ -33,9 +55,10 @@ class ClusterDTO(BaseModel): node_affinity: bool anti_affinity: bool secret: str + capacity: CapacityStatDTO @staticmethod - def from_model(model: Cluster): + def from_model(model: Cluster, stat_obj: Optional[StatsObject]=None): return ClusterDTO( id=UUID(model.get_id()), name=model.cluster_name, @@ -52,6 +75,7 @@ def from_model(model: Cluster): node_affinity=model.enable_node_affinity, anti_affinity=model.strict_node_anti_affinity, secret=model.secret, + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) @@ -65,9 +89,10 @@ class DeviceDTO(BaseModel): nvmf_ips: List[IPv4Address] nvmf_nqn: str = "" nvmf_port: int = 0 + capacity: CapacityStatDTO @staticmethod - def from_model(model: NVMeDevice): + def from_model(model: NVMeDevice, stat_obj: Optional[StatsObject]=None): return DeviceDTO( id=UUID(model.get_id()), status=model.status, @@ -78,6 +103,7 @@ def from_model(model: NVMeDevice): nvmf_ips=[IPv4Address(ip) for ip in model.nvmf_ip.split(',')], nvmf_nqn=model.nvmf_nqn, nvmf_port=model.nvmf_port, + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) @@ -107,9 +133,10 @@ class StoragePoolDTO(BaseModel): max_rw_mbytes: util.Unsigned max_r_mbytes: util.Unsigned max_w_mbytes: util.Unsigned + capacity: CapacityStatDTO @staticmethod - def from_model(model: Pool): + def from_model(model: Pool, stat_obj: Optional[StatsObject]=None): return StoragePoolDTO( id=UUID(model.get_id()), name=model.pool_name, @@ -120,6 +147,7 @@ def from_model(model: Pool): max_rw_mbytes=model.max_rw_mbytes_per_sec, max_r_mbytes=model.max_r_mbytes_per_sec, max_w_mbytes=model.max_w_mbytes_per_sec, + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) @@ -156,15 +184,17 @@ class StorageNodeDTO(BaseModel): mgmt_ip: IPv4Address health_check: bool online_devices: str + capacity: CapacityStatDTO @staticmethod - def from_model(model: StorageNode): + def from_model(model: StorageNode, stat_obj: Optional[StatsObject]=None): return StorageNodeDTO( uuid=UUID(model.get_id()), status=model.status, mgmt_ip=IPv4Address(model.mgmt_ip), health_check=model.health_check, online_devices=f"{len(model.nvme_devices)}/{len([d for d in model.nvme_devices if d.status=='online'])}", + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) @@ -208,9 +238,10 @@ class VolumeDTO(BaseModel): max_rw_mbytes: util.Unsigned max_r_mbytes: util.Unsigned max_w_mbytes: util.Unsigned + capacity: CapacityStatDTO @staticmethod - def from_model(model: LVol, request: Request, cluster_id: str): + def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optional[StatsObject]=None): return VolumeDTO( id=UUID(model.get_id()), name=model.lvol_name, @@ -243,4 +274,5 @@ def from_model(model: LVol, request: Request, cluster_id: str): max_rw_mbytes=model.rw_mbytes_per_sec, max_r_mbytes=model.r_mbytes_per_sec, max_w_mbytes=model.w_mbytes_per_sec, + capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), ) diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py index 4ef2c897b..4ccae01ab 100644 --- a/simplyblock_web/api/v2/pool.py +++ b/simplyblock_web/api/v2/pool.py @@ -20,12 +20,15 @@ @api.get('/', name='clusters:storage-pools:list') def list(cluster: Cluster) -> List[StoragePoolDTO]: - return [ - StoragePoolDTO.from_model(pool) - for pool - in db.get_pools() - if pool.cluster_id == cluster.get_id() - ] + data = [] + for pool in db.get_pools(): + if pool.cluster_id == cluster.get_id(): + stat_obj = None + ret = db.get_pool_stats(pool, 1) + if ret: + stat_obj = ret[0] + data.append(StoragePoolDTO.from_model(pool, stat_obj)) + return data class StoragePoolParams(BaseModel): @@ -73,7 +76,11 @@ def _lookup_storage_pool(pool_id: UUID) -> PoolModel: @instance_api.get('/', name='clusters:storage-pools:detail') def get(cluster: Cluster, pool: StoragePool) -> StoragePoolDTO: - return StoragePoolDTO.from_model(pool) + stat_obj = None + ret = db.get_pool_stats(pool, 1) + if ret: + stat_obj = ret[0] + return StoragePoolDTO.from_model(pool, stat_obj) @instance_api.delete('/', name='clusters:storage-pools:delete', status_code=204, responses={204: {"content": None}}) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index e612d7177..aa7923d36 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -22,11 +22,14 @@ @api.get('/', name='clusters:storage-nodes:list') def list(cluster: Cluster) -> List[StorageNodeDTO]: - return [ - StorageNodeDTO.from_model(storage_node) - for storage_node - in db.get_storage_nodes_by_cluster_id(cluster.get_id()) - ] + data = [] + for storage_node in db.get_storage_nodes_by_cluster_id(cluster.get_id()): + node_stat_obj = None + ret = db.get_node_capacity(storage_node, 1) + if ret: + node_stat_obj = ret[0] + data.append(StorageNodeDTO.from_model(storage_node, node_stat_obj)) + return data class StorageNodeParams(BaseModel): @@ -86,7 +89,11 @@ def _lookup_storage_node(storage_node_id: UUID) -> StorageNodeModel: @instance_api.get('/', name='clusters:storage-nodes:detail') def get(cluster: Cluster, storage_node: StorageNode): - return StorageNodeDTO.from_model(storage_node) + node_stat_obj = None + ret = db.get_node_capacity(storage_node, 1) + if ret: + node_stat_obj = ret[0] + return StorageNodeDTO.from_model(storage_node, node_stat_obj) @instance_api.delete('/', name='clusters:storage-nodes:delete') diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 698788718..6755a1149 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -21,11 +21,14 @@ @api.get('/', name='clusters:storage-pools:volumes:list') def list(request: Request, cluster: Cluster, pool: StoragePool) -> List[VolumeDTO]: - return [ - VolumeDTO.from_model(lvol, request, cluster.get_id()) - for lvol - in db.get_lvols_by_pool_id(pool.get_id()) - ] + data = [] + for lvol in db.get_lvols_by_pool_id(pool.get_id()): + stat_obj = None + ret = db.get_lvol_stats(lvol, 1) + if ret: + stat_obj = ret[0] + data.append(VolumeDTO.from_model(lvol, request, cluster.get_id(), stat_obj)) + return data class _CreateParams(BaseModel): @@ -122,7 +125,11 @@ def _lookup_volume(volume_id: UUID) -> LVol: @instance_api.get('/', name='clusters:storage-pools:volumes:detail') def get(request: Request, cluster: Cluster, pool: StoragePool, volume: Volume) -> VolumeDTO: - return VolumeDTO.from_model(volume, request, cluster.get_id()) + stat_obj = None + ret = db.get_lvol_stats(volume, 1) + if ret: + stat_obj = ret[0] + return VolumeDTO.from_model(volume, request, cluster.get_id(), stat_obj) class UpdatableLVolParams(BaseModel): From 15e0af31ed65baecf4ca71843d54c1a1299b7e0d Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 16 Dec 2025 19:09:46 +0300 Subject: [PATCH 117/192] Exclude src snap node id when starting replication on cloned lvol --- simplyblock_core/controllers/lvol_controller.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index e2bc5c8a2..04c8f5f9c 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1771,14 +1771,27 @@ def replication_start(lvol_id): lvol.do_replicate = True if not lvol.replication_node_id: + excluded_nodes = [] + if lvol.cloned_from_snap: + lvol_snap = db_controller.get_snapshot_by_id(lvol.cloned_from_snap) + if lvol_snap.source_replicated_snap_uuid: + org_snap = db_controller.get_snapshot_by_id(lvol_snap.source_replicated_snap_uuid) + excluded_nodes.append(org_snap.lvol.node_id) snode = db_controller.get_storage_node_by_id(lvol.node_id) cluster = db_controller.get_cluster_by_id(snode.cluster_id) if not cluster.snapshot_replication_target_cluster: logger.error(f"Cluster: {snode.cluster_id} not replicated") return False random_nodes = _get_next_3_nodes(cluster.snapshot_replication_target_cluster, lvol.size) - lvol.replication_node_id = random_nodes[0].get_id() - lvol.write_to_db() + for r_node in random_nodes: + if r_node.get_id() not in excluded_nodes: + logger.info(f"Replicating on node: {r_node.get_id()}") + lvol.replication_node_id = r_node.get_id() + lvol.write_to_db() + break + if not lvol.replication_node_id: + logger.error(f"Replication node not found for lvol: {lvol.get_id()}") + return False logger.info("Setting LVol do_replicate: True") for snap in db_controller.get_snapshots(): From 8dcc73cc38435d53371885339113e25fdb34940e Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 18 Dec 2025 15:21:03 +0300 Subject: [PATCH 118/192] Adds cluster rebalancing event (#823) --- simplyblock_core/controllers/cluster_events.py | 10 ++++++++++ simplyblock_core/services/storage_node_monitor.py | 12 ++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/controllers/cluster_events.py b/simplyblock_core/controllers/cluster_events.py index e8e6c406e..059aea976 100644 --- a/simplyblock_core/controllers/cluster_events.py +++ b/simplyblock_core/controllers/cluster_events.py @@ -80,3 +80,13 @@ def cluster_delete(cluster): db_object=cluster, caused_by=ec.CAUSED_BY_CLI, message=f"Cluster deleted {cluster.get_id()}") + + +def cluster_rebalancing_change(cluster, new_state, old_status): + ec.log_event_cluster( + cluster_id=cluster.get_id(), + domain=ec.DOMAIN_CLUSTER, + event=ec.EVENT_STATUS_CHANGE, + db_object=cluster, + caused_by=ec.CAUSED_BY_CLI, + message=f"Cluster rebalancing changed from {old_status} to {new_state}") diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py index bfb92c11b..e7f32ad82 100644 --- a/simplyblock_core/services/storage_node_monitor.py +++ b/simplyblock_core/services/storage_node_monitor.py @@ -5,7 +5,8 @@ from simplyblock_core import constants, db_controller, cluster_ops, storage_node_ops, utils -from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events +from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events, \ + cluster_events from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice @@ -134,10 +135,13 @@ def update_cluster_status(cluster_id): JobSchedule.FN_DEV_MIG, JobSchedule.FN_NEW_DEV_MIG, JobSchedule.FN_FAILED_DEV_MIG]: if task.retry == 0: first_iter_task_pending += 1 - + is_re_balancing = first_iter_task_pending > 0 cluster = db.get_cluster_by_id(cluster_id) - cluster.is_re_balancing = first_iter_task_pending > 0 - cluster.write_to_db() + if cluster.is_re_balancing != is_re_balancing: + old_status = cluster.is_re_balancing + cluster.is_re_balancing = is_re_balancing + cluster.write_to_db() + cluster_events.cluster_rebalancing_change(cluster_id, cluster.is_re_balancing, old_status) current_cluster_status = cluster.status logger.info("cluster_status: %s", current_cluster_status) From 14e3c06f619640130f97995650587da55bc16d7e Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Fri, 19 Dec 2025 20:12:19 +0300 Subject: [PATCH 119/192] Add snode port change event (#824) * Add snode port change event * fix linter --- e2e/continuous_log_collector.py | 1 - e2e/e2e_tests/cluster_test_base.py | 2 +- .../continuous_failover_ha_multi_client.py | 4 ++-- ...ntinuous_failover_ha_multi_client_quick_outage.py | 3 +-- e2e/utils/ssh_utils.py | 3 ++- simplyblock_core/controllers/storage_events.py | 12 ++++++++++++ simplyblock_core/storage_node_ops.py | 1 + 7 files changed, 19 insertions(+), 7 deletions(-) diff --git a/e2e/continuous_log_collector.py b/e2e/continuous_log_collector.py index 96b157760..d1ea68c38 100644 --- a/e2e/continuous_log_collector.py +++ b/e2e/continuous_log_collector.py @@ -1,6 +1,5 @@ import os from datetime import datetime -from pathlib import Path from utils.ssh_utils import SshUtils, RunnerK8sLog from logger_config import setup_logger diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py index 15743725b..d37222c88 100644 --- a/e2e/e2e_tests/cluster_test_base.py +++ b/e2e/e2e_tests/cluster_test_base.py @@ -405,7 +405,7 @@ def collect_management_details(self, post_teardown=False): self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd) node+=1 - all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines: + all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines for node in all_nodes: base_path = os.path.join(self.docker_logs_path, node) cmd = f"journalctl -k --no-tail >& {base_path}/jounalctl_{node}-final.txt" diff --git a/e2e/stress_test/continuous_failover_ha_multi_client.py b/e2e/stress_test/continuous_failover_ha_multi_client.py index a97c42676..0f0c9f94e 100644 --- a/e2e/stress_test/continuous_failover_ha_multi_client.py +++ b/e2e/stress_test/continuous_failover_ha_multi_client.py @@ -329,7 +329,7 @@ def perform_random_outage(self): for node in self.sn_nodes_with_sec: # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], # storage_node_id=node) - self.logger.info(f"Skipping lvstore dump!!") + self.logger.info("Skipping lvstore dump!!") for node in self.sn_nodes_with_sec: cur_node_details = self.sbcli_utils.get_storage_node_details(node) cur_node_ip = cur_node_details[0]["mgmt_ip"] @@ -663,7 +663,7 @@ def restart_nodes_after_failover(self, outage_type, restart=False): for node in self.sn_nodes_with_sec: # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0], # storage_node_id=node) - self.logger.info(f"Skipping lvstore dump!!") + self.logger.info("Skipping lvstore dump!!") def create_snapshots_and_clones(self): """Create snapshots and clones during an outage.""" diff --git a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py index afa98b055..c2c1051a2 100644 --- a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py +++ b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py @@ -306,7 +306,7 @@ def _seed_snapshots_and_clones(self): if err: nqn = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])[0]["nqn"] self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn) - self.logger.info(f"[LFNG] connect clone error → cleanup") + self.logger.info("[LFNG] connect clone error → cleanup") self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True) sleep_n_sec(3) del self.clone_mount_details[clone_name] @@ -431,7 +431,6 @@ def _perform_outage(self): return outage_type def restart_nodes_after_failover(self, outage_type): - node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node) self.logger.info(f"[LFNG] Recover outage={outage_type} node={self.current_outage_node}") diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py index ee265d507..a50a61726 100644 --- a/e2e/utils/ssh_utils.py +++ b/e2e/utils/ssh_utils.py @@ -2891,7 +2891,8 @@ def stop_log_monitor(self): print("K8s log monitor thread stopped.") def _rid(n=6): - import string, random + import string + import random letters = string.ascii_uppercase digits = string.digits return random.choice(letters) + ''.join(random.choices(letters + digits, k=n-1)) diff --git a/simplyblock_core/controllers/storage_events.py b/simplyblock_core/controllers/storage_events.py index bd5a9eb8d..027f7dbed 100644 --- a/simplyblock_core/controllers/storage_events.py +++ b/simplyblock_core/controllers/storage_events.py @@ -84,3 +84,15 @@ def jm_repl_tasks_found(node, jm_vuid, caused_by=ec.CAUSED_BY_MONITOR): event_level=EventObj.LEVEL_WARN, message=f"JM replication task found for jm {jm_vuid}", node_id=node.get_id()) + + +def node_ports_changed(node, caused_by=ec.CAUSED_BY_MONITOR): + ec.log_event_cluster( + cluster_id=node.cluster_id, + domain=ec.DOMAIN_CLUSTER, + event=ec.EVENT_STATUS_CHANGE, + db_object=node, + caused_by=caused_by, + event_level=EventObj.LEVEL_WARN, + message=f"Storage node ports set, LVol:{node.lvol_subsys_port} RPC:{node.rpc_port} Internal:{node.nvmf_port}", + node_id=node.get_id()) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 582fe918d..daa99d31a 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -3600,6 +3600,7 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo sec_node.write_to_db() + storage_events.node_ports_changed(snode) return True From 1a7d4f48b4238c55fec63d0f6a8e027a04b77245 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Fri, 19 Dec 2025 20:58:24 +0300 Subject: [PATCH 120/192] Fix snode health check cluster logs (#825) --- simplyblock_core/storage_node_ops.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index daa99d31a..a6d89b74d 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -1907,7 +1907,6 @@ def restart_storage_node( return False if snode.enable_ha_jm: snode.remote_jm_devices = _connect_to_remote_jm_devs(snode) - snode.health_check = True snode.lvstore_status = "" snode.write_to_db(db_controller.kv_store) @@ -2968,7 +2967,6 @@ def set_node_status(node_id, status, reconnect_on_online=True): return False if snode.enable_ha_jm: snode.remote_jm_devices = _connect_to_remote_jm_devs(snode) - snode.health_check = True snode.write_to_db(db_controller.kv_store) distr_controller.send_cluster_map_to_node(snode) From 8d9d0a5e45d78f1b8dd9c1a29c2a353aeec076fd Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Mon, 22 Dec 2025 18:10:35 +0300 Subject: [PATCH 121/192] Fix prom client cluster ip in case of k8s (#826) * Fix prom client cluster ip in case of k8s * fix --- simplyblock_core/prom_client.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/prom_client.py b/simplyblock_core/prom_client.py index 82756161b..833d42b36 100644 --- a/simplyblock_core/prom_client.py +++ b/simplyblock_core/prom_client.py @@ -2,6 +2,7 @@ import re from datetime import datetime, timedelta +from simplyblock_core import constants from simplyblock_core.db_controller import DBController from simplyblock_core.models.mgmt_node import MgmtNode @@ -20,13 +21,16 @@ class PromClient: def __init__(self, cluster_id): db_controller = DBController() cluster_ip = None - for node in db_controller.get_mgmt_nodes(): - if node.cluster_id == cluster_id and node.status == MgmtNode.STATUS_ONLINE: - cluster_ip = node.mgmt_ip - break - if cluster_ip is None: - raise PromClientException("Cluster has no online mgmt nodes") - + cluster = db_controller.get_cluster_by_id(cluster_id) + if cluster.mode == "docker": + for node in db_controller.get_mgmt_nodes(): + if node.cluster_id == cluster_id and node.status == MgmtNode.STATUS_ONLINE: + cluster_ip = node.mgmt_ip + break + if cluster_ip is None: + raise PromClientException("Cluster has no online mgmt nodes") + else: + cluster_ip = constants.PROMETHEUS_STATEFULSET_NAME self.ip_address = f"{cluster_ip}:9090" self.url = 'http://%s/' % self.ip_address self.client = PrometheusConnect(url=self.url, disable_ssl=True) From da0c05787869592f6e6ce2f87582079f506645a4 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 24 Dec 2025 16:56:35 +0300 Subject: [PATCH 122/192] fix snapshot replication source and target in case of replicate_to_source=True --- simplyblock_core/services/snapshot_replication.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 4af72bdc0..d11a5a28f 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -169,7 +169,10 @@ def process_snap_replicate_finish(task, snapshot): if snapshot.status == SnapShot.STATUS_IN_REPLICATION: snapshot.status = SnapShot.STATUS_ONLINE - snapshot.target_replicated_snap_uuid = new_snapshot_uuid + if replicate_to_source: + snapshot.source_replicated_snap_uuid = new_snapshot_uuid + else: + snapshot.target_replicated_snap_uuid = new_snapshot_uuid snapshot.write_to_db() new_snapshot = SnapShot() @@ -184,7 +187,10 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot.snap_name = snapshot.snap_name new_snapshot.blobid = remote_lv.blobid new_snapshot.created_at = int(time.time()) - new_snapshot.source_replicated_snap_uuid = snapshot.uuid + if replicate_to_source: + new_snapshot.target_replicated_snap_uuid = snapshot.uuid + else: + new_snapshot.source_replicated_snap_uuid = snapshot.uuid new_snapshot.status = SnapShot.STATUS_ONLINE if target_prev_snap: new_snapshot.prev_snap_uuid = target_prev_snap.get_id() From 340d1aa604342e6140ed9cb4a7a7bf2baaec9ae9 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 30 Dec 2025 14:42:48 +0300 Subject: [PATCH 123/192] fix cluster add apiv2 (#829) --- simplyblock_web/api/v2/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py index 19e9dbbf4..49f8a09e8 100644 --- a/simplyblock_web/api/v2/cluster.py +++ b/simplyblock_web/api/v2/cluster.py @@ -63,8 +63,8 @@ def add(request: Request, parameters: ClusterParams): if not cluster_id_or_false: raise ValueError('Failed to create cluster') - entity_url = request.app.url_path_for('get', cluster_id=cluster_id_or_false) - return Response(status_code=201, headers={'Location': entity_url}) + cluster = db.get_cluster_by_id(cluster_id_or_false) + return ClusterDTO.from_model(cluster) instance_api = APIRouter(prefix='/{cluster_id}') From 7649052c59e32d4995923fb64f8c8db76e625ec2 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Fri, 2 Jan 2026 23:46:00 +0300 Subject: [PATCH 124/192] Fix deps version (#832) --- simplyblock_core/scripts/install_deps.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/simplyblock_core/scripts/install_deps.sh b/simplyblock_core/scripts/install_deps.sh index 256a55500..56d0bf96e 100644 --- a/simplyblock_core/scripts/install_deps.sh +++ b/simplyblock_core/scripts/install_deps.sh @@ -2,15 +2,15 @@ if [[ "$1" == "docker" ]]; then sudo yum install -y yum-utils - sudo yum install -y https://repo.almalinux.org/almalinux/9/devel/aarch64/os/Packages/tuned-profiles-realtime-2.24.0-1.el9.noarch.rpm + sudo yum install -y https://repo.almalinux.org/almalinux/9/devel/aarch64/os/Packages/tuned-profiles-realtime-2.26.0-1.el9.noarch.rpm sudo yum install -y yum-utils xorg-x11-xauth nvme-cli fio tuned sudo yum install hostname pkg-config git wget python3-pip yum-utils \ iptables pciutils -y sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo - sudo yum install docker-ce docker-ce-cli \ - containerd.io docker-buildx-plugin docker-compose-plugin -y + sudo yum install docker-ce-29.1.3-1.el9 docker-ce-cli-29.1.3-1.el9 \ + containerd.io-2.2.0-2.el9 docker-buildx-plugin-0.30.1-1.el9 docker-compose-plugin-5.0.1-1.el9 -y sudo systemctl enable docker sudo systemctl start docker From 3b74ce12f61804281878934ac119872ae9f7b9bf Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Tue, 6 Jan 2026 16:13:14 +0300 Subject: [PATCH 125/192] fix id_device_by_nqn int+str (#833) --- simplyblock_core/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 7bc2fa112..1f086dc2d 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -1103,7 +1103,7 @@ def addNvmeDevices(rpc_client, snode, devs): serial_number = nvme_driver_data['ctrlr_data']['serial_number'] if snode.id_device_by_nqn: if "ns_data" in nvme_driver_data: - serial_number = nvme_driver_data['pci_address'] + nvme_driver_data['ns_data']['id'] + serial_number = nvme_driver_data['pci_address'] + str(nvme_driver_data['ns_data']['id']) else: logger.error(f"No subsystem nqn found for device: {nvme_driver_data['pci_address']}") From 21cd2dae33bf9871c9a0e3d94d9b0339f000a161 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 6 Jan 2026 19:02:54 +0300 Subject: [PATCH 126/192] Adds openapi.json --- simplyblock_web/static/openapi.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 simplyblock_web/static/openapi.json diff --git a/simplyblock_web/static/openapi.json b/simplyblock_web/static/openapi.json new file mode 100644 index 000000000..3e2a05130 --- /dev/null +++ b/simplyblock_web/static/openapi.json @@ -0,0 +1 @@ +{"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/api/v2/clusters/":{"get":{"summary":"Clusters:List","operationId":"clusters_list_api_v2_clusters__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/ClusterDTO"},"title":"Response Clusters List Api V2 Clusters Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Create","operationId":"clusters_create_api_v2_clusters__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClusterParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/":{"get":{"summary":"Clusters:Detail","operationId":"clusters_detail_api_v2_clusters__cluster_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClusterDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Update","operationId":"clusters_update_api_v2_clusters__cluster_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableClusterParameters"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Delete","operationId":"clusters_delete_api_v2_clusters__cluster_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/capacity":{"get":{"summary":"Clusters:Capacity","operationId":"clusters_capacity_api_v2_clusters__cluster_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/iostats":{"get":{"summary":"Clusters:Iostats","operationId":"clusters_iostats_api_v2_clusters__cluster_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/logs":{"get":{"summary":"Clusters:Logs","operationId":"clusters_logs_api_v2_clusters__cluster_id__logs_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","default":50,"title":"Limit"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/start":{"post":{"summary":"Clusters:Start","operationId":"clusters_start_api_v2_clusters__cluster_id__start_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/shutdown":{"post":{"summary":"Clusters:Shutdown","operationId":"clusters_shutdown_api_v2_clusters__cluster_id__shutdown_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/activate":{"post":{"summary":"Clusters:Activate","operationId":"clusters_activate_api_v2_clusters__cluster_id__activate_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/update":{"post":{"summary":"Clusters:Upgrade","operationId":"clusters_upgrade_api_v2_clusters__cluster_id__update_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_UpdateParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/":{"get":{"summary":"Clusters:Storage-Nodes:List","operationId":"clusters_storage_nodes_list_api_v2_clusters__cluster_id__storage_nodes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/StorageNodeDTO"},"title":"Response Clusters Storage Nodes List Api V2 Clusters Cluster Id Storage Nodes Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Nodes:Create","operationId":"clusters_storage_nodes_create_api_v2_clusters__cluster_id__storage_nodes__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/StorageNodeParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/":{"get":{"summary":"Clusters:Storage-Nodes:Detail","operationId":"clusters_storage_nodes_detail_api_v2_clusters__cluster_id__storage_nodes__storage_node_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Nodes:Delete","operationId":"clusters_storage_nodes_delete_api_v2_clusters__cluster_id__storage_nodes__storage_node_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force_remove","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force Remove"}},{"name":"force_migrate","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force Migrate"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/capacity":{"get":{"summary":"Clusters:Storage-Nodes:Capacity","operationId":"clusters_storage_nodes_capacity_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/iostats":{"get":{"summary":"Clusters:Storage-Nodes:Iostats","operationId":"clusters_storage_nodes_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/nics":{"get":{"summary":"Clusters:Storage-Nodes:Nics:List","operationId":"clusters_storage_nodes_nics_list_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__nics_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/nics/{nic_id}/iostats":{"get":{"summary":"Clusters:Storage-Nodes:Nics:Iostats","operationId":"clusters_storage_nodes_nics_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__nics__nic_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"nic_id","in":"path","required":true,"schema":{"type":"string","title":"Nic Id"}},{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/suspend":{"post":{"summary":"Clusters:Storage-Nodes:Suspend","operationId":"clusters_storage_nodes_suspend_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__suspend_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/resume":{"post":{"summary":"Clusters:Storage-Nodes:Resume","operationId":"clusters_storage_nodes_resume_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__resume_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/shutdown":{"post":{"summary":"Clusters:Storage-Nodes:Shutdown","operationId":"clusters_storage_nodes_shutdown_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__shutdown_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/restart":{"post":{"summary":"Clusters:Storage-Nodes:Restart","operationId":"clusters_storage_nodes_restart_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__restart_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_RestartParams","default":{"force":false,"reattach_volume":false}}}}},"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/start":{"post":{"summary":"Clusters:Storage-Nodes:Start","operationId":"clusters_storage_nodes_start_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__start_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_RestartParams","default":{"force":false,"reattach_volume":false}}}}},"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/":{"get":{"summary":"Clusters:Storage Nodes:Devices:List","operationId":"clusters_storage_nodes_devices_list_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/DeviceDTO"},"title":"Response Clusters Storage Nodes Devices List Api V2 Clusters Cluster Id Storage Nodes Storage Node Id Devices Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/":{"get":{"summary":"Clusters:Storage Nodes:Devices:Detail","operationId":"clusters_storage_nodes_devices_detail_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/DeviceDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage Nodes:Devices:Delete","operationId":"clusters_storage_nodes_devices_delete_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/capacity":{"get":{"summary":"Clusters:Storage Nodes:Devices:Capacity","operationId":"clusters_storage_nodes_devices_capacity_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/iostats":{"get":{"summary":"Clusters:Storage Nodes:Devices:Iostats","operationId":"clusters_storage_nodes_devices_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/reset":{"post":{"summary":"Clusters:Storage Nodes:Devices:Reset","operationId":"clusters_storage_nodes_devices_reset_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__reset_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/":{"get":{"summary":"Clusters:Storage-Pools:List","operationId":"clusters_storage_pools_list_api_v2_clusters__cluster_id__storage_pools__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/StoragePoolDTO"},"title":"Response Clusters Storage Pools List Api V2 Clusters Cluster Id Storage Pools Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Create","operationId":"clusters_storage_pools_create_api_v2_clusters__cluster_id__storage_pools__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/StoragePoolParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/":{"get":{"summary":"Clusters:Storage-Pools:Detail","operationId":"clusters_storage_pools_detail_api_v2_clusters__cluster_id__storage_pools__pool_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/StoragePoolDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Delete","operationId":"clusters_storage_pools_delete_api_v2_clusters__cluster_id__storage_pools__pool_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Storage-Pools:Update","operationId":"clusters_storage_pools_update_api_v2_clusters__cluster_id__storage_pools__pool_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableStoragePoolParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/iostats":{"get":{"summary":"Clusters:Storage-Pools:Iostats","operationId":"clusters_storage_pools_iostats_api_v2_clusters__cluster_id__storage_pools__pool_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","default":20,"title":"Limit"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/":{"get":{"summary":"Clusters:Storage-Pools:Volumes:List","operationId":"clusters_storage_pools_volumes_list_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/VolumeDTO"},"title":"Response Clusters Storage Pools Volumes List Api V2 Clusters Cluster Id Storage Pools Pool Id Volumes Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Volumes:Create","operationId":"clusters_storage_pools_volumes_create_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RootModel_Union__CreateParams___CloneParams__"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Detail","operationId":"clusters_storage_pools_volumes_detail_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VolumeDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Storage-Pools:Volumes:Update","operationId":"clusters_storage_pools_volumes_update_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableLVolParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Volumes:Delete","operationId":"clusters_storage_pools_volumes_delete_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/inflate":{"post":{"summary":"Clusters:Storage-Pools:Volumes:Inflate","operationId":"clusters_storage_pools_volumes_inflate_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__inflate_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/connect":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Connect","operationId":"clusters_storage_pools_volumes_connect_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__connect_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/capacity":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Capacity","operationId":"clusters_storage_pools_volumes_capacity_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/iostats":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Iostats","operationId":"clusters_storage_pools_volumes_iostats_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/snapshots":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Snapshots:List","operationId":"clusters_storage_pools_volumes_snapshots_list_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__snapshots_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/SnapshotDTO"},"title":"Response Clusters Storage Pools Volumes Snapshots List Api V2 Clusters Cluster Id Storage Pools Pool Id Volumes Volume Id Snapshots Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Volumes:Snapshots:Create","operationId":"clusters_storage_pools_volumes_snapshots_create_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__snapshots_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_SnapshotParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/snapshots/":{"get":{"summary":"Clusters:Storage-Pools:Snapshots:List","operationId":"clusters_storage_pools_snapshots_list_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/SnapshotDTO"},"title":"Response Clusters Storage Pools Snapshots List Api V2 Clusters Cluster Id Storage Pools Pool Id Snapshots Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/snapshots/{snapshot_id}/":{"get":{"summary":"Clusters:Storage-Pools:Snapshots:Detail","operationId":"clusters_storage_pools_snapshots_detail_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__snapshot_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"snapshot_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Snapshot Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SnapshotDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Snapshots:Delete","operationId":"clusters_storage_pools_snapshots_delete_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__snapshot_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"snapshot_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Snapshot Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/management-nodes/":{"get":{"summary":"Management Nodes:List","operationId":"management_nodes_list_api_v2_management_nodes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/ManagementNodeDTO"},"title":"Response Management Nodes List Api V2 Management Nodes Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/management-nodes/{management_node_id}/":{"get":{"summary":"Management Node:Detail","operationId":"management_node_detail_api_v2_management_nodes__management_node_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"management_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Management Node Id"}},{"name":"cluster_id","in":"query","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ManagementNodeDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"ClusterDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"nqn":{"type":"string","title":"Nqn"},"status":{"type":"string","enum":["active","read_only","inactive","suspended","degraded","unready","in_activation","in_expansion"],"title":"Status"},"rebalancing":{"type":"boolean","title":"Rebalancing"},"block_size":{"type":"integer","minimum":0.0,"title":"Block Size"},"coding":{"prefixItems":[{"type":"integer","minimum":0.0},{"type":"integer","minimum":0.0}],"type":"array","maxItems":2,"minItems":2,"title":"Coding"},"ha":{"type":"boolean","title":"Ha"},"utliziation_critical":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Utliziation Critical"},"utilization_warning":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Utilization Warning"},"provisioned_cacacity_critical":{"type":"integer","minimum":0.0,"title":"Provisioned Cacacity Critical"},"provisioned_cacacity_warning":{"type":"integer","minimum":0.0,"title":"Provisioned Cacacity Warning"},"node_affinity":{"type":"boolean","title":"Node Affinity"},"anti_affinity":{"type":"boolean","title":"Anti Affinity"},"secret":{"type":"string","title":"Secret"}},"type":"object","required":["id","name","nqn","status","rebalancing","block_size","coding","ha","utliziation_critical","utilization_warning","provisioned_cacacity_critical","provisioned_cacacity_warning","node_affinity","anti_affinity","secret"],"title":"ClusterDTO"},"ClusterParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"blk_size":{"type":"integer","enum":[512,4096],"title":"Blk Size","default":512},"page_size_in_blocks":{"type":"integer","exclusiveMinimum":0.0,"title":"Page Size In Blocks","default":2097152},"cap_warn":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Cap Warn","default":0},"cap_crit":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Cap Crit","default":0},"prov_cap_warn":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Prov Cap Warn","default":0},"prov_cap_crit":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Prov Cap Crit","default":0},"distr_ndcs":{"type":"integer","title":"Distr Ndcs","default":1},"distr_npcs":{"type":"integer","title":"Distr Npcs","default":1},"distr_bs":{"type":"integer","title":"Distr Bs","default":4096},"distr_chunk_bs":{"type":"integer","title":"Distr Chunk Bs","default":4096},"ha_type":{"type":"string","enum":["single","ha"],"title":"Ha Type","default":"single"},"qpair_count":{"type":"integer","title":"Qpair Count","default":256},"max_queue_size":{"type":"integer","title":"Max Queue Size","default":128},"inflight_io_threshold":{"type":"integer","title":"Inflight Io Threshold","default":4},"enable_node_affinity":{"type":"boolean","title":"Enable Node Affinity","default":false},"strict_node_anti_affinity":{"type":"boolean","title":"Strict Node Anti Affinity","default":false}},"type":"object","title":"ClusterParams"},"DeviceDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"size":{"type":"integer","title":"Size"},"io_error":{"type":"boolean","title":"Io Error"},"is_partition":{"type":"boolean","title":"Is Partition"},"nvmf_ips":{"items":{"type":"string","format":"ipv4"},"type":"array","title":"Nvmf Ips"},"nvmf_nqn":{"type":"string","title":"Nvmf Nqn","default":""},"nvmf_port":{"type":"integer","title":"Nvmf Port","default":0}},"type":"object","required":["id","status","health_check","size","io_error","is_partition","nvmf_ips"],"title":"DeviceDTO"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ManagementNodeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"hostname":{"type":"string","title":"Hostname"},"ip":{"type":"string","format":"ipv4","title":"Ip"}},"type":"object","required":["id","status","hostname","ip"],"title":"ManagementNodeDTO"},"RootModel_Union__CreateParams___CloneParams__":{"anyOf":[{"$ref":"#/components/schemas/_CreateParams"},{"$ref":"#/components/schemas/_CloneParams"}],"title":"RootModel[Union[_CreateParams, _CloneParams]]"},"SnapshotDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"used_size":{"type":"integer","minimum":0.0,"title":"Used Size"},"lvol":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Lvol"}},"type":"object","required":["id","name","status","health_check","size","used_size","lvol"],"title":"SnapshotDTO"},"StorageNodeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"ip":{"type":"string","format":"ipv4","title":"Ip"}},"type":"object","required":["id","status","ip"],"title":"StorageNodeDTO"},"StorageNodeParams":{"properties":{"node_address":{"type":"string","title":"Node Address","default":"^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$"},"interface_name":{"type":"string","title":"Interface Name"},"max_snapshots":{"type":"integer","title":"Max Snapshots","default":500},"ha_jm":{"type":"boolean","title":"Ha Jm","default":true},"test_device":{"type":"boolean","title":"Test Device","default":false},"spdk_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Spdk Image"},"spdk_debug":{"type":"boolean","title":"Spdk Debug","default":false},"full_page_unmap":{"type":"boolean","title":"Full Page Unmap","default":false},"data_nics":{"items":{"type":"string"},"type":"array","title":"Data Nics","default":[]},"namespace":{"type":"string","title":"Namespace","default":"default"},"jm_percent":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Jm Percent","default":3},"partitions":{"type":"integer","title":"Partitions","default":1},"iobuf_small_pool_count":{"type":"integer","title":"Iobuf Small Pool Count","default":0},"iobuf_large_pool_count":{"type":"integer","title":"Iobuf Large Pool Count","default":0},"ha_jm_count":{"type":"integer","title":"Ha Jm Count","default":3}},"type":"object","required":["interface_name","spdk_image"],"title":"StorageNodeParams"},"StoragePoolDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","enum":["active","inactive"],"title":"Status"},"max_size":{"type":"integer","minimum":0.0,"title":"Max Size"},"volume_max_size":{"type":"integer","minimum":0.0,"title":"Volume Max Size"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops"},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes"},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes"},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes"}},"type":"object","required":["id","name","status","max_size","volume_max_size","max_rw_iops","max_rw_mbytes","max_r_mbytes","max_w_mbytes"],"title":"StoragePoolDTO"},"StoragePoolParams":{"properties":{"name":{"type":"string","title":"Name"},"pool_max":{"type":"integer","minimum":0.0,"title":"Pool Max","default":0},"volume_max_size":{"type":"integer","minimum":0.0,"title":"Volume Max Size","default":0},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0}},"type":"object","required":["name"],"title":"StoragePoolParams"},"UpdatableClusterParameters":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"}},"type":"object","title":"UpdatableClusterParameters"},"UpdatableLVolParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0},"size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Size"}},"type":"object","title":"UpdatableLVolParams"},"UpdatableStoragePoolParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"max_size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Size"},"volume_max_size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Volume Max Size"},"max_rw_iops":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Rw Iops"},"max_rw_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Rw Mbytes"},"max_r_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max R Mbytes"},"max_w_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max W Mbytes"}},"type":"object","title":"UpdatableStoragePoolParams"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VolumeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"nqn":{"type":"string","title":"Nqn"},"nodes":{"items":{"type":"string"},"type":"array","title":"Nodes"},"port":{"type":"integer","exclusiveMaximum":65536.0,"minimum":0.0,"title":"Port"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"cloned_from":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cloned From"},"crypto_key":{"anyOf":[{"prefixItems":[{"type":"string"},{"type":"string"}],"type":"array","maxItems":2,"minItems":2},{"type":"null"}],"title":"Crypto Key"},"high_availability":{"type":"boolean","title":"High Availability"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops"},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes"},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes"},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes"}},"type":"object","required":["id","name","status","health_check","nqn","nodes","port","size","cloned_from","crypto_key","high_availability","max_rw_iops","max_rw_mbytes","max_r_mbytes","max_w_mbytes"],"title":"VolumeDTO"},"_CloneParams":{"properties":{"name":{"type":"string","title":"Name"},"snapshot_id":{"anyOf":[{"type":"string","pattern":"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"},{"type":"null"}],"title":"Snapshot Id"},"size":{"type":"integer","minimum":0.0,"title":"Size","default":0}},"type":"object","required":["name","snapshot_id"],"title":"_CloneParams"},"_CreateParams":{"properties":{"name":{"type":"string","title":"Name"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"crypto_key":{"anyOf":[{"prefixItems":[{"type":"string"},{"type":"string"}],"type":"array","maxItems":2,"minItems":2},{"type":"null"}],"title":"Crypto Key"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0},"ha_type":{"anyOf":[{"type":"string","enum":["single","ha"]},{"type":"null"}],"title":"Ha Type"},"host_id":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Host Id"},"priority_class":{"type":"integer","enum":[0,1],"title":"Priority Class","default":0},"namespace":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Namespace"},"pvc_name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Pvc Name"},"ndcs":{"type":"integer","minimum":0.0,"title":"Ndcs","default":0},"npcs":{"type":"integer","minimum":0.0,"title":"Npcs","default":0}},"type":"object","required":["name","size"],"title":"_CreateParams"},"_RestartParams":{"properties":{"force":{"type":"boolean","title":"Force","default":false},"reattach_volume":{"type":"boolean","title":"Reattach Volume","default":false}},"type":"object","title":"_RestartParams"},"_SnapshotParams":{"properties":{"name":{"type":"string","title":"Name"}},"type":"object","required":["name"],"title":"_SnapshotParams"},"_UpdateParams":{"properties":{"management_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Management Image"},"spdk_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Spdk Image"},"restart":{"type":"boolean","title":"Restart","default":false}},"type":"object","required":["management_image","spdk_image"],"title":"_UpdateParams"}},"securitySchemes":{"HTTPBearer":{"type":"http","scheme":"bearer"}}}} \ No newline at end of file From 315ef0150722d5020938d19951f44e945af51a28 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Fri, 9 Jan 2026 23:16:26 +0300 Subject: [PATCH 127/192] Fix remove device response if device was removed (#839) --- simplyblock_core/controllers/device_controller.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/device_controller.py b/simplyblock_core/controllers/device_controller.py index 6f7a0d9f5..e3a62f4ea 100644 --- a/simplyblock_core/controllers/device_controller.py +++ b/simplyblock_core/controllers/device_controller.py @@ -343,7 +343,9 @@ def device_remove(device_id, force=True): device = dev break - if device.status in [NVMeDevice.STATUS_REMOVED, NVMeDevice.STATUS_FAILED]: + if device.status == NVMeDevice.STATUS_REMOVED: + return True + if device.status == NVMeDevice.STATUS_FAILED: logger.error(f"Unsupported device status: {device.status}") return False From 0ffe2433353b35837fa0b5990caf7d76cc25323f Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 19 Jan 2026 12:25:26 +0100 Subject: [PATCH 128/192] added simplyblock crds and custom resource (#814) * added simplyblock crds and custom resource * make some spec field required in crd * moved simplyblock mgmt deployment into a single deployment * added helm _helpers.tpl * conditionalize ingress creation * conditionalize MongoDBCommunity cr creation * remove fluentbit daemonset * added fluent-bit sidecar container * fixed yaml identation * use dnsPolicy: ClusterFirstWithHostNet for spdk ppd * exclude fluentbit own logs * shrink thanos deployments into single deployment * improved opensearch and graylog route * call function if monitoring is enabled * updated monitoring endpoint format * removed nodeExporter * remove fdb customParameters * specified new port for thanos components * updated monitoring endpoint format * updated monitoring endpoint format * removed hostpath capacity * fixed graylog input endpoint * added cluster task cr and crd * added cluster task cr and crd * fixed task routing * fixed object has no attribute 'cluster_id' * set is_json true * return JobSchedule * decode the json first * remove id from the dict before passing it * remove id and uuid from the dict before passing it * remove id, uuid and status from the dict before passing it * remove id, uuid, status and deleted from the dict before passing it * improved list task * use model.uuid instead * made some add snode param optional in api v2 * improved sn apiv2 response data * improved sn apiv2 response data more * set grafana_secret empty if monitoring is disabled * return the mgmt ip without kubelet ping * updated the fdb health check logic * updated crd clusterInfo * added cr_name, cr_namespace and cr_plural to baseModel * removed cr_name, cr_namespace and cr_plural to baseModel * added cluster cr status patch * added cluster cr status patch for rebalancing * added function for patching node cr status * added function for patching node cr device status * added service account across mgmt services * added patch for pool cr * removed mode from snode model * added rbac for simplyblock cr resource * get cr base on node_uuid or node_mgmt_ip * fixed patch_cr_node_status() missing 1 required keyword-only argument: 'node_mgmt_ip' * updated storagenode crd memory status field * update snode cr nvmf and lvol port * updated cr kind * updated the crds * updated service account resource name * updated pool resource name * updated crd * updated crd * removed capacityInfo in crd * added lvols_cr to pool model * added lvol crd * updated lvol api response * updated lvol api response * updated lvol crd * updated lvol crd * added func patch_cr_lvol_status * fixed type check failure * fixed type check failure * added logic for updating lvol cr field * fixed TypeError: unsupported operand type * fixed type check failure * update snode cr health type to bool * updated the lvol cr change condition * added log message * added log message * updated lvol cr name * removed debug logs * fixed linter issue * added node permission * updated ap/v2 cluster create response data * updated ap/v2 cluster create response data * removed coding from field * updated ap/v2 cluster create response data * added restart on another host support in api v2 * added default value for node_address * removed default value for node_address * updated simplyblockstoragenode crd * updated manager permissions * updated simplyblockpools crd * updated simplyblock manager image tag * added param id_device_by_nqn to snode add * updated simplyblockstoragenode crd * always add new devices when restarting node on new ip * attach fluentbit container to webapi pod * updated simplyblockstoragenode crd * fixed helm template structure * bind root to container host path * bind root to container host path * bind root to container host path * bind root to container host path * updated simplyblockstoragecluster crd * updated sbcli helm values.yaml structure * added helm value for simplyblock CR * support storage node remove and delete in one endpoint * remove node details from CR * added api for cluster expansion * updated simplyblockstoragecluster crd * added api for device remove and restart * added param force * updated simplyblockdevice crd * updated simplyblock manager image tag * make node_address param optional during snode restart via api * remove the worker node from list upon node snode removal * remove the worker node from list upon node snode removal --------- Co-authored-by: hamdykhader --- .gitignore | 6 + simplyblock_core/cluster_ops.py | 41 +- simplyblock_core/constants.py | 9 + .../controllers/cluster_events.py | 18 + simplyblock_core/controllers/device_events.py | 20 + simplyblock_core/controllers/lvol_events.py | 76 +- .../controllers/pool_controller.py | 22 +- simplyblock_core/controllers/pool_events.py | 24 +- .../controllers/storage_events.py | 57 +- simplyblock_core/models/cluster.py | 3 + simplyblock_core/models/pool.py | 6 + simplyblock_core/models/storage_node.py | 3 + simplyblock_core/scripts/charts/Chart.yaml | 5 +- ...ock.simplyblock.io_simplyblockdevices.yaml | 135 + ...block.simplyblock.io_simplyblocklvols.yaml | 144 + ...block.simplyblock.io_simplyblockpools.yaml | 96 + ...lyblock.io_simplyblockstorageclusters.yaml | 173 + ...implyblock.io_simplyblockstoragenodes.yaml | 204 + ...block.simplyblock.io_simplyblocktasks.yaml | 84 + .../scripts/charts/templates/_helpers.tpl | 21 + .../charts/templates/app_configmap.yaml | 13 +- .../scripts/charts/templates/app_ingress.yaml | 6 +- .../scripts/charts/templates/app_k8s.yaml | 1178 +- .../scripts/charts/templates/app_sa.yaml | 17 +- .../charts/templates/csi-hostpath-plugin.yaml | 2 - .../scripts/charts/templates/dashboards.yaml | 14794 +--------------- .../charts/templates/foundationdb.yaml | 1 - .../scripts/charts/templates/mongodb.yaml | 4 +- .../templates/monitoring_configmap.yaml | 14 +- .../charts/templates/monitoring_ingress.yaml | 7 +- .../charts/templates/monitoring_k8s.yaml | 148 +- .../charts/templates/monitoring_secret.yaml | 12 +- .../charts/templates/monitoring_svc.yaml | 35 +- .../charts/templates/simplyblock-manager.yaml | 196 + .../templates/simplyblock_customresource.yaml | 145 + simplyblock_core/scripts/charts/values.yaml | 73 +- simplyblock_core/storage_node_ops.py | 13 +- simplyblock_core/utils/__init__.py | 264 +- simplyblock_web/api/v1/__init__.py | 31 +- simplyblock_web/api/v1/cluster.py | 15 +- simplyblock_web/api/v2/__init__.py | 3 + simplyblock_web/api/v2/cluster.py | 12 +- simplyblock_web/api/v2/device.py | 12 +- simplyblock_web/api/v2/dtos.py | 54 +- simplyblock_web/api/v2/pool.py | 8 +- simplyblock_web/api/v2/storage_node.py | 28 +- simplyblock_web/api/v2/task.py | 17 +- .../templates/storage_core_isolation.yaml.j2 | 10 +- .../templates/storage_deploy_spdk.yaml.j2 | 21 + 49 files changed, 2249 insertions(+), 16031 deletions(-) create mode 100644 simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockdevices.yaml create mode 100644 simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocklvols.yaml create mode 100644 simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockpools.yaml create mode 100644 simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstorageclusters.yaml create mode 100644 simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstoragenodes.yaml create mode 100644 simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocktasks.yaml create mode 100644 simplyblock_core/scripts/charts/templates/_helpers.tpl create mode 100644 simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml create mode 100644 simplyblock_core/scripts/charts/templates/simplyblock_customresource.yaml diff --git a/.gitignore b/.gitignore index 6603f927d..ee7e248e6 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,9 @@ dist .ruff_cache .env .tox + +# Ignore charts directory +simplyblock_core/scripts/charts/charts/ + +# Ignore Helm requirements lock file +simplyblock_core/scripts/charts/requirements.lock diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 5e6352cc0..fa11d2f4c 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -80,7 +80,7 @@ def _create_update_user(cluster_id, grafana_url, grafana_secret, user_secret, up def _add_graylog_input(cluster_ip, password): - base_url = f"http://{cluster_ip}/graylog/api" + base_url = f"{cluster_ip}/api" input_url = f"{base_url}/system/inputs" retries = 30 @@ -161,7 +161,7 @@ def _add_graylog_input(cluster_ip, password): def _set_max_result_window(cluster_ip, max_window=100000): - url_existing_indices = f"http://{cluster_ip}/opensearch/_all/_settings" + url_existing_indices = f"{cluster_ip}/_all/_settings" retries = 30 reachable=False @@ -188,7 +188,7 @@ def _set_max_result_window(cluster_ip, max_window=100000): logger.error(f"Failed to update settings for existing indices: {response.text}") return False - url_template = f"http://{cluster_ip}/opensearch/_template/all_indices_template" + url_template = f"{cluster_ip}/_template/all_indices_template" payload_template = json.dumps({ "index_patterns": ["*"], "settings": { @@ -317,8 +317,12 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.grafana_endpoint = grafana_endpoint elif ingress_host_source == "hostip": cluster.grafana_endpoint = f"http://{dev_ip}/grafana" + graylog_endpoint = f"http://{dev_ip}/graylog" + os_endpoint = f"http://{dev_ip}/opensearch" else: cluster.grafana_endpoint = f"http://{dns_name}/grafana" + graylog_endpoint = f"http://{dns_name}/graylog" + os_endpoint = f"http://{dns_name}/opensearch" cluster.enable_node_affinity = enable_node_affinity cluster.qpair_count = qpair_count or constants.QPAIR_COUNT cluster.client_qpair_count = client_qpair_count or constants.CLIENT_QPAIR_COUNT @@ -355,9 +359,10 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, if ingress_host_source == "hostip": dns_name = dev_ip - _set_max_result_window(dns_name) + + _set_max_result_window(os_endpoint) - _add_graylog_input(dns_name, monitoring_secret) + _add_graylog_input(graylog_endpoint, monitoring_secret) _create_update_user(cluster.uuid, cluster.grafana_endpoint, monitoring_secret, cluster.secret) if mode == "kubernetes": @@ -433,12 +438,13 @@ def _run_fio(mount_point) -> None: def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, qpair_count, - max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric="tcp", - cluster_ip=None, grafana_secret=None) -> str: + max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, cr_name=None, + cr_namespace=None, cr_plural=None, fabric="tcp", cluster_ip=None, grafana_secret=None) -> str: default_cluster = None monitoring_secret = os.environ.get("MONITORING_SECRET", "") + enable_monitoring = os.environ.get("ENABLE_MONITORING", "") clusters = db_controller.get_clusters() if clusters: default_cluster = clusters[0] @@ -471,16 +477,27 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.db_connection = fdb_cluster_string if monitoring_secret: cluster.grafana_secret = monitoring_secret + elif enable_monitoring != "true": + cluster.grafana_secret = "" else: raise Exception("monitoring_secret is required") - cluster.grafana_endpoint = "http://simplyblock-grafana:3000" + cluster.grafana_endpoint = constants.GRAFANA_K8S_ENDPOINT if not cluster_ip: cluster_ip = "0.0.0.0" # add mgmt node object mgmt_node_ops.add_mgmt_node(cluster_ip, "kubernetes", cluster.uuid) - - _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret) + if enable_monitoring == "true": + graylog_endpoint = constants.GRAYLOG_K8S_ENDPOINT + os_endpoint = constants.OS_K8S_ENDPOINT + _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret) + + _set_max_result_window(os_endpoint) + + _add_graylog_input(graylog_endpoint, monitoring_secret) + + if cluster.mode == "kubernetes": + utils.patch_prometheus_configmap(cluster.uuid, cluster.secret) cluster.distr_ndcs = distr_ndcs cluster.distr_npcs = distr_npcs @@ -492,6 +509,10 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn cluster.qpair_count = qpair_count or constants.QPAIR_COUNT cluster.max_queue_size = max_queue_size cluster.inflight_io_threshold = inflight_io_threshold + cluster.cr_name = cr_name + cluster.cr_namespace = cr_namespace + cluster.cr_plural = cr_plural + if cap_warn and cap_warn > 0: cluster.cap_warn = cap_warn if cap_crit and cap_crit > 0: diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index 36ba14a9e..08b101d0c 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -167,6 +167,15 @@ def get_config_var(name, default=None): LVO_MAX_NAMESPACES_PER_SUBSYS=32 +CR_GROUP = "simplyblock.simplyblock.io" +CR_VERSION = "v1alpha1" + +GRAFANA_K8S_ENDPOINT = "http://simplyblock-grafana:3000" +GRAYLOG_K8S_ENDPOINT = "http://simplyblock-graylog:9000" +OS_K8S_ENDPOINT = "http://opensearch-cluster-master:9200" + +WEBAPI_K8S_ENDPOINT = "http://simplyblock-webappapi:5000/api/v2" + K8S_NAMESPACE = os.getenv('K8S_NAMESPACE', 'simplyblock') OS_STATEFULSET_NAME = "simplyblock-opensearch" MONGODB_STATEFULSET_NAME = "simplyblock-mongo" diff --git a/simplyblock_core/controllers/cluster_events.py b/simplyblock_core/controllers/cluster_events.py index 059aea976..e201c53a9 100644 --- a/simplyblock_core/controllers/cluster_events.py +++ b/simplyblock_core/controllers/cluster_events.py @@ -4,6 +4,7 @@ from simplyblock_core.controllers import events_controller as ec from simplyblock_core.db_controller import DBController from simplyblock_core.models.events import EventObj +from simplyblock_core import utils, constants logger = logging.getLogger() db_controller = DBController() @@ -39,6 +40,15 @@ def cluster_status_change(cluster, new_state, old_status): caused_by=ec.CAUSED_BY_CLI, message=f"Cluster status changed from {old_status} to {new_state}") + if cluster.mode == "kubernetes": + utils.patch_cr_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=cluster.cr_plural, + namespace=cluster.cr_namespace, + name=cluster.cr_name, + status_patch={"status": new_state}) + def _cluster_cap_event(cluster, msg, event_level): return ec.log_event_cluster( @@ -90,3 +100,11 @@ def cluster_rebalancing_change(cluster, new_state, old_status): db_object=cluster, caused_by=ec.CAUSED_BY_CLI, message=f"Cluster rebalancing changed from {old_status} to {new_state}") + if cluster.mode == "kubernetes": + utils.patch_cr_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=cluster.cr_plural, + namespace=cluster.cr_namespace, + name=cluster.cr_name, + status_patch={"rebalancing": new_state}) diff --git a/simplyblock_core/controllers/device_events.py b/simplyblock_core/controllers/device_events.py index f2e1e959d..1f5ee881a 100644 --- a/simplyblock_core/controllers/device_events.py +++ b/simplyblock_core/controllers/device_events.py @@ -3,6 +3,8 @@ from simplyblock_core.controllers import events_controller as ec from simplyblock_core.db_controller import DBController +from simplyblock_core.models.nvme_device import NVMeDevice +from simplyblock_core import utils, constants logger = logging.getLogger() @@ -20,6 +22,24 @@ def _device_event(device, message, caused_by, event): node_id=device.get_id(), storage_id=device.cluster_device_order) + cluster = db_controller.get_cluster_by_id(snode.cluster_id) + if cluster.mode == "kubernetes": + total_devices = len(snode.nvme_devices) + online_devices = 0 + for dev in snode.nvme_devices: + if dev.status == NVMeDevice.STATUS_ONLINE: + online_devices += 1 + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=snode.cr_plural, + namespace=snode.cr_namespace, + name=snode.cr_name, + node_uuid=snode.get_id(), + node_mgmt_ip=snode.mgmt_ip, + updates={"devices": f"{total_devices}/{online_devices}"}, + ) + def device_create(device, caused_by=ec.CAUSED_BY_CLI): _device_event(device, f"Device created: {device.get_id()}", caused_by, ec.EVENT_OBJ_CREATED) diff --git a/simplyblock_core/controllers/lvol_events.py b/simplyblock_core/controllers/lvol_events.py index 636c444b3..91b91027b 100644 --- a/simplyblock_core/controllers/lvol_events.py +++ b/simplyblock_core/controllers/lvol_events.py @@ -3,6 +3,7 @@ from simplyblock_core.controllers import events_controller as ec from simplyblock_core.db_controller import DBController +from simplyblock_core import utils, constants logger = logging.getLogger() @@ -10,6 +11,7 @@ def _lvol_event(lvol, message, caused_by, event): db_controller = DBController() snode = db_controller.get_storage_node_by_id(lvol.node_id) + cluster = db_controller.get_cluster_by_id(snode.cluster_id) ec.log_event_cluster( cluster_id=snode.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -18,7 +20,79 @@ def _lvol_event(lvol, message, caused_by, event): caused_by=caused_by, message=message, node_id=lvol.get_id()) - + if cluster.mode == "kubernetes": + pool = db_controller.get_pool_by_id(lvol.pool_uuid) + + if event == ec.EVENT_OBJ_CREATED: + crypto_key=( + (lvol.crypto_key1, lvol.crypto_key2) + if lvol.crypto_key1 and lvol.crypto_key2 + else None + ) + + node_urls = [ + f"{constants.WEBAPI_K8S_ENDPOINT}/clusters/{snode.cluster_id}/storage-nodes/{node_id}/" + for node_id in lvol.nodes + ] + + utils.patch_cr_lvol_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=pool.lvols_cr_plural, + namespace=pool.lvols_cr_namespace, + name=pool.lvols_cr_name, + add={ + "uuid": lvol.get_id(), + "lvolName": lvol.lvol_name, + "status": lvol.status, + "nodeUUID": node_urls, + "size": utils.humanbytes(lvol.size), + "health": lvol.health_check, + "isCrypto": crypto_key is not None, + "nqn": lvol.nqn, + "subsysPort": lvol.subsys_port, + "hostname": lvol.hostname, + "fabric": lvol.fabric, + "ha": lvol.ha_type == 'ha', + "poolUUID": lvol.pool_uuid, + "poolName": lvol.pool_name, + "PvcName": lvol.pvc_name, + "snapName": lvol.snapshot_name, + "clonedFromSnap": lvol.cloned_from_snap, + "stripeWdata": lvol.ndcs, + "stripeWparity": lvol.npcs, + "blobID": lvol.blobid, + "namespaceID": lvol.ns_id, + "qosClass": lvol.lvol_priority_class, + "maxNamespacesPerSubsystem": lvol.max_namespace_per_subsys, + "qosIOPS": lvol.rw_ios_per_sec, + "qosRWTP": lvol.rw_mbytes_per_sec, + "qosRTP": lvol.r_mbytes_per_sec, + "qosWTP": lvol.w_mbytes_per_sec, + }, + ) + + elif event == ec.EVENT_STATUS_CHANGE: + utils.patch_cr_lvol_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=pool.lvols_cr_plural, + namespace=pool.lvols_cr_namespace, + name=pool.lvols_cr_name, + lvol_uuid=lvol.get_id(), + updates={"status": lvol.status, "health": lvol.health_check}, + ) + elif event == ec.EVENT_OBJ_DELETED: + logger.info("Deleting lvol CR object") + utils.patch_cr_lvol_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=pool.lvols_cr_plural, + namespace=pool.lvols_cr_namespace, + name=pool.lvols_cr_name, + lvol_uuid=lvol.get_id(), + remove=True, + ) def lvol_create(lvol, caused_by=ec.CAUSED_BY_CLI): _lvol_event(lvol, "LVol created", caused_by, ec.EVENT_OBJ_CREATED) diff --git a/simplyblock_core/controllers/pool_controller.py b/simplyblock_core/controllers/pool_controller.py index 2440a6bd7..0d2738e67 100644 --- a/simplyblock_core/controllers/pool_controller.py +++ b/simplyblock_core/controllers/pool_controller.py @@ -23,7 +23,8 @@ def _generate_string(length): string.ascii_letters + string.digits) for _ in range(length)) -def add_pool(name, pool_max, lvol_max, max_rw_iops, max_rw_mbytes, max_r_mbytes, max_w_mbytes, cluster_id, qos_host=None): +def add_pool(name, pool_max, lvol_max, max_rw_iops, max_rw_mbytes, max_r_mbytes, max_w_mbytes, cluster_id, + cr_name=None, cr_namespace=None, cr_plural=None, qos_host=None): db_controller = DBController() if not name: logger.error("Pool name is empty!") @@ -71,6 +72,9 @@ def add_pool(name, pool_max, lvol_max, max_rw_iops, max_rw_mbytes, max_r_mbytes, pool.max_rw_mbytes_per_sec = max_rw_mbytes pool.max_r_mbytes_per_sec = max_r_mbytes pool.max_w_mbytes_per_sec = max_w_mbytes + pool.cr_name = cr_name + pool.cr_namespace = cr_namespace + pool.cr_plural = cr_plural if pool.has_qos() and not qos_host: next_nodes = lvol_controller._get_next_3_nodes(cluster_id) if next_nodes: @@ -121,7 +125,8 @@ def qos_exists_on_child_lvol(db_controller: DBController, pool_uuid): return False def set_pool(uuid, pool_max=0, lvol_max=0, max_rw_iops=0, - max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, name=""): + max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, name="", + lvols_cr_name="", lvols_cr_namespace="", lvols_cr_plural=""): db_controller = DBController() try: pool = db_controller.get_pool_by_id(uuid) @@ -143,6 +148,17 @@ def set_pool(uuid, pool_max=0, lvol_max=0, max_rw_iops=0, return False, msg pool.pool_name = name + if lvols_cr_name and lvols_cr_name != pool.lvols_cr_name: + for p in db_controller.get_pools(): + if p.lvols_cr_name == lvols_cr_name: + msg = f"Pool found with the same lvol cr name: {name}" + logger.error(msg) + return False, msg + pool.lvols_cr_name = lvols_cr_name + pool.lvols_cr_namespace = lvols_cr_namespace + pool.lvols_cr_plural = lvols_cr_plural + + # Normalize inputs max_rw_iops = max_rw_iops or 0 max_rw_mbytes = max_rw_mbytes or 0 @@ -265,8 +281,10 @@ def set_status(pool_id, status): except KeyError: logger.error(f"Pool not found {pool_id}") return False + old_status = pool.status pool.status = status pool.write_to_db(db_controller.kv_store) + pool_events.pool_status_change(pool, pool.status, old_status) logger.info("Done") diff --git a/simplyblock_core/controllers/pool_events.py b/simplyblock_core/controllers/pool_events.py index 2581d59b1..8c4f0ea08 100644 --- a/simplyblock_core/controllers/pool_events.py +++ b/simplyblock_core/controllers/pool_events.py @@ -2,7 +2,8 @@ import logging from simplyblock_core.controllers import events_controller as ec - +from simplyblock_core.db_controller import DBController +from simplyblock_core import utils, constants logger = logging.getLogger() @@ -29,3 +30,24 @@ def pool_remove(pool): def pool_updated(pool): _add(pool, f"Pool updated {pool.pool_name}", event=ec.EVENT_STATUS_CHANGE) + +def pool_status_change(pool, new_state, old_status): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(pool.cluster_id) + ec.log_event_cluster( + cluster_id=pool.cluster_id, + domain=ec.DOMAIN_CLUSTER, + event=ec.EVENT_STATUS_CHANGE, + db_object=pool, + caused_by=ec.CAUSED_BY_CLI, + message=f"Pool status changed from {old_status} to {new_state}", + node_id=pool.cluster_id) + + if cluster.mode == "kubernetes": + utils.patch_cr_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=pool.cr_plural, + namespace=pool.cr_namespace, + name=pool.cr_name, + status_patch={"status": new_state}) diff --git a/simplyblock_core/controllers/storage_events.py b/simplyblock_core/controllers/storage_events.py index 027f7dbed..6bef257fd 100644 --- a/simplyblock_core/controllers/storage_events.py +++ b/simplyblock_core/controllers/storage_events.py @@ -3,6 +3,8 @@ from simplyblock_core.controllers import events_controller as ec from simplyblock_core.models.events import EventObj +from simplyblock_core.db_controller import DBController +from simplyblock_core import utils, constants logger = logging.getLogger() @@ -19,6 +21,8 @@ def snode_add(node): def snode_delete(node): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(node.cluster_id) ec.log_event_cluster( cluster_id=node.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -27,9 +31,21 @@ def snode_delete(node): caused_by=ec.CAUSED_BY_CLI, message=f"Storage node deleted {node.get_id()}", node_id=node.get_id()) - + if cluster.mode == "kubernetes": + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=node.cr_plural, + namespace=node.cr_namespace, + name=node.cr_name, + node_uuid=node.get_id(), + node_mgmt_ip=node.mgmt_ip, + remove=True, + ) def snode_status_change(node, new_state, old_status, caused_by=ec.CAUSED_BY_CLI): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(node.cluster_id) ec.log_event_cluster( cluster_id=node.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -38,9 +54,22 @@ def snode_status_change(node, new_state, old_status, caused_by=ec.CAUSED_BY_CLI) caused_by=caused_by, message=f"Storage node status changed from: {old_status} to: {new_state}", node_id=node.get_id()) + if cluster.mode == "kubernetes": + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=node.cr_plural, + namespace=node.cr_namespace, + name=node.cr_name, + node_uuid=node.get_id(), + node_mgmt_ip=node.mgmt_ip, + updates={"status": new_state}, + ) def snode_health_check_change(node, new_state, old_status, caused_by=ec.CAUSED_BY_CLI): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(node.cluster_id) ec.log_event_cluster( cluster_id=node.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -49,7 +78,17 @@ def snode_health_check_change(node, new_state, old_status, caused_by=ec.CAUSED_B caused_by=caused_by, message=f"Storage node health check changed from: {old_status} to: {new_state}", node_id=node.get_id()) - + if cluster.mode == "kubernetes": + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=node.cr_plural, + namespace=node.cr_namespace, + name=node.cr_name, + node_uuid=node.get_id(), + node_mgmt_ip=node.mgmt_ip, + updates={"health": new_state}, + ) def snode_restart_failed(node): ec.log_event_cluster( @@ -87,6 +126,8 @@ def jm_repl_tasks_found(node, jm_vuid, caused_by=ec.CAUSED_BY_MONITOR): def node_ports_changed(node, caused_by=ec.CAUSED_BY_MONITOR): + db_controller = DBController() + cluster = db_controller.get_cluster_by_id(node.cluster_id) ec.log_event_cluster( cluster_id=node.cluster_id, domain=ec.DOMAIN_CLUSTER, @@ -96,3 +137,15 @@ def node_ports_changed(node, caused_by=ec.CAUSED_BY_MONITOR): event_level=EventObj.LEVEL_WARN, message=f"Storage node ports set, LVol:{node.lvol_subsys_port} RPC:{node.rpc_port} Internal:{node.nvmf_port}", node_id=node.get_id()) + if cluster.mode == "kubernetes": + utils.patch_cr_node_status( + group=constants.CR_GROUP, + version=constants.CR_VERSION, + plural=node.cr_plural, + namespace=node.cr_namespace, + name=node.cr_name, + node_uuid=node.get_id(), + node_mgmt_ip=node.mgmt_ip, + updates={"nvmf_port": node.nvmf_port, "rpc_port": node.rpc_port, "lvol_port": node.lvol_subsys_port}, + ) + \ No newline at end of file diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index 620309f77..f85be6e06 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -63,6 +63,9 @@ class Cluster(BaseModel): fabric_rdma: bool = False client_qpair_count: int = 3 secret: str = "" + cr_name: str = "" + cr_namespace: str = "" + cr_plural: str = "" disable_monitoring: bool = False strict_node_anti_affinity: bool = False tls: bool = False diff --git a/simplyblock_core/models/pool.py b/simplyblock_core/models/pool.py index 27b2a23e5..683eafe1e 100644 --- a/simplyblock_core/models/pool.py +++ b/simplyblock_core/models/pool.py @@ -29,6 +29,12 @@ class Pool(BaseModel): secret: str = "" # unused users: List[str] = [] qos_host: str = "" + cr_name: str = "" + cr_namespace: str = "" + cr_plural: str = "" + lvols_cr_name: str = "" + lvols_cr_namespace: str = "" + lvols_cr_plural: str = "" def has_qos(self): diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index 45abceec9..ab7b31b09 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -97,6 +97,9 @@ class StorageNode(BaseNodeObject): subsystem: str = "" system_uuid: str = "" lvstore_status: str = "" + cr_name: str = "" + cr_namespace: str = "" + cr_plural: str = "" nvmf_port: int = 4420 physical_label: int = 0 hublvol: HubLVol = None # type: ignore[assignment] diff --git a/simplyblock_core/scripts/charts/Chart.yaml b/simplyblock_core/scripts/charts/Chart.yaml index 380f67bcd..671f39cfa 100644 --- a/simplyblock_core/scripts/charts/Chart.yaml +++ b/simplyblock_core/scripts/charts/Chart.yaml @@ -17,15 +17,14 @@ dependencies: version: 1.4.0 repository: https://mongodb.github.io/helm-charts alias: mongodb - condition: monitoring.enabled + condition: observability.enabled - name: opensearch version: 2.9.0 repository: https://opensearch-project.github.io/helm-charts - condition: monitoring.enabled + condition: observability.enabled - name: prometheus version: "25.18.0" repository: "https://prometheus-community.github.io/helm-charts" - condition: monitoring.enabled - name: ingress-nginx version: 4.10.1 repository: "https://kubernetes.github.io/ingress-nginx" diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockdevices.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockdevices.yaml new file mode 100644 index 000000000..272030736 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockdevices.yaml @@ -0,0 +1,135 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblockdevices.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockDevice + listKind: SimplyBlockDeviceList + plural: simplyblockdevices + singular: simplyblockdevice + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockDevice is the Schema for the simplyblockdevices API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockDevice + properties: + action: + enum: + - remove + - restart + type: string + clusterName: + type: string + deviceID: + type: string + nodeUUID: + type: string + required: + - clusterName + type: object + status: + description: status defines the observed state of SimplyBlockDevice + properties: + actionStatus: + properties: + action: + type: string + message: + type: string + nodeUUID: + type: string + observedGeneration: + format: int64 + type: integer + state: + type: string + triggered: + type: boolean + updatedAt: + format: date-time + type: string + type: object + nodes: + items: + properties: + devices: + items: + properties: + health: + type: string + model: + type: string + size: + type: string + stats: + items: + properties: + capacityUtil: + format: int64 + type: integer + riops: + format: int64 + type: integer + rtp: + format: int64 + type: integer + wiops: + format: int64 + type: integer + wtp: + format: int64 + type: integer + type: object + type: array + status: + type: string + utilization: + format: int64 + type: integer + uuid: + type: string + type: object + type: array + nodeUUID: + type: string + type: object + type: array + type: object + required: + - spec + type: object + x-kubernetes-validations: + - message: nodeUUID and deviceID are required when action is specified + rule: '!(has(self.spec.action) && self.spec.action != "" && ((!has(self.spec.nodeUUID) + || self.spec.nodeUUID == "") || (!has(self.spec.deviceID) || self.spec.deviceID + == "")))' + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocklvols.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocklvols.yaml new file mode 100644 index 000000000..8e44a687d --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocklvols.yaml @@ -0,0 +1,144 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblocklvols.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockLvol + listKind: SimplyBlockLvolList + plural: simplyblocklvols + singular: simplyblocklvol + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.lvols.length() + name: LVOLs + type: integer + name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockLvol is the Schema for the simplyblocklvols API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockLvol + properties: + clusterName: + type: string + poolName: + type: string + required: + - clusterName + - poolName + type: object + status: + description: status defines the observed state of SimplyBlockLvol + properties: + configured: + type: boolean + lvols: + items: + properties: + blobID: + format: int64 + type: integer + clonedFromSnap: + type: string + createDt: + format: date-time + type: string + fabric: + type: string + ha: + type: boolean + health: + type: boolean + hostname: + type: string + isCrypto: + type: boolean + lvolName: + type: string + maxNamespacesPerSubsystem: + format: int64 + type: integer + namespaceID: + format: int64 + type: integer + nodeUUID: + items: + type: string + type: array + nqn: + type: string + poolName: + type: string + poolUUID: + type: string + pvcName: + type: string + qosClass: + format: int64 + type: integer + qosIOPS: + format: int64 + type: integer + qosRTP: + format: int64 + type: integer + qosRWTP: + format: int64 + type: integer + qosWTP: + format: int64 + type: integer + size: + type: string + snapName: + type: string + status: + type: string + stripeWdata: + format: int64 + type: integer + stripeWparity: + format: int64 + type: integer + subsysPort: + format: int64 + type: integer + updateDt: + format: date-time + type: string + uuid: + type: string + type: object + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockpools.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockpools.yaml new file mode 100644 index 000000000..693322dc3 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockpools.yaml @@ -0,0 +1,96 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblockpools.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockPool + listKind: SimplyBlockPoolList + plural: simplyblockpools + singular: simplyblockpool + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockPool is the Schema for the pools API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of Pool + properties: + action: + type: string + capacityLimit: + type: string + clusterName: + type: string + name: + type: string + qosIOPSLimit: + format: int32 + type: integer + rLimit: + format: int32 + type: integer + rwLimit: + format: int32 + type: integer + status: + type: string + wLimit: + format: int32 + type: integer + required: + - clusterName + - name + type: object + status: + description: status defines the observed state of Pool + properties: + qosHost: + type: string + qosIOPSLimit: + format: int32 + type: integer + rLimit: + format: int32 + type: integer + rwLimit: + format: int32 + type: integer + status: + type: string + uuid: + type: string + wLimit: + format: int32 + type: integer + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstorageclusters.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstorageclusters.yaml new file mode 100644 index 000000000..cfd99fdee --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstorageclusters.yaml @@ -0,0 +1,173 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblockstorageclusters.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockStorageCluster + listKind: SimplyBlockStorageClusterList + plural: simplyblockstorageclusters + singular: simplyblockstoragecluster + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockStorageCluster is the Schema for the simplyblockstorageclusters + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockStorageCluster + properties: + action: + enum: + - activate + - expand + type: string + blkSize: + format: int32 + type: integer + capCrit: + format: int32 + type: integer + capWarn: + format: int32 + type: integer + clientQpairCount: + format: int32 + type: integer + clusterName: + type: string + distrBs: + format: int32 + type: integer + distrChunkBs: + format: int32 + type: integer + enableNodeAffinity: + type: boolean + eventLogEntries: + format: int32 + type: integer + fabric: + type: string + haType: + type: string + includeEventLog: + type: boolean + inflightIOThreshold: + format: int32 + type: integer + isSingleNode: + type: boolean + maxQueueSize: + format: int32 + type: integer + mgmtIfc: + description: Create-only + type: string + pageSizeInBlocks: + format: int32 + type: integer + provCapCrit: + format: int32 + type: integer + provCapWarn: + format: int32 + type: integer + qosClasses: + description: Updatable + type: string + qpairCount: + format: int32 + type: integer + strictNodeAntiAffinity: + type: boolean + stripeWdata: + format: int32 + type: integer + stripeWparity: + format: int32 + type: integer + required: + - clusterName + type: object + status: + description: status defines the observed state of SimplyBlockStorageCluster + properties: + MOD: + type: string + NQN: + type: string + UUID: + type: string + actionStatus: + properties: + action: + type: string + message: + type: string + nodeUUID: + type: string + observedGeneration: + format: int64 + type: integer + state: + type: string + triggered: + type: boolean + updatedAt: + format: date-time + type: string + type: object + clusterName: + type: string + configured: + type: boolean + created: + format: date-time + type: string + lastUpdated: + format: date-time + type: string + mgmtNodes: + format: int32 + type: integer + rebalancing: + type: boolean + secretName: + type: string + status: + type: string + storageNodes: + format: int32 + type: integer + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstoragenodes.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstoragenodes.yaml new file mode 100644 index 000000000..1e6af7724 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblockstoragenodes.yaml @@ -0,0 +1,204 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblockstoragenodes.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockStorageNode + listKind: SimplyBlockStorageNodeList + plural: simplyblockstoragenodes + singular: simplyblockstoragenode + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockStorageNode is the Schema for the storagenodes API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of StorageNode + properties: + action: + enum: + - shutdown + - restart + - suspend + - resume + - remove + type: string + addPcieToAllowList: + description: restart params + items: + type: string + type: array + clusterImage: + type: string + clusterName: + type: string + coreIsolation: + type: boolean + coreMask: + type: string + corePercentage: + format: int32 + type: integer + dataNIC: + items: + type: string + type: array + driveSizeRange: + type: string + force: + type: boolean + haJM: + type: boolean + haJmCount: + format: int32 + type: integer + idDeviceByNQN: + type: boolean + jmPercent: + format: int32 + type: integer + maxLVol: + format: int32 + type: integer + maxSize: + type: string + mgmtIfc: + type: string + nodeAddr: + type: string + nodeUUID: + description: NodeUUID is required when action is specified + type: string + nodesPerSocket: + format: int32 + type: integer + openShiftCluster: + type: boolean + partitions: + format: int32 + type: integer + pcieAllowList: + items: + type: string + type: array + pcieDenyList: + items: + type: string + type: array + pcieModel: + type: string + socketsToUse: + format: int32 + type: integer + spdkDebug: + type: boolean + spdkImage: + type: string + useSeparateJournalDevice: + type: boolean + workerNode: + type: string + workerNodes: + items: + type: string + type: array + required: + - clusterName + type: object + status: + description: status defines the observed state of StorageNode + properties: + actionStatus: + properties: + action: + type: string + message: + type: string + nodeUUID: + type: string + observedGeneration: + format: int64 + type: integer + state: + type: string + updatedAt: + format: date-time + type: string + type: object + nodes: + items: + properties: + cpu: + format: int32 + type: integer + devices: + type: string + health: + type: boolean + hostname: + type: string + lvol_port: + format: int32 + type: integer + memory: + type: string + mgmtIp: + type: string + nvmf_port: + format: int32 + type: integer + rpc_port: + format: int32 + type: integer + status: + type: string + uptime: + type: string + uuid: + type: string + volumes: + format: int32 + type: integer + type: object + type: array + type: object + required: + - spec + type: object + x-kubernetes-validations: + - message: nodeUUID is required when action is specified + rule: '!(has(self.spec.action) && self.spec.action != "" && (!has(self.spec.nodeUUID) + || self.spec.nodeUUID == ""))' + - message: clusterImage, maxLVol, and workerNodes are required when action + is not specified + rule: (has(self.spec.action) && self.spec.action != "") || (has(self.spec.clusterImage) + && self.spec.clusterImage != "" && has(self.spec.maxLVol) && has(self.spec.workerNodes) + && size(self.spec.workerNodes) > 0) + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocktasks.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocktasks.yaml new file mode 100644 index 000000000..2d25e21e1 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocktasks.yaml @@ -0,0 +1,84 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblocktasks.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockTask + listKind: SimplyBlockTaskList + plural: simplyblocktasks + singular: simplyblocktask + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockTask is the Schema for the simplyblocktasks API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockTask + properties: + clusterName: + type: string + subtasks: + type: boolean + taskID: + type: string + required: + - clusterName + type: object + status: + description: status defines the observed state of SimplyBlockTask + properties: + tasks: + items: + properties: + canceled: + type: boolean + parentTask: + type: string + retried: + format: int32 + type: integer + startedAt: + format: date-time + type: string + taskResult: + type: string + taskStatus: + type: string + taskType: + type: string + uuid: + type: string + type: object + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/templates/_helpers.tpl b/simplyblock_core/scripts/charts/templates/_helpers.tpl new file mode 100644 index 000000000..710260fdc --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/_helpers.tpl @@ -0,0 +1,21 @@ +{{- define "simplyblock.commonContainer" }} +env: + - name: SIMPLYBLOCK_LOG_LEVEL + valueFrom: + configMapKeyRef: + name: simplyblock-config + key: LOG_LEVEL + +volumeMounts: + - name: fdb-cluster-file + mountPath: /etc/foundationdb/fdb.cluster + subPath: fdb.cluster + +resources: + requests: + cpu: "50m" + memory: "100Mi" + limits: + cpu: "300m" + memory: "1Gi" +{{- end }} diff --git a/simplyblock_core/scripts/charts/templates/app_configmap.yaml b/simplyblock_core/scripts/charts/templates/app_configmap.yaml index de0a4da08..a4d1d57dd 100644 --- a/simplyblock_core/scripts/charts/templates/app_configmap.yaml +++ b/simplyblock_core/scripts/charts/templates/app_configmap.yaml @@ -6,8 +6,8 @@ metadata: namespace: {{ .Release.Namespace }} data: - LOG_LEVEL: {{ .Values.log.level }} - LOG_DELETION_INTERVAL: {{ .Values.log.deletionInterval }} + LOG_LEVEL: {{ .Values.observability.level }} + LOG_DELETION_INTERVAL: {{ .Values.observability.deletionInterval }} --- @@ -29,6 +29,7 @@ data: Path /var/log/containers/*.log Parser docker Tag kube.* + Exclude_Path /var/log/containers/*fluent-bit*.log Refresh_Interval 5 Mem_Buf_Limit 5MB Skip_Long_Lines On @@ -69,9 +70,11 @@ data: filter.lua: | function filter_tagged_pods(tag, timestamp, record) - annotations = record["kubernetes"]["annotations"] - if annotations ~= nil and annotations["log-collector/enabled"] == "true" then - return 1, record + if record["kubernetes"] ~= nil then + local annotations = record["kubernetes"]["annotations"] + if annotations ~= nil and annotations["log-collector/enabled"] == "true" then + return 1, record + end end return -1, record end diff --git a/simplyblock_core/scripts/charts/templates/app_ingress.yaml b/simplyblock_core/scripts/charts/templates/app_ingress.yaml index 67e7b0912..b49b0c396 100644 --- a/simplyblock_core/scripts/charts/templates/app_ingress.yaml +++ b/simplyblock_core/scripts/charts/templates/app_ingress.yaml @@ -1,4 +1,5 @@ -{{- if (not .Values.ingress.useDNS) }} +{{- if .Values.ingress.enabled }} + {{- if not .Values.ingress.useDNS }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -17,7 +18,7 @@ spec: port: number: 5000 --- -{{- else if .Values.ingress.useDNS }} + {{- else if .Values.ingress.useDNS }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -45,4 +46,5 @@ spec: name: simplyblock-webappapi port: number: 5000 + {{- end }} {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index 988955a4f..1626292e2 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -18,7 +18,7 @@ spec: labels: app: simplyblock-admin-control spec: - serviceAccountName: simplyblock-control-sa + serviceAccountName: simplyblock-sa hostNetwork: true dnsPolicy: ClusterFirstWithHostNet containers: @@ -33,11 +33,13 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace +{{- if .Values.observability.enabled }} - name: MONITORING_SECRET valueFrom: secretKeyRef: name: simplyblock-grafana-secrets key: MONITORING_SECRET +{{- end }} - name: SIMPLYBLOCK_LOG_LEVEL valueFrom: configMapKeyRef: @@ -63,11 +65,12 @@ spec: path: fdb.cluster --- apiVersion: apps/v1 -kind: DaemonSet +kind: Deployment metadata: name: simplyblock-webappapi namespace: {{ .Release.Namespace }} spec: + replicas: 2 selector: matchLabels: app: simplyblock-webappapi @@ -79,7 +82,15 @@ spec: reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: app: simplyblock-webappapi - spec: + spec: + serviceAccountName: simplyblock-sa + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app: simplyblock-admin-control + topologyKey: kubernetes.io/hostname containers: - name: webappapi image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" @@ -95,15 +106,19 @@ spec: key: LOG_LEVEL - name: LVOL_NVMF_PORT_START value: "{{ .Values.ports.lvolNvmfPortStart }}" + - name: ENABLE_MONITORING + value: "{{ .Values.observability.enabled }}" - name: K8S_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace +{{- if .Values.observability.enabled }} - name: MONITORING_SECRET valueFrom: secretKeyRef: name: simplyblock-grafana-secrets key: MONITORING_SECRET +{{- end }} - name: FLASK_DEBUG value: "False" - name: FLASK_ENV @@ -119,57 +134,20 @@ spec: limits: cpu: "500m" memory: "2Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-storage-node-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-storage-node-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-storage-node-monitor - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - - name: storage-node-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/storage_node_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL + - name: fluent-bit + image: fluent/fluent-bit:1.8.11 volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster + - name: varlog + mountPath: /var/log + - name: config + mountPath: /fluent-bit/etc/ resources: requests: - cpu: "200m" - memory: "256Mi" + cpu: "100m" + memory: "200Mi" limits: - cpu: "400m" - memory: "1Gi" + cpu: "200m" + memory: "400Mi" volumes: - name: fdb-cluster-file configMap: @@ -177,18 +155,23 @@ spec: items: - key: cluster-file path: fdb.cluster - + - name: varlog + hostPath: + path: /var/log + - name: config + configMap: + name: simplyblock-fluent-bit-config --- apiVersion: apps/v1 kind: Deployment metadata: - name: simplyblock-mgmt-node-monitor + name: simplyblock-monitoring namespace: {{ .Release.Namespace }} spec: replicas: 1 selector: matchLabels: - app: simplyblock-mgmt-node-monitor + app: simplyblock-monitoring template: metadata: annotations: @@ -196,261 +179,172 @@ spec: reloader.stakater.com/auto: "true" reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: - app: simplyblock-mgmt-node-monitor + app: simplyblock-monitoring spec: + serviceAccountName: simplyblock-sa hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet + dnsPolicy: ClusterFirstWithHostNet containers: + - name: storage-node-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/storage_node_monitor.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: mgmt-node-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/mgmt_node_monitor.py"] env: - - name: BACKEND_TYPE - value: "k8s" - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL + - name: BACKEND_TYPE + value: "k8s" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-lvol-stats-collector - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-lvol-stats-collector - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-lvol-stats-collector - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - name: lvol-stats-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/lvol_stat_collector.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-main-distr-event-collector - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-main-distr-event-collector - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-main-distr-event-collector - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - name: main-distr-event-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/main_distr_event_collector.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: capacity-and-stats-collector + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/capacity_and_stats_collector.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: capacity-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/cap_monitor.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: health-check + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/health_check_service.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: device-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/device_monitor.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: lvol-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/lvol_monitor.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: snapshot-monitor + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/snapshot_monitor.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: fluent-bit + image: fluent/fluent-bit:1.8.11 volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster + - name: varlog + mountPath: /var/log + - name: config + mountPath: /fluent-bit/etc/ resources: requests: - cpu: "200m" - memory: "256Mi" + cpu: "100m" + memory: "200Mi" limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster + cpu: "200m" + memory: "400Mi" ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-capacity-and-stats-collector - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-capacity-and-stats-collector - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-capacity-and-stats-collector - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - - name: capacity-and-stats-collector - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/capacity_and_stats_collector.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-capacity-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-capacity-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-capacity-monitor - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - - name: capacity-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/cap_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster - + configMap: + name: simplyblock-fdb-cluster-config + items: + - key: cluster-file + path: fdb.cluster + - name: varlog + hostPath: + path: /var/log + - name: config + configMap: + name: simplyblock-fluent-bit-config --- apiVersion: apps/v1 kind: Deployment metadata: - name: simplyblock-health-check + name: simplyblock-tasks namespace: {{ .Release.Namespace }} spec: replicas: 1 selector: matchLabels: - app: simplyblock-health-check + app: simplyblock-tasks template: metadata: annotations: @@ -458,754 +352,146 @@ spec: reloader.stakater.com/auto: "true" reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" labels: - app: simplyblock-health-check + app: simplyblock-tasks spec: + serviceAccountName: simplyblock-sa hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet + dnsPolicy: ClusterFirstWithHostNet containers: - - name: health-check - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/health_check_service.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-device-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-device-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-device-monitor - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - - name: device-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/device_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-lvol-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-lvol-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-lvol-monitor - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - - name: lvol-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/lvol_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-snapshot-monitor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-snapshot-monitor - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-snapshot-monitor - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - - name: snapshot-monitor - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/snapshot_monitor.py"] - env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-cleanupfdb - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-cleanupfdb - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-cleanupfdb - spec: - containers: - - name: cleanupfdb + - name: tasks-node-add-runner image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/workers/cleanup_foundationdb.py"] + command: ["python", "simplyblock_core/services/tasks_runner_node_add.py"] env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - - name: LOG_DELETION_INTERVAL - value: "${LOG_DELETION_INTERVAL}" + - name: LVOL_NVMF_PORT_START + value: "{{ .Values.ports.lvolNvmfPortStart }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-restart - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-restart - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-restart - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - name: tasks-runner-restart image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_restart.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-migration - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-migration - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-migration - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - name: tasks-runner-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_migration.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-failed-migration - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-failed-migration - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-failed-migration - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: tasks-runner-failed-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_failed_migration.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-cluster-status - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-cluster-status - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-cluster-status - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: tasks-runner-cluster-status image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_cluster_status.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-new-device-migration - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-new-device-migration - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-new-device-migration - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: tasks-runner-new-device-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_new_dev_migration.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL - volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster - resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-node-add-runner - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-node-add-runner - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-node-add-runner - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - - name: tasks-node-addrunner - image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" - command: ["python", "simplyblock_core/services/tasks_runner_node_add.py"] - env: - - name: LVOL_NVMF_PORT_START - value: "{{ .Values.ports.lvolNvmfPortStart }}" - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster +{{ toYaml .resources | nindent 12 }} +{{- end }} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-port-allow - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-port-allow - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-port-allow - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: - name: tasks-runner-port-allow image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_port_allow.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-jc-comp-resume - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-jc-comp-resume - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-jc-comp-resume - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: tasks-runner-jc-comp-resume image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_jc_comp.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-tasks-runner-sync-lvol-del - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-tasks-runner-sync-lvol-del - template: - metadata: - annotations: - log-collector/enabled: "true" - reloader.stakater.com/auto: "true" - reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" - labels: - app: simplyblock-tasks-runner-sync-lvol-del - spec: - hostNetwork: true - dnsPolicy: ClusterFirstWithHostNet - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} + - name: tasks-runner-sync-lvol-del image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" - imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" command: ["python", "simplyblock_core/services/tasks_runner_sync_lvol_del.py"] +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: - - name: SIMPLYBLOCK_LOG_LEVEL - valueFrom: - configMapKeyRef: - name: simplyblock-config - key: LOG_LEVEL +{{ toYaml .env | nindent 12 }} volumeMounts: - - name: fdb-cluster-file - mountPath: /etc/foundationdb/fdb.cluster - subPath: fdb.cluster +{{ toYaml .volumeMounts | nindent 12 }} resources: - requests: - cpu: "200m" - memory: "256Mi" - limits: - cpu: "400m" - memory: "1Gi" - volumes: - - name: fdb-cluster-file - configMap: - name: simplyblock-fdb-cluster-config - items: - - key: cluster-file - path: fdb.cluster ---- - -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: simplyblock-fluent-bit - namespace: {{ .Release.Namespace }} - labels: - app: simplyblock-fluent-bit -spec: - selector: - matchLabels: - app: simplyblock-fluent-bit - template: - metadata: - labels: - app: simplyblock-fluent-bit - spec: - containers: +{{ toYaml .resources | nindent 12 }} +{{- end }} - name: fluent-bit image: fluent/fluent-bit:1.8.11 - securityContext: - privileged: true volumeMounts: - name: varlog mountPath: /var/log - - name: varlibdockercontainers - mountPath: /var/lib/docker/containers - readOnly: true - name: config mountPath: /fluent-bit/etc/ resources: requests: + cpu: "100m" + memory: "200Mi" + limits: cpu: "200m" memory: "400Mi" - limits: - cpu: "400m" - memory: "1Gi" + volumes: + - name: fdb-cluster-file + configMap: + name: simplyblock-fdb-cluster-config + items: + - key: cluster-file + path: fdb.cluster - name: varlog hostPath: path: /var/log - - name: varlibdockercontainers - hostPath: - path: /var/lib/docker/containers - name: config configMap: name: simplyblock-fluent-bit-config diff --git a/simplyblock_core/scripts/charts/templates/app_sa.yaml b/simplyblock_core/scripts/charts/templates/app_sa.yaml index a5dee735b..f04fc14b3 100644 --- a/simplyblock_core/scripts/charts/templates/app_sa.yaml +++ b/simplyblock_core/scripts/charts/templates/app_sa.yaml @@ -1,13 +1,13 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: simplyblock-control-sa + name: simplyblock-sa namespace: {{ .Release.Namespace }} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: simplyblock-control-role + name: simplyblock-role rules: - apiGroups: [""] resources: ["configmaps"] @@ -21,16 +21,23 @@ rules: - apiGroups: ["mongodbcommunity.mongodb.com"] resources: ["mongodbcommunity"] verbs: ["get", "list", "watch", "patch", "update"] + - apiGroups: ["simplyblock.simplyblock.io"] + resources: ["simplyblockpools/status", "simplyblocklvols/status", "simplyblockstorageclusters/status", "simplyblockstoragenodes/status", "simplyblockdevices/status", "simplyblocktasks/status"] + verbs: ["get", "patch", "update"] + - apiGroups: ["simplyblock.simplyblock.io"] + resources: ["simplyblockpools", "simplyblocklvols", "simplyblockstorageclusters", "simplyblockstoragenodes", "simplyblockdevices", "simplyblocktasks"] + verbs: ["get","list" ,"patch", "update", "watch"] + --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: simplyblock-control-binding + name: simplyblock-binding subjects: - kind: ServiceAccount - name: simplyblock-control-sa + name: simplyblock-sa namespace: {{ .Release.Namespace }} roleRef: kind: ClusterRole - name: simplyblock-control-role + name: simplyblock-role apiGroup: rbac.authorization.k8s.io diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml index 8e695e593..721815fa5 100644 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml @@ -158,8 +158,6 @@ spec: - --v=5 - --endpoint=$(CSI_ENDPOINT) - --nodeid=$(KUBE_NODE_NAME) - - --capacity=slow=10Gi - - --capacity=fast=100Gi env: - name: CSI_ENDPOINT value: unix:///csi/csi.sock diff --git a/simplyblock_core/scripts/charts/templates/dashboards.yaml b/simplyblock_core/scripts/charts/templates/dashboards.yaml index 981e961d0..165bad130 100644 --- a/simplyblock_core/scripts/charts/templates/dashboards.yaml +++ b/simplyblock_core/scripts/charts/templates/dashboards.yaml @@ -1,4 +1,4 @@ -{{- if .Values.monitoring.enabled }} +{{- if .Values.observability.enabled }} apiVersion: v1 kind: ConfigMap metadata: @@ -12512,14796 +12512,4 @@ data: "weekStart": "" } ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: simplyblock-grafana-dashboard-node-exporter - namespace: {{ .Release.Namespace }} - labels: - grafana_dashboard: "1" -data: - node-exporter.json: | - { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "gnetId": 1860, - "graphTooltip": 1, - "id": null, - "links": [], - "liveNow": false, - "panels": [ - { - "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 261, - "panels": [], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Quick CPU / Mem / Disk", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Resource pressure via PSI", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "links": [], - "mappings": [], - "max": 1, - "min": 0, - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "dark-yellow", - "value": 70 - }, - { - "color": "dark-red", - "value": 90 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 0, - "y": 1 - }, - "id": 323, - "links": [], - "options": { - "displayMode": "basic", - "minVizHeight": 10, - "minVizWidth": 0, - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showUnfilled": true, - "text": {} - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "irate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "instant": true, - "intervalFactor": 1, - "legendFormat": "CPU", - "range": false, - "refId": "CPU some", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "irate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "Mem", - "range": false, - "refId": "Memory some", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "irate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "I/O", - "range": false, - "refId": "I/O some", - "step": 240 - } - ], - "title": "Pressure", - "type": "bargauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Busy state of all CPU cores together", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 85 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 95 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 3, - "y": 1 - }, - "id": 20, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[$__rate_interval])))", - "hide": false, - "instant": true, - "intervalFactor": 1, - "legendFormat": "", - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "CPU Busy", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "System load over all CPU cores together", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 85 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 95 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 6, - "y": 1 - }, - "id": 155, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "scalar(node_load1{instance=\"$node\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "Sys Load", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Non available RAM memory", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 80 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 90 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 9, - "y": 1 - }, - "hideTimeOverride": false, - "id": 16, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "((node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\", job=\"$job\"}) / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"}) * 100", - "format": "time_series", - "hide": true, - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "(1 - (node_memory_MemAvailable_bytes{instance=\"$node\", job=\"$job\"} / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"})) * 100", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "B", - "step": 240 - } - ], - "title": "RAM Used", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Used Swap", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 10 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 25 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 12, - "y": 1 - }, - "id": 21, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"})) * 100", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "SWAP Used", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Used Root FS", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 80 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 90 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 3, - "x": 15, - "y": 1 - }, - "id": 154, - "links": [], - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"})", - "format": "time_series", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "Root FS Used", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Total number of CPU cores", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 1 - }, - "id": 14, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - } - ], - "title": "CPU Cores", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "System uptime", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 1, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 4, - "x": 20, - "y": 1 - }, - "hideTimeOverride": true, - "id": 15, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "Uptime", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Total RootFS", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "rgba(50, 172, 45, 0.97)", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 70 - }, - { - "color": "rgba(245, 54, 54, 0.9)", - "value": 90 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 18, - "y": 3 - }, - "id": 23, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", - "format": "time_series", - "hide": false, - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "RootFS Total", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Total RAM", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 20, - "y": 3 - }, - "id": 75, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "RAM Total", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Total SWAP", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "decimals": 0, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 2, - "w": 2, - "x": 22, - "y": 3 - }, - "id": 18, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "none", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.4.3", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}", - "instant": true, - "intervalFactor": 1, - "range": false, - "refId": "A", - "step": 240 - } - ], - "title": "SWAP Total", - "type": "stat" - }, - { - "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 5 - }, - "id": 263, - "panels": [], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Basic CPU / Mem / Net / Disk", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Basic CPU info", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "percent" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Busy Iowait" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Idle" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Busy Iowait" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Idle" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Busy System" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Busy User" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Busy Other" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 6 - }, - "id": 77, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true, - "width": 250 - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "exemplar": false, - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "hide": false, - "instant": false, - "intervalFactor": 1, - "legendFormat": "Busy System", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Busy User", - "range": true, - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Busy Iowait", - "range": true, - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Busy IRQs", - "range": true, - "refId": "D", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Busy Other", - "range": true, - "refId": "E", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Idle", - "range": true, - "refId": "F", - "step": 240 - } - ], - "title": "CPU Basic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Basic memory usage", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "SWAP Used" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap Used" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "custom.stacking", - "value": { - "group": false, - "mode": "normal" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM Cache + Buffer" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Available" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#DEDAF7", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - }, - { - "id": "custom.stacking", - "value": { - "group": false, - "mode": "normal" - } - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 6 - }, - "id": 78, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "RAM Total", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "RAM Used", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "RAM Cache + Buffer", - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "RAM Free", - "refId": "D", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "SWAP Used", - "refId": "E", - "step": 240 - } - ], - "title": "Memory Basic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Basic network info per interface", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bps" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Recv_bytes_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_bytes_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_drop_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_drop_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_errs_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Recv_errs_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CCA300", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_bytes_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_bytes_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_drop_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_drop_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_errs_eth2" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Trans_errs_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CCA300", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_bytes_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_drop_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_drop_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#967302", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_errs_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "recv_errs_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_bytes_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_bytes_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_drop_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_drop_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#967302", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_errs_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "trans_errs_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*trans.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 74, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "recv {{ "{{" }}device{{ "}}" }}", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "trans {{ "{{" }}device{{ "}}" }} ", - "refId": "B", - "step": 240 - } - ], - "title": "Network Traffic Basic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Disk space used of all filesystems mounted", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 152, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }}", - "refId": "A", - "step": 240 - } - ], - "title": "Disk Space Used Basic", - "type": "timeseries" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 20 - }, - "id": 265, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "percentage", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 70, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "percent" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Idle - Waiting for something to happen" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Iowait - Waiting for I/O to complete" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Irq - Servicing interrupts" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Nice - Niced processes executing in user mode" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Softirq - Servicing softirqs" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Steal - Time spent in other operating systems when running in a virtualized environment" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCE2DE", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "System - Processes executing in kernel mode" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "User - Normal processes executing in user mode" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#5195CE", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 21 - }, - "id": 3, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 250 - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "System - Processes executing in kernel mode", - "range": true, - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "User - Normal processes executing in user mode", - "range": true, - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"nice\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Nice - Niced processes executing in user mode", - "range": true, - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum by(instance) (irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Iowait - Waiting for I/O to complete", - "range": true, - "refId": "E", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Irq - Servicing interrupts", - "range": true, - "refId": "F", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"softirq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Softirq - Servicing softirqs", - "range": true, - "refId": "G", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"steal\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment", - "range": true, - "refId": "H", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Idle - Waiting for something to happen", - "range": true, - "refId": "J", - "step": 240 - } - ], - "title": "CPU", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap - Swap memory usage" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused - Free memory unassigned" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Hardware Corrupted - *./" - }, - "properties": [ - { - "id": "custom.stacking", - "value": { - "group": false, - "mode": "normal" - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 21 - }, - "id": 24, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Apps - Memory used by user-space applications", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified", - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)", - "refId": "D", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Cache - Parked file data (file content) cache", - "refId": "E", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Buffers - Block device (e.g. harddisk) cache", - "refId": "F", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Unused - Free memory unassigned", - "refId": "G", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Swap - Swap space used", - "refId": "H", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", - "refId": "I", - "step": 240 - } - ], - "title": "Memory Stack", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bits out (-) / in (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bps" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "receive_packets_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "receive_packets_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "transmit_packets_eth0" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "transmit_packets_lo" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Trans.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 33 - }, - "id": 84, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Receive", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Transmit", - "refId": "B", - "step": 240 - } - ], - "title": "Network Traffic", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 33 - }, - "id": 156, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }}", - "refId": "A", - "step": 240 - } - ], - "title": "Disk Space Used", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "IO read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "iops" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 45 - }, - "id": 229, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Reads completed", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Writes completed", - "refId": "B", - "step": 240 - } - ], - "title": "Disk IOps", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "io time" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*read*./" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byType", - "options": "time" - }, - "properties": [ - { - "id": "custom.axisPlacement", - "value": "hidden" - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 45 - }, - "id": 42, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Successfully read bytes", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Successfully written bytes", - "refId": "B", - "step": 240 - } - ], - "title": "I/O Usage Read / Write", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "%util", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 40, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "io time" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byType", - "options": "time" - }, - "properties": [ - { - "id": "custom.axisPlacement", - "value": "hidden" - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 0, - "y": 57 - }, - "id": 127, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [$__rate_interval])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }}", - "refId": "A", - "step": 240 - } - ], - "title": "I/O Utilization", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "percentage", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "bars", - "fillOpacity": 70, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "smooth", - "lineWidth": 2, - "pointSize": 3, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "max": 1, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/^Guest - /" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#5195ce", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/^GuestNice - /" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#c15c17", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 12, - "x": 12, - "y": 57 - }, - "id": 319, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[1m])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[1m])))", - "hide": false, - "legendFormat": "Guest - Time spent running a virtual CPU for a guest operating system", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\", mode=\"nice\"}[1m])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[1m])))", - "hide": false, - "legendFormat": "GuestNice - Time spent running a niced guest (virtual CPU for guest operating system)", - "range": true, - "refId": "B" - } - ], - "title": "CPU spent seconds in guests (VMs)", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "CPU / Memory / Net / Disk", - "type": "row" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 266, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 54 - }, - "id": 136, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Inactive - Memory which has been less recently used. It is more eligible to be reclaimed for other purposes", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Active / Inactive", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*CommitLimit - *./" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - }, - { - "id": "custom.fillOpacity", - "value": 0 - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 54 - }, - "id": 135, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Committed_AS - Amount of memory presently allocated on the system", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "CommitLimit - Amount of memory currently available to be allocated on the system", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Committed", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 64 - }, - "id": 191, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Inactive_file - File-backed memory on inactive LRU list", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Active_file - File-backed memory on active LRU list", - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs", - "refId": "D", - "step": 240 - } - ], - "title": "Memory Active / Inactive Detail", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 64 - }, - "id": 130, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Writeback - Memory which is actively being written back to disk", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "WritebackTmp - Memory used by FUSE for temporary writeback buffers", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Dirty - Memory which is waiting to get written back to the disk", - "refId": "C", - "step": 240 - } - ], - "title": "Memory Writeback and Dirty", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" - }, - "properties": [ - { - "id": "custom.fillOpacity", - "value": 0 - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages" - }, - "properties": [ - { - "id": "custom.fillOpacity", - "value": 0 - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 74 - }, - "id": 138, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Mapped - Used memory in mapped pages files which have been mapped, such as libraries", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Shmem - Used shared memory (shared between several processes, thus including RAM disks)", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", - "refId": "C", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "ShmemPmdMapped - Amount of shared (shmem/tmpfs) memory backed by huge pages", - "refId": "D", - "step": 240 - } - ], - "title": "Memory Shared and Mapped", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 74 - }, - "id": 131, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "SReclaimable - Part of Slab, that might be reclaimed, such as caches", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Slab", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 84 - }, - "id": 70, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "VmallocChunk - Largest contiguous block of vmalloc area which is free", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "VmallocTotal - Total size of vmalloc memory area", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "VmallocUsed - Amount of vmalloc area which is used", - "refId": "C", - "step": 240 - } - ], - "title": "Memory Vmalloc", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 84 - }, - "id": 159, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Bounce - Memory used for block device bounce buffers", - "refId": "A", - "step": 240 - } - ], - "title": "Memory Bounce", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*Inactive *./" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 94 - }, - "id": 129, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "AnonHugePages - Memory in anonymous huge pages", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "AnonPages - Memory in user pages not backed by files", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Anonymous", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 94 - }, - "id": 160, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "KernelStack - Kernel memory stack. This is not reclaimable", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "PerCPU - Per CPU memory allocated dynamically by loadable modules", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Kernel / CPU", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "pages", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 104 - }, - "id": 140, - "links": [], - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "HugePages_Free - Huge pages in the pool that are not yet allocated", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages", - "refId": "C", - "step": 240 - } - ], - "title": "Memory HugePages Counter", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 104 - }, - "id": 71, - "links": [], - "options": { - "legend": { - "calcs": [ - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "HugePages - Total size of the pool of huge pages", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Hugepagesize - Huge Page size", - "refId": "B", - "step": 240 - } - ], - "title": "Memory HugePages Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 114 - }, - "id": 128, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "DirectMap1G - Amount of pages mapped as this size", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "DirectMap2M - Amount of pages mapped as this size", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "DirectMap4K - Amount of pages mapped as this size", - "refId": "C", - "step": 240 - } - ], - "title": "Memory DirectMap", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Apps" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A437C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#CFFAFF", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "RAM_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#806EB7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#2F575E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unused" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 114 - }, - "id": 137, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true, - "width": 350 - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "MLocked - Size of pages locked to memory using the mlock() system call", - "refId": "B", - "step": 240 - } - ], - "title": "Memory Unevictable and MLocked", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Active" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#99440A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Buffers" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#58140C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6D1F62", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Cached" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Committed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#508642", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Dirty" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Free" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#B7DBAB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Inactive" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Mapped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "PageTables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Page_Tables" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Slab_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Swap_Cache" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#C15C17", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#511749", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total RAM + Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#052B51", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Total Swap" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "VmallocUsed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 124 - }, - "id": 132, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "NFS Unstable - Memory in NFS pages sent to the server, but not yet committed to the storage", - "refId": "A", - "step": 240 - } - ], - "title": "Memory NFS", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Memory Meminfo", - "type": "row" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 28 - }, - "id": 270, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The number (after merges) of I/O requests completed per second for the device", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "IO read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "iops" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 47 - }, - "id": 9, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Reads completed", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Writes completed", - "refId": "B", - "step": 240 - } - ], - "title": "Disk IOps Completed", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The number of bytes read from or written to the device per second", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 47 - }, - "id": 33, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Read bytes", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Written bytes", - "refId": "B", - "step": 240 - } - ], - "title": "Disk R/W Data", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "time. read (-) / write (+)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 57 - }, - "id": 37, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "hide": false, - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Read wait time avg", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Write wait time avg", - "refId": "B", - "step": 240 - } - ], - "title": "Disk Average Wait Time", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The average queue length of the requests that were issued to the device", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "aqu-sz", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 57 - }, - "id": 35, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }}", - "refId": "A", - "step": 240 - } - ], - "title": "Average Queue Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The number of read and write requests merged per second that were queued to the device", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "I/Os", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "iops" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*Read.*/" - }, - "properties": [ - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 67 - }, - "id": 133, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Read merged", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Write merged", - "refId": "B", - "step": 240 - } - ], - "title": "Disk R/W Merged", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "Percentage of elapsed time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100% for devices serving requests serially. But for devices serving requests in parallel, such as RAID arrays and modern SSDs, this number does not reflect their performance limits.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "%util", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 30, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 67 - }, - "id": 36, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - IO", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - discard", - "refId": "B", - "step": 240 - } - ], - "title": "Time Spent Doing I/Os", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "The number of outstanding requests at the instant the sample was taken. Incremented as requests are given to appropriate struct request_queue and decremented as they finish.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Outstanding req.", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 77 - }, - "id": 34, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - IO now", - "refId": "A", - "step": 240 - } - ], - "title": "Instantaneous Queue Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "IOs", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "iops" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#7EB26D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EAB839", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#6ED0E0", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EF843C", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E24D42", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#584477", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda2_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sda3_.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F4D598", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#0A50A1", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BF1B00", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdb3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0752D", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#962D82", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#614D93", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdc3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#9AC48A", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#65C5DB", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9934E", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#EA6460", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde1.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#E0F9D7", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sdd2.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#FCEACA", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/.*sde3.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F9E2D2", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 77 - }, - "id": 301, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Discards completed", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}device{{ "}}" }} - Discards merged", - "refId": "B", - "step": 240 - } - ], - "title": "Disk IOps Discards completed / merged", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Storage Disk", - "type": "row" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 29 - }, - "id": 271, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "bytes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 62 - }, - "id": 43, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Available", - "metric": "", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Free", - "refId": "B", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": true, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Size", - "refId": "C", - "step": 240 - } - ], - "title": "Filesystem space available", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "file nodes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 62 - }, - "id": 41, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Free file nodes", - "refId": "A", - "step": 240 - } - ], - "title": "File Nodes Free", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "files", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 72 - }, - "id": 28, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 4, - "legendFormat": "Max open files", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Open files", - "refId": "B", - "step": 240 - } - ], - "title": "File Descriptor", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "file Nodes", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 72 - }, - "id": 219, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - File nodes total", - "refId": "A", - "step": 240 - } - ], - "title": "File Nodes Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "counter", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "max": 1, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "/ ReadOnly" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 82 - }, - "id": 44, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - ReadOnly", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}mountpoint{{ "}}" }} - Device error", - "refId": "B", - "step": 240 - } - ], - "title": "Filesystem in ReadOnly / Error", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Storage Filesystem", - "type": "row" - }, - { - "collapsed": true, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 279, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "seconds", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 66 - }, - "id": 40, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}collector{{ "}}" }} - Scrape duration", - "refId": "A", - "step": 240 - } - ], - "title": "Node Exporter Scrape Time", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "counter", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineStyle": { - "fill": "solid" - }, - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green" - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*error.*/" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#F2495C", - "mode": "fixed" - } - }, - { - "id": "custom.transform", - "value": "negative-Y" - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 66 - }, - "id": 157, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max", - "min" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}collector{{ "}}" }} - Scrape success", - "refId": "A", - "step": 240 - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "expr": "node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ "{{" }}collector{{ "}}" }} - Scrape textfile error (1 = true)", - "refId": "B", - "step": 240 - } - ], - "title": "Node Exporter Scrape", - "type": "timeseries" - } - ], - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "Node Exporter", - "type": "row" - } - ], - "refresh": "1m", - "revision": 1, - "schemaVersion": 38, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "default", - "value": "default" - }, - "hide": 0, - "includeAll": false, - "label": "Datasource", - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "queryValue": "", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "definition": "", - "hide": 0, - "includeAll": false, - "label": "Job", - "multi": false, - "name": "job", - "options": [], - "query": { - "query": "label_values(node_uname_info, job)", - "refId": "Prometheus-job-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "definition": "label_values(node_uname_info{job=\"$job\"}, instance)", - "hide": 0, - "includeAll": false, - "label": "Host", - "multi": false, - "name": "node", - "options": [], - "query": { - "query": "label_values(node_uname_info{job=\"$job\"}, instance)", - "refId": "Prometheus-node-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "selected": false, - "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", - "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" - }, - "hide": 2, - "includeAll": false, - "multi": false, - "name": "diskdevices", - "options": [ - { - "selected": true, - "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", - "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" - } - ], - "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", - "skipUrlSync": false, - "type": "custom" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "NodeExporter", - "uid": "d56e0ae7-48d5-481d-a2ea-3192da4d9e42", - "version": 5, - "weekStart": "" - } {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml index 4eb7f1410..5020c2fea 100644 --- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml +++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml @@ -246,7 +246,6 @@ spec: general: customParameters: - knob_disable_posix_kernel_aio=1 - - listen_address=0.0.0.0:4501 podTemplate: spec: containers: diff --git a/simplyblock_core/scripts/charts/templates/mongodb.yaml b/simplyblock_core/scripts/charts/templates/mongodb.yaml index 815df6505..6c004f314 100644 --- a/simplyblock_core/scripts/charts/templates/mongodb.yaml +++ b/simplyblock_core/scripts/charts/templates/mongodb.yaml @@ -1,3 +1,4 @@ +{{- if .Values.observability.enabled }} apiVersion: mongodbcommunity.mongodb.com/v1 kind: MongoDBCommunity metadata: @@ -51,4 +52,5 @@ metadata: name: admin-password type: Opaque stringData: - password: {{ .Values.monitoring.secret }} + password: {{ .Values.observability.secret }} +{{- end }} diff --git a/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml b/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml index cb4243493..497760180 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml @@ -1,4 +1,3 @@ -{{- if .Values.monitoring.enabled }} apiVersion: v1 kind: ConfigMap @@ -25,14 +24,6 @@ data: username: password: - - job_name: 'node' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_endpoints_name] - action: keep - regex: 'simplyblock-node-exporter' - --- apiVersion: v1 kind: ConfigMap @@ -46,6 +37,7 @@ data: type: FILESYSTEM config: directory: /mnt/thanos +{{- if .Values.observability.enabled }} --- apiVersion: v1 kind: ConfigMap @@ -60,7 +52,7 @@ data: datasources: - name: Thanos type: prometheus - url: http://simplyblock-thanos-query:9091 + url: http://simplyblock-thanos:9091 isDefault: true access: proxy uid: PBFA97CFB590B2093 @@ -829,7 +821,7 @@ data: type: slack settings: username: grafana_bot - url: '{{ .Values.grafana.contactPoint }}' + url: '{{ .Values.observability.grafana.contactPoint }}' title: | '{{ "{{" }} template "slack.title" . {{ "}}" }}' text: | diff --git a/simplyblock_core/scripts/charts/templates/monitoring_ingress.yaml b/simplyblock_core/scripts/charts/templates/monitoring_ingress.yaml index ec0e1ab80..bcccf4a35 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_ingress.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_ingress.yaml @@ -1,4 +1,5 @@ -{{- if (not .Values.ingress.useDNS) }} +{{- if .Values.ingress.enabled }} + {{- if not .Values.ingress.useDNS }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -32,9 +33,8 @@ spec: name: simplyblock-graylog port: number: 9000 - --- -{{- else if .Values.ingress.useDNS }} + {{- else if .Values.ingress.useDNS }} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -77,4 +77,5 @@ spec: name: simplyblock-graylog port: number: 9000 + {{- end }} {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml index 1349a33a9..f54a9c2f5 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml @@ -1,4 +1,4 @@ -{{- if .Values.monitoring.enabled }} +{{- if .Values.observability.enabled }} --- apiVersion: apps/v1 kind: Deployment @@ -46,7 +46,7 @@ spec: - name: GRAYLOG_ELASTICSEARCH_HOSTS value: "http://opensearch-cluster-master:9200" - name: GRAYLOG_MONGODB_URI - value: "mongodb://admin:{{ .Values.monitoring.secret }}@simplyblock-mongo-svc:27017/graylog" + value: "mongodb://admin:{{ .Values.observability.secret }}@simplyblock-mongo-svc:27017/graylog" - name: GRAYLOG_SKIP_PREFLIGHT_CHECKS value: "true" - name: GRAYLOG_ROTATION_STRATEGY @@ -105,30 +105,37 @@ spec: apiVersion: apps/v1 kind: Deployment metadata: - name: simplyblock-thanos-store + name: simplyblock-thanos namespace: {{ .Release.Namespace }} spec: replicas: 1 selector: matchLabels: - app: simplyblock-thanos-store + app: simplyblock-thanos template: metadata: labels: - app: simplyblock-thanos-store - spec: + app: simplyblock-thanos + spec: containers: - name: thanos-store image: thanosio/thanos:v0.31.0 args: - store + - --grpc-address=0.0.0.0:10901 + - --http-address=0.0.0.0:10902 - --objstore.config-file=/etc/thanos/objstore.yml - --index-cache-size=500MB - --chunk-pool-size=500MB + ports: + - name: grpc + containerPort: 10901 + - name: http + containerPort: 10902 volumeMounts: - name: objstore-config mountPath: /etc/thanos - - name: thanos-data + - name: data mountPath: /data resources: requests: @@ -137,37 +144,20 @@ spec: limits: cpu: "250m" memory: "1Gi" - volumes: - - name: objstore-config - configMap: - name: simplyblock-objstore-config - - name: thanos-data - emptyDir: {} - ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-thanos-query - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-thanos-query - template: - metadata: - labels: - app: simplyblock-thanos-query - spec: - containers: + - name: thanos-query image: thanosio/thanos:v0.31.0 args: - query + - --grpc-address=0.0.0.0:10911 - --http-address=0.0.0.0:9091 - - --store=simplyblock-thanos-store:10901 + - --store=simplyblock-thanos:10901 - --store=simplyblock-prometheus:10901 + ports: + - containerPort: 9091 + name: http + - containerPort: 10911 + name: grpc resources: requests: cpu: "100m" @@ -176,28 +166,11 @@ spec: cpu: "250m" memory: "1Gi" ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simplyblock-thanos-compactor - namespace: {{ .Release.Namespace }} -spec: - replicas: 1 - selector: - matchLabels: - app: simplyblock-thanos-compactor - template: - metadata: - labels: - app: simplyblock-thanos-compactor - spec: - - containers: - name: thanos-compactor image: thanosio/thanos:v0.31.0 args: - compact + - --http-address=0.0.0.0:10922 - --data-dir=/data - --objstore.config-file=/etc/thanos/objstore.yml - --retention.resolution-raw=30d @@ -205,10 +178,13 @@ spec: - --retention.resolution-1h=90d - --compact.concurrency=1 - --wait + ports: + - containerPort: 10922 + name: http volumeMounts: - name: objstore-config mountPath: /etc/thanos - - name: compactor-data + - name: data mountPath: /data resources: requests: @@ -217,72 +193,14 @@ spec: limits: cpu: "250m" memory: "1Gi" + volumes: - name: objstore-config configMap: name: simplyblock-objstore-config - - name: compactor-data - emptyDir: {} ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: simplyblock-node-exporter - namespace: {{ .Release.Namespace }} -spec: - selector: - matchLabels: - app: simplyblock-node-exporter - template: - metadata: - labels: - app: simplyblock-node-exporter - spec: - containers: - - name: node-exporter - image: prom/node-exporter:v1.7.0 - args: - - '--path.procfs=/host/proc' - - '--path.sysfs=/host/sys' - - '--path.rootfs=/host/root' - - '--collector.filesystem.ignored-mount-points=^(/rootfs|/host|)/(sys|proc|dev|host|etc|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)' - - '--collector.filesystem.ignored-fs-types=^(sys|proc|auto|cgroup|devpts|ns|au|fuse.lxc|mqueue)(fs|)$' - - '--no-collector.ipvs' - - '--web.listen-address=:9200' - ports: - - containerPort: 9200 - protocol: TCP - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - mountPropagation: HostToContainer - - name: sys - mountPath: /host/sys - mountPropagation: HostToContainer - readOnly: true - - name: root - mountPath: /host/root - mountPropagation: HostToContainer - readOnly: true - resources: - requests: - cpu: "100m" - memory: "256Mi" - limits: - cpu: "250m" - memory: "1Gi" - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys - - name: root - hostPath: - path: / + - name: data + emptyDir: {} --- apiVersion: apps/v1 @@ -345,9 +263,6 @@ spec: - name: dashboard-pools mountPath: /var/lib/grafana/dashboards/pools.json subPath: pools.json - - name: dashboard-node-exporter - mountPath: /var/lib/grafana/dashboards/node-exporter.json - subPath: node-exporter.json - name: grafana-data mountPath: /var/lib/grafana volumes: @@ -375,9 +290,6 @@ spec: - name: dashboard-pools configMap: name: simplyblock-grafana-dashboard-pools - - name: dashboard-node-exporter - configMap: - name: simplyblock-grafana-dashboard-node-exporter - name: grafana-data emptyDir: {} {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/monitoring_secret.yaml b/simplyblock_core/scripts/charts/templates/monitoring_secret.yaml index c39735159..df741f026 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_secret.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_secret.yaml @@ -1,4 +1,4 @@ -{{- if .Values.monitoring.enabled }} +{{- if .Values.observability.enabled }} apiVersion: v1 kind: Secret metadata: @@ -6,8 +6,8 @@ metadata: namespace: {{ .Release.Namespace }} type: Opaque stringData: - MONITORING_SECRET: "{{ .Values.monitoring.secret }}" - GRAFANA_ENDPOINT: "{{ .Values.grafana.endpoint }}" + MONITORING_SECRET: "{{ .Values.observability.secret }}" + GRAFANA_ENDPOINT: "{{ .Values.observability.grafana.endpoint }}" --- apiVersion: v1 @@ -17,7 +17,7 @@ metadata: namespace: {{ .Release.Namespace }} type: Opaque stringData: - GRAYLOG_PASSWORD_SECRET: "{{ .Values.graylog.passwordSecret }}" - GRAYLOG_ROOT_PASSWORD_SHA2: "{{ .Values.graylog.rootPasswordSha2 }}" - MAX_NUMBER_OF_INDICES: "{{ .Values.log.maxNumberIndex }}" + GRAYLOG_PASSWORD_SECRET: "{{ .Values.observability.graylog.passwordSecret }}" + GRAYLOG_ROOT_PASSWORD_SHA2: "{{ .Values.observability.graylog.rootPasswordSha2 }}" + MAX_NUMBER_OF_INDICES: "{{ .Values.observability.graylog.maxNumberIndex }}" {{- end }} diff --git a/simplyblock_core/scripts/charts/templates/monitoring_svc.yaml b/simplyblock_core/scripts/charts/templates/monitoring_svc.yaml index 55b15dccc..5a0936434 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_svc.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_svc.yaml @@ -1,4 +1,4 @@ -{{- if .Values.monitoring.enabled }} +{{- if .Values.observability.enabled }} --- apiVersion: v1 kind: Service @@ -25,44 +25,19 @@ spec: apiVersion: v1 kind: Service metadata: - name: simplyblock-thanos-store + name: simplyblock-thanos namespace: {{ .Release.Namespace }} spec: selector: - app: simplyblock-thanos-store + app: simplyblock-thanos ports: - - name: thanos-store + - name: store port: 10901 targetPort: 10901 ---- -apiVersion: v1 -kind: Service -metadata: - name: simplyblock-thanos-query - namespace: {{ .Release.Namespace }} -spec: - selector: - app: simplyblock-thanos-query - ports: - - name: thanos-query + - name: query port: 9091 targetPort: 9091 ---- -apiVersion: v1 -kind: Service -metadata: - name: simplyblock-node-exporter - namespace: {{ .Release.Namespace }} -spec: - selector: - app: simplyblock-node-exporter - ports: - - name: simplyblock-node-exporter - protocol: TCP - port: 9200 - targetPort: 9200 - --- apiVersion: v1 kind: Service diff --git a/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml b/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml new file mode 100644 index 000000000..257be5ae1 --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml @@ -0,0 +1,196 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simplyblock-manager + labels: + control-plane: simplyblock-manager + app: simplyblock-manager +spec: + selector: + matchLabels: + app: simplyblock-manager + replicas: 1 + template: + metadata: + labels: + control-plane: simplyblock-manager + app: simplyblock-manager + spec: + securityContext: + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + serviceAccountName: simplyblock-manager + containers: + - image: simplyblock/simplyblock-manager:main + imagePullPolicy: Always + name: manager + env: + - name: WATCH_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + resources: + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 500m + memory: 256Mi + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + privileged: false + terminationGracePeriodSeconds: 10 + +################# ROLE AND ROLE BINDING ############################## +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: simplyblock-manager + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: simplyblock-manager-clusterrole +rules: +- apiGroups: + - "" + resources: + - configmaps + - events + - persistentvolumeclaims + - pods + - pods/exec + - namespaces + - secrets + - services + - serviceaccounts + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - apps + resources: + - deployments + - daemonsets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - update + - patch +- apiGroups: + - "rbac.authorization.k8s.io" + resources: + - roles + - clusterroles + verbs: + - create + - get + - list + - watch + - update + - patch +- apiGroups: + - "rbac.authorization.k8s.io" + resources: + - rolebindings + - clusterrolebindings + verbs: + - create + - get + - list + - watch + - update + - patch +- apiGroups: + - simplyblock.simplyblock.io + resources: + - simplyblockpools + - simplyblocklvols + - simplyblockstorageclusters + - simplyblockstoragenodes + - simplyblockdevices + - simplyblocktasks + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - simplyblock.simplyblock.io + resources: + - simplyblockpools/finalizers + - simplyblocklvols/finalizers + - simplyblockstorageclusters/finalizers + - simplyblockstoragenodes/finalizers + - simplyblockdevices/finalizers + - simplyblocktasks/finalizers + verbs: + - update + - delete +- apiGroups: + - simplyblock.simplyblock.io + resources: + - simplyblockpools/status + - simplyblocklvols/status + - simplyblockstorageclusters/status + - simplyblockstoragenodes/status + - simplyblockdevices/status + - simplyblocktasks/status + verbs: + - get + - patch + - update + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + creationTimestamp: null + name: simplyblock-manager-clusterrolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: simplyblock-manager-clusterrole +subjects: +- kind: ServiceAccount + name: simplyblock-manager + namespace: {{ .Release.Namespace }} + \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/simplyblock_customresource.yaml b/simplyblock_core/scripts/charts/templates/simplyblock_customresource.yaml new file mode 100644 index 000000000..eb360b60a --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/simplyblock_customresource.yaml @@ -0,0 +1,145 @@ +{{- if .Values.simplyblock.cluster }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockStorageCluster +metadata: + name: {{ .Values.simplyblock.cluster.clusterName }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} + + {{- if .Values.simplyblock.cluster.mgmtIfc }} + mgmtIfc: {{ .Values.simplyblock.cluster.mgmtIfc }} + {{- end }} + + {{- if .Values.simplyblock.cluster.fabric }} + fabric: {{ .Values.simplyblock.cluster.fabric }} + {{- end }} + + {{- if hasKey .Values.simplyblock.cluster "isSingleNode" }} + isSingleNode: {{ .Values.simplyblock.cluster.isSingleNode }} + {{- end }} + + {{- if hasKey .Values.simplyblock.cluster "enableNodeAffinity" }} + enableNodeAffinity: {{ .Values.simplyblock.cluster.enableNodeAffinity }} + {{- end }} + + {{- if hasKey .Values.simplyblock.cluster "strictNodeAntiAffinity" }} + strictNodeAntiAffinity: {{ .Values.simplyblock.cluster.strictNodeAntiAffinity }} + {{- end }} + + {{- if .Values.simplyblock.cluster.capWarn }} + capWarn: {{ .Values.simplyblock.cluster.capWarn }} + {{- end }} + + {{- if .Values.simplyblock.cluster.capCrit }} + capCrit: {{ .Values.simplyblock.cluster.capCrit }} + {{- end }} + + {{- if .Values.simplyblock.cluster.provCapWarn }} + provCapWarn: {{ .Values.simplyblock.cluster.provCapWarn }} + {{- end }} + + {{- if .Values.simplyblock.cluster.provCapCrit }} + provCapCrit: {{ .Values.simplyblock.cluster.provCapCrit }} + {{- end }} +{{- end }} + +--- +{{- if .Values.simplyblock.pool }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockPool +metadata: + name: {{ .Values.simplyblock.pool.name }} + namespace: {{ .Release.Namespace }} +spec: + name: {{ .Values.simplyblock.pool.name }} + clusterName: {{ .Values.simplyblock.cluster.clusterName }} + + {{- if .Values.simplyblock.pool.capacityLimit }} + capacityLimit: {{ .Values.simplyblock.pool.capacityLimit | quote }} + {{- end }} +{{- end }} + +--- +{{- if .Values.simplyblock.lvol }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockLvol +metadata: + name: {{ .Values.simplyblock.lvol.name }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} + poolName: {{ .Values.simplyblock.pool.name }} +{{- end }} + +--- +{{- if .Values.simplyblock.storageNodes }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockStorageNode +metadata: + name: {{ .Values.simplyblock.storageNodes.name }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} + + {{- if .Values.simplyblock.storageNodes.clusterImage }} + clusterImage: {{ .Values.simplyblock.storageNodes.clusterImage }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.mgmtIfc }} + mgmtIfc: {{ .Values.simplyblock.storageNodes.mgmtIfc }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.maxLVol }} + maxLVol: {{ .Values.simplyblock.storageNodes.maxLVol }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.maxSize }} + maxSize: {{ .Values.simplyblock.storageNodes.maxSize | quote }} + {{- end }} + + {{- if hasKey .Values.simplyblock.storageNodes "partitions" }} + partitions: {{ .Values.simplyblock.storageNodes.partitions }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.corePercentage }} + corePercentage: {{ .Values.simplyblock.storageNodes.corePercentage }} + {{- end }} + + {{- if hasKey .Values.simplyblock.storageNodes "spdkDebug" }} + spdkDebug: {{ .Values.simplyblock.storageNodes.spdkDebug }} + {{- end }} + + {{- if hasKey .Values.simplyblock.storageNodes "coreIsolation" }} + coreIsolation: {{ .Values.simplyblock.storageNodes.coreIsolation }} + {{- end }} + + {{- if .Values.simplyblock.storageNodes.workerNodes }} + workerNodes: + {{- range .Values.simplyblock.storageNodes.workerNodes }} + - {{ . }} + {{- end }} + {{- end }} +{{- end }} + +--- +{{- if .Values.simplyblock.devices }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockDevice +metadata: + name: {{ .Values.simplyblock.devices.name }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} +{{- end }} + +--- +{{- if .Values.simplyblock.tasks }} +apiVersion: simplyblock.simplyblock.io/v1alpha1 +kind: SimplyBlockTask +metadata: + name: {{ .Values.simplyblock.tasks.name }} + namespace: {{ .Release.Namespace }} +spec: + clusterName: {{ .Values.simplyblock.cluster.clusterName }} +{{- end }} diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml index 0b70f321e..7272846d7 100644 --- a/simplyblock_core/scripts/charts/values.yaml +++ b/simplyblock_core/scripts/charts/values.yaml @@ -1,29 +1,26 @@ -graylog: - rootPasswordSha2: "b87c15a8ae4736d771ca60a7cc2014baaeab19b11c31f5fedef9421958a403c9" - passwordSecret: "is6SP2EdWg0NdmVGv6CEp5hRHNL7BKVMFem4t9pouMqDQnHwXMSomas1qcbKSt5yISr8eBHv4Y7Dbswhyz84Ut0TW6kqsiPs" -monitoring: - enabled: true +observability: + enabled: false secret: "sWbpOgba1bKnCfcPkVQi" - -log: deletionInterval: "3d" - retentionPeriod: "7d" level: "DEBUG" - maxNumberIndex: "3" - -grafana: - endpoint: "" - contactPoint: "https://hooks.slack.com/services/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" + graylog: + rootPasswordSha2: "b87c15a8ae4736d771ca60a7cc2014baaeab19b11c31f5fedef9421958a403c9" + passwordSecret: "is6SP2EdWg0NdmVGv6CEp5hRHNL7BKVMFem4t9pouMqDQnHwXMSomas1qcbKSt5yISr8eBHv4Y7Dbswhyz84Ut0TW6kqsiPs" + maxNumberIndex: "3" + retentionPeriod: "7d" + grafana: + endpoint: "" + contactPoint: "https://hooks.slack.com/services/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" image: simplyblock: repository: "public.ecr.aws/simply-block/simplyblock" - tag: "main" + tag: "improved_user_experience" pullPolicy: "Always" ports: - lvolNvmfPortStart: + lvolNvmfPortStart: 9100 storageclass: allowedTopologyZones: [] @@ -61,7 +58,7 @@ opensearch: persistence: enabled: true storageClass: local-hostpath - size: 10Gi + size: 20Gi resources: requests: @@ -180,7 +177,7 @@ prometheus: enabled: false ingress: - enabled: true + enabled: false ingressClassName: nginx useDNS: false host: "" @@ -193,3 +190,45 @@ ingress: extraArgs: tcp-services-configmap: "simplyblock/simplyblock-tcp-services" nodeSelector: {} + + +simplyblock: + cluster: + clusterName: demo-cluster + mgmtIfc: eth0 + fabric: tcp + isSingleNode: false + enableNodeAffinity: false + strictNodeAntiAffinity: false + capWarn: 80 + capCrit: 90 + provCapWarn: 120 + provCapCrit: 150 + + pool: + name: demo-pool + capacityLimit: 100Gi + + lvol: + name: demo-lvol + + storageNodes: + name: demo-node + clusterImage: public.ecr.aws/simply-block/simplyblock:improved_user_experience + mgmtIfc: eth0 + maxLVol: 10 + maxSize: 150G + partitions: 0 + corePercentage: 65 + spdkDebug: false + coreIsolation: false + workerNodes: + - vm02.simplyblock3.localdomain + - vm03.simplyblock3.localdomain + - vm04.simplyblock3.localdomain + + devices: + name: demo-devices + + tasks: + name: demo-task diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index a6d89b74d..1e14d7a80 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -835,8 +835,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, max_snap, spdk_image=None, spdk_debug=False, small_bufsize=0, large_bufsize=0, num_partitions_per_dev=0, jm_percent=0, enable_test_device=False, - namespace=None, enable_ha_jm=False, id_device_by_nqn=False, - partition_size="", ha_jm_count=3): + namespace=None, enable_ha_jm=False, cr_name=None, cr_namespace=None, cr_plural=None, + id_device_by_nqn=False, partition_size="", ha_jm_count=3): snode_api = SNodeClient(node_addr) node_info, _ = snode_api.info() if node_info.get("nodes_config") and node_info["nodes_config"].get("nodes"): @@ -1086,6 +1086,9 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.cloud_name = cloud_instance['cloud'] or "" snode.namespace = namespace + snode.cr_name = cr_name + snode.cr_namespace = cr_namespace + snode.cr_plural = cr_plural snode.ssd_pcie = ssd_pcie snode.hostname = hostname snode.host_nqn = subsystem_nqn @@ -1148,7 +1151,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.physical_label = 0 else: snode.physical_label = get_next_physical_device_order(snode) - + snode.num_partitions_per_dev = num_partitions_per_dev snode.jm_percent = jm_percent snode.id_device_by_nqn = id_device_by_nqn @@ -1683,7 +1686,7 @@ def restart_storage_node( cluster_ip = cluster_docker.info()["Swarm"]["NodeAddr"] else: - cluster_ip = utils.get_k8s_node_ip() + cluster_ip = utils.get_k8s_node_ip() total_mem = 0 for n in db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id): @@ -1864,7 +1867,7 @@ def restart_storage_node( snode.nvme_devices.append(dev) snode.write_to_db(db_controller.kv_store) - if node_ip and len(new_devices)>0: + if node_ip: # prepare devices on new node if snode.num_partitions_per_dev == 0 or snode.jm_percent == 0: diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 1f086dc2d..6711dc857 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -10,8 +10,8 @@ import sys import uuid import time -import socket -from typing import Union, Any, Optional, Tuple +from typing import Union, Any, Optional, Tuple, Dict +from datetime import datetime, timezone from docker import DockerClient from kubernetes import client, config from kubernetes.client import ApiException, V1Deployment, V1DeploymentSpec, V1ObjectMeta, \ @@ -198,16 +198,8 @@ def get_k8s_node_ip(): logger.error("No mgmt nodes was found in the cluster!") return False - mgmt_ips = [node.mgmt_ip for node in nodes] - - for ip in mgmt_ips: - try: - with socket.create_connection((ip, 10250), timeout=2): - return ip - except Exception as e: - print(e) - raise e - return False + for node in nodes: + return node.mgmt_ip def dict_agg(data, mean=False, keys=None): @@ -1937,6 +1929,254 @@ def load_kube_config_with_fallback(): except Exception: config.load_kube_config() +def patch_cr_status( + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + status_patch: dict, +): + """ + Patch the status subresource of a Custom Resource. + + status_patch example: + {"": "": } + """ + + load_kube_config_with_fallback() + + api = client.CustomObjectsApi() + + body = { + "status": status_patch + } + + try: + api.patch_namespaced_custom_object_status( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body=body, + ) + except ApiException as e: + raise RuntimeError( + f"Failed to patch status for {name}: {e.reason} {e.body}" + ) + +def patch_cr_node_status( + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + node_uuid: str, + node_mgmt_ip: str, + updates: Optional[Dict[str, Any]] = None, + remove: bool = False, +): + """ + Patch status.nodes[*] fields for a specific node identified by UUID. + + Operations: + - Update a node (by uuid or mgmtIp) + - Remove a node (by uuid or mgmtIp) + + updates example: + {"health": "true"} + {"status": "offline"} + {"capacity": {"sizeUsed": 1234}} + """ + load_kube_config_with_fallback() + api = client.CustomObjectsApi() + + try: + cr = api.get_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + ) + + status_nodes = cr.get("status", {}).get("nodes", []) + if not status_nodes: + raise RuntimeError("CR has no status.nodes") + + spec_worker_nodes = cr.get("spec", {}).get("workerNodes", []) + + found = False + new_status_nodes = [] + removed_hostname = None + + for node in status_nodes: + match = ( + node.get("uuid") == node_uuid or + node.get("mgmtIp") == node_mgmt_ip + ) + + if match: + found = True + removed_hostname = node.get("hostname") + + if remove: + continue + + if updates: + node.update(updates) + + new_status_nodes.append(node) + + if not found: + raise RuntimeError( + f"Node not found (uuid={node_uuid}, mgmtIp={node_mgmt_ip})" + ) + + if remove and removed_hostname: + new_worker_nodes = [ + n for n in spec_worker_nodes if n != removed_hostname + ] + + api.patch_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body={ + "spec": { + "workerNodes": new_worker_nodes + } + }, + ) + + api.patch_namespaced_custom_object_status( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body={ + "status": { + "nodes": new_status_nodes + } + }, + ) + + except ApiException as e: + raise RuntimeError( + f"Failed to patch node for {name}: {e.reason} {e.body}" + ) + +def patch_cr_lvol_status( + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + lvol_uuid: Optional[str] = None, + updates: Optional[Dict[str, Any]] = None, + remove: bool = False, + add: Optional[Dict[str, Any]] = None, +): + """ + Patch status.lvols[*] for an LVOL CustomResource. + + Operations: + - Update an existing LVOL (by uuid) + - Remove an LVOL (by uuid) + - Add a new LVOL entry + + Parameters: + lvol_uuid: + UUID of the lvol entry to update or remove + + updates: + Dict of fields to update on the matched lvol + Example: + {"status": "offline", "health": False} + + remove: + If True, remove the lvol identified by lvol_uuid + + add: + Full lvol dict to append to status.lvols + """ + + load_kube_config_with_fallback() + api = client.CustomObjectsApi() + + now = datetime.now(timezone.utc).isoformat() + + try: + cr = api.get_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + ) + + status = cr.get("status", {}) + lvols = status.get("lvols", []) + + # Ensure list exists + if lvols is None: + lvols = [] + + # ---- ADD ---- + if add is not None: + add.setdefault("createDt", now) + add["updateDt"] = now + lvols.append(add) + + # ---- UPDATE / REMOVE ---- + if lvol_uuid: + found = False + new_lvols = [] + + for lvol in lvols: + if lvol.get("uuid") == lvol_uuid: + found = True + + if remove: + continue + + if updates: + lvol.update(updates) + lvol["updateDt"] = now + + new_lvols.append(lvol) + + if not found: + raise RuntimeError(f"LVOL not found (uuid={lvol_uuid})") + + lvols = new_lvols + + body = { + "status": { + "lvols": lvols + } + } + + api.patch_namespaced_custom_object_status( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=name, + body=body, + ) + + except ApiException as e: + raise RuntimeError( + f"Failed to patch lvol status for {name}: {e.reason} {e.body}" + ) def get_node_name_by_ip(target_ip: str) -> str: load_kube_config_with_fallback() diff --git a/simplyblock_web/api/v1/__init__.py b/simplyblock_web/api/v1/__init__.py index 084a737cc..6df2a2db5 100644 --- a/simplyblock_web/api/v1/__init__.py +++ b/simplyblock_web/api/v1/__init__.py @@ -1,5 +1,5 @@ import logging -import os +import fdb from flask import jsonify from flask import Flask @@ -45,28 +45,21 @@ def status(): @api.route('/health/fdb', methods=['GET']) def health_fdb(): - fdb_cluster_file = constants.KVD_DB_FILE_PATH + try: + fdb.api_version(constants.KVD_DB_VERSION) + + db = fdb.open(constants.KVD_DB_FILE_PATH) + tr = db.create_transaction() + + tr.get(b"\x00") + tr.commit().wait() - if not os.path.exists(fdb_cluster_file): return jsonify({ - "fdb_connected": False, - "message": "FDB cluster file not found" - }), 503 + "fdb_connected": True + }), 200 - try: - with open(fdb_cluster_file, 'r') as f: - cluster_data = f.read().strip() - if not cluster_data: - return jsonify({ - "fdb_connected": False, - "message": "FDB cluster file is empty" - }), 503 except Exception as e: return jsonify({ "fdb_connected": False, - "message": f"Failed to read FDB cluster file: {str(e)}" + "error": str(e) }), 503 - - return jsonify({ - "fdb_connected": True, - }), 200 diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py index 2447cf958..9bb191602 100644 --- a/simplyblock_web/api/v1/cluster.py +++ b/simplyblock_web/api/v1/cluster.py @@ -47,6 +47,9 @@ def add_cluster(): qpair_count = cl_data.get('qpair_count', 256) name = cl_data.get('name', None) fabric = cl_data.get('fabric', "tcp") + cr_name = cl_data.get('cr_name', None) + cr_namespace = cl_data.get('cr_namespace', None) + cr_plural = cl_data.get('cr_plural', None) max_queue_size = cl_data.get('max_queue_size', 128) inflight_io_threshold = cl_data.get('inflight_io_threshold', 4) @@ -56,7 +59,8 @@ def add_cluster(): return utils.get_response(cluster_ops.add_cluster( blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, - qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric + qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, + cr_name, cr_namespace, cr_plural, fabric )) @@ -78,7 +82,7 @@ def create_first_cluster(): distr_npcs = cl_data.get('distr_npcs', 1) distr_bs = cl_data.get('distr_bs', 4096) distr_chunk_bs = cl_data.get('distr_chunk_bs', 4096) - ha_type = cl_data.get('ha_type', 'single') + ha_type = cl_data.get('ha_type', 'ha') enable_node_affinity = cl_data.get('enable_node_affinity', False) qpair_count = cl_data.get('qpair_count', 256) name = cl_data.get('name', None) @@ -91,6 +95,9 @@ def create_first_cluster(): inflight_io_threshold = cl_data.get('inflight_io_threshold', 4) strict_node_anti_affinity = cl_data.get('strict_node_anti_affinity', False) is_single_node = cl_data.get('is_single_node', False) + cr_name = cl_data.get('cr_name', None) + cr_namespace = cl_data.get('cr_namespace', None) + cr_plural = cl_data.get('cr_plural', None) cluster_ip = cl_data.get('cluster_ip', None) grafana_secret = cl_data.get('grafana_secret', None) @@ -98,8 +105,8 @@ def create_first_cluster(): cluster_id = cluster_ops.add_cluster( blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, - qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric, - cluster_ip=cluster_ip, grafana_secret=grafana_secret) + qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, + cr_name, cr_namespace, cr_plural, fabric, cluster_ip=cluster_ip, grafana_secret=grafana_secret) if cluster_id: return utils.get_response(db.get_cluster_by_id(cluster_id).to_dict()) else: diff --git a/simplyblock_web/api/v2/__init__.py b/simplyblock_web/api/v2/__init__.py index c3723cce6..c4c0168c7 100644 --- a/simplyblock_web/api/v2/__init__.py +++ b/simplyblock_web/api/v2/__init__.py @@ -38,6 +38,9 @@ def _verify_api_token( storage_node.api.include_router(storage_node.instance_api) cluster.instance_api.include_router(storage_node.api) + +task.api.include_router(task.instance_api) + cluster.instance_api.include_router(task.api) volume.api.include_router(volume.instance_api) diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py index 49f8a09e8..009e4cd96 100644 --- a/simplyblock_web/api/v2/cluster.py +++ b/simplyblock_web/api/v2/cluster.py @@ -35,7 +35,7 @@ class ClusterParams(BaseModel): distr_npcs: int = 1 distr_bs: int = 4096 distr_chunk_bs: int = 4096 - ha_type: Literal['single', 'ha'] = 'single' + ha_type: Literal['single', 'ha'] = 'ha' qpair_count: int = 256 max_queue_size: int = 128 inflight_io_threshold: int = 4 @@ -43,6 +43,9 @@ class ClusterParams(BaseModel): strict_node_anti_affinity: bool = False is_single_node: bool = False fabric: str = "tcp" + cr_name: str = "" + cr_namespace: str = "" + cr_plural: str = "" cluster_ip: str = "" grafana_secret: str = "" @@ -163,6 +166,13 @@ def activate(cluster: Cluster) -> Response: ).start() return Response(status_code=202) # FIXME: Provide URL for checking task status +@instance_api.post('/expand', name='clusters:expand', status_code=202, responses={202: {"content": None}}) +def expand(cluster: Cluster) -> Response: + Thread( + target=cluster_ops.cluster_expand, + args=(cluster.get_id(),), + ).start() + return Response(status_code=202) # FIXME: Provide URL for checking task status @instance_api.post('/update', name='clusters:upgrade', status_code=204, responses={204: {"content": None}}) def update_cluster( cluster: Cluster, parameters: _UpdateParams) -> Response: diff --git a/simplyblock_web/api/v2/device.py b/simplyblock_web/api/v2/device.py index 4fa0949fb..b0015b69b 100644 --- a/simplyblock_web/api/v2/device.py +++ b/simplyblock_web/api/v2/device.py @@ -49,13 +49,19 @@ def get(cluster: Cluster, storage_node: StorageNode, device: Device) -> DeviceDT return DeviceDTO.from_model(device, stat_obj) -@instance_api.delete('/', name='clusters:storage_nodes:devices:delete', status_code=204, responses={204: {"content": None}}) -def delete(cluster: Cluster, storage_node: StorageNode, device: Device) -> Response: - if not device_controller.device_remove(device.get_id()): +@instance_api.post('/remove', name='clusters:storage_nodes:devices:remove', status_code=204, responses={204: {"content": None}}) +def remove(cluster: Cluster, storage_node: StorageNode, device: Device, force: bool = False) -> Response: + if not device_controller.device_remove(device.get_id(), force): raise ValueError('Failed to remove device') return Response(status_code=204) +@instance_api.post('/restart', name='clusters:storage_nodes:devices:restart', status_code=204, responses={204: {"content": None}}) +def restart(cluster: Cluster, storage_node: StorageNode, device: Device, force: bool = False) -> Response: + if not device_controller.restart_device(device.get_id(), force): + raise ValueError('Failed to restart device') + + return Response(status_code=204) @instance_api.get('/capacity', name='clusters:storage_nodes:devices:capacity') def capacity( diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index 62f1a94e1..b30e3ee26 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -40,13 +40,12 @@ def from_model(model: StatsObject): class ClusterDTO(BaseModel): - id: UUID + uuid: UUID name: Optional[str] nqn: str status: Literal['active', 'read_only', 'inactive', 'suspended', 'degraded', 'unready', 'in_activation', 'in_expansion'] - rebalancing: bool + is_re_balancing: bool block_size: util.Unsigned - coding: Tuple[util.Unsigned, util.Unsigned] ha: bool utliziation_critical: util.Percent utilization_warning: util.Percent @@ -55,18 +54,21 @@ class ClusterDTO(BaseModel): node_affinity: bool anti_affinity: bool secret: str + distr_ndcs: int + distr_npcs: int capacity: CapacityStatDTO @staticmethod def from_model(model: Cluster, stat_obj: Optional[StatsObject]=None): return ClusterDTO( - id=UUID(model.get_id()), + uuid=UUID(model.get_id()), name=model.cluster_name, nqn=model.nqn, status=model.status, # type: ignore - rebalancing=model.is_re_balancing, + is_re_balancing=model.is_re_balancing, block_size=model.blk_size, - coding=(model.distr_ndcs, model.distr_npcs), + distr_ndcs=model.distr_ndcs, + distr_npcs=model.distr_npcs, ha=model.ha_type == 'ha', utilization_warning=model.cap_warn, utliziation_critical=model.cap_crit, @@ -181,6 +183,13 @@ def from_model(model: SnapShot, request: Request, cluster_id, pool_id, volume_id class StorageNodeDTO(BaseModel): uuid: UUID status: str + hostname: str + cpu: int + spdk_mem: int + lvols: int + rpc_port: int + lvol_subsys_port: int + nvmf_port: int mgmt_ip: IPv4Address health_check: bool online_devices: str @@ -191,6 +200,13 @@ def from_model(model: StorageNode, stat_obj: Optional[StatsObject]=None): return StorageNodeDTO( uuid=UUID(model.get_id()), status=model.status, + hostname=model.hostname, + cpu=model.cpu, + spdk_mem=model.spdk_mem, + lvols=model.lvols, + rpc_port=model.rpc_port, + lvol_subsys_port=model.lvol_subsys_port, + nvmf_port=model.nvmf_port, mgmt_ip=IPv4Address(model.mgmt_ip), health_check=model.health_check, online_devices=f"{len(model.nvme_devices)}/{len([d for d in model.nvme_devices if d.status=='online'])}", @@ -211,7 +227,7 @@ class TaskDTO(BaseModel): @staticmethod def from_model(model: JobSchedule): return TaskDTO( - id=UUID(model.get_id()), + id=UUID(model.uuid), status=model.status, canceled=model.canceled, function_name=model.function_name, @@ -228,12 +244,24 @@ class VolumeDTO(BaseModel): status: str health_check: bool nqn: str + hostname: str + fabric: str nodes: List[util.UrlPath] port: util.Port size: util.Unsigned + ndcs: int + npcs: int + pool_uuid: str + pool_name: str + pvc_name: str = "" + snapshot_name: str = "" + blobid: int + ns_id: int cloned_from: Optional[util.UrlPath] crypto_key: Optional[Tuple[str, str]] high_availability: bool + lvol_priority_class: util.Unsigned + max_namespace_per_subsys: int max_rw_iops: util.Unsigned max_rw_mbytes: util.Unsigned max_r_mbytes: util.Unsigned @@ -248,6 +276,8 @@ def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optiona status=model.status, health_check=model.health_check, nqn=model.nqn, + hostname=model.hostname, + fabric=model.fabric, nodes=[ str(request.url_for( 'clusters:storage-nodes:detail', @@ -270,6 +300,16 @@ def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optiona else None ), high_availability=model.ha_type == 'ha', + pool_uuid=model.pool_uuid, + pool_name=model.pool_name, + pvc_name=model.pvc_name, + snapshot_name=model.snapshot_name, + ndcs=model.ndcs, + npcs=model.npcs, + blobid=model.blobid, + ns_id=model.ns_id, + lvol_priority_class=model.lvol_priority_class, + max_namespace_per_subsys=model.max_namespace_per_subsys, max_rw_iops=model.rw_ios_per_sec, max_rw_mbytes=model.rw_mbytes_per_sec, max_r_mbytes=model.r_mbytes_per_sec, diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py index 4ccae01ab..4bc201a28 100644 --- a/simplyblock_web/api/v2/pool.py +++ b/simplyblock_web/api/v2/pool.py @@ -39,6 +39,9 @@ class StoragePoolParams(BaseModel): max_rw_mbytes: util.Unsigned = 0 max_r_mbytes: util.Unsigned = 0 max_w_mbytes: util.Unsigned = 0 + cr_name: str + cr_namespace: str + cr_plural: str @api.post('/', name='clusters:storage-pools:create', status_code=201, responses={201: {"content": None}}) @@ -52,7 +55,7 @@ def add(request: Request, cluster: Cluster, parameters: StoragePoolParams) -> Re id_or_false = pool_controller.add_pool( parameters.name, parameters.pool_max, parameters.volume_max_size, parameters.max_rw_iops, parameters.max_rw_mbytes, - parameters.max_r_mbytes, parameters.max_w_mbytes, cluster.get_id() + parameters.max_r_mbytes, parameters.max_w_mbytes, cluster.get_id(), parameters.cr_name, parameters.cr_namespace, parameters.cr_plural ) if not id_or_false: @@ -102,6 +105,9 @@ class UpdatableStoragePoolParams(BaseModel): max_rw_mbytes: Optional[util.Unsigned] = None max_r_mbytes: Optional[util.Unsigned] = None max_w_mbytes: Optional[util.Unsigned] = None + lvols_cr_name: Optional[str] = None + lvols_cr_namespace: Optional[str] = None + lvols_cr_plural: Optional[str] = None @instance_api.put('/', name='clusters:storage-pools:update', status_code=204, responses={204: {"content": None}}) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index aa7923d36..5159425f1 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -35,17 +35,21 @@ def list(cluster: Cluster) -> List[StorageNodeDTO]: class StorageNodeParams(BaseModel): node_address: Annotated[str, Field(web_utils.IP_PATTERN)] interface_name: str - max_snapshots: int = Field(500) - ha_jm: bool = Field(True) - test_device: bool = Field(False) + max_snapshots: Optional[int] = Field(500) + ha_jm: Optional[bool] = Field(True) + test_device: Optional[bool] = Field(False) spdk_image: Optional[str] = Field("") spdk_debug: bool = Field(False) data_nics: List[str] = Field([]) namespace: str = Field('default') + id_device_by_nqn: Optional[bool] = Field(False) jm_percent: util.Percent = Field(3) partitions: int = Field(1) iobuf_small_pool_count: int = Field(0) iobuf_large_pool_count: int = Field(0) + cr_name: str + cr_namespace: str + cr_plural: str @api.post('/', name='clusters:storage-nodes:create', status_code=201, responses={201: {"content": None}}) @@ -67,6 +71,10 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams): 'enable_test_device': parameters.test_device, 'namespace': parameters.namespace, 'enable_ha_jm': parameters.ha_jm, + 'id_device_by_nqn': parameters.id_device_by_nqn, + 'cr_name': parameters.cr_name, + 'cr_namespace': parameters.cr_namespace, + 'cr_plural': parameters.cr_plural, } ) if not task_id_or_false: @@ -98,12 +106,19 @@ def get(cluster: Cluster, storage_node: StorageNode): @instance_api.delete('/', name='clusters:storage-nodes:delete') def delete( - cluster: Cluster, storage_node: StorageNode, force_remove: bool = False, force_migrate: bool = False) -> Response: + cluster: Cluster, storage_node: StorageNode, force_remove: bool = False, force_migrate: bool = False, force_delete: bool = False ) -> Response: none_or_false = storage_node_ops.remove_storage_node( storage_node.get_id(), force_remove=force_remove, force_migrate=force_migrate ) if none_or_false == False: # noqa raise ValueError('Failed to remove storage node') + + if force_delete: + none_or_false = storage_node_ops.delete_storage_node( + storage_node.get_id(), force=force_delete + ) + if none_or_false == False: # noqa + raise ValueError('Failed to delete storage node') return Response(status_code=204) @@ -201,17 +216,20 @@ def shutdown(cluster: Cluster, storage_node: StorageNode, force: bool = False) - class _RestartParams(BaseModel): force: bool = False reattach_volume: bool = False + node_address: Optional[Annotated[str, Field(pattern=web_utils.IP_PATTERN)]] = None + @instance_api.post('/start', name='clusters:storage-nodes:start', status_code=202, responses={202: {"content": None}}) # Same as restart for now @instance_api.post('/restart', name='clusters:storage-nodes:restart', status_code=202, responses={202: {"content": None}}) -def restart(cluster: Cluster, storage_node: StorageNode, parameters: _RestartParams = _RestartParams()) -> Response: +def restart(cluster: Cluster, storage_node: StorageNode, parameters: _RestartParams) -> Response: storage_node = storage_node Thread( target=storage_node_ops.restart_storage_node, kwargs={ "node_id": storage_node.get_id(), "force": parameters.force, + "node_ip": parameters.node_address, "reattach_volume": parameters.reattach_volume, } ).start() diff --git a/simplyblock_web/api/v2/task.py b/simplyblock_web/api/v2/task.py index 83890640f..94ecccce3 100644 --- a/simplyblock_web/api/v2/task.py +++ b/simplyblock_web/api/v2/task.py @@ -5,7 +5,6 @@ from simplyblock_core.db_controller import DBController from simplyblock_core.models.job_schedule import JobSchedule -from simplyblock_core.controllers import tasks_controller from .cluster import Cluster from .dtos import TaskDTO @@ -16,13 +15,13 @@ @api.get('/', name='clusters:tasks:list') def list(cluster: Cluster) -> List[TaskDTO]: - return [ - TaskDTO.from_model(task) - for task - in tasks_controller.list_tasks(cluster.get_id()) - if task.cluster_id == cluster.get_id() - ] - + cluster_tasks = db.get_job_tasks(cluster.get_id(), limit=0) + data=[] + for t in cluster_tasks: + if t.function_name == JobSchedule.FN_DEV_MIG: + continue + data.append(t) + return [TaskDTO.from_model(task) for task in data] instance_api = APIRouter(prefix='/{task_id}') @@ -40,5 +39,3 @@ def _lookup_task(task_id: UUID) -> JobSchedule: @instance_api.get('/', name='clusters:tasks:detail') def get(cluster: Cluster, task: Task) -> TaskDTO: return TaskDTO.from_model(task) - -api.include_router(instance_api) diff --git a/simplyblock_web/templates/storage_core_isolation.yaml.j2 b/simplyblock_web/templates/storage_core_isolation.yaml.j2 index b6fafe2ee..9ae4ba8e8 100644 --- a/simplyblock_web/templates/storage_core_isolation.yaml.j2 +++ b/simplyblock_web/templates/storage_core_isolation.yaml.j2 @@ -91,7 +91,7 @@ spec: - name: etc mountPath: /etc - name: rootfs - mountPath: / + mountPath: /host - name: var-simplyblock mountPath: /var/simplyblock command: ["/bin/sh", "-c"] @@ -113,13 +113,13 @@ spec: apt update && apt install -y grep jq nvme-cli tuned ;; ubuntu) - apt update && apt install -y grep jq nvme-cli tuned - apt-get install -y linux-modules-extra-$(uname -r) + chroot /host apt update && chroot /host apt install -y grep jq nvme-cli tuned + chroot /host apt-get install -y linux-modules-extra-$(uname -r) ;; centos|rhel|rocky|almalinux) export YUM_RELEASEVER=$(awk -F'=' '/^VERSION_ID=/{gsub(/"/,"",$2); print $2}' /etc/os-release) export DNF_RELEASEVER=$(awk -F'=' '/^VERSION_ID=/{gsub(/"/,"",$2); print $2}' /etc/os-release) - dnf install -y grep jq nvme-cli kernel-modules-extra tuned \ + chroot /host dnf install -y grep jq nvme-cli kernel-modules-extra tuned \ --setopt=tsflags=nocontexts,noscripts --setopt=install_weak_deps=False ;; *) @@ -204,7 +204,7 @@ spec: tuned-adm profile "$TUNED_PROFILE" case "$OS_ID" in centos|rhel|rocky|almalinux) - grubby --update-kernel=ALL --args="isolcpus=$ISOLATED_CORES nohz_full=$ISOLATED_CORES rcu_nocbs=$ISOLATED_CORES" + chroot /host grubby --update-kernel=ALL --args="isolcpus=$ISOLATED_CORES nohz_full=$ISOLATED_CORES rcu_nocbs=$ISOLATED_CORES" ;; *) echo "" diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index 81f1e1eda..d118927cd 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -16,6 +16,7 @@ spec: nodeSelector: kubernetes.io/hostname: {{ HOSTNAME }} hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet tolerations: - effect: NoSchedule operator: Exists @@ -57,6 +58,11 @@ spec: - name: dockercontainerlogdirectory hostPath: path: /var/log/pods + {% if MODE == "kubernetes" %} + - name: config + configMap: + name: simplyblock-fluent-bit-config + {% endif %} initContainers: - name: copy-script @@ -160,4 +166,19 @@ spec: - name: dockercontainerlogdirectory mountPath: /var/log/pods readOnly: true + {% elif MODE == "kubernetes" %} + - name: fluent-bit + image: fluent/fluent-bit:1.8.11 + volumeMounts: + - name: varlog + mountPath: /var/log + - name: config + mountPath: /fluent-bit/etc/ + resources: + requests: + cpu: "100m" + memory: "200Mi" + limits: + cpu: "400m" + memory: "2Gi" {% endif %} From 4fbf425eec65f8877e8344976dbfc98e24cc12c1 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 19 Jan 2026 15:09:33 +0100 Subject: [PATCH 129/192] Main sfam 2359 api (#844) * added api for snapshot replication * removed helm chart dep * fixed Remove assignment to unused variable --- simplyblock_web/api/v1/cluster.py | 17 +++++++++++++++++ simplyblock_web/api/v1/lvol.py | 25 ++++++++++++++++++++++++- simplyblock_web/api/v2/cluster.py | 16 +++++++++++++++- simplyblock_web/api/v2/volume.py | 7 +++++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py index 2447cf958..645155fda 100644 --- a/simplyblock_web/api/v1/cluster.py +++ b/simplyblock_web/api/v1/cluster.py @@ -275,6 +275,23 @@ def cluster_activate(uuid): # FIXME: Any failure within the thread are not handled return utils.get_response(True), 202 +@bp.route('/cluster/addreplication/', methods=['PUT']) +def cluster_add_replication(uuid): + req_data = request.get_json() + target_cluster_uuid = req_data.get("target_cluster_uuid", None) + replication_timeout = req_data.get("replication_timeout", 0) + target_pool_uuid = req_data.get("target_pool_uuid", None) + + try: + db.get_cluster_by_id(uuid) + except KeyError: + return utils.get_response_error(f"Cluster not found: {uuid}", 404) + + cluster_ops.add_replication(source_cl_id=uuid, target_cl_id=target_cluster_uuid, + timeout=replication_timeout, target_pool=target_pool_uuid) + return utils.get_response(True), 202 + + @bp.route('/cluster/allstats//history/', methods=['GET']) @bp.route('/cluster/allstats/', methods=['GET'], defaults={'history': None}) diff --git a/simplyblock_web/api/v1/lvol.py b/simplyblock_web/api/v1/lvol.py index dbb77f6a2..78d58b024 100644 --- a/simplyblock_web/api/v1/lvol.py +++ b/simplyblock_web/api/v1/lvol.py @@ -158,6 +158,7 @@ def add_lvol(): ndcs = utils.get_value_or_default(cl_data, "ndcs", 0) npcs = utils.get_value_or_default(cl_data, "npcs", 0) fabric = utils.get_value_or_default(cl_data, "fabric", "tcp") + do_replicate = utils.get_value_or_default(cl_data, "do_replicate", False) ret, error = lvol_controller.add_lvol_ha( name=name, @@ -186,7 +187,8 @@ def add_lvol(): max_namespace_per_subsys=max_namespace_per_subsys, ndcs=ndcs, npcs=npcs, - fabric=fabric + fabric=fabric, + do_replicate=do_replicate ) return utils.get_response(ret, error, http_code=400) @@ -306,3 +308,24 @@ def inflate_lvol(uuid): ret = lvol_controller.inflate_lvol(uuid) return utils.get_response(ret) + +@bp.route('/lvol/replication_start/', methods=['PUT']) +def replication_start(uuid): + try: + db.get_lvol_by_id(uuid) + except KeyError as e: + return utils.get_response_error(str(e), 404) + + ret = lvol_controller.replication_start(uuid) + return utils.get_response(ret) + +@bp.route('/lvol/replication_stop/', methods=['PUT']) +def replication_stop(uuid): + try: + db.get_lvol_by_id(uuid) + except KeyError as e: + return utils.get_response_error(str(e), 404) + + ret = lvol_controller.replication_stop(uuid) + return utils.get_response(ret) + \ No newline at end of file diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py index 7834e3f06..6e3da0eef 100644 --- a/simplyblock_web/api/v2/cluster.py +++ b/simplyblock_web/api/v2/cluster.py @@ -17,6 +17,11 @@ db = DBController() +class _ReplicationParams(BaseModel): + snapshot_replication_target_cluster: Optional[str] + snapshot_replication_timeout: Optional[str] + target_pool: Optional[str] + class _UpdateParams(BaseModel): management_image: Optional[str] spdk_image: Optional[str] @@ -157,7 +162,16 @@ def activate(cluster: Cluster) -> Response: ).start() return Response(status_code=202) # FIXME: Provide URL for checking task status - +@instance_api.post('/addreplication', name='clusters:addreplication', status_code=202, responses={202: {"content": None}}) +def cluster_add_replication(cluster: Cluster, parameters: _ReplicationParams) -> Response: + cluster_ops.add_replication( + source_cl_id=cluster.get_id(), + target_cl_id=parameters.snapshot_replication_target_cluster, + timeout=parameters.snapshot_replication_timeout, + target_pool=parameters.target_pool + ) + return Response(status_code=202) + @instance_api.post('/update', name='clusters:upgrade', status_code=204, responses={204: {"content": None}}) def update_cluster( cluster: Cluster, parameters: _UpdateParams) -> Response: cluster_ops.update_cluster( diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 698788718..4c798943c 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -43,6 +43,9 @@ class _CreateParams(BaseModel): pvc_name: Optional[str] = None ndcs: util.Unsigned = 0 npcs: util.Unsigned = 0 + fabric: str = "tcp" + max_namespace_per_subsys: int = 1 + do_replicate: bool = False class _CloneParams(BaseModel): @@ -85,6 +88,10 @@ def add( pvc_name=data.pvc_name, ndcs=data.ndcs, npcs=data.npcs, + fabric=data.fabric, + max_namespace_per_subsys=data.max_namespace_per_subsys, + do_replicate=data.do_replicate, + ) elif isinstance(data, _CloneParams): volume_id_or_false, error = snapshot_controller.clone( From 575816b782e15391bf427040d78126af7c86fde1 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 19 Jan 2026 15:50:24 +0100 Subject: [PATCH 130/192] added replication_start and stop to api v2 (#845) --- simplyblock_web/api/v2/volume.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 4c798943c..3aef34f56 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -178,6 +178,19 @@ def inflate(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: return Response(status_code=204) +@instance_api.post('/replication_start', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) +def replication_start(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: + if not lvol_controller.replication_start(volume.get_id()): + raise ValueError('Failed to start volume snapshot replication') + + return Response(status_code=204) + +@instance_api.post('/replication_stop', name='clusters:storage-pools:volumes:replication_stop', status_code=204, responses={204: {"content": None}}) +def replication_stop(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: + if not lvol_controller.replication_stop(volume.get_id()): + raise ValueError('Failed to stop volume snapshot replication') + + return Response(status_code=204) @instance_api.get('/connect', name='clusters:storage-pools:volumes:connect') def connect(cluster: Cluster, pool: StoragePool, volume: Volume): From 852a04e0b422dda18c7b469dc26e68d4719a9f8b Mon Sep 17 00:00:00 2001 From: Waleed Mousa <32266980+wmousa@users.noreply.github.com> Date: Wed, 21 Jan 2026 20:12:59 +0100 Subject: [PATCH 131/192] Waleed Hotfix changes to main (#846) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * Update socket parameter in docker.py * Fix mandatory argument handling in storage_node__configure * Add socket field to storage node configuration * Format nvme devices when run sbcli sn configure with --force (#760) * Format nvme devices when run sbcli configure with --force Adding a new filter for specifying the nvme devices --device-model and --size-range * Add sbctl sn clean-devices, to free nvme devices and wipe partitions * fix python linter * fix checks * Add small fix * When cleaning the nvme devices, do that only for nvme devices in sn_config_file * Fix RPC socket path * Pass the socket when adding sn node * Add the mount of /mnt/ramdisk to docker deployment * Fix calculate total_mem for multi sn nodes on same numa (#767) * Fix heredoc syntax in systemd unit creation * R25.10 hotfix isolate (#792) * redistribute isoalted cores allocation * Second version * redistribute isoalted cores allocation * Second version * fix type checker * prepare for merge * fix type checker --------- Co-authored-by: hamdykhader * Add --cores-percentage to sbctl sn configure and support oracle OS for cores isolation (#813) * remove the ramdisk systemd for talOS * delay nbd stop and controller detach if the device not ready Also set the max number of distribs to 8 and when more cores applied set them for secondary distribs * Fix the number of pooler cores * Return back max number of distribs to be 12 distribs (#827) * Return back max number of distribs to be 12 distribs * fix unit testing --------- Co-authored-by: hamdykhader * Increase total sys HP memory with a buffer .5G for each sn node and and container limit * Change hugepages memory variable from MEM_GEGA to MEM_MEGA * Change hugepages memory variable from MEM_GEGA to MEM_MEGA2 * Move my hotfix changes to main * updated socker_dir * Update docker.py to mount ramdisk for proxy * Fix PR checks --------- Co-authored-by: schmidt-scaled Co-authored-by: hamdykhader Co-authored-by: geoffrey1330 Co-authored-by: Raunak Jalan <41023976+RaunakJalan@users.noreply.github.com> --- simplyblock_cli/cli-reference.yaml | 46 +- simplyblock_cli/cli.py | 13 +- simplyblock_cli/clibase.py | 28 +- simplyblock_core/env_var | 4 +- simplyblock_core/models/storage_node.py | 2 + simplyblock_core/rpc_client.py | 6 + simplyblock_core/scripts/config_docker.sh | 2 +- .../services/spdk_http_proxy_server.py | 4 +- simplyblock_core/snode_client.py | 5 +- simplyblock_core/storage_node_ops.py | 300 +++++----- simplyblock_core/test/test_utils.py | 55 +- simplyblock_core/utils/__init__.py | 527 +++++++++++++++--- .../api/internal/storage_node/docker.py | 16 +- .../api/internal/storage_node/kubernetes.py | 11 +- simplyblock_web/node_configure.py | 156 +++--- .../templates/storage_core_isolation.yaml.j2 | 2 +- .../templates/storage_deploy_spdk.yaml.j2 | 12 +- .../templates/storage_init_job.yaml.j2 | 31 +- 18 files changed, 897 insertions(+), 323 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 59357ce89..d5cad51c0 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -47,7 +47,7 @@ commands: _150 TiB / 3 * 2 = 100TiB_ would be a safe choice. dest: max_prov type: str - required: true + required: false - name: "--nodes-per-socket" help: "number of each node to be added per each socket." dest: nodes_per_socket @@ -60,6 +60,16 @@ commands: dest: sockets_to_use type: str default: "0" + - name: "--cores-percentage" + help: "The percentage of cores to be used for spdk (0-99)" + description: > + The percentage of cores to be used for spdk (0-99) + dest: cores_percentage + type: + range: + min: 0 + max: 99 + default: 0 - name: "--pci-allowed" help: "Comma separated list of PCI addresses of Nvme devices to use for storage devices." description: > @@ -76,6 +86,27 @@ commands: required: false type: str default: "" + - name: "--device-model" + help: "NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together" + description: > + NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together + dest: device_model + required: false + type: str + default: "" + - name: "--size-range" + help: "NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together" + description: > + NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together + dest: size_range + required: false + type: str + default: "" + - name: "--force" + help: "Force format detected or passed nvme pci address to 4K and clean partitions" + dest: force + type: bool + action: store_true - name: configure-upgrade help: "Upgrade the automated configuration file with new changes of cpu mask or storage devices" description: > @@ -86,6 +117,19 @@ commands: description: > Run locally on storage nodes and control plane hosts. Remove a previous deployment to support a fresh scratch-deployment of cluster software. + + - name: clean-devices + help: "clean devices stored in /etc/simplyblock/sn_config_file (local run)" + description: > + Run locally on storage nodes to clean nvme devices and free them. + arguments: + - name: "--config-path" + help: "Config path to read stored nvme devices from" + dest: config_path + required: false + type: str + default: "/etc/simplyblock/sn_config_file" + - name: add-node help: "Adds a storage node by its IP address" arguments: diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index e70f72339..d55b8317f 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -36,6 +36,7 @@ def init_storage_node(self): self.init_storage_node__configure(subparser) self.init_storage_node__configure_upgrade(subparser) self.init_storage_node__deploy_cleaner(subparser) + self.init_storage_node__clean_devices(subparser) self.init_storage_node__add_node(subparser) self.init_storage_node__delete(subparser) self.init_storage_node__remove(subparser) @@ -87,11 +88,15 @@ def init_storage_node__deploy(self, subparser): def init_storage_node__configure(self, subparser): subcommand = self.add_sub_command(subparser, 'configure', 'Prepare a configuration file to be used when adding the storage node') argument = subcommand.add_argument('--max-lvol', help='Max logical volume per storage node', type=int, dest='max_lvol', required=True) - argument = subcommand.add_argument('--max-size', help='Maximum amount of GB to be utilized on this storage node', type=str, dest='max_prov', required=True) + argument = subcommand.add_argument('--max-size', help='Maximum amount of GB to be utilized on this storage node', type=str, dest='max_prov', required=False) argument = subcommand.add_argument('--nodes-per-socket', help='number of each node to be added per each socket.', type=int, default=1, dest='nodes_per_socket') argument = subcommand.add_argument('--sockets-to-use', help='The system socket to use when adding the storage nodes', type=str, default='0', dest='sockets_to_use') + argument = subcommand.add_argument('--cores-percentage', help='The percentage of cores to be used for spdk (0-99)', type=range_type(0, 99), default=0, dest='cores_percentage') argument = subcommand.add_argument('--pci-allowed', help='Comma separated list of PCI addresses of Nvme devices to use for storage devices.', type=str, default='', dest='pci_allowed', required=False) argument = subcommand.add_argument('--pci-blocked', help='Comma separated list of PCI addresses of Nvme devices to not use for storage devices', type=str, default='', dest='pci_blocked', required=False) + argument = subcommand.add_argument('--device-model', help='NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together', type=str, default='', dest='device_model', required=False) + argument = subcommand.add_argument('--size-range', help='NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together', type=str, default='', dest='size_range', required=False) + argument = subcommand.add_argument('--force', help='Force format detected or passed nvme pci address to 4K and clean partitions', dest='force', action='store_true') def init_storage_node__configure_upgrade(self, subparser): subcommand = self.add_sub_command(subparser, 'configure-upgrade', 'Upgrade the automated configuration file with new changes of cpu mask or storage devices') @@ -99,6 +104,10 @@ def init_storage_node__configure_upgrade(self, subparser): def init_storage_node__deploy_cleaner(self, subparser): subcommand = self.add_sub_command(subparser, 'deploy-cleaner', 'Cleans a previous simplyblock deploy (local run)') + def init_storage_node__clean_devices(self, subparser): + subcommand = self.add_sub_command(subparser, 'clean-devices', 'clean devices stored in /etc/simplyblock/sn_config_file (local run)') + argument = subcommand.add_argument('--config-path', help='Config path to read stored nvme devices from', type=str, default='/etc/simplyblock/sn_config_file', dest='config_path', required=False) + def init_storage_node__add_node(self, subparser): subcommand = self.add_sub_command(subparser, 'add-node', 'Adds a storage node by its IP address') subcommand.add_argument('cluster_id', help='Cluster id', type=str) @@ -809,6 +818,8 @@ def run(self): ret = self.storage_node__configure_upgrade(sub_command, args) elif sub_command in ['deploy-cleaner']: ret = self.storage_node__deploy_cleaner(sub_command, args) + elif sub_command in ['clean-devices']: + ret = self.storage_node__clean_devices(sub_command, args) elif sub_command in ['add-node']: if not self.developer_mode: args.jm_percent = 3 diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 834dd2bab..5e76e89fd 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -87,8 +87,7 @@ def storage_node__configure_upgrade(self, sub_command, args): def storage_node__configure(self, sub_command, args): if not args.max_lvol: self.parser.error(f"Mandatory argument '--max-lvol' not provided for {sub_command}") - if not args.max_prov: - self.parser.error(f"Mandatory argument '--max-size' not provided for {sub_command}") + max_size = getattr(args, "max_prov") or 0 sockets_to_use = [0] if args.sockets_to_use: try: @@ -101,21 +100,38 @@ def storage_node__configure(self, sub_command, args): self.parser.error(f"nodes_per_socket {args.nodes_per_socket}must be either 1 or 2") if args.pci_allowed and args.pci_blocked: self.parser.error("pci-allowed and pci-blocked cannot be both specified") - max_prov = utils.parse_size(args.max_prov, assume_unit='G') + max_prov = utils.parse_size(max_size, assume_unit='G') pci_allowed = [] pci_blocked = [] if args.pci_allowed: pci_allowed = [str(x) for x in args.pci_allowed.split(',')] if args.pci_blocked: pci_blocked = [str(x) for x in args.pci_blocked.split(',')] - - return storage_ops.generate_automated_deployment_config(args.max_lvol, max_prov, sockets_to_use, - args.nodes_per_socket, pci_allowed, pci_blocked) + if (args.device_model and not args.size_range) or (not args.device_model and args.size_range): + self.parser.error("device_model and size_range must be set together") + use_pci_allowed = bool(args.pci_allowed) + use_pci_blocked = bool(args.pci_blocked) + use_model_range = bool(args.device_model and args.size_range) + if sum([use_pci_allowed, use_pci_blocked, use_model_range]) > 1: + self.parser.error( + "Options --pci-allowed, --pci-blocked, and " + "(--device-model with --size-range) are mutually exclusive; choose only one." + ) + cores_percentage = int(args.cores_percentage) + + return storage_ops.generate_automated_deployment_config( + args.max_lvol, max_prov, sockets_to_use,args.nodes_per_socket, + pci_allowed, pci_blocked, force=args.force, device_model=args.device_model, + size_range=args.size_range, cores_percentage=cores_percentage) def storage_node__deploy_cleaner(self, sub_command, args): storage_ops.deploy_cleaner() return True # remove once CLI changed to exceptions + def storage_node__clean_devices(self, sub_command, args): + storage_ops.clean_devices(args.config_path) + return True # remove once CLI changed to exceptions + def storage_node__add_node(self, sub_command, args): cluster_id = args.cluster_id node_addr = args.node_addr diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index f34a430a9..30b2e6563 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,6 +1,6 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev SIMPLY_BLOCK_VERSION=19.2.27 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main -SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:hotfix-to-main +SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=simplyblock/spdk:hotfix-to-main-latest diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index ab7b31b09..967f041a5 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -105,6 +105,8 @@ class StorageNode(BaseNodeObject): hublvol: HubLVol = None # type: ignore[assignment] active_tcp: bool = True active_rdma: bool = False + lvol_sync_del_queue: List[str] = [] + socket: int = 0 def rpc_client(self, **kwargs): """Return rpc client to this node diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index ce48e1796..8be2c9d6a 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -886,6 +886,12 @@ def nbd_stop_disk(self, nbd_device): } return self._request("nbd_stop_disk", params) + def nbd_get_disks(self, nbd_device): + params = { + "nbd_device": nbd_device + } + return self._request("nbd_get_disks", params) + def bdev_jm_unmap_vuid(self, name, vuid): params = {"name": name, "vuid": vuid} return self._request("bdev_jm_unmap_vuid", params) diff --git a/simplyblock_core/scripts/config_docker.sh b/simplyblock_core/scripts/config_docker.sh index 9f75cdde3..590664ca7 100644 --- a/simplyblock_core/scripts/config_docker.sh +++ b/simplyblock_core/scripts/config_docker.sh @@ -38,7 +38,7 @@ create_override ${DEV_IP} sudo systemctl daemon-reload sudo systemctl restart docker -activate-global-python-argcomplete --user +activate-global-python-argcomplete --user -y if [ ! -s "$HOME/.bashrc" ] || [ -z "$(grep "source $HOME/.bash_completion" $HOME/.bashrc)" ] then echo -e "\nsource $HOME/.bash_completion\n" >> $HOME/.bashrc diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index 06eeee008..e0bb05bff 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -12,7 +12,6 @@ from http.server import BaseHTTPRequestHandler -rpc_sock = '/var/tmp/spdk.sock' logger_handler = logging.StreamHandler(stream=sys.stdout) logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s')) logger = logging.getLogger() @@ -30,6 +29,7 @@ def get_env_var(name, default=None, is_required=False): return os.environ.get(name, default) +unix_sockets: list[socket] = [] # type: ignore[valid-type] def rpc_call(req): req_data = json.loads(req.decode('ascii')) params = "" @@ -72,6 +72,7 @@ def rpc_call(req): class ServerHandler(BaseHTTPRequestHandler): + server_session: list[int] = [] key = "" def do_HEAD(self): @@ -157,6 +158,7 @@ def run_server(host, port, user, password, is_threading_enabled=False): rpc_port = int(rpc_port) except Exception: rpc_port = 8080 +rpc_sock = f"/mnt/ramdisk/spdk_{rpc_port}/spdk.sock" is_threading_enabled = bool(is_threading_enabled) run_server(server_ip, rpc_port, rpc_username, rpc_password, is_threading_enabled=is_threading_enabled) diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 6f1bee0db..23504ab0b 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -81,7 +81,8 @@ def info(self): def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None, cluster_ip=None, fdb_connection=None, namespace=None, server_ip=None, rpc_port=None, rpc_username=None, rpc_password=None, multi_threading_enabled=False, timeout=0, ssd_pcie=None, - total_mem=None, system_mem=None, cluster_mode=None, cluster_id=None): + total_mem=None, system_mem=None, cluster_mode=None, socket=0, cluster_id=None): + params = { "cluster_ip": cluster_ip, "server_ip": server_ip, @@ -113,6 +114,8 @@ def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None params["system_mem"] = system_mem if cluster_mode: params["cluster_mode"] = cluster_mode + params["socket"] = socket + if cluster_id: params["cluster_id"] = cluster_id return self._request("POST", "spdk_process_start", params) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 1e14d7a80..a7d7c9d74 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -1,7 +1,6 @@ # coding=utf- 8 import datetime import json -import os import platform import socket @@ -37,6 +36,7 @@ from simplyblock_web import node_utils from simplyblock_core.utils import addNvmeDevices from simplyblock_core.utils import pull_docker_image_with_retry +import os logger = utils.get_logger(__name__) @@ -79,22 +79,22 @@ def connect_device(name: str, device: NVMeDevice, node: StorageNode, bdev_names: bdev_name = None - db_ctrl=DBController() - node=db_ctrl.get_storage_node_by_id(device.node_id) + db_ctrl = DBController() + node = db_ctrl.get_storage_node_by_id(device.node_id) if node.active_rdma: - tr_type="RDMA" + tr_type = "RDMA" else: if node.active_tcp: - tr_type="TCP" + tr_type = "TCP" else: - msg="target node to connect has no active fabric." + msg = "target node to connect has no active fabric." logger.error(msg) raise RuntimeError(msg) for ip in device.nvmf_ip.split(","): ret = rpc_client.bdev_nvme_attach_controller( - name, device.nvmf_nqn, ip, device.nvmf_port,tr_type, - multipath=device.nvmf_multipath) + name, device.nvmf_nqn, ip, device.nvmf_port, tr_type, + multipath=device.nvmf_multipath) if not bdev_name and ret and isinstance(ret, list): bdev_name = ret[0] @@ -225,9 +225,9 @@ def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart): return False for iface in snode.data_nics: - logger.info(f"adding {iface.trtype} listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address)) - ret = rpc_client.listeners_create(subsystem_nqn, iface.trtype, iface.ip4_address, snode.nvmf_port) - ip_list.append(iface.ip4_address) + logger.info(f"adding {iface.trtype} listener for %s on IP %s" % (subsystem_nqn, iface.ip4_address)) + ret = rpc_client.listeners_create(subsystem_nqn, iface.trtype, iface.ip4_address, snode.nvmf_port) + ip_list.append(iface.ip4_address) if len(ip_list) > 1: IP = ",".join(ip_list) @@ -432,9 +432,15 @@ def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, j return False time.sleep(3) rpc_client.nbd_stop_disk(nbd_device) - time.sleep(1) + for i in range(10): + if not rpc_client.nbd_get_disks(nbd_device): + break + time.sleep(1) rpc_client.bdev_nvme_detach_controller(nvme.nvme_controller) - time.sleep(1) + for i in range(10): + if not rpc_client.bdev_nvme_controller_list(nvme.nvme_controller): + break + time.sleep(1) try: rpc_client.bdev_nvme_controller_attach(nvme.nvme_controller, nvme.pcie_address) except RPCException as e: @@ -471,7 +477,7 @@ def _prepare_cluster_devices_partitions(snode, devices): t = threading.Thread( target=_create_device_partitions, args=(snode.rpc_client(), nvme, snode, snode.num_partitions_per_dev, - snode.jm_percent, snode.partition_size, index+1,)) + snode.jm_percent, snode.partition_size, index + 1,)) thread_list.append(t) t.start() @@ -802,8 +808,9 @@ def _connect_to_remote_jm_devs(this_node, jm_ids=None): try: org_dev.remote_bdev = connect_device( - f"remote_{org_dev.jm_bdev}", org_dev, this_node, - bdev_names=node_bdev_names, reattach=True, + f"remote_{org_dev.jm_bdev}", org_dev, this_node, + bdev_names=node_bdev_names, reattach=True, + ) except RuntimeError: logger.error(f'Failed to connect to {org_dev.get_id()}') @@ -811,6 +818,7 @@ def _connect_to_remote_jm_devs(this_node, jm_ids=None): return new_devs + def ifc_is_tcp(nic): addrs = psutil.net_if_addrs().get(nic, []) for addr in addrs: @@ -818,6 +826,7 @@ def ifc_is_tcp(nic): return True return False + def ifc_is_roce(nic): rdma_path = "/sys/class/infiniband/" if not os.path.exists(rdma_path): @@ -831,7 +840,8 @@ def ifc_is_roce(nic): return True return False -def add_node(cluster_id, node_addr, iface_name,data_nics_list, + +def add_node(cluster_id, node_addr, iface_name, data_nics_list, max_snap, spdk_image=None, spdk_debug=False, small_bufsize=0, large_bufsize=0, num_partitions_per_dev=0, jm_percent=0, enable_test_device=False, @@ -921,9 +931,10 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, jm_cpu_mask = utils.generate_mask(jm_cpu_core) # Calculate pool count - max_prov = int(utils.parse_size(node_config.get("max_size"))) - - if max_prov <= 0: + max_prov = 0 + if node_config.get("max_size"): + max_prov = int(utils.parse_size(node_config.get("max_size"))) + if max_prov < 0: logger.error(f"Incorrect max-prov value {max_prov}") return False @@ -935,6 +946,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, minimum_hp_memory = node_config.get("huge_page_memory") + minimum_hp_memory = max(minimum_hp_memory, max_prov) + # check for memory if "memory_details" in node_info and node_info['memory_details']: memory_details = node_info['memory_details'] @@ -943,7 +956,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, logger.info(f"Free: {utils.humanbytes(memory_details['free'])}") logger.info(f"huge_total: {utils.humanbytes(memory_details['huge_total'])}") logger.info(f"huge_free: {utils.humanbytes(memory_details['huge_free'])}") - logger.info(f"Minimum required huge pages memory is : {utils.humanbytes(minimum_hp_memory)}") + logger.info(f"Set huge pages memory is : {utils.humanbytes(minimum_hp_memory)}") else: logger.error("Cannot get memory info from the instance.. Exiting") return False @@ -951,14 +964,15 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, # Calculate minimum sys memory minimum_sys_memory = node_config.get("sys_memory") - satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory, - minimum_sys_memory, - int(memory_details['free']), - int(memory_details['huge_total'])) + # satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory, + # minimum_sys_memory, + # int(memory_details['free']), + # int(memory_details['huge_total'])) max_lvol = node_config.get("max_lvol") - if not satisfied: - logger.warning( - f"Not enough memory for the provided max_lvo: {max_lvol}, max_prov: {max_prov}..") + + # if not satisfied: + # logger.warning( + # f"Not enough memory for the provided max_lvo: {max_lvol}, max_prov: {max_prov}..") ssd_pcie = node_config.get("ssd_pcis") if ssd_pcie: @@ -1005,12 +1019,13 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, if log_config_type and log_config_type != LogConfig.types.GELF: logger.info("SNodeAPI container found but not configured with gelf logger") start_storage_node_api_container(mgmt_ip, cluster_ip) + node_socket = node_config.get("socket") total_mem = minimum_hp_memory for n in db_controller.get_storage_nodes_by_cluster_id(cluster_id): - if n.api_endpoint == node_addr: - total_mem += n.spdk_mem - total_mem += utils.parse_size("500m") + if n.api_endpoint == node_addr and n.socket == node_socket: + total_mem += (n.spdk_mem + 500000000) + logger.info("Deploying SPDK") results = None l_cores = node_config.get("l-cores") @@ -1023,7 +1038,9 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, namespace, mgmt_ip, rpc_port, rpc_user, rpc_pass, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, - ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=cluster_id) + ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, + socket=node_socket, cluster_id=cluster_id) + time.sleep(5) except Exception as e: @@ -1036,8 +1053,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, data_nics = [] - active_tcp=False - active_rdma=False + active_tcp = False + active_rdma = False fabric_tcp = cluster.fabric_tcp fabric_rdma = cluster.fabric_rdma names = data_nics_list or [mgmt_iface] @@ -1046,17 +1063,17 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, logger.debug(f"Data nics ports are: {names}") for nic in names: device = node_info['network_interface'][nic] - base_ifc_cfg={ - 'uuid': str(uuid.uuid4()), - 'if_name': nic, - 'ip4_address': device['ip'], - 'status': device['status'], - 'net_type': device['net_type'],} + base_ifc_cfg = { + 'uuid': str(uuid.uuid4()), + 'if_name': nic, + 'ip4_address': device['ip'], + 'status': device['status'], + 'net_type': device['net_type'], } if fabric_rdma and snode_api.ifc_is_roce(nic): cfg = base_ifc_cfg.copy() cfg['trtype'] = "RDMA" data_nics.append(IFace(cfg)) - active_rdma=True + active_rdma = True if fabric_tcp and snode_api.ifc_is_tcp(nic): active_tcp = True elif fabric_tcp and snode_api.ifc_is_tcp(nic): @@ -1108,8 +1125,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.enable_ha_jm = enable_ha_jm snode.ha_jm_count = ha_jm_count snode.minimum_sys_memory = minimum_sys_memory - snode.active_tcp=active_tcp - snode.active_rdma=active_rdma + snode.active_tcp = active_tcp + snode.active_rdma = active_rdma if 'cpu_hz' in node_info: snode.cpu_hz = node_info['cpu_hz'] @@ -1141,6 +1158,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.nvmf_port = utils.get_next_dev_port(cluster_id) snode.poller_cpu_cores = poller_cpu_cores or [] + snode.socket = node_socket + snode.iobuf_small_pool_count = small_pool_count or 0 snode.iobuf_large_pool_count = large_pool_count or 0 snode.iobuf_small_bufsize = small_bufsize or 0 @@ -1151,7 +1170,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode.physical_label = 0 else: snode.physical_label = get_next_physical_device_order(snode) - + snode.num_partitions_per_dev = num_partitions_per_dev snode.jm_percent = jm_percent snode.id_device_by_nqn = id_device_by_nqn @@ -1231,15 +1250,15 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, return False if cluster.fabric_tcp: - ret = rpc_client.transport_create("TCP", qpair,512*(req_cpu_count+1)) - if not ret: - logger.error(f"Failed to create transport TCP with qpair: {qpair}") - return False + ret = rpc_client.transport_create("TCP", qpair, 512 * (req_cpu_count + 1)) + if not ret: + logger.error(f"Failed to create transport TCP with qpair: {qpair}") + return False if cluster.fabric_rdma: - ret = rpc_client.transport_create("RDMA", qpair,512*(req_cpu_count+1)) - if not ret: - logger.error(f"Failed to create transport RDMA with qpair: {qpair}") - return False + ret = rpc_client.transport_create("RDMA", qpair, 512 * (req_cpu_count + 1)) + if not ret: + logger.error(f"Failed to create transport RDMA with qpair: {qpair}") + return False # 7- set jc singleton mask if snode.jc_singleton_mask: @@ -1289,8 +1308,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, logger.info("Setting Alcemls QOS weights") ret = rpc_client.alceml_set_qos_weights(qos_controller.get_qos_weights_list(cluster_id)) if not ret: - logger.error("Failed to set Alcemls QOS") - return False + logger.error("Failed to set Alcemls QOS") + return False logger.info("Connecting to remote devices") remote_devices = _connect_to_remote_devs(snode) @@ -1304,7 +1323,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, snode = db_controller.get_storage_node_by_id(snode.get_id()) old_status = snode.status - snode.status = StorageNode.STATUS_ONLINE + snode.status = StorageNode.STATUS_ONLINE snode.updated_at = str(datetime.datetime.now(datetime.timezone.utc)) snode.online_since = str(datetime.datetime.now(datetime.timezone.utc)) snode.write_to_db(db_controller.kv_store) @@ -1324,7 +1343,8 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list, return False node.write_to_db(kv_store) - if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY, Cluster.STATUS_IN_EXPANSION]: + if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY, + Cluster.STATUS_IN_EXPANSION]: logger.warning( f"The cluster status is not active ({cluster.status}), adding the node without distribs and lvstore") continue @@ -1491,7 +1511,6 @@ def remove_storage_node(node_id, force_remove=False, force_migrate=False): pci_address.append(dev.pcie_address) except Exception as e: logger.exception(e) - return False set_node_status(node_id, StorageNode.STATUS_REMOVED) @@ -1592,23 +1611,22 @@ def restart_storage_node( snode_api.bind_device_to_spdk(dev['address']) else: node_ip = None - active_tcp=False - active_rdma=False + active_tcp = False + active_rdma = False fabric_tcp = cluster.fabric_tcp fabric_rdma = cluster.fabric_rdma snode_api = SNodeClient(snode.api_endpoint, timeout=5 * 60, retry=3) for nic in snode.data_nics: if fabric_rdma and snode_api.ifc_is_roce(nic["if_name"]): nic.trtype = "RDMA" - active_rdma=True + active_rdma = True if fabric_tcp and snode_api.ifc_is_tcp(nic["if_name"]): active_tcp = True elif fabric_tcp and snode_api.ifc_is_tcp(nic["if_name"]): nic.trtype = "TCP" active_tcp = True - snode.active_tcp=active_tcp - snode.active_rdma=active_rdma - + snode.active_tcp = active_tcp + snode.active_rdma = active_rdma logger.info(f"Restarting Storage node: {snode.mgmt_ip}") node_info, _ = snode_api.info() @@ -1629,28 +1647,28 @@ def restart_storage_node( snode.l_cores = node['l-cores'] break - if max_prov: - if not isinstance(max_prov, int): - try: - max_prov = int(max_prov) - max_prov = f"{max_prov}g" - max_prov = int(utils.parse_size(max_prov)) - except Exception: - logger.error(f"Invalid max_prov value: {max_prov}") - return False + if max_prov > 0: + try: + max_prov = int(utils.parse_size(max_prov)) + snode.max_prov = max_prov + except Exception as e: + logger.debug(e) + logger.error(f"Invalid max_prov value: {max_prov}") + return False + else: + max_prov = snode.max_prov - snode.max_prov = max_prov - if snode.max_prov <= 0: - logger.error(f"Incorrect max-prov value {snode.max_prov}") - return False if spdk_image: snode.spdk_image = spdk_image # Calculate minimum huge page memory - minimum_hp_memory = utils.calculate_minimum_hp_memory(snode.iobuf_small_pool_count, snode.iobuf_large_pool_count, snode.max_lvol, - snode.max_prov, + minimum_hp_memory = utils.calculate_minimum_hp_memory(snode.iobuf_small_pool_count, snode.iobuf_large_pool_count, + snode.max_lvol, + max_prov, len(utils.hexa_to_cpu_list(snode.spdk_cpu_mask))) + minimum_hp_memory = max(minimum_hp_memory, max_prov) + # check for memory if "memory_details" in node_info and node_info['memory_details']: memory_details = node_info['memory_details'] @@ -1663,17 +1681,18 @@ def restart_storage_node( return False # Calculate minimum sys memory - #minimum_sys_memory = utils.calculate_minimum_sys_memory(snode.max_prov, memory_details['total']) - minimum_sys_memory = snode.minimum_sys_memory - satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory, - minimum_sys_memory, - int(memory_details['free']), - int(memory_details['huge_total'])) - if not satisfied: - logger.error( - f"Not enough memory for the provided max_lvo: {snode.max_lvol}, max_snap: {snode.max_snap}, max_prov: {utils.humanbytes(snode.max_prov)}.. Exiting") - - snode.spdk_mem = spdk_mem + # minimum_sys_memory = utils.calculate_minimum_sys_memory(snode.max_prov, memory_details['total']) + # minimum_sys_memory = snode.minimum_sys_memory + # satisfied, spdk_mem = utils.calculate_spdk_memory(minimum_hp_memory, + # minimum_sys_memory, + # int(memory_details['free']), + # int(memory_details['huge_total'])) + # if not satisfied: + # logger.error( + # f"Not enough memory for the provided max_lvo: {snode.max_lvol}, max_snap: {snode.max_snap}, max_prov: {utils.humanbytes(snode.max_prov)}.. Exiting") + minimum_sys_memory = snode.minimum_sys_memory or 0 + snode.spdk_mem = minimum_hp_memory + spdk_debug = snode.spdk_debug if set_spdk_debug: spdk_debug = True @@ -1686,13 +1705,12 @@ def restart_storage_node( cluster_ip = cluster_docker.info()["Swarm"]["NodeAddr"] else: - cluster_ip = utils.get_k8s_node_ip() + cluster_ip = utils.get_k8s_node_ip() - total_mem = 0 + total_mem = minimum_hp_memory for n in db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id): - if n.api_endpoint == snode.api_endpoint: - total_mem += n.spdk_mem - total_mem+= utils.parse_size("500m") + if n.api_endpoint == snode.api_endpoint and n.socket == snode.socket and n.uuid != snode.uuid: + total_mem += (n.spdk_mem + 500000000) results = None try: @@ -1703,7 +1721,9 @@ def restart_storage_node( snode.l_cores, snode.spdk_mem, snode.spdk_image, spdk_debug, cluster_ip, fdb_connection, snode.namespace, snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, - ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=snode.cluster_id) + ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, + socket=snode.socket, + cluster_id=snode.cluster_id) except Exception as e: logger.error(e) @@ -1788,12 +1808,12 @@ def restart_storage_node( qpair = cluster.qpair_count req_cpu_count = len(utils.hexa_to_cpu_list(snode.spdk_cpu_mask)) if cluster.fabric_tcp: - ret = rpc_client.transport_create("TCP", qpair, 512*(req_cpu_count+1)) + ret = rpc_client.transport_create("TCP", qpair, 512 * (req_cpu_count + 1)) if not ret: logger.error(f"Failed to create transport TCP with qpair: {qpair}") return False if cluster.fabric_rdma: - ret = rpc_client.transport_create("RDMA", qpair, 512*(req_cpu_count+1)) + ret = rpc_client.transport_create("RDMA", qpair, 512 * (req_cpu_count + 1)) if not ret: logger.error(f"Failed to create transport RDMA with qpair: {qpair}") return False @@ -1826,10 +1846,11 @@ def restart_storage_node( active_devices = [] removed_devices = [] known_devices_sn = [] - devices_sn_dict = {d.serial_number:d for d in nvme_devs} + devices_sn_dict = {d.serial_number: d for d in nvme_devs} for db_dev in snode.nvme_devices: known_devices_sn.append(db_dev.serial_number) - if db_dev.status in [NVMeDevice.STATUS_FAILED_AND_MIGRATED, NVMeDevice.STATUS_FAILED, NVMeDevice.STATUS_REMOVED]: + if db_dev.status in [NVMeDevice.STATUS_FAILED_AND_MIGRATED, NVMeDevice.STATUS_FAILED, + NVMeDevice.STATUS_REMOVED]: removed_devices.append(db_dev) continue if db_dev.serial_number in devices_sn_dict.keys(): @@ -1838,7 +1859,7 @@ def restart_storage_node( if not db_dev.is_partition and not found_dev.is_partition: db_dev.device_name = found_dev.device_name db_dev.nvme_bdev = found_dev.nvme_bdev - db_dev.nvme_controller =found_dev.nvme_controller + db_dev.nvme_controller = found_dev.nvme_controller db_dev.pcie_address = found_dev.pcie_address # if db_dev.status in [ NVMeDevice.STATUS_ONLINE]: @@ -1868,6 +1889,7 @@ def restart_storage_node( snode.write_to_db(db_controller.kv_store) if node_ip: + # prepare devices on new node if snode.num_partitions_per_dev == 0 or snode.jm_percent == 0: @@ -1913,7 +1935,6 @@ def restart_storage_node( snode.lvstore_status = "" snode.write_to_db(db_controller.kv_store) - snode = db_controller.get_storage_node_by_id(snode.get_id()) for db_dev in snode.nvme_devices: if db_dev.status in [NVMeDevice.STATUS_UNAVAILABLE, NVMeDevice.STATUS_ONLINE, @@ -2007,7 +2028,6 @@ def restart_storage_node( return False node.write_to_db(kv_store) - logger.info("Sending device status event") snode = db_controller.get_storage_node_by_id(snode.get_id()) for db_dev in snode.nvme_devices: @@ -2030,11 +2050,11 @@ def restart_storage_node( pools = db_controller.get_pools() for pool in pools: ret = rpc_client.bdev_lvol_set_qos_limit(pool.numeric_id, - pool.max_rw_ios_per_sec, - pool.max_rw_mbytes_per_sec, - pool.max_r_mbytes_per_sec, - pool.max_w_mbytes_per_sec, - ) + pool.max_rw_ios_per_sec, + pool.max_rw_mbytes_per_sec, + pool.max_r_mbytes_per_sec, + pool.max_w_mbytes_per_sec, + ) if not ret: logger.error("RPC failed bdev_lvol_set_qos_limit") return False @@ -2369,7 +2389,7 @@ def suspend_storage_node(node_id, force=False): if snode.lvstore_stack_secondary_1: nodes = db_controller.get_primary_storage_nodes_by_secondary_node_id(node_id) if nodes: - for node in nodes: + for node in nodes: try: fw_api.firewall_set_port( node.hublvol.nvmf_port, port_type, "block", snode.rpc_port, is_reject=True) @@ -2396,7 +2416,6 @@ def suspend_storage_node(node_id, force=False): rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid) time.sleep(1) - logger.info("Done") return True @@ -2448,7 +2467,7 @@ def resume_storage_node(node_id): port_type = "udp" nodes = db_controller.get_primary_storage_nodes_by_secondary_node_id(node_id) if nodes: - for node in nodes: + for node in nodes: try: fw_api.firewall_set_port( node.lvol_subsys_port, port_type, "allow", snode.rpc_port) @@ -2664,8 +2683,8 @@ def upgrade_automated_deployment_config(): return False -def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, cores_percentage=0): - +def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, + cores_percentage=0, force=False, device_model="", size_range=""): # we need minimum of 6 VPCs. RAM 4GB min. Plus 0.2% of the storage. total_cores = os.cpu_count() or 0 if total_cores < 6: @@ -2676,7 +2695,8 @@ def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nod utils.load_kernel_module("uio_pci_generic") nodes_config, system_info = utils.generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, - pci_allowed, pci_blocked, cores_percentage) + pci_allowed, pci_blocked, cores_percentage, force=force, + device_model=device_model, size_range=size_range) if not nodes_config or not nodes_config.get("nodes"): return False utils.store_config_file(nodes_config, constants.NODES_CONFIG_FILE, create_read_only_file=True) @@ -2688,11 +2708,13 @@ def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nod for node_config in nodes_config["nodes"]: numa = node_config["socket"] huge_page_memory_dict[numa] = huge_page_memory_dict.get(numa, 0) + node_config["huge_page_memory"] - for numa, huge_page_memory in huge_page_memory_dict.items(): - num_pages = huge_page_memory // (2048 * 1024) - utils.set_hugepages_if_needed(numa, num_pages) + utils.create_rpc_socket_mount() + # for numa, huge_page_memory in huge_page_memory_dict.items(): + # num_pages = huge_page_memory // (2048 * 1024) + # utils.set_hugepages_if_needed(numa, num_pages) return True + def deploy(ifname, isolate_cores=False): if not ifname: ifname = "eth0" @@ -2716,7 +2738,8 @@ def deploy(ifname, isolate_cores=False): logger.info("Config Validated successfully.") logger.info("NVMe SSD devices found on node:") - stream = os.popen(f"lspci -Dnn | grep -i '\\[{LINUX_DRV_MASS_STORAGE_ID:02}{LINUX_DRV_MASS_STORAGE_NVME_TYPE_ID:02}\\]'") + stream = os.popen( + f"lspci -Dnn | grep -i '\\[{LINUX_DRV_MASS_STORAGE_ID:02}{LINUX_DRV_MASS_STORAGE_NVME_TYPE_ID:02}\\]'") for line in stream.readlines(): logger.info(line.strip()) @@ -2785,6 +2808,17 @@ def deploy_cleaner(): scripts.deploy_cleaner() +def clean_devices(config_path): + with open(config_path) as f: + cfg = json.load(f) + ssd_pcis = [ + pci + for node in cfg.get("nodes", []) + for pci in node.get("ssd_pcis", []) + ] + utils.clean_devices(ssd_pcis) + + def get_host_secret(node_id): db_controller = DBController() try: @@ -2874,12 +2908,12 @@ def health_check(node_id): # subsystem = rpc_client.subsystem_list(dev.nvmf_nqn) - # dev.testing_bdev = test_name - # dev.alceml_bdev = alceml_name - # dev.pt_bdev = pt_name - # # nvme.nvmf_nqn = subsystem_nqn - # # nvme.nvmf_ip = IP - # # nvme.nvmf_port = 4420 + # dev.testing_bdev = test_name + # dev.alceml_bdev = alceml_name + # dev.pt_bdev = pt_name + # # nvme.nvmf_nqn = subsystem_nqn + # # nvme.nvmf_ip = IP + # # nvme.nvmf_port = 4420 except Exception as e: logger.error(f"Failed to connect to node's SPDK: {e}") @@ -3003,7 +3037,6 @@ def set_node_status(node_id, status, reconnect_on_online=True): except Exception as e: logger.error("Error establishing hublvol: %s", e) - return True @@ -3051,7 +3084,6 @@ def recreate_lvstore_on_sec(secondary_node): port_type = "udp" if primary_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_RESTARTING]: - fw_api = FirewallClient(primary_node, timeout=5, retry=2) ### 3- block primary port fw_api.firewall_set_port(primary_node.lvol_subsys_port, port_type, "block", primary_node.rpc_port) @@ -3078,7 +3110,6 @@ def recreate_lvstore_on_sec(secondary_node): logger.error("Error connecting to hublvol: %s", e) # return False - fw_api = FirewallClient(primary_node, timeout=5, retry=2) ### 8- allow port on primary fw_api.firewall_set_port(primary_node.lvol_subsys_port, port_type, "allow", primary_node.rpc_port) @@ -3197,7 +3228,8 @@ def recreate_lvstore(snode, force=False): logger.info("Inflight IO NOT found, continuing") break else: - logger.error(f"Timeout while checking for inflight IO after 10 seconds on node {snode.secondary_node_id}") + logger.error( + f"Timeout while checking for inflight IO after 10 seconds on node {snode.secondary_node_id}") if sec_node.status in [StorageNode.STATUS_UNREACHABLE, StorageNode.STATUS_DOWN]: logger.info(f"Secondary node is not online, forcing journal replication on node: {snode.get_id()}") @@ -3335,7 +3367,7 @@ def add_lvol_thread(lvol, snode, lvol_ana_state="optimized"): logger.info("Add BDev to subsystem") ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, nsid=lvol.ns_id) for iface in snode.data_nics: - if iface.ip4_address and lvol.fabric==iface.trtype.lower(): + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) ret = rpc_client.listeners_create( lvol.nqn, iface.trtype, iface.ip4_address, lvol.subsys_port, ana_state=lvol_ana_state) @@ -3350,9 +3382,9 @@ def add_lvol_thread(lvol, snode, lvol_ana_state="optimized"): lvol_obj.health_check = True lvol_obj.write_to_db() # set QOS - if lvol.rw_ios_per_sec or lvol.rw_mbytes_per_sec or lvol.r_mbytes_per_sec or lvol.w_mbytes_per_sec : + if lvol.rw_ios_per_sec or lvol.rw_mbytes_per_sec or lvol.r_mbytes_per_sec or lvol.w_mbytes_per_sec: lvol_controller.set_lvol(lvol.uuid, lvol.rw_ios_per_sec, lvol.rw_mbytes_per_sec, - lvol.r_mbytes_per_sec , lvol.w_mbytes_per_sec) + lvol.r_mbytes_per_sec, lvol.w_mbytes_per_sec) return True, None @@ -3389,7 +3421,7 @@ def get_sorted_ha_jms(current_node): continue mgmt_ips.append(jm_dev_to_mgmt_ip[jm_id]) out.append(jm_id) - return out[:constants.HA_JM_COUNT-1] + return out[:constants.HA_JM_COUNT - 1] def get_node_jm_names(current_node, remote_node=None): @@ -3437,8 +3469,8 @@ def get_secondary_nodes(current_node): if node.get_id() == current_node.get_id(): nod_found = True continue - elif node.status == StorageNode.STATUS_ONLINE and node.mgmt_ip != current_node.mgmt_ip : - # elif node.status == StorageNode.STATUS_ONLINE : + elif node.status == StorageNode.STATUS_ONLINE and node.mgmt_ip != current_node.mgmt_ip: + # elif node.status == StorageNode.STATUS_ONLINE : if node.is_secondary_node: nodes.append(node.get_id()) @@ -3657,7 +3689,7 @@ def _create_distr(snode, name, params): ret = True elif type == "bdev_lvstore" and lvstore_stack and not primary_node: - ret = rpc_client.create_lvstore(**params) + ret = rpc_client.create_lvstore(**params) elif type == "bdev_ptnonexcl": ret = rpc_client.bdev_PT_NoExcl_create(**params) diff --git a/simplyblock_core/test/test_utils.py b/simplyblock_core/test/test_utils.py index da22a73ba..37b3cb267 100644 --- a/simplyblock_core/test/test_utils.py +++ b/simplyblock_core/test/test_utils.py @@ -1,8 +1,13 @@ +import uuid from typing import ContextManager +from unittest.mock import patch import pytest -from simplyblock_core import utils +from simplyblock_core import utils, storage_node_ops +from simplyblock_core.db_controller import DBController +from simplyblock_core.models.nvme_device import JMDevice, RemoteJMDevice +from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.utils import helpers, parse_thread_siblings_list @@ -146,3 +151,51 @@ def test_parse_thread_siblings_list(input, expected): parse_thread_siblings_list(input) else: assert parse_thread_siblings_list(input) == expected + + +@patch.object(DBController, 'get_jm_device_by_id') +def test_get_node_jm_names(db_controller_get_jm_device_by_id): + + node_1_jm = JMDevice() + node_1_jm.uuid = "node_1_jm_id" + node_1_jm.jm_bdev = "node_1_jm" + + node_2_jm = JMDevice() + node_2_jm.uuid = "node_2_jm_id" + node_2_jm.jm_bdev = "node_2_jm" + + node_3_jm = JMDevice() + node_3_jm.uuid = "node_3_jm_id" + node_3_jm.jm_bdev = "node_3_jm" + + node_4_jm = JMDevice() + node_4_jm.uuid = "node_4_jm_id" + node_4_jm.jm_bdev = "node_4_jm" + + def get_jm_device_by_id(jm_id): + for jm in [node_1_jm, node_2_jm, node_3_jm, node_4_jm]: + if jm.uuid == jm_id: + return jm + + db_controller_get_jm_device_by_id.side_effect = get_jm_device_by_id + + node_1 = StorageNode() + node_1.uuid = str(uuid.uuid4()) + node_1.enable_ha_jm = True + node_1.ha_jm_count = 4 + node_1.jm_device = node_1_jm + node_1.jm_ids = ["node_2_jm_id", "node_3_jm_id", "node_4_jm_id"] + + remote_node = StorageNode() + remote_node.uuid = str(uuid.uuid4()) + remote_node.enable_ha_jm = True + remote_node.jm_ids = [] + remote_node.jm_device = node_2_jm + remote_node.remote_jm_devices = [ + RemoteJMDevice({"uuid": node_1_jm.uuid, "remote_bdev": f"rem_{node_1_jm.jm_bdev}"}), + RemoteJMDevice({"uuid": node_3_jm.uuid, "remote_bdev": f"rem_{node_3_jm.jm_bdev}"}), + RemoteJMDevice({"uuid": node_4_jm.uuid, "remote_bdev": f"rem_{node_4_jm.jm_bdev}"})] + + jm_names = storage_node_ops.get_node_jm_names(node_1, remote_node=remote_node) + print(f"jm_names: {len(jm_names)}", jm_names) + diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 6711dc857..6add65d62 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -10,9 +10,9 @@ import sys import uuid import time -from typing import Union, Any, Optional, Tuple, Dict from datetime import datetime, timezone from docker import DockerClient +from typing import Union, Any, Optional, Tuple, List, Dict from kubernetes import client, config from kubernetes.client import ApiException, V1Deployment, V1DeploymentSpec, V1ObjectMeta, \ V1PodTemplateSpec, V1PodSpec, V1Container, V1EnvVar, V1VolumeMount, V1Volume, V1ConfigMapVolumeSource, \ @@ -149,7 +149,7 @@ def print_table(data: list, title=None): } -def humanbytes(size: int, mode: str = 'iec') -> str: # show size using 1024 base +def humanbytes(size: int, mode: str = 'iec') -> str: # show size using 1024 base """Return the given bytes as a human friendly including the appropriate unit.""" if not size or size < 0: return '0 B' @@ -443,7 +443,7 @@ def reserve_n(count): assigned["jm_cpu_core"] = vcpu vcpu = reserve_n(1) assigned["jc_singleton_core"] = vcpu - assigned["alceml_worker_cpu_cores"] = vcpu + # assigned["alceml_worker_cpu_cores"] = vcpu vcpu = reserve_n(1) assigned["alceml_cpu_cores"] = vcpu elif (len(vcpu_list) < 22): @@ -451,8 +451,8 @@ def reserve_n(count): assigned["jm_cpu_core"] = vcpu vcpu = reserve_n(1) assigned["jc_singleton_core"] = vcpu - vcpus = reserve_n(1) - assigned["alceml_worker_cpu_cores"] = vcpus + # vcpus = reserve_n(1) + # assigned["alceml_worker_cpu_cores"] = vcpus vcpus = reserve_n(2) assigned["alceml_cpu_cores"] = vcpus else: @@ -460,20 +460,33 @@ def reserve_n(count): assigned["jm_cpu_core"] = vcpus vcpu = reserve_n(1) assigned["jc_singleton_core"] = vcpu - vcpus = reserve_n(int(alceml_count / 3) + ((alceml_count % 3) > 0)) - assigned["alceml_worker_cpu_cores"] = vcpus + # vcpus = reserve_n(int(alceml_count / 3) + ((alceml_count % 3) > 0)) + # assigned["alceml_worker_cpu_cores"] = vcpus vcpus = reserve_n(alceml_count) assigned["alceml_cpu_cores"] = vcpus dp = int(len(remaining) / 2) - vcpus = reserve_n(dp) - assigned["distrib_cpu_cores"] = vcpus - vcpus = reserve_n(dp) - assigned["poller_cpu_cores"] = vcpus + if 17 > dp >= 12: + poller_n = len(remaining) - 12 + vcpus = reserve_n(12) + assigned["distrib_cpu_cores"] = vcpus + vcpus = reserve_n(poller_n) + assigned["poller_cpu_cores"] = vcpus + elif dp >= 17: + poller_n = len(remaining) - 24 + vcpus = reserve_n(24) + assigned["distrib_cpu_cores"] = vcpus + vcpus = reserve_n(poller_n) + assigned["poller_cpu_cores"] = vcpus + else: + vcpus = reserve_n(dp) + assigned["distrib_cpu_cores"] = vcpus + vcpus = reserve_n(dp) + assigned["poller_cpu_cores"] = vcpus if len(remaining) > 0: if len(assigned["poller_cpu_cores"]) == 0: assigned["distrib_cpu_cores"] = assigned["poller_cpu_cores"] = reserve_n(1) else: - assigned["distrib_cpu_cores"] = assigned["distrib_cpu_cores"] + reserve_n(1) + assigned["poller_cpu_cores"] = assigned["poller_cpu_cores"] + reserve_n(1) # Return the individual threads as separate values return ( assigned.get("app_thread_core", []), @@ -532,7 +545,8 @@ def calculate_pool_count(alceml_count, number_of_distribs, cpu_count, poller_cou poller_number = poller_count if poller_count else cpu_count small_pool_count = 384 * (alceml_count + number_of_distribs + 3 + poller_count) + ( - 6 + alceml_count + number_of_distribs) * 256 + poller_number * 127 + 384 + 128 * poller_number + constants.EXTRA_SMALL_POOL_COUNT + + 6 + alceml_count + number_of_distribs) * + poller_number * 127 + 384 + 128 * poller_number + constants.EXTRA_SMALL_POOL_COUNT large_pool_count = 48 * (alceml_count + number_of_distribs + 3 + poller_count) + ( 6 + alceml_count + number_of_distribs) * 32 + poller_number * 15 + 384 + 16 * poller_number + constants.EXTRA_LARGE_POOL_COUNT @@ -547,9 +561,9 @@ def calculate_minimum_hp_memory(small_pool_count, large_pool_count, lvol_count, extra buffer 2GB return: minimum_hp_memory in bytes ''' - pool_consumption = (small_pool_count * 8 + large_pool_count * 128) / 1024 + 1092 - memory_consumption = (4 * cpu_count + 1.0277 * pool_consumption + 25 * lvol_count) * (1024 * 1024) + ( - 250 * 1024 * 1024) * 1.1 * convert_size(max_prov, 'TiB') + constants.EXTRA_HUGE_PAGE_MEMORY + pool_consumption = (small_pool_count * 8 + large_pool_count * 128) / 1024 + memory_consumption = (4 * cpu_count + 1.1 * pool_consumption + 22 * lvol_count) * ( + 1024 * 1024) + constants.EXTRA_HUGE_PAGE_MEMORY return int(1.2 * memory_consumption) @@ -708,6 +722,7 @@ def get_total_cpu_cores(mapping: str) -> int: items = [pair for pair in mapping.split(",") if "@" in pair] return len(items) + def convert_size(size: Union[int, str], unit: str, round_up: bool = False) -> int: """Convert the given number of bytes to target unit @@ -721,13 +736,15 @@ def convert_size(size: Union[int, str], unit: str, round_up: bool = False) -> in raw = size / (base ** exponent) return math.ceil(raw) if round_up else int(raw) + def first_six_chars(s: str) -> str: """ Returns the first six characters of a given string. If the string is shorter than six characters, returns the entire string. """ return s[:6] - + + def nearest_upper_power_of_2(n): # Check if n is already a power of 2 if (n & (n - 1)) == 0: @@ -1233,7 +1250,7 @@ def get_nvme_pci_devices(): return [], [] -def detect_nvmes(pci_allowed, pci_blocked): +def detect_nvmes(pci_allowed, pci_blocked, device_model, size_range): pci_addresses, blocked_devices = get_nvme_pci_devices() ssd_pci_set = set(pci_addresses) @@ -1251,6 +1268,13 @@ def detect_nvmes(pci_allowed, pci_blocked): return [] pci_addresses = list(user_pci_set) + for pci in pci_addresses: + pci_utils.ensure_driver(pci, 'nvme', override=True) + logger.debug(f"Found nvme devices are {pci_addresses}") + elif device_model and size_range: + pci_addresses = query_nvme_ssd_by_model_and_size(device_model, size_range) + logger.debug(f"Found nvme devices are {pci_addresses}") + pci_allowed = pci_addresses elif pci_blocked: user_pci_set = set( addr if len(addr.split(":")[0]) == 4 else f"0000:{addr}" @@ -1261,19 +1285,14 @@ def detect_nvmes(pci_allowed, pci_blocked): for pci in pci_addresses: pci_utils.ensure_driver(pci, 'nvme') - nvme_base_path = '/sys/class/nvme/' nvme_devices = [dev for dev in os.listdir(nvme_base_path) if dev.startswith('nvme')] nvmes = {} for dev in nvme_devices: - dev_name = os.path.basename(dev) - pattern = re.compile(rf"^{re.escape(dev_name)}n\d+$") - if any(pattern.match(block_device) for block_device in blocked_devices): - logger.debug(f"device {dev_name} is busy.. skipping") - continue - device_symlink = os.path.join(nvme_base_path, dev) try: - pci_address = "unknown" + dev_name = os.path.basename(dev) + pattern = re.compile(rf"^{re.escape(dev_name)}n\d+$") + device_symlink = os.path.join(nvme_base_path, dev) # Resolve the real path to get the actual device path real_path = os.path.realpath(device_symlink) @@ -1282,12 +1301,15 @@ def detect_nvmes(pci_allowed, pci_blocked): address_file = os.path.join(real_path, 'address') with open(address_file, 'r') as f: pci_address = f.read().strip() - + if any(pattern.match(block_device) for block_device in blocked_devices): + if pci_address not in pci_allowed: + logger.debug(f"device {dev_name} is busy.. skipping") + continue + logger.warning(f"PCI {pci_address} passed as allowed PCI, even it has partitions.. Formatting it now") # Read the NUMA node information numa_node_file = os.path.join(real_path, 'numa_node') with open(numa_node_file, 'r') as f: numa_node = f.read().strip() - if pci_address not in pci_addresses: continue nvmes[dev_name] = {"pci_address": pci_address, "numa_node": numa_node} @@ -1302,11 +1324,11 @@ def calculate_unisolated_cores(cores, cores_percentage=0): if cores_percentage: return math.ceil(total * (100 - cores_percentage) / 100) if total <= 10: - return 1 - if total <= 20: return 2 - if total <= 28: + if total <= 20: return 3 + if total <= 28: + return 4 return math.ceil(total * 0.15) @@ -1314,6 +1336,103 @@ def get_core_indexes(core_to_index, list_of_cores): return [core_to_index[core] for core in list_of_cores if core in core_to_index] +def build_unisolated_stride( + all_cores: List[int], + num_unisolated: int, + client_qpair_count: int, + pool_stride: int = 2, +) -> List[int]: + """ + Build a list of 'unisolated' CPUs by picking from per-qpair pools. + + Pools are contiguous slices of all_cores: + total=30, q=3 -> [0..9], [10..19], [20..29] + + Selection: + round-robin across pools, and within each pool advance by pool_stride + e.g. stride=2 -> 0,2,4,... then 10,12,14,... then 20,22,24,... + + If hyper_thread=True, append sibling right after each core: + sibling = cpu +/- (total//2) + """ + hyper_thread = is_hyperthreading_enabled_via_siblings() + if num_unisolated <= 0: + return [] + if client_qpair_count <= 0: + raise ValueError("client_qpair_count must be > 0") + if pool_stride <= 0: + raise ValueError("pool_stride must be > 0") + + cores = sorted(all_cores) + total = len(cores) + if total == 0: + return [] + + core_set = set(cores) + + half: int = 0 + if hyper_thread: + if total % 2 != 0: + raise ValueError(f"hyper_thread=True but total logical CPUs ({total}) is not even") + half = total // 2 + + # Build pools + pool_size = math.ceil(total / client_qpair_count) + pools = [cores[i * pool_size : min((i + 1) * pool_size, total)] for i in range(client_qpair_count)] + pools = [p for p in pools if p] # drop empties + + # Per-pool index (within each pool) + idx = [0] * len(pools) + + out: List[int] = [] + used = set() + + def add_cpu(cpu: int) -> None: + if cpu in core_set and cpu not in used and len(out) < num_unisolated: + out.append(cpu) + used.add(cpu) + + while len(out) < num_unisolated: + progress = False + + for pi, pool in enumerate(pools): + if len(out) >= num_unisolated: + break + + # find next candidate in this pool using stride + j = idx[pi] + while j < len(pool) and pool[j] in used: + j += pool_stride + if j >= len(pool): + continue + + cpu = pool[j] + idx[pi] = j + pool_stride + + add_cpu(cpu) + progress = True + + if hyper_thread and len(out) < num_unisolated: + sib = cpu + half if cpu < half else cpu - half + add_cpu(sib) + + if progress: + continue + + # Fallback: fill any remaining from whatever is unused (should rarely happen) + for cpu in cores: + if len(out) >= num_unisolated: + break + if cpu not in used: + add_cpu(cpu) + if hyper_thread and len(out) < num_unisolated: + sib = cpu + half if cpu < half else cpu - half + add_cpu(sib) + break + + return out[:num_unisolated] + + def generate_core_allocation(cores_by_numa, sockets_to_use, nodes_per_socket, cores_percentage=0): node_distribution: dict = {} # Iterate over each NUMA node @@ -1321,20 +1440,9 @@ def generate_core_allocation(cores_by_numa, sockets_to_use, nodes_per_socket, co if numa_node not in cores_by_numa: continue all_cores = sorted(cores_by_numa[numa_node]) - total_cores = len(all_cores) num_unisolated = calculate_unisolated_cores(all_cores, cores_percentage) + unisolated = build_unisolated_stride(all_cores,num_unisolated,constants.CLIENT_QPAIR_COUNT) - unisolated = [] - half = total_cores // 2 - for i in range(num_unisolated): - if i % 2 == 0: - index = i // 2 - else: - index = (i - 1) // 2 - if i % 2 == 0: - unisolated.append(all_cores[index]) - else: - unisolated.append(all_cores[half + index]) available_cores = [c for c in all_cores if c not in unisolated] q1 = len(available_cores) // 4 @@ -1428,8 +1536,10 @@ def regenerate_config(new_config, old_config, force=False): number_of_distribs = 2 number_of_distribs_cores = len(old_config["nodes"][i]["distribution"]["distrib_cpu_cores"]) number_of_poller_cores = len(old_config["nodes"][i]["distribution"]["poller_cpu_cores"]) - if number_of_distribs_cores > 2: + if 12 >= number_of_distribs_cores > 2: number_of_distribs = number_of_distribs_cores + else: + number_of_distribs = 12 old_config["nodes"][i]["number_of_distribs"] = number_of_distribs old_config["nodes"][i]["ssd_pcis"] = new_config["nodes"][i]["ssd_pcis"] old_config["nodes"][i]["nic_ports"] = new_config["nodes"][i]["nic_ports"] @@ -1459,7 +1569,7 @@ def regenerate_config(new_config, old_config, force=False): all_isolated_cores = set() for node in old_config["nodes"]: if len(node["ssd_pcis"]) == 0: - logger.error(f"There are not enough SSD devices on numa node {node['socket']}") + logger.error(f"There are no enough SSD devices on numa node {node['socket']}") return False total_required_memory += node["huge_page_memory"] + node["sys_memory"] node_cores_set = set(node["isolated"]) @@ -1473,7 +1583,7 @@ def regenerate_config(new_config, old_config, force=False): def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, - cores_percentage=0): + cores_percentage=0, force=False, device_model="", size_range=""): system_info = {} nodes_config: dict = {"nodes": []} @@ -1481,7 +1591,25 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a validate_sockets(sockets_to_use, cores_by_numa) logger.debug(f"Cores by numa {cores_by_numa}") nics = detect_nics() - nvmes = detect_nvmes(pci_allowed, pci_blocked) + nvmes = detect_nvmes(pci_allowed, pci_blocked, device_model, size_range) + if not nvmes: + logger.error( + "There are no enough SSD devices on system, you may run 'sbctl sn clean-devices', to clean devices stored in /etc/simplyblock/sn_config_file") + return False, False + if force: + nvme_devices = " ".join([f"/dev/{d}n1" for d in nvmes.keys()]) + logger.warning(f"Formating Nvme devices {nvme_devices}") + answer = input("Type YES/Y to continue: ").strip().lower() + if answer not in ("yes", "y"): + logger.warning("Aborted by user.") + exit(1) + logger.info("OK, continuing formating...") + for nvme_device in nvmes.keys(): + nvme_device_path = f"/dev/{nvme_device}n1" + clean_partitions(nvme_device_path) + nvme_json_string = get_idns(nvme_device_path) + lbaf_id = find_lbaf_id(nvme_json_string, 0, 12) + format_nvme_device(nvme_device_path, lbaf_id) for nid in sockets_to_use: if nid in cores_by_numa: @@ -1499,7 +1627,7 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a for nvme, val in nvmes.items(): pci = val["pci_address"] - numa = val["numa_node"] + numa = int(val["numa_node"]) pci_utils.unbind_driver(pci) if numa in sockets_to_use: system_info[numa]["nvmes"].append(pci) @@ -1552,8 +1680,8 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a "jm_cpu_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][1]), "poller_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][2]), "alceml_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][3]), - "alceml_worker_cpu_cores": get_core_indexes(core_group["core_to_index"], - core_group["distribution"][4]), + # "alceml_worker_cpu_cores": get_core_indexes(core_group["core_to_index"], + # core_group["distribution"][4]), "distrib_cpu_cores": get_core_indexes(core_group["core_to_index"], core_group["distribution"][5]), "jc_singleton_core": get_core_indexes(core_group["core_to_index"], core_group["distribution"][6]) }, @@ -1585,7 +1713,7 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a node_info["large_pool_count"] = large_pool_count node_info["max_lvol"] = max_lvol node_info["max_size"] = max_prov - node_info["huge_page_memory"] = minimum_hp_memory + node_info["huge_page_memory"] = max(minimum_hp_memory, max_prov) minimum_sys_memory = calculate_minimum_sys_memory(max_prov) node_info["sys_memory"] = minimum_sys_memory all_nodes.append(node_info) @@ -1598,7 +1726,7 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a all_isolated_cores = set() for node in all_nodes: if len(node["ssd_pcis"]) == 0: - logger.error(f"There are not enough SSD devices on numa node {node['socket']}") + logger.error(f"There are no enough SSD devices on numa node {node['socket']}") return False, False total_required_memory += node["huge_page_memory"] + node["sys_memory"] node_cores_set = set(node["isolated"]) @@ -1659,8 +1787,7 @@ def validate_node_config(node): required_distribution_fields = [ "app_thread_core", "jm_cpu_core", "poller_cpu_cores", - "alceml_cpu_cores", "alceml_worker_cpu_cores", - "distrib_cpu_cores", "jc_singleton_core" + "alceml_cpu_cores", "distrib_cpu_cores", "jc_singleton_core" ] # Check top-level fields @@ -1929,14 +2056,15 @@ def load_kube_config_with_fallback(): except Exception: config.load_kube_config() + def patch_cr_status( - *, - group: str, - version: str, - plural: str, - namespace: str, - name: str, - status_patch: dict, + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + status_patch: dict, ): """ Patch the status subresource of a Custom Resource. @@ -1967,17 +2095,18 @@ def patch_cr_status( f"Failed to patch status for {name}: {e.reason} {e.body}" ) + def patch_cr_node_status( - *, - group: str, - version: str, - plural: str, - namespace: str, - name: str, - node_uuid: str, - node_mgmt_ip: str, - updates: Optional[Dict[str, Any]] = None, - remove: bool = False, + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + node_uuid: str, + node_mgmt_ip: str, + updates: Optional[Dict[str, Any]] = None, + remove: bool = False, ): """ Patch status.nodes[*] fields for a specific node identified by UUID. @@ -2015,8 +2144,8 @@ def patch_cr_node_status( for node in status_nodes: match = ( - node.get("uuid") == node_uuid or - node.get("mgmtIp") == node_mgmt_ip + node.get("uuid") == node_uuid or + node.get("mgmtIp") == node_mgmt_ip ) if match: @@ -2066,23 +2195,24 @@ def patch_cr_node_status( } }, ) - + except ApiException as e: raise RuntimeError( f"Failed to patch node for {name}: {e.reason} {e.body}" ) + def patch_cr_lvol_status( - *, - group: str, - version: str, - plural: str, - namespace: str, - name: str, - lvol_uuid: Optional[str] = None, - updates: Optional[Dict[str, Any]] = None, - remove: bool = False, - add: Optional[Dict[str, Any]] = None, + *, + group: str, + version: str, + plural: str, + namespace: str, + name: str, + lvol_uuid: Optional[str] = None, + updates: Optional[Dict[str, Any]] = None, + remove: bool = False, + add: Optional[Dict[str, Any]] = None, ): """ Patch status.lvols[*] for an LVOL CustomResource. @@ -2178,6 +2308,7 @@ def patch_cr_lvol_status( f"Failed to patch lvol status for {name}: {e.reason} {e.body}" ) + def get_node_name_by_ip(target_ip: str) -> str: load_kube_config_with_fallback() v1 = client.CoreV1Api() @@ -2342,9 +2473,9 @@ def create_docker_service(cluster_docker: DockerClient, service_name: str, servi "com.docker.stack.namespace": "app"} ) + def create_k8s_service(namespace: str, deployment_name: str, container_name: str, service_file: str, container_image: str): - logger.info(f"Creating deployment: {deployment_name} in namespace {namespace}") load_kube_config_with_fallback() apps_v1 = client.AppsV1Api() @@ -2377,7 +2508,7 @@ def create_k8s_service(namespace: str, deployment_name: str, container = V1Container( name=container_name, image=container_image, - command=["python", service_file], + command=["python", service_file], env=env_list, volume_mounts=volume_mounts, resources=V1ResourceRequirements( @@ -2413,3 +2544,227 @@ def create_k8s_service(namespace: str, deployment_name: str, apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment) logger.info(f"Deployment {deployment_name} created successfully.") + + +def clean_partitions(nvme_device: str): + command = ['wipefs', '-a', nvme_device] + print(" ".join(command)) + try: + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True # Raise a CalledProcessError if the exit code is non-zero + ) + return result.stdout + + except subprocess.CalledProcessError as e: + # Handle errors (e.g., nvme not found, permission denied, or other command failures) + return (f"Error executing command: {' '.join(command)}\n" + f"Return Code: {e.returncode}\n" + f"Standard Error:\n{e.stderr}") + except FileNotFoundError: + return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" + + +def find_lbaf_id(json_data: str, target_ms: int, target_ds: int) -> int: + try: + data = json.loads(json_data) + except json.JSONDecodeError: + print("Error: Invalid JSON format provided.") + return 0 + + lbafs_list: List[Dict[str, int]] = data.get('lbafs', []) + + # LBAF IDs are 1-based, so we use enumerate starting from 1 + for index, lbaf in enumerate(lbafs_list, start=0): + if lbaf.get('ms') == target_ms and lbaf.get('ds') == target_ds: + return index + + return 0 + + +def get_idns(nvme_device: str): + command = ['nvme', 'id-ns', nvme_device, '--output-format', 'json'] + try: + # Run the command + # capture_output=True captures stdout and stderr. + # text=True decodes the output as text (using default encoding, typically UTF-8). + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True # Raise a CalledProcessError if the exit code is non-zero + ) + + # Return the captured standard output + return result.stdout + + except subprocess.CalledProcessError as e: + # Handle errors (e.g., nvme not found, permission denied, or other command failures) + return (f"Error executing command: {' '.join(command)}\n" + f"Return Code: {e.returncode}\n" + f"Standard Error:\n{e.stderr}") + except FileNotFoundError: + return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" + + +def format_nvme_device(nvme_device: str, lbaf_id: int): + command = ['nvme', 'format', nvme_device, f"--lbaf={lbaf_id}", '--force'] + print(" ".join(command)) + try: + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True # Raise a CalledProcessError if the exit code is non-zero + ) + + return result.stdout + + except subprocess.CalledProcessError as e: + # Handle errors (e.g., nvme not found, permission denied, or other command failures) + return (f"Error executing command: {' '.join(command)}\n" + f"Return Code: {e.returncode}\n" + f"Standard Error:\n{e.stderr}") + except FileNotFoundError: + return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" + + +def get_nvme_list_verbose() -> str: + """ + Executes the 'nvme list -v' command and returns the output. + + Returns: + str: The standard output of the command, or an error message + if the command fails. + """ + command = ['nvme', 'list', '-v', '--output-format', 'json'] + + try: + # Run the command + # capture_output=True captures stdout and stderr. + # text=True decodes the output as text (using default encoding, typically UTF-8). + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True # Raise a CalledProcessError if the exit code is non-zero + ) + + # Return the captured standard output + return result.stdout + + except subprocess.CalledProcessError as e: + # Handle errors (e.g., nvme not found, permission denied, or other command failures) + return (f"Error executing command: {' '.join(command)}\n" + f"Return Code: {e.returncode}\n" + f"Standard Error:\n{e.stderr}") + except FileNotFoundError: + return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" + + +def query_nvme_ssd_by_model_and_size(model: str, size_range: str) -> list: + if not model: + print("No model specified.") + return [] + if not size_range: + print("No size range specified.") + return [] + + size_from = 0 + size_to = 0 + try: + range_split = size_range.split('-') + if len(range_split) == 1: + size_from = parse_size(range_split[0]) + elif len(range_split) == 2: + size_from = parse_size(range_split[0]) + size_to = parse_size(range_split[1]) + else: + raise ValueError("Invalid size range") + except Exception as e: + print(e) + return [] + + json_string = get_nvme_list_verbose() + data = json.loads(json_string) + + pci_lst = [] + for device_entry in data.get('Devices', []): + for subsystem in device_entry.get('Subsystems', []): + for controller in subsystem.get('Controllers', []): + model_number = controller.get("ModelNumber") + if model_number != model: + continue + address = controller.get("Address") + if len(controller.get("Namespaces")) > 0: + size = controller.get("Namespaces")[0].get("PhysicalSize") + if size > size_from: + if size_to > 0 and size < size_to: + pci_lst.append(address) + return pci_lst + + +def clean_devices(nvme_devices_list): + for pci in nvme_devices_list: + pci_utils.ensure_driver(pci, 'nvme') + try: + json_string = get_nvme_list_verbose() + data = json.loads(json_string) + controllers_list = [] + + # The structure is Devices[0] -> Subsystems[] -> Controllers[] + nvme_devices = "" + for device_entry in data.get('Devices', []): + for subsystem in device_entry.get('Subsystems', []): + for controller in subsystem.get('Controllers', []): + # 3. Pull out the desired fields + if len(controller.get("Namespaces")) > 0 and controller.get("Address") in nvme_devices_list: + controllers_list.append({ + "NVMe_Controller": controller.get("Controller"), + "PCI_Address": controller.get("Address"), + "NAMESPACE": controller.get("Namespaces")[0].get("NameSpace") + }) + nvme_devices += f"/dev/{controller.get('Namespaces')[0].get('NameSpace')} " + logger.warning(f"Formating Nvme devices {nvme_devices}") + answer = input("Type YES/Y to continue: ").strip().lower() + if answer not in ("yes", "y"): + logger.warning("Aborted by user.") + exit(1) + + for mapping in controllers_list: + if mapping['PCI_Address'] in nvme_devices_list: + nvme_device_path = f"/dev/{mapping['NAMESPACE']}" + clean_partitions(nvme_device_path) + except json.JSONDecodeError as e: + logger.error(f"Error decoding JSON: {e}") + + +def create_rpc_socket_mount(): + try: + + logger.info("create RPC socket mount") + mount_point = "/mnt/ramdisk" + size = "1G" + fstab_entry = f"tmpfs {mount_point} tmpfs size={size},mode=1777,noatime 0 0\n" + + # Create the mount point if it doesn't exist + os.makedirs(mount_point, exist_ok=True) + + # Add to /etc/fstab if not already present + with open("/etc/fstab", "r+") as fstab: + lines = fstab.readlines() + if not any(mount_point in line for line in lines): + fstab.write(fstab_entry) + print(f"Added fstab entry for {mount_point}") + else: + print(f"fstab entry for {mount_point} already exists") + + # Mount the RAM disk immediately + subprocess.run(["mount", mount_point], check=True) + + # Verify + subprocess.run(["df", "-h", mount_point]) + except Exception as e: + logger.error(e) diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index d1ee4f9f0..31b4912a3 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -4,7 +4,6 @@ import math import os from pathlib import Path -import subprocess import time from typing import List, Optional, Union @@ -142,6 +141,7 @@ class SPDKParams(BaseModel): spdk_image: Optional[str] = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE) cluster_ip: Optional[str] = Field(default=None, pattern=utils.IP_PATTERN) cluster_mode: str + socket: Optional[int] = Field(None, ge=0) cluster_id: str @@ -181,24 +181,28 @@ def spdk_process_start(body: SPDKParams): f'/tmp/shm_{body.rpc_port}/:/dev/shm/', '/lib/modules/:/lib/modules/', '/var/lib/systemd/coredump/:/var/lib/systemd/coredump/', - '/sys:/sys'], + '/sys:/sys', + '/mnt/ramdisk:/mnt/ramdisk', + ], environment=[ f"RPC_PORT={body.rpc_port}", f"ssd_pcie={ssd_pcie_params}", f"PCI_ALLOWED={ssd_pcie_list}", f"TOTAL_HP={total_mem_mib}", + f"NSOCKET={body.socket}", ] # restart_policy={"Name": "on-failure", "MaximumRetryCount": 99} ) node_docker.containers.run( constants.SIMPLY_BLOCK_DOCKER_IMAGE, - "python simplyblock_core/services/spdk_http_proxy_server.py", + "python simplyblock_core/services/spdk_http_proxy_server.py ", name=f"spdk_proxy_{body.rpc_port}", detach=True, network_mode="host", log_config=log_config, volumes=[ f'/var/tmp/spdk_{body.rpc_port}:/var/tmp', + '/mnt/ramdisk:/mnt/ramdisk', ], environment=[ f"SERVER_IP={body.server_ip}", @@ -509,8 +513,10 @@ def bind_device_to_nvme(body: utils.DeviceParams): def delete_gpt_partitions_for_dev(body: utils.DeviceParams): bind_device_to_nvme(body) device_name = pci_utils.nvme_device_name(body.device_pci) - subprocess.check_call(['parted', '-fs', f'/dev/{device_name}', 'mklabel' 'gpt']) - return utils.get_response(True) + cmd = f"parted -fs /dev/{device_name} mklabel gpt" + out, err, ret_code = shell_utils.run_command(cmd) + logger.info(f"out: {out}, err: {err}, ret_code: {ret_code}") + return utils.get_response(ret_code==0, error=err) CPU_INFO = cpuinfo.get_cpu_info() diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index d5e98eb1d..59a8ec607 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -338,8 +338,8 @@ def spdk_process_start(body: SPDKParams): "L_CORES": body.l_cores, "CORES": core_utils.get_total_cpu_cores(body.l_cores), 'SPDK_MEM': core_utils.convert_size(body.spdk_mem, 'MiB'), - 'MEM_GEGA': core_utils.convert_size(body.spdk_mem, 'GiB', round_up=True), - 'MEM2_GEGA': core_utils.convert_size(body.system_mem, 'GiB', round_up=True), + 'MEM_MEGA': (core_utils.convert_size(body.spdk_mem, 'MiB', round_up=True) // 2) * 2 + 512, + 'MEM2_MEGA': (core_utils.convert_size(body.system_mem, 'MiB', round_up=True) // 2) * 2, 'SERVER_IP': body.server_ip, 'RPC_PORT': body.rpc_port, 'RPC_USERNAME': body.rpc_username, @@ -639,10 +639,13 @@ def apply_config(): # Set Huge page memory huge_page_memory_dict: dict = {} for node_config in nodes: + hg_memory = node_config["huge_page_memory"] + if int(node_config["max_size"]) > 0: + hg_memory = max(hg_memory , node_config["max_size"]) numa = node_config["socket"] - huge_page_memory_dict[numa] = huge_page_memory_dict.get(numa, 0) + node_config["huge_page_memory"] + huge_page_memory_dict[numa] = huge_page_memory_dict.get(numa, 0) + hg_memory + 1000000000 for numa, huge_page_memory in huge_page_memory_dict.items(): - num_pages = huge_page_memory // (2048 * 1024) + num_pages = huge_page_memory // 2000000 core_utils.set_hugepages_if_needed(numa, num_pages) return utils.get_response(True) diff --git a/simplyblock_web/node_configure.py b/simplyblock_web/node_configure.py index 6b69ee347..52dd02be2 100755 --- a/simplyblock_web/node_configure.py +++ b/simplyblock_web/node_configure.py @@ -1,54 +1,54 @@ -#!/usr/bin/env python -# encoding: utf-8 - -import argparse -import logging -import os -import sys -from typing import List, Optional, cast - -from kubernetes.client import ApiException, CoreV1Api - -from simplyblock_core import constants, utils -from simplyblock_core.storage_node_ops import ( - generate_automated_deployment_config, - upgrade_automated_deployment_config, -) -from simplyblock_cli.clibase import range_type -from simplyblock_web import node_utils_k8s - - -logger = logging.getLogger(__name__) -logger.setLevel(constants.LOG_LEVEL) - -POD_PREFIX: str = "snode-spdk-pod" - -def _is_pod_present_for_node() -> bool: - """ - Check if a pod with the specified prefix is already running on the current node. - - Returns: - bool: True if a matching pod is found, False otherwise - - Raises: - RuntimeError: If there's an error communicating with the Kubernetes API - """ - k8s_core_v1: CoreV1Api = cast(CoreV1Api, utils.get_k8s_core_client()) - namespace: str = node_utils_k8s.get_namespace() - node_name: Optional[str] = os.environ.get("HOSTNAME") - - if not node_name: +#!/usr/bin/env python +# encoding: utf-8 + +import argparse +import logging +import os +import sys +from typing import List, Optional, cast + +from kubernetes.client import ApiException, CoreV1Api + +from simplyblock_core import constants, utils +from simplyblock_core.storage_node_ops import ( + generate_automated_deployment_config, + upgrade_automated_deployment_config, +) +from simplyblock_cli.clibase import range_type +from simplyblock_web import node_utils_k8s + +logger = logging.getLogger(__name__) +logger.setLevel(constants.LOG_LEVEL) + +POD_PREFIX: str = "snode-spdk-pod" + + +def _is_pod_present_for_node() -> bool: + """ + Check if a pod with the specified prefix is already running on the current node. + + Returns: + bool: True if a matching pod is found, False otherwise + + Raises: + RuntimeError: If there's an error communicating with the Kubernetes API + """ + k8s_core_v1: CoreV1Api = cast(CoreV1Api, utils.get_k8s_core_client()) + namespace: str = node_utils_k8s.get_namespace() + node_name: Optional[str] = os.environ.get("HOSTNAME") + + if not node_name: raise RuntimeError("HOSTNAME environment variable not set") try: resp = k8s_core_v1.list_namespaced_pod(namespace) for pod in resp.items: if ( - pod.metadata and - pod.metadata.name and - pod.spec and - pod.spec.node_name == node_name and - pod.metadata.name.startswith(POD_PREFIX) + pod.metadata and + pod.metadata.name and + pod.spec and + pod.spec.node_name == node_name and + pod.metadata.name.startswith(POD_PREFIX) ): return True except ApiException as e: @@ -66,7 +66,7 @@ def parse_arguments() -> argparse.Namespace: argparse.Namespace: Parsed command line arguments """ parser = argparse.ArgumentParser(description="Automated Deployment Configuration Script") - + # Define command line arguments parser.add_argument( '--max-lvol', @@ -121,13 +121,36 @@ def parse_arguments() -> argparse.Namespace: ) parser.add_argument( '--cores-percentage', - help='The percentage of cores to be used for spdk (0-100)', - type=range_type(0, 100), + help='The percentage of cores to be used for spdk (0-99)', + type=range_type(0, 99), dest='cores_percentage', required=False, default=0 ) - + parser.add_argument( + '--force', + help='Force format detected or passed nvme pci address to 4K and clean partitions', + action='store_true', + dest='force', + required=False + ) + parser.add_argument( + '--device-model', + help='NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together', + type=str, + default='', + dest='device_model', + required=False + ) + parser.add_argument( + '--size-range', + help='NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together', + type=str, + default='', + dest='size_range', + required=False + ) + return parser.parse_args() @@ -145,8 +168,8 @@ def validate_arguments(args: argparse.Namespace) -> None: if not args.max_lvol: raise argparse.ArgumentError(None, '--max-lvol is required') if not args.max_prov: - raise argparse.ArgumentError(None, '--max-size is required') - + args.max_prov = 0 + try: max_lvol = int(args.max_lvol) if max_lvol <= 0: @@ -156,15 +179,15 @@ def validate_arguments(args: argparse.Namespace) -> None: None, f"Invalid value for max-lvol '{args.max_lvol}': {str(e)}" ) - + if args.pci_allowed and args.pci_blocked: raise argparse.ArgumentError( None, "pci-allowed and pci-blocked cannot be both specified" ) - + max_prov = utils.parse_size(args.max_prov, assume_unit='G') - if max_prov <= 0: + if max_prov < 0: raise argparse.ArgumentError( None, f"Invalid storage size: {args.max_prov}. Must be a positive value with optional unit (e.g., 100G, 1T)" @@ -175,17 +198,19 @@ def main() -> None: """Main entry point for the node configuration script.""" try: args = parse_arguments() - + if args.upgrade: upgrade_automated_deployment_config() return - + + if not args.max_prov: + args.max_prov = 0 validate_arguments(args) - + if _is_pod_present_for_node(): logger.info("Skipped generating automated deployment configuration — pod already present.") sys.exit(0) - + # Process socket configuration sockets_to_use: List[int] = [0] if args.sockets_to_use: @@ -196,7 +221,7 @@ def main() -> None: None, f"Invalid value for sockets-to-use '{args.sockets_to_use}': {str(e)}" ) - + nodes_per_socket: int = 1 if args.nodes_per_socket: try: @@ -208,16 +233,16 @@ def main() -> None: None, f"Invalid value for nodes-per-socket '{args.nodes_per_socket}': {str(e)}" ) - + # Process PCI device filters pci_allowed: List[str] = [] pci_blocked: List[str] = [] - + if args.pci_allowed: pci_allowed = [pci.strip() for pci in args.pci_allowed.split(',') if pci.strip()] if args.pci_blocked: pci_blocked = [pci.strip() for pci in args.pci_blocked.split(',') if pci.strip()] - + # Generate the deployment configuration generate_automated_deployment_config( max_lvol=int(args.max_lvol), @@ -226,9 +251,12 @@ def main() -> None: sockets_to_use=sockets_to_use, pci_allowed=pci_allowed, pci_blocked=pci_blocked, - cores_percentage=args.cores_percentage + cores_percentage=args.cores_percentage, + force=args.force, + device_model=args.device_model, + size_range=args.size_range ) - + except argparse.ArgumentError as e: logger.error(f"Argument error: {e}") sys.exit(1) diff --git a/simplyblock_web/templates/storage_core_isolation.yaml.j2 b/simplyblock_web/templates/storage_core_isolation.yaml.j2 index 9ae4ba8e8..30bbf8809 100644 --- a/simplyblock_web/templates/storage_core_isolation.yaml.j2 +++ b/simplyblock_web/templates/storage_core_isolation.yaml.j2 @@ -116,7 +116,7 @@ spec: chroot /host apt update && chroot /host apt install -y grep jq nvme-cli tuned chroot /host apt-get install -y linux-modules-extra-$(uname -r) ;; - centos|rhel|rocky|almalinux) + centos|rhel|rocky|almalinux|ol) export YUM_RELEASEVER=$(awk -F'=' '/^VERSION_ID=/{gsub(/"/,"",$2); print $2}' /etc/os-release) export DNF_RELEASEVER=$(awk -F'=' '/^VERSION_ID=/{gsub(/"/,"",$2); print $2}' /etc/os-release) chroot /host dnf install -y grep jq nvme-cli kernel-modules-extra tuned \ diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index d118927cd..7963aa248 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -24,8 +24,8 @@ spec: operator: Exists volumes: - name: socket-dir - emptyDir: - medium: "Memory" + hostPath: + path: /mnt/ramdisk - name: host-sys hostPath: path: /sys @@ -101,7 +101,7 @@ spec: privileged: true volumeMounts: - name: socket-dir - mountPath: /var/tmp + mountPath: /mnt/ramdisk - name: host-sys mountPath: /sys - name: host-modules @@ -114,10 +114,10 @@ spec: mountPath: /etc/simplyblock resources: limits: - hugepages-2Mi: {{ MEM_GEGA }}Gi + hugepages-2Mi: {{ MEM_MEGA }}Mi cpu: {{ CORES }} requests: - hugepages-2Mi: {{ MEM_GEGA }}Gi + hugepages-2Mi: {{ MEM_MEGA }}Mi - name: spdk-proxy-container image: {{ SIMPLYBLOCK_DOCKER_IMAGE }} @@ -125,7 +125,7 @@ spec: command: ["python", "simplyblock_core/services/spdk_http_proxy_server.py"] volumeMounts: - name: socket-dir - mountPath: /var/tmp + mountPath: /mnt/ramdisk env: - name: SERVER_IP value: "{{ SERVER_IP }}" diff --git a/simplyblock_web/templates/storage_init_job.yaml.j2 b/simplyblock_web/templates/storage_init_job.yaml.j2 index 6432d4500..074f501b5 100644 --- a/simplyblock_web/templates/storage_init_job.yaml.j2 +++ b/simplyblock_web/templates/storage_init_job.yaml.j2 @@ -17,11 +17,20 @@ spec: operator: Exists - effect: NoExecute operator: Exists + + volumes: + - name: host-proc + hostPath: + path: /proc containers: - name: init-setup image: simplyblock/ubuntu-tools:22.04 securityContext: privileged: true + + volumeMounts: + - name: host-proc + mountPath: /proc command: ["/bin/sh", "-c"] args: - | @@ -44,15 +53,19 @@ spec: OS_ID="$(cat /proc/version | awk '{print $3}' | awk -F'-' '{print $NF}')" if [ "$OS_ID" != "talos" ]; then - echo "--- Restarting kubelet ---" - nsenter --target 1 --mount --uts --ipc --net --pid -- /bin/sh -c ' - if command -v systemctl >/dev/null 2>&1; then - echo "Restarting kubelet..." - systemctl restart kubelet && echo "Kubelet restarted" || echo "Kubelet restart failed" - else - echo "systemctl not found; skipping kubelet restart" - fi - ' + HUGEPAGES_AFTER=$(grep HugePages_Total /proc/meminfo | awk '{print $2}') + echo "[INFO] Hugepages after: $HUGEPAGES_AFTER" + + if [ "$HUGEPAGES_BEFORE" != "$HUGEPAGES_AFTER" ]; then + echo "[INFO] Hugepages changed, restarting kubelet..." + nsenter --target 1 --mount --uts --ipc --net --pid -- /bin/sh -c ' + if command -v systemctl >/dev/null 2>&1; then + systemctl restart kubelet && echo "Kubelet restarted" || echo "Kubelet restart failed" + fi + ' + else + echo "[INFO] Hugepages unchanged, skipping kubelet restart." + fi else echo "Talos detected - skipping nsenter and kubelet restart." echo "Use 'talosctl service kubelet restart -n $NODE_IP' to restart the node kubelet" From dbb360e41c1a2a1c87b3ff03e587864c5e8878d7 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 22 Jan 2026 17:28:31 +0300 Subject: [PATCH 132/192] Cherry-pick commits from R25.10-Hotfix that was made by [Hamdy Michael Raunak] to main (#848) * set size of lvstore cluster in constants (as ratio to distrib page size) * R25.10 hotfix multi fix (#738) * Updating Storage node monitor (cherry picked from commit ad546ca5fe667a74a5559109fb0e7c58d3a707b0) * Fix fdb value exceed limit Create two new onjects to hold the remote device connections: RemoteDevice RemoteJMDevice Those two objects would hold 5 attributes related to the remote connection only * fix type checker issues * set version * set version * disable lvol geo * fix logger issue * Do not set spdk mem when starting spdk * Fix healthcheck logger _1 * Update storage_node_monitor.py * Revert "Update __init__.py" This reverts commit 5b83b13776ea6d184fa957eb0a74bf081f70f385. --------- Co-authored-by: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> * fix connection in case device node is online or down * Fix healthcheck service node statues in_creation * Remove remote object from node when receiving distrib events (#743) * Remove remote object from node when receiving distrib events * Fix linter issues * set size of lvstore cluster in constants (as ratio to distrib page size) * Fix distrib event collector syntax * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * Add req id to RPC logs for spdk_proxy (#752) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * set size of lvstore cluster in constants (as ratio to distrib page size) * Update socket parameter in docker.py * Fix mandatory argument handling in storage_node__configure * Add socket field to storage node configuration * Update docker.py to mount ramdisk for proxy * Update spdk_http_proxy_server.py to take ramdisk folder sock file * Update storage_node_ops.py to fix max prov typo * set size of lvstore cluster in constants (as ratio to distrib page size) * Adding minimum_sys_memory from database * set size of lvstore cluster in constants (as ratio to distrib page size) * fix syntax errors and return values * Main lvol sync delete (#734) (#757) * Add lvol sync delete task runner * fix linter issues * fix (cherry picked from commit 25e3dd29b5fa5b345e73cb905cf169fc021d6eff) * R25.10 hotfix sfam 2471 (#758) * raise RPC exception in case of timeout * fix image * fix storage_node_ops.py * fix node restart cli catch exception * expose prometheus port 9090 using HAProxy * Fix unhandled exceptions in capacity monitor and lvol stats collector * fix linter and type checker issues (#759) * Fix exception msg not to print the whole traceback * Add param ha_jm_count to snode add API v1 and v2 * remove logging data response * Fix fw connection error handling not to set node status down (#765) * Fix fw connection error handling not to set node status down * fix linter issues * Remove constants.HA_JM_COUNT (#766) * Do not install pip package on cluster update (#749) * print tables in health check service when not passing * fix sfam-2474 * fix sfam-2475 * refactor node add task runner (#768) refactor node add task runner to have 11 retries with double the amount of waiting time on each retry starting from 10 seconds * Update constants.py * Suspend and retry on node add task fail * Fix sfam-2483 (#773) * Fix sfam-2483 add device functions to lock and release on connecting and disconnecting to node * Fix main_distr_event_collector.py to optimize event_process speed * set main_distr_event_collector.py log level * fix logger (#776) * fix logger * fix logger * fix SFAM-2476 (#775) add migration tasks to unfinished master task * Fix SFAM-2482 (#769) * Fix SFAM-2482 Adds bool return type for the check functions Fixes port allow service to check and validate each component individually (distribs, lvstore, hublvol) refactor function health_controller._check_node_lvstore to return on first error * fix type checker * fix port allow to send device status to one node * fix distrib collector * Fix sfam-2484 check jm device status and its nod status on connect * Fix type checker * Fix sfam-2491 remove service "app_CachedLVolStatsCollector" on update from docker services * Fix sfam-2490 Use snode/info as failback api path if snode/check is not found * Update constants.py * Fix sfam-2488 Allow connection to online JM device on node with status RESTARTING * Fix sfam-2489 Allow migrations tasks to run even if there are unavailable devices as long as the cluster is active * Fix sfam-2499 * Fix sfam-2500 * Remove deprecated lvol monitor code * Fix cluster get-tasks --limit 0 * Fix cluster get-tasks --limit 0 _2 * Create partitions and alcemls on node add in parallel (#763) * Create partitions and alcemls on node add in parallel * fix 1 * connect to remote alcemls in parallel * Create distrib bdevs in parallel * Create distrib bdevs in parallel * prepare for merge * Fix sfam-2485 * increase dump_lvstore rpc timeout * Adds log number of active threads, unix sockets, server sessions * Fix sfam-2512 Fix spdk_proxy error handling * increase dump_lvstore rpc timeout=120 * Handle node not found in service jc comp resume * Change the FW app port to 50001 * Fix sfam-2515 send device updates to node regardless of node status in db * Handle node runner restart node not found * Fix sfam-2515 check JM replication status on sec before dropping leadership during node restart and node down>online status change * R25.10 hotfix spdk proxy stats (#812) * Add stats to spdk_http_proxy_server.py Prints max, avg and last 3 sec avg for read lines from http socket and rpc response receive from spdk * Add stats to spdk_http_proxy_server.py _2 * Fix 2 * Fix 2 * Fix 3 * Fix sfam-2517 move port ranges start to constants file * Fix sfam-2502 Fix the check for JM repl tasks to be before blocking the port * Fix sfam-2506 do not send device updates from health_controller.py when node in_restart * Use diff fw port per node (#817) * Fix sfam-2531 Use diff fw port per node * update img * prepare for merge * prepare for merge * Ignore cluster activate force option on failing lvstore creation * fix sfam-2528 Exclude nodes with status offline and removed from events updates sending Send device events to distribs in parallel * Check remote jm bdev before adding it to the list Add unit tests for storage_node_ops.get_node_jm_names * Check lvstore before checking jm rep status * validate_connected_JMs before creating distribs The new function would select new JM if one is not connected * fix JM names on distrib create to include offline jm name * fix JM names on distrib create to include offline jm name _2 * Fix sfam-2548 Catch the connection error when checking lvol on node * Disable full_page_unmap on new clusters * Fix sfam-2543 remove device from db in case of node restart and device state is new and device sn is not found in spdk. * Change jc compression start after cluster activate we start on all nodes after all migration task is finished on a node then we start jc on that node after lvol delete if deleted lvol used size is more than 10% of total cluster size * fix error response handling * Fix sfam-2555 (#831) * Fix sfam-2555 do not allow lvol operations when async del found * Update storage_node.py * Update snapshot_controller.py * Update storage_node.py * Update lvol_controller.py * Adds mutex lock on storage node level for lvol del operations --------- Co-authored-by: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> * add lvol sync del on sec if it is online-ish * add lvol sync del on sec if it not offline * Fix firewall port number from 50001 to 5001 * Fix sfam-2556 (#835) * wip * fix linter and unit test * wip * wip * Fix node auto restart (#836) * Fix node auto restart Allow auto restart if offline nodes == cluster.npcs do not restart if cluster is suspended * Update tasks_controller.py --------- Co-authored-by: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> * Fix restart-device adds --force param adds device bind and ctrl attach * Allow migration in case of device in state failed * adds jm restart --format * Adds failed device back to cluster as a new one (#841) * Adds failed device back to cluster as a new one Add CLI: sbctl sn new-device-from-failed change serial number for the failed device by adding postfix _failed Create new device object from failed one info * fix device sn * - remove device bdevs and subsys on setting device status to failed and migrated - fail new_device_from_failed when device already added before - create new alceml and formate is on add_device * fix type checker * fix linter * wip * fix device remove * fix linter * bind dev to nvme driver on creating new device object * send physical_label when adding new device to distrib map * Update rpc_client.py * fix traceback * replace controller.state by controller_state * Fix remove device function (#840) * Fix remove device function - check if bdev found before delete - remove device reset from CLI * whrn restarting/removing device then handle JM raid base badev to be removed to added * wip * fix * fix * fix 2 * fix 3 * fix 3 * fix 4 * fix 4 * fix 4 * fix 5 * fix 5 * Revert "fix 5" This reverts commit 10cd27a357ad6f6d24222ffc86367a8740c7502f. * fix has_any * fix force * fix missing raid * examine on dev restart * check jm raid on device restart * Adds wait for examine * fix 1 * fix 2 * fix sfam-2557 (#843) * fix sfam-2557 Skip processing distrib event if fired late and controller found * replace controller.state by controller_state * Reset lvol delete sync lock on task cancellation Reset the lvol delete sync lock for the primary node after marking the task as canceled. * Reset lvol delete sync lock on node not found * Fix SFAM-2577 Set rpc_client timeout=3 on device connection * Fix PR checks * fix checks * fix checks 2 * update docker image tag for main-multifix-hamdy-michael * Add configuration files and scripts for SimplyBlock integration * Update SimplyBlock version and Docker image tag --------- Co-authored-by: schmidt-scaled Co-authored-by: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Co-authored-by: Waleed Mousa <32266980+wmousa@users.noreply.github.com> Co-authored-by: Raunak Jalan <41023976+RaunakJalan@users.noreply.github.com> --- simplyblock_cli/cli-reference.yaml | 30 +- simplyblock_cli/cli.py | 18 +- simplyblock_cli/clibase.py | 64 +-- simplyblock_cli/scripts/cli-wrapper.jinja2 | 2 +- simplyblock_core/cluster_ops.py | 46 +- simplyblock_core/constants.py | 18 +- .../controllers/device_controller.py | 341 ++++++++------ .../controllers/health_controller.py | 231 +++++---- .../controllers/lvol_controller.py | 22 +- .../controllers/snapshot_controller.py | 11 +- .../controllers/storage_events.py | 1 - .../controllers/tasks_controller.py | 54 ++- simplyblock_core/db_controller.py | 12 +- simplyblock_core/distr_controller.py | 95 ++-- simplyblock_core/env_var | 6 +- simplyblock_core/fw_api_client.py | 5 +- simplyblock_core/models/nvme_device.py | 43 +- simplyblock_core/models/storage_node.py | 58 ++- simplyblock_core/rpc_client.py | 35 +- .../scripts/docker-compose-swarm.yml | 1 + simplyblock_core/scripts/haproxy.cfg | 10 + .../services/capacity_and_stats_collector.py | 41 +- .../services/health_check_service.py | 444 +++++++++--------- simplyblock_core/services/lvol_monitor.py | 389 +++++++-------- .../services/lvol_stat_collector.py | 107 ++--- .../services/main_distr_event_collector.py | 94 ++-- simplyblock_core/services/snapshot_monitor.py | 287 ++++++----- .../services/spdk_http_proxy_server.py | 70 ++- .../services/storage_node_monitor.py | 397 ++++++++-------- .../services/tasks_runner_failed_migration.py | 8 +- .../services/tasks_runner_jc_comp.py | 14 +- .../services/tasks_runner_migration.py | 31 +- .../tasks_runner_new_dev_migration.py | 8 +- .../services/tasks_runner_node_add.py | 87 ++-- .../services/tasks_runner_port_allow.py | 431 +++++++++-------- .../services/tasks_runner_restart.py | 52 +- .../services/tasks_runner_sync_lvol_del.py | 6 + simplyblock_core/snode_client.py | 18 +- simplyblock_core/storage_node_ops.py | 303 ++++++------ simplyblock_core/utils/__init__.py | 26 +- .../api/internal/storage_node/docker.py | 7 +- .../api/internal/storage_node/kubernetes.py | 6 +- simplyblock_web/api/v1/storage_node.py | 5 + simplyblock_web/api/v2/storage_node.py | 6 +- simplyblock_web/node_configure.py | 32 +- .../templates/storage_deploy_spdk.yaml.j2 | 14 + simplyblock_web/utils.py | 2 +- 47 files changed, 2300 insertions(+), 1688 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index d5cad51c0..3ef6d71d8 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -481,16 +481,6 @@ commands: help: "Device id" dest: device_id type: str - - name: reset-device - help: "Resets a storage device" - usage: > - Hardware device reset. Resetting the device can return the device from an unavailable into online state, if - successful. - arguments: - - name: "device_id" - help: "Device id" - dest: device_id - type: str - name: restart-device help: "Restarts a storage device" usage: > @@ -502,6 +492,11 @@ commands: help: "Device id" dest: device_id type: str + - name: "--force" + help: "Force remove" + dest: force + type: bool + action: store_true - name: add-device help: "Adds a new storage device" usage: > @@ -647,6 +642,11 @@ commands: dest: force type: bool action: store_true + - name: "--format" + help: "Format the Alceml device used for JM device" + dest: format + type: bool + action: store_true - name: send-cluster-map help: "Sends a new cluster map" private: true @@ -701,6 +701,16 @@ commands: help: "attr_value" dest: attr_value type: str + - name: new-device-from-failed + help: "Adds a new device to from failed device information" + usage: > + A previously failed and migrated device may be added back into the cluster as a new device. The new device + would have the same info as the failed device but would be empty and not contain any data. + arguments: + - name: "device_id" + help: "Device id" + dest: device_id + type: str - name: "cluster" help: "Cluster commands" weight: 200 diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index d55b8317f..3f85be70a 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -5,7 +5,7 @@ import sys import traceback -from simplyblock_cli.clibase import CLIWrapperBase, range_type, regex_type, size_type +from simplyblock_cli.clibase import CLIWrapperBase, range_type, size_type from simplyblock_core import utils class CLIWrapper(CLIWrapperBase): @@ -52,7 +52,6 @@ def init_storage_node(self): if self.developer_mode: self.init_storage_node__device_testing_mode(subparser) self.init_storage_node__get_device(subparser) - self.init_storage_node__reset_device(subparser) self.init_storage_node__restart_device(subparser) self.init_storage_node__add_device(subparser) self.init_storage_node__remove_device(subparser) @@ -78,6 +77,7 @@ def init_storage_node(self): self.init_storage_node__dump_lvstore(subparser) if self.developer_mode: self.init_storage_node__set(subparser) + self.init_storage_node__new_device_from_failed(subparser) def init_storage_node__deploy(self, subparser): @@ -219,13 +219,10 @@ def init_storage_node__get_device(self, subparser): subcommand = self.add_sub_command(subparser, 'get-device', 'Gets storage device by its id') subcommand.add_argument('device_id', help='Device id', type=str) - def init_storage_node__reset_device(self, subparser): - subcommand = self.add_sub_command(subparser, 'reset-device', 'Resets a storage device') - subcommand.add_argument('device_id', help='Device id', type=str) - def init_storage_node__restart_device(self, subparser): subcommand = self.add_sub_command(subparser, 'restart-device', 'Restarts a storage device') subcommand.add_argument('device_id', help='Device id', type=str) + argument = subcommand.add_argument('--force', help='Force remove', dest='force', action='store_true') def init_storage_node__add_device(self, subparser): subcommand = self.add_sub_command(subparser, 'add-device', 'Adds a new storage device') @@ -285,6 +282,7 @@ def init_storage_node__restart_jm_device(self, subparser): subcommand = self.add_sub_command(subparser, 'restart-jm-device', 'Restarts a journaling device') subcommand.add_argument('jm_device_id', help='Journaling device id', type=str) argument = subcommand.add_argument('--force', help='Force device remove', dest='force', action='store_true') + argument = subcommand.add_argument('--format', help='Format the Alceml device used for JM device', dest='format', action='store_true') def init_storage_node__send_cluster_map(self, subparser): subcommand = self.add_sub_command(subparser, 'send-cluster-map', 'Sends a new cluster map') @@ -308,6 +306,10 @@ def init_storage_node__set(self, subparser): subcommand.add_argument('attr_name', help='attr_name', type=str) subcommand.add_argument('attr_value', help='attr_value', type=str) + def init_storage_node__new_device_from_failed(self, subparser): + subcommand = self.add_sub_command(subparser, 'new-device-from-failed', 'Adds a new device to from failed device information') + subcommand.add_argument('device_id', help='Device id', type=str) + def init_cluster(self): subparser = self.add_command('cluster', 'Cluster commands') @@ -871,8 +873,6 @@ def run(self): ret = self.storage_node__device_testing_mode(sub_command, args) elif sub_command in ['get-device']: ret = self.storage_node__get_device(sub_command, args) - elif sub_command in ['reset-device']: - ret = self.storage_node__reset_device(sub_command, args) elif sub_command in ['restart-device']: ret = self.storage_node__restart_device(sub_command, args) elif sub_command in ['add-device']: @@ -935,6 +935,8 @@ def run(self): ret = False else: ret = self.storage_node__set(sub_command, args) + elif sub_command in ['new-device-from-failed']: + ret = self.storage_node__new_device_from_failed(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 5e76e89fd..240df3381 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -150,26 +150,29 @@ def storage_node__add_node(self, sub_command, args): enable_ha_jm = args.enable_ha_jm namespace = args.namespace ha_jm_count = args.ha_jm_count - - out = storage_ops.add_node( - cluster_id=cluster_id, - node_addr=node_addr, - iface_name=ifname, - data_nics_list=data_nics, - max_snap=max_snap, - spdk_image=spdk_image, - spdk_debug=spdk_debug, - small_bufsize=small_bufsize, - large_bufsize=large_bufsize, - num_partitions_per_dev=num_partitions_per_dev, - jm_percent=jm_percent, - enable_test_device=enable_test_device, - namespace=namespace, - enable_ha_jm=enable_ha_jm, - id_device_by_nqn=args.id_device_by_nqn, - partition_size=args.partition_size, - ha_jm_count=ha_jm_count, - ) + try: + out = storage_ops.add_node( + cluster_id=cluster_id, + node_addr=node_addr, + iface_name=ifname, + data_nics_list=data_nics, + max_snap=max_snap, + spdk_image=spdk_image, + spdk_debug=spdk_debug, + small_bufsize=small_bufsize, + large_bufsize=large_bufsize, + num_partitions_per_dev=num_partitions_per_dev, + jm_percent=jm_percent, + enable_test_device=enable_test_device, + namespace=namespace, + enable_ha_jm=enable_ha_jm, + id_device_by_nqn=args.id_device_by_nqn, + partition_size=args.partition_size, + ha_jm_count=ha_jm_count, + ) + except Exception as e: + print(e) + return False return out @@ -200,11 +203,15 @@ def storage_node__restart(self, sub_command, args): large_bufsize = args.large_bufsize ssd_pcie = args.ssd_pcie - return storage_ops.restart_storage_node( - node_id, max_lvol, max_snap, max_prov, - spdk_image, spdk_debug, - small_bufsize, large_bufsize, node_ip=args.node_ip, reattach_volume=reattach_volume, force=args.force, - new_ssd_pcie=ssd_pcie, force_lvol_recreate=args.force_lvol_recreate) + try: + return storage_ops.restart_storage_node( + node_id, max_lvol, max_snap, max_prov, + spdk_image, spdk_debug, + small_bufsize, large_bufsize, node_ip=args.node_ip, reattach_volume=reattach_volume, force=args.force, + new_ssd_pcie=ssd_pcie, force_lvol_recreate=args.force_lvol_recreate) + except Exception as e: + print(e) + return False def storage_node__shutdown(self, sub_command, args): return storage_ops.shutdown_storage_node(args.node_id, args.force) @@ -249,7 +256,7 @@ def storage_node__reset_device(self, sub_command, args): return device_controller.reset_storage_device(args.device_id) def storage_node__restart_device(self, sub_command, args): - return device_controller.restart_device(args.device_id) + return device_controller.restart_device(args.device_id, args.force) def storage_node__add_device(self, sub_command, args): return device_controller.add_device(args.device_id) @@ -308,7 +315,7 @@ def storage_node__remove_jm_device(self, sub_command, args): return device_controller.remove_jm_device(args.jm_device_id, args.force) def storage_node__restart_jm_device(self, sub_command, args): - return device_controller.restart_jm_device(args.jm_device_id, args.force) + return device_controller.restart_jm_device(args.jm_device_id, args.force, args.format) def storage_node__send_cluster_map(self, sub_command, args): node_id = args.node_id @@ -326,6 +333,9 @@ def storage_node__dump_lvstore(self, sub_command, args): node_id = args.node_id return storage_ops.dump_lvstore(node_id) + def storage_node__new_device_from_failed(self, sub_command, args): + return device_controller.new_device_from_failed(args.device_id) + def storage_node__set(self, sub_command, args): return storage_ops.set_value(args.node_id, args.attr_name, args.attr_value) diff --git a/simplyblock_cli/scripts/cli-wrapper.jinja2 b/simplyblock_cli/scripts/cli-wrapper.jinja2 index 423b11992..80932e582 100644 --- a/simplyblock_cli/scripts/cli-wrapper.jinja2 +++ b/simplyblock_cli/scripts/cli-wrapper.jinja2 @@ -5,7 +5,7 @@ import logging import sys import traceback -from simplyblock_cli.clibase import CLIWrapperBase, range_type, regex_type, size_type +from simplyblock_cli.clibase import CLIWrapperBase, range_type, size_type from simplyblock_core import utils class CLIWrapper(CLIWrapperBase): diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index fa11d2f4c..78e06ccd7 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -15,7 +15,7 @@ from docker.errors import DockerException from simplyblock_core import utils, scripts, constants, mgmt_node_ops, storage_node_ops -from simplyblock_core.controllers import cluster_events, device_controller, qos_controller +from simplyblock_core.controllers import cluster_events, device_controller, qos_controller, tasks_controller from simplyblock_core.db_controller import DBController from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule @@ -333,6 +333,7 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.contact_point = contact_point cluster.disable_monitoring = disable_monitoring cluster.mode = mode + cluster.full_page_unmap = False if mode == "docker": if not disable_monitoring: @@ -359,7 +360,7 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, if ingress_host_source == "hostip": dns_name = dev_ip - + _set_max_result_window(os_endpoint) _add_graylog_input(graylog_endpoint, monitoring_secret) @@ -438,7 +439,7 @@ def _run_fio(mount_point) -> None: def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit, distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, qpair_count, - max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, cr_name=None, + max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, cr_name=None, cr_namespace=None, cr_plural=None, fabric="tcp", cluster_ip=None, grafana_secret=None) -> str: @@ -489,7 +490,7 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn mgmt_node_ops.add_mgmt_node(cluster_ip, "kubernetes", cluster.uuid) if enable_monitoring == "true": graylog_endpoint = constants.GRAYLOG_K8S_ENDPOINT - os_endpoint = constants.OS_K8S_ENDPOINT + os_endpoint = constants.OS_K8S_ENDPOINT _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret) _set_max_result_window(os_endpoint) @@ -498,7 +499,7 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn if cluster.mode == "kubernetes": utils.patch_prometheus_configmap(cluster.uuid, cluster.secret) - + cluster.distr_ndcs = distr_ndcs cluster.distr_npcs = distr_npcs cluster.distr_bs = distr_bs @@ -524,6 +525,7 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn protocols = parse_protocols(fabric) cluster.fabric_tcp = protocols["tcp"] cluster.fabric_rdma = protocols["rdma"] + cluster.full_page_unmap = False cluster.status = Cluster.STATUS_UNREADY cluster.create_dt = str(datetime.datetime.now()) @@ -634,9 +636,8 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None: snode.lvstore_status = "failed" snode.write_to_db() logger.error(f"Failed to restore lvstore on node {snode.get_id()}") - if not force: - set_cluster_status(cl_id, ols_status) - raise ValueError("Failed to activate cluster") + set_cluster_status(cl_id, ols_status) + raise ValueError("Failed to activate cluster") snodes = db_controller.get_storage_nodes_by_cluster_id(cl_id) for snode in snodes: @@ -658,10 +659,8 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None: snode.lvstore_status = "failed" snode.write_to_db() logger.error(f"Failed to restore lvstore on node {snode.get_id()}") - if not force: - logger.error("Failed to activate cluster") - set_cluster_status(cl_id, ols_status) - raise ValueError("Failed to activate cluster") + set_cluster_status(cl_id, ols_status) + raise ValueError("Failed to activate cluster") # reorder qos classes ids qos_classes = db_controller.get_qos(cl_id) @@ -682,6 +681,15 @@ def cluster_activate(cl_id, force=False, force_lvstore_create=False) -> None: if not ret: logger.error(f"Failed to set Alcemls QOS on node: {node.get_id()}") + # Start JC compression on each node + if ols_status == Cluster.STATUS_UNREADY: + for node in db_controller.get_storage_nodes_by_cluster_id(cl_id): + if node.status == StorageNode.STATUS_ONLINE: + ret, err = node.rpc_client().jc_suspend_compression(jm_vuid=node.jm_vuid, suspend=False) + if not ret: + logger.info("Failed to resume JC compression adding task...") + tasks_controller.add_jc_comp_resume_task(node.cluster_id, node.get_id(), jm_vuid=node.jm_vuid) + if not cluster.cluster_max_size: cluster = db_controller.get_cluster_by_id(cl_id) cluster.cluster_max_size = max_size @@ -1169,6 +1177,7 @@ def get_logs(cluster_id, limit=50, **kwargs) -> t.List[dict]: if record.event in ["device_status", "node_status"]: msg = msg+f" ({record.count})" + logger.debug(record) out.append({ "Date": record.get_date_string(), "NodeId": record.node_id, @@ -1191,10 +1200,6 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, logger.info("Updating mgmt cluster") if cluster.mode == "docker": - sbcli=constants.SIMPLY_BLOCK_CLI_NAME - subprocess.check_call(f"pip install {sbcli} --upgrade".split(' ')) - logger.info(f"{sbcli} upgraded") - cluster_docker = utils.get_docker_client(cluster_id) logger.info(f"Pulling image {constants.SIMPLY_BLOCK_DOCKER_IMAGE}") pull_docker_image_with_retry(cluster_docker, constants.SIMPLY_BLOCK_DOCKER_IMAGE) @@ -1208,7 +1213,7 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, for service in cluster_docker.services.list(): if image_parts in service.attrs['Spec']['Labels']['com.docker.stack.image'] or \ "simplyblock" in service.attrs['Spec']['Labels']['com.docker.stack.image']: - if service.name == "app_CachingNodeMonitor": + if service.name in ["app_CachingNodeMonitor", "app_CachedLVolStatsCollector"]: logger.info(f"Removing service {service.name}") service.remove() else: @@ -1333,7 +1338,12 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, logger.info(f"Restarting node: {node.get_id()} with SPDK image: {spdk_image}") else: logger.info(f"Restarting node: {node.get_id()}") - storage_node_ops.restart_storage_node(node.get_id(), force=True, spdk_image=spdk_image) + try: + storage_node_ops.restart_storage_node(node.get_id(), force=True, spdk_image=spdk_image) + except Exception as e: + logger.debug(e) + logger.error(f"Failed to restart node: {node.get_id()}") + return logger.info("Done") diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index 08b101d0c..ff5bd484f 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -27,7 +27,6 @@ def get_config_var(name, default=None): KVD_DB_FILE_PATH = os.getenv('FDB_CLUSTER_FILE', '/etc/foundationdb/fdb.cluster') KVD_DB_TIMEOUT_MS = 10000 SPK_DIR = '/home/ec2-user/spdk' -RPC_HTTP_PROXY_PORT = 8080 LOG_LEVEL = logging.INFO LOG_WEB_LEVEL = logging.DEBUG LOG_WEB_DEBUG = True if LOG_WEB_LEVEL == logging.DEBUG else False @@ -93,7 +92,7 @@ def get_config_var(name, default=None): MIN_SYS_MEMORY_FOR_LVOL = 524288000 EXTRA_SMALL_POOL_COUNT = 4096 EXTRA_LARGE_POOL_COUNT = 10240 -EXTRA_HUGE_PAGE_MEMORY = 1147483648 +EXTRA_HUGE_PAGE_MEMORY = 3221225472 EXTRA_SYS_MEMORY = 0.10 INSTANCE_STORAGE_DATA = { @@ -133,13 +132,10 @@ def get_config_var(name, default=None): LVOL_NVME_CONNECT_NR_IO_QUEUES=3 LVOL_NVME_KEEP_ALIVE_TO=10 LVOL_NVME_KEEP_ALIVE_TO_TCP=7 -LVOL_NVMF_PORT_ENV = os.getenv("LVOL_NVMF_PORT_START", "") -LVOL_NVMF_PORT_START = int(LVOL_NVMF_PORT_ENV) if LVOL_NVMF_PORT_ENV else 9100 QPAIR_COUNT=32 CLIENT_QPAIR_COUNT=3 NVME_TIMEOUT_US=8000000 NVMF_MAX_SUBSYSTEMS=50000 -HA_JM_COUNT=3 KATO=10000 ACK_TO=11 BDEV_RETRY=0 @@ -158,9 +154,7 @@ def get_config_var(name, default=None): LINUX_DRV_MASS_STORAGE_ID = 1 LINUX_DRV_MASS_STORAGE_NVME_TYPE_ID = 8 -NODE_NVMF_PORT_START=9060 -NODE_HUBLVOL_PORT_START=9030 NODES_CONFIG_FILE = "/etc/simplyblock/sn_config_file" SYSTEM_INFO_FILE = "/etc/simplyblock/system_info" @@ -234,4 +228,12 @@ def get_config_var(name, default=None): qos_class_meta_and_migration_weight_percent = 25 -MIG_PARALLEL_JOBS = 64 \ No newline at end of file +MIG_PARALLEL_JOBS = 64 +MIG_JOB_SIZE = 64 + +# ports ranges +RPC_PORT_RANGE_START = 8080 +LVOL_NVMF_PORT_START = 9100 +NODE_NVMF_PORT_START=9060 +NODE_HUBLVOL_PORT_START=9030 +FW_PORT_START = 50001 diff --git a/simplyblock_core/controllers/device_controller.py b/simplyblock_core/controllers/device_controller.py index e3a62f4ea..b51801302 100644 --- a/simplyblock_core/controllers/device_controller.py +++ b/simplyblock_core/controllers/device_controller.py @@ -1,5 +1,6 @@ import time import logging +import uuid from simplyblock_core import distr_controller, utils, storage_node_ops from simplyblock_core.controllers import device_events, tasks_controller @@ -8,7 +9,7 @@ from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.prom_client import PromClient from simplyblock_core.rpc_client import RPCClient - +from simplyblock_core.snode_client import SNodeClient logger = logging.getLogger() @@ -69,7 +70,9 @@ def device_set_state(device_id, state): for node in snodes: if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE: continue - node.remote_devices = storage_node_ops._connect_to_remote_devs(node) + remote_devices = storage_node_ops._connect_to_remote_devs(node) + node = db_controller.get_storage_node_by_id(node.get_id()) + node.remote_devices = remote_devices node.write_to_db() distr_controller.send_dev_status_event(device, device.status) @@ -122,7 +125,7 @@ def get_alceml_name(alceml_id): return f"alceml_{alceml_id}" -def _def_create_device_stack(device_obj, snode, force=False): +def _def_create_device_stack(device_obj, snode, force=False, clear_data=False): db_controller = DBController() rpc_client = RPCClient( @@ -156,7 +159,7 @@ def _def_create_device_stack(device_obj, snode, force=False): if alceml_name not in bdev_names: ret = snode.create_alceml( alceml_name, nvme_bdev, alceml_id, - pba_init_mode=2, + pba_init_mode=3 if clear_data else 2, write_protection=cluster.distr_ndcs > 1, pba_page_size=cluster.page_size_in_blocks, full_page_unmap=cluster.full_page_unmap @@ -241,6 +244,10 @@ def restart_device(device_id, force=False): device_obj = dev break + if not device_obj: + logger.error("device not found") + return False + task_id = tasks_controller.get_active_dev_restart_task(snode.cluster_id, device_id) if task_id: logger.error(f"Restart task found: {task_id}, can not restart device") @@ -251,6 +258,17 @@ def restart_device(device_id, force=False): device_set_retries_exhausted(device_id, True) device_set_unavailable(device_id) + if not snode.rpc_client().bdev_nvme_controller_list(device_obj.nvme_controller): + try: + ret = SNodeClient(snode.api_endpoint, timeout=30, retry=1).bind_device_to_spdk(device_obj.pcie_address) + logger.debug(ret) + snode.rpc_client().bdev_nvme_controller_attach(device_obj.nvme_controller, device_obj.pcie_address) + snode.rpc_client().bdev_examine(f"{device_obj.nvme_controller}n1") + snode.rpc_client().bdev_wait_for_examine() + except Exception as e: + logger.error(e) + return False + ret = _def_create_device_stack(device_obj, snode, force=force) if not ret: @@ -264,22 +282,33 @@ def restart_device(device_id, force=False): device_set_online(device_id) device_events.device_restarted(device_obj) - # add to jm raid - if snode.jm_device and snode.jm_device.raid_bdev and snode.jm_device.status != JMDevice.STATUS_REMOVED: - # looking for jm partition - rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) - jm_dev_part = f"{dev.nvme_bdev[:-1]}1" - ret = rpc_client.get_bdevs(jm_dev_part) - if ret: - logger.info(f"JM part found: {jm_dev_part}") + if snode.jm_device and snode.jm_device.status != JMDevice.STATUS_REMOVED: + if not snode.jm_device.raid_bdev: if snode.jm_device.status == JMDevice.STATUS_UNAVAILABLE: - restart_jm_device(snode.jm_device.get_id(), force=True) - - if snode.jm_device.status == JMDevice.STATUS_ONLINE and \ - jm_dev_part not in snode.jm_device.jm_nvme_bdev_list: - remove_jm_device(snode.jm_device.get_id(), force=True) - time.sleep(3) - restart_jm_device(snode.jm_device.get_id(), force=True) + set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) + else: + # looking for jm partition + rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) + jm_dev_part = f"{dev.nvme_bdev[:-1]}1" + ret = rpc_client.get_bdevs(jm_dev_part) + if ret: + logger.info(f"JM part found: {jm_dev_part}") + if snode.jm_device.status == JMDevice.STATUS_UNAVAILABLE: + if snode.rpc_client().get_bdevs(snode.jm_device.raid_bdev): + logger.info("Raid found, setting jm device online") + ret = snode.rpc_client().bdev_raid_get_bdevs() + has_bdev = any( + bdev["name"] == jm_dev_part + for raid in ret + for bdev in raid.get("base_bdevs_list", []) + ) + if not has_bdev: + logger.info(f"Adding to raid: {jm_dev_part}") + snode.rpc_client().bdev_raid_add_base_bdev(snode.jm_device.raid_bdev, jm_dev_part) + set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) + else: + logger.info("Raid not found, restarting jm device") + restart_jm_device(snode.jm_device.get_id(), force=True) return "Done" @@ -338,16 +367,24 @@ def device_remove(device_id, force=True): logger.error(e) return False + device = None for dev in snode.nvme_devices: if dev.get_id() == device_id: device = dev break + if not device: + logger.error("device not found") + return False + if device.status == NVMeDevice.STATUS_REMOVED: return True - if device.status == NVMeDevice.STATUS_FAILED: + + if device.status in [NVMeDevice.STATUS_FAILED, NVMeDevice.STATUS_FAILED_AND_MIGRATED, + NVMeDevice.STATUS_NEW]: logger.error(f"Unsupported device status: {device.status}") - return False + if force is False: + return False task_id = tasks_controller.get_active_dev_restart_task(snode.cluster_id, device_id) if task_id: @@ -362,33 +399,46 @@ def device_remove(device_id, force=True): distr_controller.disconnect_device(device) logger.info("Removing device fabric") - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password) + rpc_client = snode.rpc_client() + node_bdev = {} + ret = rpc_client.get_bdevs() + if ret: + for b in ret: + node_bdev[b['name']] = b + for al in b['aliases']: + node_bdev[al] = b + + if rpc_client.subsystem_list(device.nvmf_nqn): + logger.info("Removing device subsystem") + ret = rpc_client.subsystem_delete(device.nvmf_nqn) + if not ret: + logger.error(f"Failed to remove subsystem: {device.nvmf_nqn}") + if not force: + return False - ret = rpc_client.subsystem_delete(device.nvmf_nqn) - if not ret: - logger.error(f"Failed to remove subsystem: {device.nvmf_nqn}") - if not force: - return False + if f"{device.alceml_bdev}_PT" in node_bdev or force: + logger.info("Removing device PT") + ret = rpc_client.bdev_PT_NoExcl_delete(f"{device.alceml_bdev}_PT") + if not ret: + logger.error(f"Failed to remove bdev: {device.alceml_bdev}_PT") + if not force: + return False - logger.info("Removing device bdevs") - ret = rpc_client.bdev_PT_NoExcl_delete(f"{device.alceml_bdev}_PT") - if not ret: - logger.error(f"Failed to remove bdev: {device.alceml_bdev}_PT") - if not force: - return False - ret = rpc_client.bdev_alceml_delete(device.alceml_bdev) - if not ret: - logger.error(f"Failed to remove bdev: {device.alceml_bdev}") - if not force: - return False - ret = rpc_client.qos_vbdev_delete(device.qos_bdev) - if not ret: - logger.error(f"Failed to remove bdev: {device.qos_bdev}") - if not force: - return False - if snode.enable_test_device: + if device.alceml_bdev in node_bdev or force: + ret = rpc_client.bdev_alceml_delete(device.alceml_bdev) + if not ret: + logger.error(f"Failed to remove bdev: {device.alceml_bdev}") + if not force: + return False + + if device.qos_bdev in node_bdev or force: + ret = rpc_client.qos_vbdev_delete(device.qos_bdev) + if not ret: + logger.error(f"Failed to remove bdev: {device.qos_bdev}") + if not force: + return False + + if snode.enable_test_device and device.testing_bdev in node_bdev or force: ret = rpc_client.bdev_passtest_delete(device.testing_bdev) if not ret: logger.error(f"Failed to remove bdev: {device.testing_bdev}") @@ -397,8 +447,9 @@ def device_remove(device_id, force=True): device_set_state(device_id, NVMeDevice.STATUS_REMOVED) - # remove device from jm raid - if snode.jm_device.raid_bdev: + if not snode.jm_device.raid_bdev: + remove_jm_device(snode.jm_device.get_id()) + else: nvme_controller = device.nvme_controller dev_to_remove = None for part in snode.jm_device.jm_nvme_bdev_list: @@ -407,11 +458,49 @@ def device_remove(device_id, force=True): break if dev_to_remove: - if snode.jm_device.status == NVMeDevice.STATUS_ONLINE: - remove_jm_device(snode.jm_device.get_id(), force=True) - time.sleep(3) + raid_found = False + for raid_info in rpc_client.bdev_raid_get_bdevs(): + if raid_info["name"] == snode.jm_device.raid_bdev: + raid_found = True + base_bdevs = raid_info.get("base_bdevs_list", []) + if any(bdev["name"] == dev_to_remove for bdev in base_bdevs): + remove_from_jm_device(snode.jm_device.get_id(), dev_to_remove) + if not raid_found: + set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_UNAVAILABLE) + + return True + + +def remove_from_jm_device(device_id, jm_bdev): + db_controller = DBController() + + try: + snode = get_storage_node_by_jm_device(db_controller, device_id) + except KeyError as e: + logger.error(e) + return False + + if snode.status == StorageNode.STATUS_ONLINE: + rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) - restart_jm_device(snode.jm_device.get_id(), force=True) + if snode.jm_device.raid_bdev: + logger.info("device part of raid1: only remove from raid") + try: + has_any = False + for raid_info in rpc_client.bdev_raid_get_bdevs(): + if raid_info["name"] == snode.jm_device.raid_bdev: + base_bdevs = raid_info.get("base_bdevs_list", []) + if any(bdev["name"] and bdev["name"] != jm_bdev for bdev in base_bdevs): + has_any = True + if has_any: + rpc_client.bdev_raid_remove_base_bdev(jm_bdev) + return True + else: + set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_UNAVAILABLE) + + except KeyError as e: + logger.error(e) + return False return True @@ -441,7 +530,7 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True): if not records_number: return False else: - records_number = 20 + records_number = records_count # records = db_controller.get_device_capacity(device, records_number) cap_stats_keys = [ @@ -589,14 +678,15 @@ def device_set_failed(device_id): logger.error(e) return False + if dev.status != NVMeDevice.STATUS_REMOVED: + logger.error(f"Device must be in removed status, current status: {dev.status}") + return False + task_id = tasks_controller.get_active_dev_restart_task(snode.cluster_id, device_id) if task_id: logger.error(f"Restart task found: {task_id}, can not fail device") return False - if dev.status == NVMeDevice.STATUS_FAILED: - return True - ret = device_set_state(device_id, NVMeDevice.STATUS_FAILED) if not ret: logger.warning("Failed to set device state to failed") @@ -606,6 +696,7 @@ def device_set_failed(device_id): rpc_client.distr_replace_id_in_map_prob(dev.cluster_device_order, -1) tasks_controller.add_device_failed_mig_task(device_id) + return True def add_device(device_id, add_migration_task=True): @@ -621,14 +712,18 @@ def add_device(device_id, add_migration_task=True): logger.error("Device must be in new state") return False + device_obj = None for dev in snode.nvme_devices: if dev.get_id() == device_id: device_obj = dev break + if not device_obj: + logger.error("device not found") + return False + logger.info(f"Adding device {device_id}") - # if snode.num_partitions_per_dev == 0 or device_obj.is_partition: - ret = _def_create_device_stack(device_obj, snode, force=True) + ret = _def_create_device_stack(device_obj, snode, force=True, clear_data=True) if not ret: logger.error("Failed to create device stack") return False @@ -655,81 +750,6 @@ def add_device(device_id, add_migration_task=True): tasks_controller.add_new_device_mig_task(device_id) return device_id - # - # # create partitions - # partitions = snode.num_partitions_per_dev - # rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) - # # look for partitions - # partitioned_devices = storage_node_ops._search_for_partitions(rpc_client, device_obj) - # logger.debug("partitioned_devices") - # logger.debug(partitioned_devices) - # if len(partitioned_devices) == partitions+1: - # logger.info("Partitioned devices found") - # else: - # logger.info(f"Creating partitions for {device_obj.nvme_bdev}") - # storage_node_ops._create_device_partitions(rpc_client, device_obj, snode, partitions, snode.jm_percent) - # partitioned_devices = storage_node_ops._search_for_partitions(rpc_client, device_obj) - # if len(partitioned_devices) == partitions+1: - # logger.info("Device partitions created") - # else: - # logger.error("Failed to create partitions") - # return False - # - # jm_part = partitioned_devices.pop(0) - # new_devices = [] - # dev_order = storage_node_ops.get_next_cluster_device_order(db_controller, snode.cluster_id) - # for dev in partitioned_devices: - # new_device = storage_node_ops._create_storage_device_stack(rpc_client, dev, snode, after_restart=False) - # if not new_device: - # logger.error("failed to create dev stack") - # continue - # - # new_device.cluster_device_order = dev_order - # dev_order += 1 - # device_events.device_create(new_device) - # new_devices.append(new_device) - # - # if new_devices: - # snode.nvme_devices.remove(device_obj) - # snode.nvme_devices.extend(new_devices) - # snode.write_to_db(db_controller.kv_store) - # else: - # logger.error("failed to create devices") - # return False - # - # for dev in new_devices: - # distr_controller.send_cluster_map_add_device(dev, snode) - # - # logger.info("Make other nodes connect to the node devices") - # snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id) - # for node in snodes: - # if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE: - # continue - # node.remote_devices = storage_node_ops._connect_to_remote_devs(node) - # node.write_to_db() - # for dev in new_devices: - # distr_controller.send_cluster_map_add_device(dev, node) - # - # for dev in new_devices: - # tasks_controller.add_new_device_mig_task(dev.get_id()) - # - # # add to jm raid - # if snode.jm_device and snode.jm_device.raid_bdev and jm_part: - # # looking for jm partition - # jm_dev_part = jm_part.nvme_bdev - # ret = rpc_client.get_bdevs(jm_dev_part) - # if ret: - # logger.info(f"JM part found: {jm_dev_part}") - # if snode.jm_device.status in [JMDevice.STATUS_UNAVAILABLE, JMDevice.STATUS_REMOVED]: - # restart_jm_device(snode.jm_device.get_id(), force=True, format_alceml=True) - # - # if snode.jm_device.status == JMDevice.STATUS_ONLINE and \ - # jm_dev_part not in snode.jm_device.jm_nvme_bdev_list: - # remove_jm_device(snode.jm_device.get_id(), force=True) - # restart_jm_device(snode.jm_device.get_id(), force=True) - # - # return "Done" - def device_set_failed_and_migrated(device_id): db_controller = DBController() @@ -922,3 +942,58 @@ def restart_jm_device(device_id, force=False, format_alceml=False): set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) return True + + +def new_device_from_failed(device_id): + db_controller = DBController() + device = None + device_node = None + for node in db_controller.get_storage_nodes(): + for dev in node.nvme_devices: + if dev.get_id() == device_id: + device = dev + device_node = node + break + + if not device: + logger.info(f"Device not found: {device_id}") + return False + + if not device_node: + logger.info("node not found") + return False + + if device.status != NVMeDevice.STATUS_FAILED_AND_MIGRATED: + logger.error(f"Device status: {device.status} but expected status is {NVMeDevice.STATUS_FAILED_AND_MIGRATED}") + return False + + if device.serial_number.endswith("_failed"): + logger.error("Device is already added back from failed") + return False + + if not device_node.rpc_client().bdev_nvme_controller_list(device.nvme_controller): + try: + ret = SNodeClient(device_node.api_endpoint, timeout=30, retry=1).bind_device_to_spdk(device.pcie_address) + logger.debug(ret) + device_node.rpc_client().bdev_nvme_controller_attach(device.nvme_controller, device.pcie_address) + except Exception as e: + logger.error(e) + return False + + if not device_node.rpc_client().bdev_nvme_controller_list(device.nvme_controller): + logger.error(f"Failed to find device nvme controller {device.nvme_controller}") + return False + + new_device = NVMeDevice(device.to_dict()) + new_device.uuid = str(uuid.uuid4()) + new_device.status = NVMeDevice.STATUS_NEW + new_device.cluster_device_order = -1 + new_device.deleted = False + new_device.io_error = False + new_device.retries_exhausted = False + device_node.nvme_devices.append(new_device) + + device.serial_number = f"{device.serial_number}_failed" + device_node.write_to_db(db_controller.kv_store) + logger.info(f"New device created from failed device: {device_id}, new device id: {new_device.get_id()}") + return new_device.get_id() \ No newline at end of file diff --git a/simplyblock_core/controllers/health_controller.py b/simplyblock_core/controllers/health_controller.py index 94855f111..c8f35265a 100644 --- a/simplyblock_core/controllers/health_controller.py +++ b/simplyblock_core/controllers/health_controller.py @@ -9,7 +9,7 @@ from simplyblock_core.db_controller import DBController from simplyblock_core.fw_api_client import FirewallClient from simplyblock_core.models.cluster import Cluster -from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice +from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice, RemoteDevice from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.rpc_client import RPCClient from simplyblock_core.snode_client import SNodeClient @@ -18,7 +18,7 @@ logger = utils.get_logger(__name__) -def check_bdev(name, *, rpc_client=None, bdev_names=None): +def check_bdev(name, *, rpc_client=None, bdev_names=None) -> bool: present = ( ((bdev_names is not None) and (name in bdev_names)) or (rpc_client is not None and (rpc_client.get_bdevs(name) is not None)) @@ -27,7 +27,7 @@ def check_bdev(name, *, rpc_client=None, bdev_names=None): return present -def check_subsystem(nqn, *, rpc_client=None, nqns=None, ns_uuid=None): +def check_subsystem(nqn, *, rpc_client=None, nqns=None, ns_uuid=None) -> bool: if rpc_client: subsystem = subsystems[0] if (subsystems := rpc_client.subsystem_list(nqn)) is not None else None elif nqns: @@ -59,7 +59,7 @@ def check_subsystem(nqn, *, rpc_client=None, nqns=None, ns_uuid=None): for listener in listeners: logger.info(f"Checking listener {listener['traddr']}:{listener['trsvcid']} ... ok") - return bool(listeners) and namespaces + return bool(listeners) and bool(namespaces) def check_cluster(cluster_id): @@ -109,15 +109,17 @@ def _check_node_rpc(rpc_ip, rpc_port, rpc_username, rpc_password, timeout=5, ret ret = rpc_client.get_version() if ret: logger.debug(f"SPDK version: {ret['version']}") - return True + return True, True + else: + return True, False except Exception as e: logger.debug(e) - return False + return False, False def _check_node_api(ip): try: - snode_api = SNodeClient(f"{ip}:5000", timeout=10, retry=2) + snode_api = SNodeClient(f"{ip}:5000", timeout=90, retry=2) logger.debug(f"Node API={ip}:5000") ret, _ = snode_api.is_live() logger.debug(f"snode is alive: {ret}") @@ -129,42 +131,34 @@ def _check_node_api(ip): def _check_spdk_process_up(ip, rpc_port, cluster_id): - try: - snode_api = SNodeClient(f"{ip}:5000", timeout=10, retry=2) - logger.debug(f"Node API={ip}:5000") - is_up, _ = snode_api.spdk_process_is_up(rpc_port, cluster_id) - logger.debug(f"SPDK is {is_up}") - return is_up - except Exception as e: - logger.debug(e) - return False - - -def _check_port_on_node(snode, port_id): - try: - fw_api = FirewallClient(snode, timeout=5, retry=2) - iptables_command_output, _ = fw_api.get_firewall(snode.rpc_port) - if type(iptables_command_output) is str: - iptables_command_output = [iptables_command_output] - for rules in iptables_command_output: - result = jc.parse('iptables', rules) - for chain in result: - if chain['chain'] in ["INPUT", "OUTPUT"]: # type: ignore - for rule in chain['rules']: # type: ignore - if str(port_id) in rule['options']: # type: ignore - action = rule['target'] # type: ignore - if action in ["DROP"]: - return False - - # check RDMA port block - if snode.active_rdma: - rdma_fw_port_list = snode.rpc_client().nvmf_get_blocked_ports_rdma() - if port_id in rdma_fw_port_list: - return False + snode_api = SNodeClient(f"{ip}:5000", timeout=90, retry=2) + logger.debug(f"Node API={ip}:5000") + is_up, _ = snode_api.spdk_process_is_up(rpc_port, cluster_id) + logger.debug(f"SPDK is {is_up}") + return is_up + + +def check_port_on_node(snode, port_id): + fw_api = FirewallClient(snode, timeout=5, retry=2) + iptables_command_output, _ = fw_api.get_firewall(snode.rpc_port) + if type(iptables_command_output) is str: + iptables_command_output = [iptables_command_output] + for rules in iptables_command_output: + result = jc.parse('iptables', rules) + for chain in result: + if chain['chain'] in ["INPUT", "OUTPUT"]: # type: ignore + for rule in chain['rules']: # type: ignore + if str(port_id) in rule['options']: # type: ignore + action = rule['target'] # type: ignore + if action in ["DROP"]: + return False + + # check RDMA port block + if snode.active_rdma: + rdma_fw_port_list = snode.rpc_client().nvmf_get_blocked_ports_rdma() + if port_id in rdma_fw_port_list: + return False - return True - except Exception as e: - logger.error(e) return True @@ -175,7 +169,7 @@ def _check_node_ping(ip): else: return False -def _check_node_hublvol(node: StorageNode, node_bdev_names=None, node_lvols_nqns=None): +def _check_node_hublvol(node: StorageNode, node_bdev_names=None, node_lvols_nqns=None) -> bool: if not node.hublvol: logger.error(f"Node {node.get_id()} does not have a hublvol") return False @@ -235,15 +229,17 @@ def _check_node_hublvol(node: StorageNode, node_bdev_names=None, node_lvols_nqns passed = False else: lvs_info_dict.append({"Key": k, "Value": v, "expected": " "}) - for line in utils.print_table(lvs_info_dict).splitlines(): - logger.info(line) + if not passed: + for line in utils.print_table(lvs_info_dict).splitlines(): + logger.info(line) except Exception as e: logger.exception(e) + return False return passed -def _check_sec_node_hublvol(node: StorageNode, node_bdev=None, node_lvols_nqns=None, auto_fix=False): +def _check_sec_node_hublvol(node: StorageNode, node_bdev=None, node_lvols_nqns=None, auto_fix=False) -> bool: db_controller = DBController() try: primary_node = db_controller.get_storage_node_by_id(node.lvstore_stack_secondary_1) @@ -294,6 +290,16 @@ def _check_sec_node_hublvol(node: StorageNode, node_bdev=None, node_lvols_nqns=N passed = bool(ret) logger.info(f"Checking controller: {primary_node.hublvol.bdev_name} ... {passed}") + node_bdev = {} + ret = rpc_client.get_bdevs() + if ret: + for b in ret: + node_bdev[b['name']] = b + for al in b['aliases']: + node_bdev[al]= b + else: + node_bdev = [] + passed &= check_bdev(primary_node.hublvol.get_remote_bdev_name(), bdev_names=node_bdev) if not passed: return False @@ -331,20 +337,20 @@ def _check_sec_node_hublvol(node: StorageNode, node_bdev=None, node_lvols_nqns=N else: lvs_info_dict.append({"Key": k, "Value": v, "expected": " "}) - for line in utils.print_table(lvs_info_dict).splitlines(): - logger.info(line) + if not passed: + for line in utils.print_table(lvs_info_dict).splitlines(): + logger.info(line) except Exception as e: logger.exception(e) + return False return passed def _check_node_lvstore( - lvstore_stack, node, auto_fix=False, node_bdev_names=None, stack_src_node=None): + lvstore_stack, node, auto_fix=False, node_bdev_names=None, stack_src_node=None) -> bool: db_controller = DBController() - lvstore_check = True logger.info(f"Checking distr stack on node : {node.get_id()}") - rpc_client = RPCClient( - node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=5, retry=1) + cluster = db_controller.get_cluster_by_id(node.cluster_id) if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: auto_fix = False @@ -367,12 +373,24 @@ def _check_node_lvstore( node_distribs_list = bdev["distribs_list"] if not node_bdev_names: - ret = rpc_client.get_bdevs() + try: + ret = node.rpc_client().get_bdevs() + except Exception as e: + logger.info(e) + return False + if ret: node_bdev_names = [b['name'] for b in ret] else: node_bdev_names = [] + nodes = {} + devices = {} + for n in db_controller.get_storage_nodes(): + nodes[n.get_id()] = n + for dev in n.nvme_devices: + devices[dev.get_id()] = dev + for distr in distribs_list: if distr in node_bdev_names: logger.info(f"Checking distr bdev : {distr} ... ok") @@ -386,22 +404,34 @@ def _check_node_lvstore( for jm in jm_names: logger.info(jm) logger.info("Checking Distr map ...") - ret = rpc_client.distr_get_cluster_map(distr) + try: + ret = node.rpc_client().distr_get_cluster_map(distr) + except Exception as e: + logger.info(f"Failed to get cluster map: {e}") + return False if not ret: logger.error("Failed to get cluster map") - lvstore_check = False + return False else: - results, is_passed = distr_controller.parse_distr_cluster_map(ret) + results, is_passed = distr_controller.parse_distr_cluster_map(ret, nodes, devices) if results: - logger.info(utils.print_table(results)) logger.info(f"Checking Distr map ... {is_passed}") - if not is_passed and auto_fix: + if is_passed: + continue + + elif not auto_fix: + return False + + else: # is_passed is False and auto_fix is True + logger.info(utils.print_table(results)) for result in results: if result['Results'] == 'failed': if result['Kind'] == "Device": if result['Found Status']: dev = db_controller.get_storage_device_by_id(result['UUID']) - if dev.status == NVMeDevice.STATUS_ONLINE: + dev_node = db_controller.get_storage_node_by_id(dev.node_id) + if dev.status == NVMeDevice.STATUS_ONLINE and dev_node.status in [ + StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: try: remote_bdev = storage_node_ops.connect_device( f"remote_{dev.alceml_bdev}", dev, node, @@ -413,44 +443,67 @@ def _check_node_lvstore( if dev.get_id() == rem_dev.get_id(): continue new_remote_devices.append(rem_dev) - dev.remote_bdev = remote_bdev - new_remote_devices.append(dev) + + remote_device = RemoteDevice() + remote_device.uuid = dev.uuid + remote_device.alceml_name = dev.alceml_name + remote_device.node_id = dev.node_id + remote_device.size = dev.size + remote_device.status = NVMeDevice.STATUS_ONLINE + remote_device.nvmf_multipath = dev.nvmf_multipath + remote_device.remote_bdev = remote_bdev + new_remote_devices.append(remote_device) n.remote_devices = new_remote_devices n.write_to_db() distr_controller.send_dev_status_event(dev, dev.status, node) except Exception as e: logger.error(f"Failed to connect to {dev.get_id()}: {e}") + else: + if dev_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: + distr_controller.send_dev_status_event(dev, dev.status, node) + if result['Kind'] == "Node": n = db_controller.get_storage_node_by_id(result['UUID']) distr_controller.send_node_status_event(n, n.status, node) - ret = rpc_client.distr_get_cluster_map(distr) + + try: + ret = node.rpc_client().distr_get_cluster_map(distr) + except Exception as e: + logger.error(e) + return False if not ret: logger.error("Failed to get cluster map") - lvstore_check = False + return False else: - results, is_passed = distr_controller.parse_distr_cluster_map(ret) + results, is_passed = distr_controller.parse_distr_cluster_map(ret, nodes, devices) logger.info(f"Checking Distr map ... {is_passed}") + if not is_passed: + return False else: logger.error("Failed to parse distr cluster map") - lvstore_check &= is_passed + return False else: logger.info(f"Checking distr bdev : {distr} ... not found") - lvstore_check = False + return False if raid: if raid in node_bdev_names: logger.info(f"Checking raid bdev: {raid} ... ok") else: logger.info(f"Checking raid bdev: {raid} ... not found") - lvstore_check = False + return False if bdev_lvstore: - ret = rpc_client.bdev_lvol_get_lvstores(bdev_lvstore) + try: + ret = node.rpc_client().bdev_lvol_get_lvstores(bdev_lvstore) + except Exception as e: + logger.error(e) + return False if ret: logger.info(f"Checking lvstore: {bdev_lvstore} ... ok") else: logger.info(f"Checking lvstore: {bdev_lvstore} ... not found") - lvstore_check = False - return lvstore_check + return False + return True def check_node(node_id, with_devices=True): db_controller = DBController() @@ -479,7 +532,7 @@ def check_node(node_id, with_devices=True): logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}") # 3- check node RPC - node_rpc_check = _check_node_rpc( + node_rpc_check, _ = _check_node_rpc( snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") @@ -493,13 +546,19 @@ def check_node(node_id, with_devices=True): if snode.lvstore_stack_secondary_1: try: n = db_controller.get_storage_node_by_id(snode.lvstore_stack_secondary_1) - lvol_port_check = _check_port_on_node(snode, n.lvol_subsys_port) + lvol_port_check = check_port_on_node(snode, n.lvol_subsys_port) logger.info(f"Check: node {snode.mgmt_ip}, port: {n.lvol_subsys_port} ... {lvol_port_check}") except KeyError: - pass + logger.error("node not found") + except Exception: + logger.error("Check node port failed, connection error") + if not snode.is_secondary_node: - lvol_port_check = _check_port_on_node(snode, snode.lvol_subsys_port) - logger.info(f"Check: node {snode.mgmt_ip}, port: {snode.lvol_subsys_port} ... {lvol_port_check}") + try: + lvol_port_check = check_port_on_node(snode, snode.lvol_subsys_port) + logger.info(f"Check: node {snode.mgmt_ip}, port: {snode.lvol_subsys_port} ... {lvol_port_check}") + except Exception: + logger.error("Check node port failed, connection error") is_node_online = ping_check and node_api_check and node_rpc_check @@ -722,17 +781,23 @@ def check_lvol_on_node(lvol_id, node_id, node_bdev_names=None, node_lvols_nqns=N if not node_bdev_names: node_bdev_names = {} - ret = rpc_client.get_bdevs() - if ret: - for bdev in ret: - node_bdev_names[bdev['name']] = bdev + try: + ret = rpc_client.get_bdevs() + if ret: + for bdev in ret: + node_bdev_names[bdev['name']] = bdev + except Exception as e: + logger.error(f"Failed to connect to node's SPDK: {e}") if not node_lvols_nqns: node_lvols_nqns = {} - ret = rpc_client.subsystem_list() - if ret: - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub + try: + ret = rpc_client.subsystem_list() + if ret: + for sub in ret: + node_lvols_nqns[sub['nqn']] = sub + except Exception as e: + logger.error(f"Failed to connect to node's SPDK: {e}") passed = True try: diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index be8c4fc55..6cdbfd476 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -281,6 +281,9 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, host_node = nodes[0] else: return False, f"Can not find storage node: {host_id_or_name}" + if host_node.lvol_sync_del(): + logger.error(f"LVol sync deletion found on node: {host_node.get_id()}") + return False, f"LVol sync deletion found on node: {host_node.get_id()}" if namespace: try: @@ -456,14 +459,19 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, lvol.nqn = cl.nqn + ":lvol:" + lvol.uuid lvol.max_namespace_per_subsys = max_namespace_per_subsys - nodes = [] - if host_node: - nodes.insert(0, host_node) - else: + if not host_node: nodes = _get_next_3_nodes(cl.get_id(), lvol.size) if not nodes: return False, "No nodes found with enough resources to create the LVol" - host_node = nodes[0] + for n in nodes: + if n.lvol_sync_del(): + logger.warning(f"LVol sync delete task found on node: {n.get_id()}, skipping") + else: + host_node = n + break + if not host_node: + return False, "No nodes found with enough resources to create the LVol" + s_node = db_controller.get_storage_node_by_id(host_node.secondary_node_id) attr_name = f"active_{fabric}" is_active_primary = getattr(host_node, attr_name) @@ -1381,6 +1389,10 @@ def resize_lvol(id, new_size): snode = db_controller.get_storage_node_by_id(lvol.node_id) + if snode.lvol_sync_del(): + logger.error(f"LVol sync deletion found on node: {snode.get_id()}") + return False, f"LVol sync deletion found on node: {snode.get_id()}" + logger.info(f"Resizing LVol: {lvol.get_id()}") logger.info(f"Current size: {utils.humanbytes(lvol.size)}, new size: {utils.humanbytes(new_size)}") diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index d3eca0e00..685f0864b 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -49,9 +49,14 @@ def add(lvol_id, snapshot_name): if sn.snap_name == snapshot_name: return False, f"Snapshot name must be unique: {snapshot_name}" - logger.info(f"Creating snapshot: {snapshot_name} from LVol: {lvol.get_id()}") snode = db_controller.get_storage_node_by_id(lvol.node_id) + if snode.lvol_sync_del(): + logger.error(f"LVol sync deletion found on node: {snode.get_id()}") + return False, f"LVol sync deletion found on node: {snode.get_id()}" + + logger.info(f"Creating snapshot: {snapshot_name} from LVol: {lvol.get_id()}") + rec = db_controller.get_lvol_stats(lvol, 1) if rec: size = rec[0].size_used @@ -381,6 +386,10 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None logger.exception(msg) return False, msg + if snode.lvol_sync_del(): + logger.error(f"LVol sync deletion found on node: {snode.get_id()}") + return False, f"LVol sync deletion found on node: {snode.get_id()}" + cluster = db_controller.get_cluster_by_id(pool.cluster_id) if cluster.status not in [cluster.STATUS_ACTIVE, cluster.STATUS_DEGRADED]: return False, f"Cluster is not active, status: {cluster.status}" diff --git a/simplyblock_core/controllers/storage_events.py b/simplyblock_core/controllers/storage_events.py index 6bef257fd..486daa8ee 100644 --- a/simplyblock_core/controllers/storage_events.py +++ b/simplyblock_core/controllers/storage_events.py @@ -148,4 +148,3 @@ def node_ports_changed(node, caused_by=ec.CAUSED_BY_MONITOR): node_mgmt_ip=node.mgmt_ip, updates={"nvmf_port": node.nvmf_port, "rpc_port": node.rpc_port, "lvol_port": node.lvol_subsys_port}, ) - \ No newline at end of file diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index b7c434f63..a51425861 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -100,11 +100,13 @@ def add_device_mig_task(device_id_list, cluster_id): device = db.get_storage_device_by_id(device_id_list[0]) tasks = db.get_job_tasks(cluster_id) + master_task = None for task in tasks: if task.function_name == JobSchedule.FN_BALANCING_AFTER_NODE_RESTART : if task.status != JobSchedule.STATUS_DONE and task.canceled is False: - logger.info(f"Task found, skip adding new task: {task.get_id()}") - return False + logger.info("Master task found, skip adding new master task") + master_task = task + break for node in db.get_storage_nodes_by_cluster_id(cluster_id): if node.status == StorageNode.STATUS_REMOVED: @@ -117,16 +119,19 @@ def add_device_mig_task(device_id_list, cluster_id): if task_id: sub_tasks.append(task_id) if sub_tasks: - task_obj = JobSchedule() - task_obj.uuid = str(uuid.uuid4()) - task_obj.cluster_id = cluster_id - task_obj.date = int(time.time()) - task_obj.function_name = JobSchedule.FN_BALANCING_AFTER_NODE_RESTART - task_obj.sub_tasks = sub_tasks - task_obj.status = JobSchedule.STATUS_NEW - task_obj.write_to_db(db.kv_store) - tasks_events.task_create(task_obj) - + if master_task: + master_task.sub_tasks.extend(sub_tasks) + master_task.write_to_db() + else: + task_obj = JobSchedule() + task_obj.uuid = str(uuid.uuid4()) + task_obj.cluster_id = cluster_id + task_obj.date = int(time.time()) + task_obj.function_name = JobSchedule.FN_BALANCING_AFTER_NODE_RESTART + task_obj.sub_tasks = sub_tasks + task_obj.status = JobSchedule.STATUS_NEW + task_obj.write_to_db(db.kv_store) + tasks_events.task_create(task_obj) return True @@ -140,10 +145,13 @@ def add_node_to_auto_restart(node): Cluster.STATUS_READONLY, Cluster.STATUS_UNREADY]: logger.warning(f"Cluster is not active, skip node auto restart, status: {cluster.status}") return False + offline_nodes = 0 for sn in db.get_storage_nodes_by_cluster_id(node.cluster_id): if node.get_id() != sn.get_id() and sn.status != StorageNode.STATUS_ONLINE and node.mgmt_ip != sn.mgmt_ip: - logger.info("Node found that is not online, skip node auto restart") - return False + offline_nodes += 1 + if offline_nodes > cluster.distr_npcs : + logger.info("Node found that is not online, skip node auto restart") + return False return _add_task(JobSchedule.FN_NODE_RESTART, node.cluster_id, node.get_id(), "", max_retry=11) @@ -155,13 +163,15 @@ def list_tasks(cluster_id, is_json=False, limit=50, **kwargs): return False data = [] - tasks = db.get_job_tasks(cluster_id, reverse=True, limit=limit) + tasks = db.get_job_tasks(cluster_id, reverse=True) tasks.reverse() if is_json is True: for t in tasks: if t.function_name == JobSchedule.FN_DEV_MIG: continue data.append(t.get_clean_dict()) + if len(data)+1 > limit > 0: + return json.dumps(data, indent=2) return json.dumps(data, indent=2) for task in tasks: @@ -171,7 +181,7 @@ def list_tasks(cluster_id, is_json=False, limit=50, **kwargs): retry = f"{task.retry}/{task.max_retry}" else: retry = f"{task.retry}" - + logger.debug(task) upd = task.updated_at if upd: try: @@ -197,6 +207,8 @@ def list_tasks(cluster_id, is_json=False, limit=50, **kwargs): "Result": task.function_result, "Updated At": upd or "", }) + if len(data)+1 > limit > 0: + return utils.print_table(data) return utils.print_table(data) @@ -239,6 +251,7 @@ def get_subtasks(master_task_id): except Exception as e: logger.error(e) + logger.debug(sub_task) data.append({ "Task ID": sub_task.uuid, "Node ID / Device ID": f"{sub_task.node_id}\n{sub_task.device_id}", @@ -308,7 +321,8 @@ def add_new_device_mig_task(device_id): def add_node_add_task(cluster_id, function_params): - return _add_task(JobSchedule.FN_NODE_ADD, cluster_id, "", "", function_params=function_params) + return _add_task(JobSchedule.FN_NODE_ADD, cluster_id, "", "", + function_params=function_params, max_retry=11) def get_active_node_tasks(cluster_id, node_id): @@ -339,7 +353,7 @@ def get_new_device_mig_task(cluster_id, node_id, distr_name, dev_id=None): def get_device_mig_task(cluster_id, node_id, device_id, distr_name): tasks = db.get_job_tasks(cluster_id) for task in tasks: - if task.function_name == JobSchedule.FN_DEV_MIG and task.node_id == node_id and task.device_id == device_id: + if task.function_name == JobSchedule.FN_DEV_MIG and task.node_id == node_id: if task.status != JobSchedule.STATUS_DONE and task.canceled is False \ and "distr_name" in task.function_params and task.function_params["distr_name"] == distr_name: return task.uuid @@ -393,9 +407,9 @@ def get_jc_comp_task(cluster_id, node_id, jm_vuid=0): return False -def add_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name): +def add_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name, primary_node): return _add_task(JobSchedule.FN_LVOL_SYNC_DEL, cluster_id, node_id, "", - function_params={"lvol_bdev_name": lvol_bdev_name}, max_retry=10) + function_params={"lvol_bdev_name": lvol_bdev_name, "primary_node": primary_node}, max_retry=10) def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None): tasks = db.get_job_tasks(cluster_id) diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py index 277d1b68a..f0939b97c 100644 --- a/simplyblock_core/db_controller.py +++ b/simplyblock_core/db_controller.py @@ -2,7 +2,7 @@ import os.path import fdb -from typing import List +from typing import List, Optional from simplyblock_core import constants from simplyblock_core.models.cluster import Cluster @@ -17,8 +17,7 @@ from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.stats import DeviceStatObject, NodeStatObject, ClusterStatObject, LVolStatObject, \ PoolStatObject, CachedLVolStatObject -from simplyblock_core.models.storage_node import StorageNode - +from simplyblock_core.models.storage_node import StorageNode, NodeLVolDelLock class Singleton(type): @@ -309,3 +308,10 @@ def get_qos(self, cluster_id=None) -> List[QOSClass]: else: classes = QOSClass().read_from_db(self.kv_store) return sorted(classes, key=lambda x: x.class_id) + + def get_lvol_del_lock(self, node_id) -> Optional[NodeLVolDelLock]: + ret = NodeLVolDelLock().read_from_db(self.kv_store, id=node_id) + if ret: + return ret[0] + else: + return None diff --git a/simplyblock_core/distr_controller.py b/simplyblock_core/distr_controller.py index e50115f62..420b9e3fe 100644 --- a/simplyblock_core/distr_controller.py +++ b/simplyblock_core/distr_controller.py @@ -2,6 +2,7 @@ import datetime import logging import re +import threading from simplyblock_core import utils from simplyblock_core.models.nvme_device import NVMeDevice @@ -26,6 +27,7 @@ def send_node_status_event(node, node_status, target_node=None): events = {"events": [node_status_event]} logger.debug(node_status_event) skipped_nodes = [] + connect_threads = [] if target_node: snodes = [target_node] else: @@ -45,10 +47,14 @@ def send_node_status_event(node, node_status, target_node=None): if node_found_same_host: continue logger.info(f"Sending to: {node.get_id()}") - rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1) - ret = rpc_client.distr_status_events_update(events) - if not ret: - logger.warning("Failed to send event update") + t = threading.Thread( + target=_send_event_to_node, + args=(node, events,)) + connect_threads.append(t) + t.start() + + for t in connect_threads: + t.join() def send_dev_status_event(device, status, target_node=None): @@ -57,7 +63,7 @@ def send_dev_status_event(device, status, target_node=None): db_controller = DBController() storage_ID = device.cluster_device_order skipped_nodes = [] - + connect_threads = [] if target_node: snodes = [db_controller.get_storage_node_by_id(target_node.get_id())] else: @@ -67,7 +73,8 @@ def send_dev_status_event(device, status, target_node=None): skipped_nodes.append(node) for node in snodes: - if node.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + if node.status in [StorageNode.STATUS_OFFLINE, StorageNode.STATUS_REMOVED]: + logger.info(f"skipping node: {node.get_id()} with status: {node.status}") continue node_found_same_host = False for n in skipped_nodes: @@ -95,10 +102,14 @@ def send_dev_status_event(device, status, target_node=None): "storage_ID": storage_ID, "status": dev_status}]} logger.debug(f"Sending event updates, device: {storage_ID}, status: {dev_status}, node: {node.get_id()}") - rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1) - ret = rpc_client.distr_status_events_update(events) - if not ret: - logger.warning("Failed to send event update") + t = threading.Thread( + target=_send_event_to_node, + args=(node,events,)) + connect_threads.append(t) + t.start() + + for t in connect_threads: + t.join() def disconnect_device(device): @@ -192,12 +203,20 @@ def get_distr_cluster_map(snodes, target_node, distr_name=""): return cl_map -def parse_distr_cluster_map(map_string): +def parse_distr_cluster_map(map_string, nodes=None, devices=None): db_controller = DBController() node_pattern = re.compile(r".*uuid_node=(.*) status=(.*)$", re.IGNORECASE) device_pattern = re.compile( r".*storage_ID=(.*) status=(.*) uuid_device=(.*) storage_bdev_name=(.*)$", re.IGNORECASE) + if not nodes or not devices: + nodes = {} + devices = {} + for n in db_controller.get_storage_nodes(): + nodes[n.get_id()] = n + for dev in n.nvme_devices: + devices[dev.get_id()] = dev + results = [] passed = True for line in map_string.split('\n'): @@ -213,8 +232,7 @@ def parse_distr_cluster_map(map_string): "Results": "", } try: - nd = db_controller.get_storage_node_by_id(node_id) - node_status = nd.status + node_status = nodes[node_id].status if node_status == StorageNode.STATUS_SCHEDULABLE: node_status = StorageNode.STATUS_UNREACHABLE data["Desired Status"] = node_status @@ -238,7 +256,7 @@ def parse_distr_cluster_map(map_string): "Results": "", } try: - sd = db_controller.get_storage_device_by_id(device_id) + sd = devices[device_id] data["Desired Status"] = sd.status if sd.status == status: data["Results"] = "ok" @@ -252,38 +270,26 @@ def parse_distr_cluster_map(map_string): return results, passed -def send_cluster_map_to_node(node): +def send_cluster_map_to_node(node: StorageNode): db_controller = DBController() snodes = db_controller.get_storage_nodes_by_cluster_id(node.cluster_id) - rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=10) - - # if node.lvstore_stack_secondary_1: - # for snode in db_controller.get_primary_storage_nodes_by_secondary_node_id(node.get_id()): - # for bdev in snode.lvstore_stack: - # if bdev['type'] == "bdev_distr": - # cluster_map_data = get_distr_cluster_map(snodes, node, bdev["name"]) - # ret = rpc_client.distr_send_cluster_map(cluster_map_data) - # if not ret: - # logger.error("Failed to send cluster map") - # return False - # return True - # else: cluster_map_data = get_distr_cluster_map(snodes, node) - ret = rpc_client.distr_send_cluster_map(cluster_map_data) - if not ret: + try: + node.rpc_client(timeout=10).distr_send_cluster_map(cluster_map_data) + except Exception: logger.error("Failed to send cluster map") logger.info(cluster_map_data) return False return True -def send_cluster_map_to_distr(node, distr_name): +def send_cluster_map_to_distr(node: StorageNode, distr_name: str): db_controller = DBController() snodes = db_controller.get_storage_nodes_by_cluster_id(node.cluster_id) - rpc_client = RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=10) cluster_map_data = get_distr_cluster_map(snodes, node, distr_name) - ret = rpc_client.distr_send_cluster_map(cluster_map_data) - if not ret: + try: + node.rpc_client(timeout=10).distr_send_cluster_map(cluster_map_data) + except Exception: logger.error("Failed to send cluster map") logger.info(cluster_map_data) return False @@ -294,14 +300,13 @@ def send_cluster_map_add_node(snode, target_node): if target_node.status != StorageNode.STATUS_ONLINE: return False logger.info(f"Sending to: {target_node.get_id()}") - rpc_client = RPCClient(target_node.mgmt_ip, target_node.rpc_port, target_node.rpc_username, target_node.rpc_password, timeout=5) - cluster_map_data = get_distr_cluster_map([snode], target_node) cl_map = { "map_cluster": cluster_map_data['map_cluster'], "map_prob": cluster_map_data['map_prob']} - ret = rpc_client.distr_add_nodes(cl_map) - if not ret: + try: + target_node.rpc_client(timeout=10).distr_add_nodes(cl_map) + except Exception: logger.error("Failed to send cluster map") return False return True @@ -353,10 +358,20 @@ def send_cluster_map_add_device(device: NVMeDevice, target_node: StorageNode): "bdev_name": name, "status": device.status, "weight": dev_w_gib, + "physical_label": device.physical_label if device.physical_label > 0 else -1, }} } - ret = rpc_client.distr_add_devices(cl_map) - if not ret: + try: + rpc_client.distr_add_devices(cl_map) + except Exception: logger.error("Failed to send cluster map") return False return True + + +def _send_event_to_node(node, events): + try: + node.rpc_client(timeout=1, retry=0).distr_status_events_update(events) + except Exception as e: + logger.warning("Failed to send event update") + logger.error(e) diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index 30b2e6563..d9502d695 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,6 +1,6 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev -SIMPLY_BLOCK_VERSION=19.2.27 +SIMPLY_BLOCK_VERSION=19.2.30 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:hotfix-to-main -SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=simplyblock/spdk:hotfix-to-main-latest +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main +SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=simplyblock/spdk:main-latest diff --git a/simplyblock_core/fw_api_client.py b/simplyblock_core/fw_api_client.py index d17255c80..8f089ce5c 100644 --- a/simplyblock_core/fw_api_client.py +++ b/simplyblock_core/fw_api_client.py @@ -6,6 +6,7 @@ from requests.adapters import HTTPAdapter from urllib3 import Retry + logger = logging.getLogger() @@ -18,7 +19,7 @@ class FirewallClient: def __init__(self, node, timeout=300, retry=5): self.node = node - self.ip_address = f"{node.mgmt_ip}:5001" + self.ip_address = f"{node.mgmt_ip}:{node.firewall_port}" self.url = 'http://%s/' % self.ip_address self.timeout = timeout self.session = requests.session() @@ -41,7 +42,7 @@ def _request(self, method, path, payload=None): response = self.session.request(method, self.url+path, data=data, timeout=self.timeout, params=params) except Exception as e: - raise e + raise FirewallClientException(str(e)) logger.debug("Response: status_code: %s, content: %s", response.status_code, response.content) diff --git a/simplyblock_core/models/nvme_device.py b/simplyblock_core/models/nvme_device.py index b86e25c44..82749e30a 100644 --- a/simplyblock_core/models/nvme_device.py +++ b/simplyblock_core/models/nvme_device.py @@ -47,25 +47,39 @@ class NVMeDevice(BaseModel): nvmf_nqn: str = "" nvmf_port: int = 0 nvmf_multipath: bool = False - overload_percentage: int = 0 # Unused - partition_jm_bdev: str = "" # Unused - partition_jm_size: int = 0 # Unused - partition_main_bdev: str = "" # Unused - partition_main_size: int = 0 # Unused - partitions_count: int = 0 # Unused pcie_address: str = "" physical_label: int = 0 pt_bdev: str = "" qos_bdev: str = "" remote_bdev: str = "" retries_exhausted: bool = False - sequential_number: int = 0 # Unused serial_number: str = "" size: int = -1 testing_bdev: str = "" connecting_from_node: str = "" previous_status: str = "" + def __change_dev_connection_to(self, connecting_from_node): + from simplyblock_core.db_controller import DBController + db = DBController() + for n in db.get_storage_nodes(): + if n.nvme_devices: + for d in n.nvme_devices: + if d.get_id() == self.get_id(): + d.connecting_from_node = connecting_from_node + n.write_to_db() + break + + def lock_device_connection(self, node_id): + self.__change_dev_connection_to(node_id) + + def release_device_connection(self): + self.__change_dev_connection_to("") + + def is_connection_in_progress_to_node(self, node_id): + if self.connecting_from_node and self.connecting_from_node == node_id: + return True + class JMDevice(NVMeDevice): @@ -73,3 +87,18 @@ class JMDevice(NVMeDevice): jm_bdev: str = "" jm_nvme_bdev_list: List[str] = [] raid_bdev: str = "" + + +class RemoteDevice(BaseModel): + + remote_bdev: str = "" + alceml_name: str = "" + node_id: str = "" + size: int = -1 + nvmf_multipath: bool = False + + +class RemoteJMDevice(RemoteDevice): + + jm_bdev: str = "" + diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py index 967f041a5..b8fdd62f0 100644 --- a/simplyblock_core/models/storage_node.py +++ b/simplyblock_core/models/storage_node.py @@ -4,10 +4,11 @@ from uuid import uuid4 from simplyblock_core import utils -from simplyblock_core.models.base_model import BaseNodeObject +from simplyblock_core.models.base_model import BaseNodeObject, BaseModel from simplyblock_core.models.hublvol import HubLVol from simplyblock_core.models.iface import IFace -from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice, RemoteDevice, RemoteJMDevice from simplyblock_core.rpc_client import RPCClient, RPCException logger = utils.get_logger(__name__) @@ -79,8 +80,8 @@ class StorageNode(BaseNodeObject): pollers_mask: str = "" primary_ip: str = "" raid: str = "" - remote_devices: List[NVMeDevice] = [] - remote_jm_devices: List[JMDevice] = [] + remote_devices: List[RemoteDevice] = [] + remote_jm_devices: List[RemoteJMDevice] = [] rpc_password: str = "" rpc_port: int = -1 rpc_username: str = "" @@ -105,8 +106,8 @@ class StorageNode(BaseNodeObject): hublvol: HubLVol = None # type: ignore[assignment] active_tcp: bool = True active_rdma: bool = False - lvol_sync_del_queue: List[str] = [] socket: int = 0 + firewall_port: int = 5001 def rpc_client(self, **kwargs): """Return rpc client to this node @@ -309,6 +310,8 @@ def create_alceml(self, name, nvme_bdev, uuid, **kwargs): ) def wait_for_jm_rep_tasks_to_finish(self, jm_vuid): + if not self.rpc_client().bdev_lvol_get_lvstores(self.lvstore): + return True # no lvstore means no need to wait retry = 10 while retry > 0: try: @@ -327,3 +330,48 @@ def wait_for_jm_rep_tasks_to_finish(self, jm_vuid): except Exception: logger.warning("Failed to get replication task!") return False + + def lvol_sync_del(self) -> bool: + from simplyblock_core.db_controller import DBController + db_controller = DBController() + lock = db_controller.get_lvol_del_lock(self.get_id()) + if lock: + return True + return False + + def lvol_del_sync_lock(self) -> bool: + from simplyblock_core.db_controller import DBController + db_controller = DBController() + lock = db_controller.get_lvol_del_lock(self.get_id()) + if not lock: + lock = NodeLVolDelLock({"uuid": self.uuid}) + lock.write_to_db() + logger.info(f"Created lvol_del_sync_lock on node: {self.get_id()}") + return True + + def lvol_del_sync_lock_reset(self) -> bool: + from simplyblock_core.db_controller import DBController + db_controller = DBController() + task_found = False + tasks = db_controller.get_job_tasks(self.cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL and task.node_id == self.secondary_node_id: + if task.status != JobSchedule.STATUS_DONE and task.canceled is False: + task_found = True + break + + lock = db_controller.get_lvol_del_lock(self.get_id()) + if task_found: + if not lock: + lock = NodeLVolDelLock({"uuid": self.uuid}) + lock.write_to_db() + logger.info(f"Created lvol_del_sync_lock on node: {self.get_id()}") + else: + if lock: + lock.remove(db_controller.kv_store) + logger.info(f"remove lvol_del_sync_lock from node: {self.get_id()}") + return True + + +class NodeLVolDelLock(BaseModel): + pass \ No newline at end of file diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 8be2c9d6a..e2235b8d9 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -109,11 +109,10 @@ def _request2(self, method, params=None): if params: payload['params'] = params try: - logger.debug("Requesting method: %s, params: %s", method, params) + logger.debug("From: %s, Requesting method: %s, params: %s", self.ip_address, method, params) response = self.session.post(self.url, data=json.dumps(payload), timeout=self.timeout) - except Exception as e: - logger.error(e) - return False, str(e) + except Exception: + raise RPCException("connection error") ret_code = response.status_code ret_content = response.content @@ -581,7 +580,7 @@ def get_lvol_stats(self, uuid=""): params["uuid"] = uuid return self._request("bdev_get_iostat", params) - def bdev_raid_create(self, name, bdevs_list, raid_level="0", strip_size_kb=4): + def bdev_raid_create(self, name, bdevs_list, raid_level="0", strip_size_kb=4, superblock=False): try: ret = self.get_bdevs(name) if ret: @@ -593,7 +592,8 @@ def bdev_raid_create(self, name, bdevs_list, raid_level="0", strip_size_kb=4): "raid_level": raid_level, "strip_size_kb": strip_size_kb, "base_bdevs": bdevs_list, - "io_unmap_limit": 100 + "io_unmap_limit": 100, + "superblock": superblock } if raid_level == "1": params["strip_size_kb"] = 0 @@ -928,7 +928,7 @@ def distr_migration_status(self, name): params = {"name": name} return self._request("distr_migration_status", params) - def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=64, jobs=64): + def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=constants.MIG_JOB_SIZE, jobs=constants.MIG_PARALLEL_JOBS): params = { "name": name, "storage_ID": storage_ID, @@ -941,7 +941,7 @@ def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=Fals params["jobs"] = jobs return self._request("distr_migration_failure_start", params) - def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=64, jobs=64): + def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=constants.MIG_JOB_SIZE, jobs=constants.MIG_PARALLEL_JOBS): params = { "name": name, } @@ -960,10 +960,9 @@ def bdev_raid_add_base_bdev(self, raid_bdev, base_bdev): } return self._request("bdev_raid_add_base_bdev", params) - def bdev_raid_remove_base_bdev(self, raid_bdev, base_bdev): + def bdev_raid_remove_base_bdev(self, base_bdev): params = { - "raid_bdev": raid_bdev, - "base_bdev": base_bdev, + "name": base_bdev, } return self._request("bdev_raid_remove_base_bdev", params) @@ -1148,7 +1147,7 @@ def jc_suspend_compression(self, jm_vuid, suspend=False): "jm_vuid": jm_vuid, "suspend": suspend, } - return self._request("jc_suspend_compression", params) + return self._request2("jc_suspend_compression", params) def nvmf_subsystem_add_listener(self, nqn, trtype, traddr, trsvcid, ana_state=None): params = { @@ -1235,3 +1234,15 @@ def nvmf_port_unblock_rdma(self, port): def nvmf_get_blocked_ports_rdma(self): return self._request("nvmf_get_blocked_ports") + + def bdev_raid_get_bdevs(self): + params = { + "category": "online" + } + return self._request("bdev_raid_get_bdevs", params) + + + + + + diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml index fd79f43c1..eb29d68b6 100644 --- a/simplyblock_core/scripts/docker-compose-swarm.yml +++ b/simplyblock_core/scripts/docker-compose-swarm.yml @@ -130,6 +130,7 @@ services: - 80:80 - 12202:12202 - 9200:9200 + - 9090:9090 networks: - localnet - monitoring-net diff --git a/simplyblock_core/scripts/haproxy.cfg b/simplyblock_core/scripts/haproxy.cfg index d95d3ebec..667989baf 100644 --- a/simplyblock_core/scripts/haproxy.cfg +++ b/simplyblock_core/scripts/haproxy.cfg @@ -65,6 +65,11 @@ backend graylog_input_services balance roundrobin server-template graylog_input- 1 graylog:12201 check resolvers docker init-addr libc,none +backend prometheus_input_services + mode tcp + balance roundrobin + server-template prometheus_input- 1 prometheus:9090 check resolvers docker init-addr libc,none + backend opensearch_services balance roundrobin http-request set-path %[path,regsub(^/opensearch/?,/)] @@ -85,3 +90,8 @@ frontend graylog_input_front bind *:12202 mode tcp default_backend graylog_input_services + +frontend prometheus_input_front + bind *:9090 + mode tcp + default_backend prometheus_input_services diff --git a/simplyblock_core/services/capacity_and_stats_collector.py b/simplyblock_core/services/capacity_and_stats_collector.py index 022dd84b5..07a850edd 100644 --- a/simplyblock_core/services/capacity_and_stats_collector.py +++ b/simplyblock_core/services/capacity_and_stats_collector.py @@ -4,7 +4,6 @@ from simplyblock_core import constants, db_controller, utils from simplyblock_core.models.nvme_device import NVMeDevice from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.rpc_client import RPCClient from simplyblock_core.models.stats import DeviceStatObject, NodeStatObject, ClusterStatObject logger = utils.get_logger(__name__) @@ -62,17 +61,17 @@ def add_device_stats(cl, device, capacity_dict, stats_dict): if last_record: time_diff = (now - last_record.date) if time_diff > 0: - data['read_bytes_ps'] = int((data['read_bytes'] - last_record['read_bytes']) / time_diff) - data['read_io_ps'] = int((data['read_io'] - last_record['read_io']) / time_diff) - data['read_latency_ps'] = int((data['read_latency_ticks'] - last_record['read_latency_ticks']) / time_diff) + data['read_bytes_ps'] = abs(int((data['read_bytes'] - last_record['read_bytes']) / time_diff)) + data['read_io_ps'] = abs(int((data['read_io'] - last_record['read_io']) / time_diff)) + data['read_latency_ps'] = abs(int((data['read_latency_ticks'] - last_record['read_latency_ticks']) / time_diff)) - data['write_bytes_ps'] = int((data['write_bytes'] - last_record['write_bytes']) / time_diff) - data['write_io_ps'] = int((data['write_io'] - last_record['write_io']) / time_diff) - data['write_latency_ps'] = int((data['write_latency_ticks'] - last_record['write_latency_ticks']) / time_diff) + data['write_bytes_ps'] = abs(int((data['write_bytes'] - last_record['write_bytes']) / time_diff)) + data['write_io_ps'] = abs(int((data['write_io'] - last_record['write_io']) / time_diff)) + data['write_latency_ps'] = abs(int((data['write_latency_ticks'] - last_record['write_latency_ticks']) / time_diff)) - data['unmap_bytes_ps'] = int((data['unmap_bytes'] - last_record['unmap_bytes']) / time_diff) - data['unmap_io_ps'] = int((data['unmap_io'] - last_record['unmap_io']) / time_diff) - data['unmap_latency_ps'] = int((data['unmap_latency_ticks'] - last_record['unmap_latency_ticks']) / time_diff) + data['unmap_bytes_ps'] = abs(int((data['unmap_bytes'] - last_record['unmap_bytes']) / time_diff)) + data['unmap_io_ps'] = abs(int((data['unmap_io'] - last_record['unmap_io']) / time_diff)) + data['unmap_latency_ps'] = abs(int((data['unmap_latency_ticks'] - last_record['unmap_latency_ticks']) / time_diff)) else: logger.warning("last record not found") @@ -188,15 +187,15 @@ def add_cluster_stats(cl, records): logger.error("No devices found in node: %s", node.get_id()) continue - rpc_client = RPCClient( - node.mgmt_ip, node.rpc_port, - node.rpc_username, node.rpc_password, - timeout=5, retry=2) - + rpc_client = node.rpc_client(timeout=5, retry=2) node_devs_stats = {} - ret = rpc_client.get_lvol_stats() - if ret: - node_devs_stats = {b['name']: b for b in ret['bdevs']} + try: + ret = rpc_client.get_lvol_stats() + if ret: + node_devs_stats = {b['name']: b for b in ret['bdevs']} + except Exception as e: + logger.error(e) + continue devices_records = [] for device in node.nvme_devices: @@ -204,7 +203,11 @@ def add_cluster_stats(cl, records): if device.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, NVMeDevice.STATUS_CANNOT_ALLOCATE]: logger.info(f"Device is skipped: {device.get_id()} status: {device.status}") continue - capacity_dict = rpc_client.alceml_get_capacity(device.alceml_name) + try: + capacity_dict = rpc_client.alceml_get_capacity(device.alceml_name) + except Exception as e: + logger.error(e) + continue if device.nvme_bdev in node_devs_stats: stats_dict = node_devs_stats[device.nvme_bdev] record = add_device_stats(cl, device, capacity_dict, stats_dict) diff --git a/simplyblock_core/services/health_check_service.py b/simplyblock_core/services/health_check_service.py index bb48e9620..8fc5f0489 100644 --- a/simplyblock_core/services/health_check_service.py +++ b/simplyblock_core/services/health_check_service.py @@ -1,4 +1,5 @@ # coding=utf-8 +import threading import time from datetime import datetime @@ -10,10 +11,10 @@ from simplyblock_core.rpc_client import RPCClient from simplyblock_core import constants, db_controller, distr_controller, storage_node_ops -logger = utils.get_logger(__name__) - utils.init_sentry_sdk() +logger = utils.get_logger(__name__) + def set_node_health_check(snode, health_check_status): snode = db.get_storage_node_by_id(snode.get_id()) @@ -42,223 +43,242 @@ def set_device_health_check(cluster_id, device, health_check_status): return -# get DB controller -db = db_controller.DBController() +def check_node(snode): -logger.info("Starting health check service") -while True: - clusters = db.get_clusters() - for cluster in clusters: - cluster_id = cluster.get_id() - snodes = db.get_storage_nodes_by_cluster_id(cluster_id) - if not snodes: - logger.warning("storage nodes list is empty") - - for snode in snodes: - logger.info("Node: %s, status %s", snode.get_id(), snode.status) - - if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE, - StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - logger.info(f"Node status is: {snode.status}, skipping") - set_node_health_check(snode, False) - for device in snode.nvme_devices: - set_device_health_check(cluster_id, device, False) - continue - - # 1- check node ping - ping_check = health_controller._check_node_ping(snode.mgmt_ip) - logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") - - # 2- check node API - node_api_check = health_controller._check_node_api(snode.mgmt_ip) - logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}") - - # 3- check node RPC - node_rpc_check = health_controller._check_node_rpc( - snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) - logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") - - is_node_online = ping_check and node_api_check and node_rpc_check - - health_check_status = is_node_online - if node_rpc_check: - logger.info(f"Node device count: {len(snode.nvme_devices)}") - node_devices_check = True - node_remote_devices_check = True - - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, - timeout=3, retry=2) - connected_devices = [] - - node_bdevs = rpc_client.get_bdevs() - if node_bdevs: - # node_bdev_names = [b['name'] for b in node_bdevs] - node_bdev_names = {} - for b in node_bdevs: - node_bdev_names[b['name']] = b - for al in b['aliases']: - node_bdev_names[al] = b - else: - node_bdev_names = {} - - subsystem_list = rpc_client.subsystem_list() or [] - subsystems = { - subsystem['nqn']: subsystem - for subsystem - in subsystem_list - } - - for device in snode.nvme_devices: - passed = True - - if device.io_error: - logger.info(f"Device io_error {device.get_id()}") - passed = False - - if device.status != NVMeDevice.STATUS_ONLINE: - logger.info(f"Device status {device.status}") - passed = False - - if snode.enable_test_device: - bdevs_stack = [device.nvme_bdev, device.testing_bdev, device.alceml_bdev, device.pt_bdev] - else: - bdevs_stack = [device.nvme_bdev, device.alceml_bdev, device.pt_bdev] - - logger.info(f"Checking Device: {device.get_id()}, status:{device.status}") - problems = 0 - for bdev in bdevs_stack: - if not bdev: - continue - - if not health_controller.check_bdev(bdev, bdev_names=node_bdev_names): - problems += 1 - passed = False - - logger.info(f"Checking Device's BDevs ... ({(len(bdevs_stack) - problems)}/{len(bdevs_stack)})") - - passed &= health_controller.check_subsystem(device.nvmf_nqn, nqns=subsystems) - - set_device_health_check(cluster_id, device, passed) - if device.status == NVMeDevice.STATUS_ONLINE: - node_devices_check &= passed - - logger.info(f"Node remote device: {len(snode.remote_devices)}") - - for remote_device in snode.remote_devices: - org_dev = db.get_storage_device_by_id(remote_device.get_id()) - org_node = db.get_storage_node_by_id(remote_device.node_id) - if org_dev.status == NVMeDevice.STATUS_ONLINE and org_node.status == StorageNode.STATUS_ONLINE: - if health_controller.check_bdev(remote_device.remote_bdev, bdev_names=node_bdev_names): - connected_devices.append(remote_device.get_id()) - continue - - if not org_dev.alceml_bdev: - logger.error(f"device alceml bdev not found!, {org_dev.get_id()}") - continue - - try: - storage_node_ops.connect_device( - f"remote_{org_dev.alceml_bdev}", org_dev, snode, - bdev_names=list(node_bdev_names), reattach=False, - ) - connected_devices.append(org_dev.get_id()) - sn = db.get_storage_node_by_id(snode.get_id()) - for d in sn.remote_devices: - if d.get_id() == remote_device.get_id(): - d.status = NVMeDevice.STATUS_ONLINE - sn.write_to_db() - break - distr_controller.send_dev_status_event(org_dev, NVMeDevice.STATUS_ONLINE, snode) - except RuntimeError: - logger.error(f"Failed to connect to device: {org_dev.get_id()}") - node_remote_devices_check = False - - connected_jms = [] - if snode.jm_device and snode.jm_device.get_id(): - jm_device = snode.jm_device - logger.info(f"Node JM: {jm_device.get_id()}") - if jm_device.jm_bdev in node_bdev_names: - logger.info(f"Checking jm bdev: {jm_device.jm_bdev} ... ok") - connected_jms.append(jm_device.get_id()) + snode = db.get_storage_node_by_id(snode.get_id()) + logger.info("Node: %s, status %s", snode.get_id(), snode.status) + + if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE, + StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + logger.info(f"Node status is: {snode.status}, skipping") + set_node_health_check(snode, False) + for device in snode.nvme_devices: + set_device_health_check(snode.cluster_id, device, False) + return + + # 1- check node ping + ping_check = health_controller._check_node_ping(snode.mgmt_ip) + logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") + + # 2- check node API + node_api_check = health_controller._check_node_api(snode.mgmt_ip) + logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}") + + # 3- check node RPC + node_rpc_check = health_controller._check_node_rpc( + snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password) + logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") + + is_node_online = ping_check and node_api_check and node_rpc_check + + health_check_status = is_node_online + if node_rpc_check: + logger.info(f"Node device count: {len(snode.nvme_devices)}") + node_devices_check = True + node_remote_devices_check = True + + rpc_client = RPCClient( + snode.mgmt_ip, snode.rpc_port, + snode.rpc_username, snode.rpc_password, + timeout=3, retry=2) + connected_devices = [] + + node_bdevs = rpc_client.get_bdevs() + if node_bdevs: + # node_bdev_names = [b['name'] for b in node_bdevs] + node_bdev_names = {} + for b in node_bdevs: + node_bdev_names[b['name']] = b + for al in b['aliases']: + node_bdev_names[al] = b + else: + node_bdev_names = {} + + subsystem_list = rpc_client.subsystem_list() or [] + subsystems = { + subsystem['nqn']: subsystem + for subsystem + in subsystem_list + } + + for device in snode.nvme_devices: + passed = True + + if device.io_error: + logger.info(f"Device io_error {device.get_id()}") + passed = False + + if device.status != NVMeDevice.STATUS_ONLINE: + logger.info(f"Device status {device.status}") + passed = False + + if snode.enable_test_device: + bdevs_stack = [device.nvme_bdev, device.testing_bdev, device.alceml_bdev, device.pt_bdev] + else: + bdevs_stack = [device.nvme_bdev, device.alceml_bdev, device.pt_bdev] + + logger.info(f"Checking Device: {device.get_id()}, status:{device.status}") + problems = 0 + for bdev in bdevs_stack: + if not bdev: + continue + + if not health_controller.check_bdev(bdev, bdev_names=node_bdev_names): + problems += 1 + passed = False + + logger.info(f"Checking Device's BDevs ... ({(len(bdevs_stack) - problems)}/{len(bdevs_stack)})") + + passed &= health_controller.check_subsystem(device.nvmf_nqn, nqns=subsystems) + + set_device_health_check(snode.cluster_id, device, passed) + if device.status == NVMeDevice.STATUS_ONLINE: + node_devices_check &= passed + + logger.info(f"Node remote device: {len(snode.remote_devices)}") + + for remote_device in snode.remote_devices: + org_dev = db.get_storage_device_by_id(remote_device.get_id()) + org_node = db.get_storage_node_by_id(remote_device.node_id) + if org_dev.status == NVMeDevice.STATUS_ONLINE and org_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: + if health_controller.check_bdev(remote_device.remote_bdev, bdev_names=node_bdev_names): + connected_devices.append(remote_device.get_id()) + continue + + if not org_dev.alceml_bdev: + logger.error(f"device alceml bdev not found!, {org_dev.get_id()}") + continue + + try: + storage_node_ops.connect_device( + f"remote_{org_dev.alceml_bdev}", org_dev, snode, + bdev_names=list(node_bdev_names), reattach=False, + ) + connected_devices.append(org_dev.get_id()) + sn = db.get_storage_node_by_id(snode.get_id()) + for d in sn.remote_devices: + if d.get_id() == remote_device.get_id(): + d.status = NVMeDevice.STATUS_ONLINE + sn.write_to_db() + break + distr_controller.send_dev_status_event(org_dev, NVMeDevice.STATUS_ONLINE, snode) + except RuntimeError: + logger.error(f"Failed to connect to device: {org_dev.get_id()}") + node_remote_devices_check = False + + connected_jms = [] + if snode.jm_device and snode.jm_device.get_id(): + jm_device = snode.jm_device + logger.info(f"Node JM: {jm_device.get_id()}") + if jm_device.jm_bdev in node_bdev_names: + logger.info(f"Checking jm bdev: {jm_device.jm_bdev} ... ok") + connected_jms.append(jm_device.get_id()) + else: + logger.info(f"Checking jm bdev: {jm_device.jm_bdev} ... not found") + + if snode.enable_ha_jm: + logger.info(f"Node remote JMs: {len(snode.remote_jm_devices)}") + for remote_device in snode.remote_jm_devices: + if remote_device.remote_bdev: + check = health_controller.check_bdev(remote_device.remote_bdev, bdev_names=node_bdev_names) + if check: + connected_jms.append(remote_device.get_id()) else: - logger.info(f"Checking jm bdev: {jm_device.jm_bdev} ... not found") - - if snode.enable_ha_jm: - logger.info(f"Node remote JMs: {len(snode.remote_jm_devices)}") - for remote_device in snode.remote_jm_devices: - if remote_device.remote_bdev: - check = health_controller.check_bdev(remote_device.remote_bdev, bdev_names=node_bdev_names) - if check: - connected_jms.append(remote_device.get_id()) - else: + node_remote_devices_check = False + + for jm_id in snode.jm_ids: + if jm_id and jm_id not in connected_jms: + for nd in db.get_storage_nodes(): + if nd.jm_device and nd.jm_device.get_id() == jm_id: + if nd.status == StorageNode.STATUS_ONLINE: node_remote_devices_check = False + break - for jm_id in snode.jm_ids: - if jm_id and jm_id not in connected_jms: - for nd in db.get_storage_nodes(): - if nd.jm_device and nd.jm_device.get_id() == jm_id: - if nd.status == StorageNode.STATUS_ONLINE: - node_remote_devices_check = False - break - - if not node_remote_devices_check and cluster.status in [ - Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: - snode = db.get_storage_node_by_id(snode.get_id()) - snode.remote_jm_devices = storage_node_ops._connect_to_remote_jm_devs(snode) - snode.write_to_db() - - lvstore_check = True + if not node_remote_devices_check and cluster.status in [ + Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: + remote_jm_devices = storage_node_ops._connect_to_remote_jm_devs(snode) snode = db.get_storage_node_by_id(snode.get_id()) - if snode.lvstore_status == "ready" or snode.status == StorageNode.STATUS_ONLINE or \ - snode.lvstore_status == "failed" : + snode.remote_jm_devices = remote_jm_devices + snode.write_to_db() - lvstore_stack = snode.lvstore_stack + lvstore_check = True + snode = db.get_storage_node_by_id(snode.get_id()) + if snode.lvstore_status == "ready" or snode.status == StorageNode.STATUS_ONLINE or \ + snode.lvstore_status == "failed": + + lvstore_stack = snode.lvstore_stack + lvstore_check &= health_controller._check_node_lvstore( + lvstore_stack, snode, auto_fix=True, node_bdev_names=node_bdev_names) + + if snode.secondary_node_id: + + lvstore_check &= health_controller._check_node_hublvol( + snode, node_bdev_names=node_bdev_names, node_lvols_nqns=subsystems) + + second_node_1 = db.get_storage_node_by_id(snode.secondary_node_id) + if second_node_1 and second_node_1.status == StorageNode.STATUS_ONLINE: lvstore_check &= health_controller._check_node_lvstore( - lvstore_stack, snode, auto_fix=True, node_bdev_names=node_bdev_names) - - if snode.secondary_node_id: - - lvstore_check &= health_controller._check_node_hublvol( - snode, node_bdev_names=node_bdev_names, node_lvols_nqns=subsystems) - - second_node_1 = db.get_storage_node_by_id(snode.secondary_node_id) - if second_node_1 and second_node_1.status == StorageNode.STATUS_ONLINE: - lvstore_check &= health_controller._check_node_lvstore( - lvstore_stack, second_node_1, auto_fix=True, stack_src_node=snode) - sec_node_check = health_controller._check_sec_node_hublvol(second_node_1) - if not sec_node_check: - if snode.status == StorageNode.STATUS_ONLINE: - ret = second_node_1.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if ret: - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - # is_sec_node_leader = True - # check jc_compression status - jc_compression_is_active = second_node_1.rpc_client().jc_compression_get_status(snode.jm_vuid) - if not jc_compression_is_active: - lvstore_check &= health_controller._check_sec_node_hublvol(second_node_1, auto_fix=True) - - - lvol_port_check = False - # if node_api_check: - ports = [snode.lvol_subsys_port] - - if snode.lvstore_stack_secondary_1: - second_node_1 = db.get_storage_node_by_id(snode.lvstore_stack_secondary_1) - if second_node_1 and second_node_1.status == StorageNode.STATUS_ONLINE: - ports.append(second_node_1.lvol_subsys_port) - - for port in ports: - lvol_port_check = health_controller._check_port_on_node(snode, port) - logger.info( - f"Check: node {snode.mgmt_ip}, port: {port} ... {lvol_port_check}") - if not lvol_port_check and snode.status != StorageNode.STATUS_SUSPENDED: - tasks_controller.add_port_allow_task(snode.cluster_id, snode.get_id(), port) - - health_check_status = is_node_online and node_devices_check and node_remote_devices_check and lvstore_check - set_node_health_check(snode, bool(health_check_status)) + lvstore_stack, second_node_1, auto_fix=True, stack_src_node=snode) + sec_node_check = health_controller._check_sec_node_hublvol(second_node_1) + if not sec_node_check: + if snode.status == StorageNode.STATUS_ONLINE: + ret = second_node_1.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if ret: + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + # is_sec_node_leader = True + # check jc_compression status + jc_compression_is_active = second_node_1.rpc_client().jc_compression_get_status( + snode.jm_vuid) + if not jc_compression_is_active: + lvstore_check &= health_controller._check_sec_node_hublvol(second_node_1, + auto_fix=True) + + lvol_port_check = False + # if node_api_check: + ports = [snode.lvol_subsys_port] + + if snode.lvstore_stack_secondary_1: + second_node_1 = db.get_storage_node_by_id(snode.lvstore_stack_secondary_1) + if second_node_1 and second_node_1.status == StorageNode.STATUS_ONLINE: + ports.append(second_node_1.lvol_subsys_port) + + for port in ports: + try: + lvol_port_check = health_controller.check_port_on_node(snode, port) + logger.info( + f"Check: node {snode.mgmt_ip}, port: {port} ... {lvol_port_check}") + if not lvol_port_check and snode.status != StorageNode.STATUS_SUSPENDED: + tasks_controller.add_port_allow_task(snode.cluster_id, snode.get_id(), port) + except Exception: + logger.error("Check node port failed, connection error") + + health_check_status = is_node_online and node_devices_check and node_remote_devices_check and lvstore_check + set_node_health_check(snode, bool(health_check_status)) + time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC) + + +def loop_for_node(snode): + while True: + try: + check_node(snode) + except Exception as e: + logger.error(e) + time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC) + + +logger.info("Starting health check service") +db = db_controller.DBController() +threads_maps: dict[str, threading.Thread] = {} +while True: + clusters = db.get_clusters() + for cluster in clusters: + for node in db.get_storage_nodes_by_cluster_id(cluster.get_id()): + node_id = node.get_id() + if node_id not in threads_maps or threads_maps[node_id].is_alive() is False: + t = threading.Thread(target=loop_for_node, args=(node,)) + t.start() + threads_maps[node_id] = t time.sleep(constants.HEALTH_CHECK_INTERVAL_SEC) diff --git a/simplyblock_core/services/lvol_monitor.py b/simplyblock_core/services/lvol_monitor.py index 8486f3a32..79c492a40 100644 --- a/simplyblock_core/services/lvol_monitor.py +++ b/simplyblock_core/services/lvol_monitor.py @@ -60,8 +60,8 @@ def resume_comp(lvol): return rpc_client = RPCClient( node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=5, retry=2) - ret, err = rpc_client.jc_compression_start(jm_vuid=node.jm_vuid) - if err and "code" in err and err["code"] != -2: + ret, err = rpc_client.jc_suspend_compression(jm_vuid=node.jm_vuid, suspend=False) + if err: logger.info("Failed to resume JC compression adding task...") tasks_controller.add_jc_comp_resume_task(node.cluster_id, node.get_id(), node.jm_vuid) @@ -118,21 +118,24 @@ def process_lvol_delete_finish(lvol): lvol_controller.delete_lvol_from_node(lvol.get_id(), leader_node.get_id()) return + if snode.get_id() == leader_node.get_id(): + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + else: + sec_node = db.get_storage_node_by_id(snode.get_id()) + # 3-1 async delete lvol bdev from primary primary_node = db.get_storage_node_by_id(leader_node.get_id()) if primary_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + if sec_node and sec_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN, StorageNode.STATUS_UNREACHABLE]: + primary_node.lvol_del_sync_lock() ret = lvol_controller.delete_lvol_from_node(lvol.get_id(), primary_node.get_id(), del_async=True) if not ret: logger.error(f"Failed to delete lvol from primary_node node: {primary_node.get_id()}") # 3-2 async delete lvol bdev from secondary - if snode.get_id() == leader_node.get_id(): - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - else: - sec_node = db.get_storage_node_by_id(snode.get_id()) - - if sec_node: - tasks_controller.add_lvol_sync_del_task(sec_node.cluster_id, sec_node.get_id(), f"{lvol.lvs_name}/{lvol.lvol_bdev}") + if sec_node and sec_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN, StorageNode.STATUS_UNREACHABLE]: + tasks_controller.add_lvol_sync_del_task(sec_node.cluster_id, sec_node.get_id(), f"{lvol.lvs_name}/{lvol.lvol_bdev}", primary_node.get_id()) lvol_events.lvol_delete(lvol) lvol.remove(db.kv_store) @@ -159,195 +162,209 @@ def process_lvol_delete_try_again(lvol): lvol.write_to_db() -# get DB controller -db = db_controller.DBController() - -logger.info("Starting LVol monitor...") -while True: +def check_node(snode): + node_bdev_names = [] + node_lvols_nqns = {} + sec_node_bdev_names = {} + sec_node_lvols_nqns = {} + sec_node = None - for cluster in db.get_clusters(): + if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + node_bdevs = snode.rpc_client().get_bdevs() + if node_bdevs: + node_bdev_names = [b['name'] for b in node_bdevs] + for bdev in node_bdevs: + if "aliases" in bdev and bdev["aliases"]: + node_bdev_names.extend(bdev['aliases']) + ret = snode.rpc_client().subsystem_list() + if ret: + for sub in ret: + node_lvols_nqns[sub['nqn']] = sub - if cluster.status in [Cluster.STATUS_INACTIVE, Cluster.STATUS_UNREADY, Cluster.STATUS_IN_ACTIVATION]: - logger.warning(f"Cluster {cluster.get_id()} is in {cluster.status} state, skipping") + if snode.secondary_node_id: + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + sec_rpc_client = RPCClient( + sec_node.mgmt_ip, sec_node.rpc_port, + sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) + ret = sec_rpc_client.get_bdevs() + if ret: + for bdev in ret: + sec_node_bdev_names[bdev['name']] = bdev + + ret = sec_rpc_client.subsystem_list() + if ret: + for sub in ret: + sec_node_lvols_nqns[sub['nqn']] = sub + + for lvol in db.get_lvols_by_node_id(snode.get_id()): + + if lvol.status == LVol.STATUS_IN_CREATION: continue - for snode in db.get_storage_nodes_by_cluster_id(cluster.get_id()): - node_bdev_names = [] - node_lvols_nqns = {} - sec_node_bdev_names = {} - sec_node_lvols_nqns = {} - sec_node = None - + if lvol.status == lvol.STATUS_IN_DELETION: + # check leadership + leader_node = None + snode = db.get_storage_node_by_id(snode.get_id()) if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - node_bdevs = snode.rpc_client().get_bdevs() - if node_bdevs: - node_bdev_names = [b['name'] for b in node_bdevs] - for bdev in node_bdevs: - if "aliases" in bdev and bdev["aliases"]: - node_bdev_names.extend(bdev['aliases']) - ret = snode.rpc_client().subsystem_list() - if ret: - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub - - if snode.secondary_node_id: - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: - sec_rpc_client = RPCClient( - sec_node.mgmt_ip, sec_node.rpc_port, - sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) - ret = sec_rpc_client.get_bdevs() - if ret: - for bdev in ret: - sec_node_bdev_names[bdev['name']] = bdev - - ret = sec_rpc_client.subsystem_list() - if ret: - for sub in ret: - sec_node_lvols_nqns[sub['nqn']] = sub - - for lvol in db.get_lvols_by_node_id(snode.get_id()): - - if lvol.status == LVol.STATUS_IN_CREATION: - continue - - if lvol.status == lvol.STATUS_IN_DELETION: - # check leadership - leader_node = None - snode = db.get_storage_node_by_id(snode.get_id()) - if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if not ret: - raise Exception("Failed to get LVol info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = snode - - if not leader_node and sec_node: - ret = sec_node.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if not ret: - raise Exception("Failed to get LVol info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = sec_node - - if not leader_node: - raise Exception("Failed to get leader node") - - if lvol.deletion_status == "" or lvol.deletion_status != leader_node.get_id(): - lvol_controller.delete_lvol_from_node(lvol.get_id(), leader_node.get_id()) - time.sleep(3) - - try: - ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status( - f"{lvol.lvs_name}/{lvol.lvol_bdev}") - except Exception as e: - logger.error(e) - # timeout detected, check other node - break - - if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed - process_lvol_delete_finish(lvol) - - elif ret == 1: # Async lvol deletion is in progress or queued - logger.info(f"LVol deletion in progress, id: {lvol.get_id()}") - pre_lvol_delete_rebalance() - - elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Async deletion is done, but leadership has changed (sync deletion is now blocked)") - - elif ret == 4: # No async delete request exists for this lvol - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("No async delete request exists for this lvol") - lvol = db.get_lvol_by_id(lvol.get_id()) - lvol.io_error = True - lvol.write_to_db() - set_lvol_status(lvol, LVol.STATUS_OFFLINE) - - elif ret == -1: # Operation not permitted - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Operation not permitted") - lvol = db.get_lvol_by_id(lvol.get_id()) - lvol.io_error = True - lvol.write_to_db() - set_lvol_status(lvol, LVol.STATUS_OFFLINE) - - elif ret == -2: # No such file or directory - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("No such file or directory") - process_lvol_delete_finish(lvol) - - elif ret == -5: # I/O error - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("I/O error") - process_lvol_delete_try_again(lvol) - - elif ret == -11: # Try again - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Try again") - process_lvol_delete_try_again(lvol) - - elif ret == -12: # Out of memory - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Out of memory") - process_lvol_delete_try_again(lvol) - - elif ret == -16: # Device or resource busy - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Device or resource busy") - process_lvol_delete_try_again(lvol) - - elif ret == -19: # No such device - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Finishing lvol delete") - process_lvol_delete_finish(lvol) - - elif ret == -35: # Leadership changed - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Leadership changed") - process_lvol_delete_try_again(lvol) - - elif ret == -36: # Failed to update lvol for deletion - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Failed to update lvol for deletion") - process_lvol_delete_try_again(lvol) - - else: # Failed to update lvol for deletion - logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") - logger.error("Failed to update lvol for deletion") - - continue - - passed = True - ret = health_controller.check_lvol_on_node( - lvol.get_id(), lvol.node_id, node_bdev_names, node_lvols_nqns) + ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) if not ret: - passed = False + raise Exception("Failed to get LVol info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = snode - if lvol.ha_type == "ha": - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: - ret = health_controller.check_lvol_on_node( - lvol.get_id(), snode.secondary_node_id, sec_node_bdev_names, sec_node_lvols_nqns) - if not ret: - passed = False - else: - passed = True + if not leader_node and sec_node: + ret = sec_node.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if not ret: + raise Exception("Failed to get LVol info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = sec_node + + if not leader_node: + raise Exception("Failed to get leader node") + + if lvol.deletion_status == "" or lvol.deletion_status != leader_node.get_id(): + lvol_controller.delete_lvol_from_node(lvol.get_id(), leader_node.get_id()) + time.sleep(3) + + try: + ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status( + f"{lvol.lvs_name}/{lvol.lvol_bdev}") + except Exception as e: + logger.error(e) + # timeout detected, check other node + break + + if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed + process_lvol_delete_finish(lvol) + + elif ret == 1: # Async lvol deletion is in progress or queued + logger.info(f"LVol deletion in progress, id: {lvol.get_id()}") + pre_lvol_delete_rebalance() + + elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Async deletion is done, but leadership has changed (sync deletion is now blocked)") + + elif ret == 4: # No async delete request exists for this lvol + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("No async delete request exists for this lvol") + lvol = db.get_lvol_by_id(lvol.get_id()) + lvol.io_error = True + lvol.write_to_db() + set_lvol_status(lvol, LVol.STATUS_OFFLINE) + + elif ret == -1: # Operation not permitted + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Operation not permitted") + lvol = db.get_lvol_by_id(lvol.get_id()) + lvol.io_error = True + lvol.write_to_db() + set_lvol_status(lvol, LVol.STATUS_OFFLINE) + + elif ret == -2: # No such file or directory + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("No such file or directory") + process_lvol_delete_finish(lvol) + + elif ret == -5: # I/O error + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("I/O error") + process_lvol_delete_try_again(lvol) + + elif ret == -11: # Try again + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Try again") + process_lvol_delete_try_again(lvol) + + elif ret == -12: # Out of memory + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Out of memory") + process_lvol_delete_try_again(lvol) + + elif ret == -16: # Device or resource busy + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Device or resource busy") + process_lvol_delete_try_again(lvol) + + elif ret == -19: # No such device + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Finishing lvol delete") + process_lvol_delete_finish(lvol) + + elif ret == -35: # Leadership changed + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Leadership changed") + process_lvol_delete_try_again(lvol) + + elif ret == -36: # Failed to update lvol for deletion + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Failed to update lvol for deletion") + process_lvol_delete_try_again(lvol) + + else: # Failed to update lvol for deletion + logger.info(f"LVol deletion error, id: {lvol.get_id()}, error code: {ret}") + logger.error("Failed to update lvol for deletion") - if snode.lvstore_status == "ready": + continue - logger.info(f"LVol: {lvol.get_id()}, is healthy: {passed}") - set_lvol_health_check(lvol, passed) - if passed: - set_lvol_status(lvol, LVol.STATUS_ONLINE) + passed = True + try: + ret = health_controller.check_lvol_on_node( + lvol.get_id(), lvol.node_id, node_bdev_names, node_lvols_nqns) + if not ret: + passed = False + except Exception as e: + logger.error(f"Failed to check lvol:{lvol.get_id()} on node: {lvol.node_id}") + logger.error(e) + + if lvol.ha_type == "ha": + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + try: + ret = health_controller.check_lvol_on_node( + lvol.get_id(), snode.secondary_node_id, sec_node_bdev_names, sec_node_lvols_nqns) + if not ret: + passed = False + else: + passed = True + except Exception as e: + logger.error(f"Failed to check lvol: {lvol.get_id()} on node: {snode.secondary_node_id}") + logger.error(e) + + if snode.lvstore_status == "ready": + + logger.info(f"LVol: {lvol.get_id()}, is healthy: {passed}") + set_lvol_health_check(lvol, passed) + if passed: + set_lvol_status(lvol, LVol.STATUS_ONLINE) + + if snode.lvstore_status == "ready": + + for snap in db.get_snapshots_by_node_id(snode.get_id()): + present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) + set_snapshot_health_check(snap, present) - if snode.lvstore_status == "ready": - for snap in db.get_snapshots_by_node_id(snode.get_id()): - present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) - set_snapshot_health_check(snap, present) +# get DB controller +db = db_controller.DBController() +logger.info("Starting LVol monitor...") +while True: + + for cluster in db.get_clusters(): + + if cluster.status in [Cluster.STATUS_INACTIVE, Cluster.STATUS_UNREADY, Cluster.STATUS_IN_ACTIVATION]: + logger.warning(f"Cluster {cluster.get_id()} is in {cluster.status} state, skipping") + continue + + for snode in db.get_storage_nodes_by_cluster_id(cluster.get_id()): + try: + check_node(snode) + except Exception as e: + logger.error(e) time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/lvol_stat_collector.py b/simplyblock_core/services/lvol_stat_collector.py index 1933b6703..18f09d4ce 100644 --- a/simplyblock_core/services/lvol_stat_collector.py +++ b/simplyblock_core/services/lvol_stat_collector.py @@ -7,7 +7,6 @@ from simplyblock_core.models.lvol_model import LVol from simplyblock_core.models.stats import LVolStatObject, PoolStatObject from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.rpc_client import RPCClient logger = utils.get_logger(__name__) @@ -212,68 +211,66 @@ def add_pool_stats(pool, records): continue if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + try: + rpc_client = snode.rpc_client(timeout=3, retry=2) + if snode.get_id() in all_node_bdev_names and all_node_bdev_names[snode.get_id()]: + node_bdev_names = all_node_bdev_names[snode.get_id()] + else: + node_bdevs = rpc_client.get_bdevs() + if node_bdevs: + node_bdev_names = {b['name']: b for b in node_bdevs} + all_node_bdev_names[snode.get_id()] = node_bdev_names - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, timeout=3, retry=2) - - if snode.get_id() in all_node_bdev_names and all_node_bdev_names[snode.get_id()]: - node_bdev_names = all_node_bdev_names[snode.get_id()] - else: - node_bdevs = rpc_client.get_bdevs() - if node_bdevs: - node_bdev_names = {b['name']: b for b in node_bdevs} - all_node_bdev_names[snode.get_id()] = node_bdev_names - - if snode.get_id() in all_node_lvols_nqns and all_node_lvols_nqns[snode.get_id()]: - node_lvols_nqns = all_node_lvols_nqns[snode.get_id()] - else: - ret = rpc_client.subsystem_list() - if ret: - node_lvols_nqns = {} - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub - all_node_lvols_nqns[snode.get_id()] = node_lvols_nqns - - if snode.get_id() in all_node_lvols_stats and all_node_lvols_stats[snode.get_id()]: - node_lvols_stats = all_node_lvols_stats[snode.get_id()] - else: - ret = rpc_client.get_lvol_stats() - if ret: - node_lvols_stats = {} - for st in ret['bdevs']: - node_lvols_stats[st['name']] = st - all_node_lvols_stats[snode.get_id()] = node_lvols_stats - - if snode.secondary_node_id: - sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: - sec_rpc_client = RPCClient( - sec_node.mgmt_ip, sec_node.rpc_port, - sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) - - if sec_node.get_id() not in all_node_bdev_names or not all_node_bdev_names[sec_node.get_id()]: - ret = sec_rpc_client.get_bdevs() - if ret: - # node_bdev_names = {} - node_bdev_names = {b['name']: b for b in ret} - all_node_bdev_names[sec_node.get_id()] = node_bdev_names - - if sec_node.get_id() not in all_node_lvols_nqns or not all_node_lvols_nqns[sec_node.get_id()]: - ret = sec_rpc_client.subsystem_list() + if snode.get_id() in all_node_lvols_nqns and all_node_lvols_nqns[snode.get_id()]: + node_lvols_nqns = all_node_lvols_nqns[snode.get_id()] + else: + ret = rpc_client.subsystem_list() if ret: node_lvols_nqns = {} for sub in ret: node_lvols_nqns[sub['nqn']] = sub - all_node_lvols_nqns[sec_node.get_id()] = node_lvols_nqns + all_node_lvols_nqns[snode.get_id()] = node_lvols_nqns - if sec_node.get_id() not in all_node_lvols_stats or not all_node_lvols_stats[sec_node.get_id()]: - ret = sec_rpc_client.get_lvol_stats() + if snode.get_id() in all_node_lvols_stats and all_node_lvols_stats[snode.get_id()]: + node_lvols_stats = all_node_lvols_stats[snode.get_id()] + else: + ret = rpc_client.get_lvol_stats() if ret: - sec_node_lvols_stats = {} + node_lvols_stats = {} for st in ret['bdevs']: - sec_node_lvols_stats[st['name']] = st - all_node_lvols_stats[sec_node.get_id()] = sec_node_lvols_stats + node_lvols_stats[st['name']] = st + all_node_lvols_stats[snode.get_id()] = node_lvols_stats + except Exception as e: + logger.error(e) + + if snode.secondary_node_id: + sec_node = db.get_storage_node_by_id(snode.secondary_node_id) + if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: + try: + sec_rpc_client = sec_node.rpc_client(timeout=3, retry=2) + if sec_node.get_id() not in all_node_bdev_names or not all_node_bdev_names[sec_node.get_id()]: + ret = sec_rpc_client.get_bdevs() + if ret: + # node_bdev_names = {} + node_bdev_names = {b['name']: b for b in ret} + all_node_bdev_names[sec_node.get_id()] = node_bdev_names + if sec_node.get_id() not in all_node_lvols_nqns or not all_node_lvols_nqns[sec_node.get_id()]: + ret = sec_rpc_client.subsystem_list() + if ret: + node_lvols_nqns = {} + for sub in ret: + node_lvols_nqns[sub['nqn']] = sub + all_node_lvols_nqns[sec_node.get_id()] = node_lvols_nqns + + if sec_node.get_id() not in all_node_lvols_stats or not all_node_lvols_stats[sec_node.get_id()]: + ret = sec_rpc_client.get_lvol_stats() + if ret: + sec_node_lvols_stats = {} + for st in ret['bdevs']: + sec_node_lvols_stats[st['name']] = st + all_node_lvols_stats[sec_node.get_id()] = sec_node_lvols_stats + except Exception as e: + logger.error(e) for lvol in lvol_list: if lvol.status in [LVol.STATUS_IN_CREATION, LVol.STATUS_IN_DELETION]: diff --git a/simplyblock_core/services/main_distr_event_collector.py b/simplyblock_core/services/main_distr_event_collector.py index 31dffeda0..93e0ae4df 100644 --- a/simplyblock_core/services/main_distr_event_collector.py +++ b/simplyblock_core/services/main_distr_event_collector.py @@ -1,7 +1,7 @@ # coding=utf-8 import threading import time - +from datetime import datetime from simplyblock_core import constants, db_controller, utils, rpc_client, distr_controller from simplyblock_core.controllers import events_controller, device_controller @@ -9,9 +9,8 @@ from simplyblock_core.models.storage_node import StorageNode -logger = utils.get_logger(__name__) - utils.init_sentry_sdk() +logger = utils.get_logger(__name__) # get DB controller db = db_controller.DBController() @@ -19,7 +18,17 @@ EVENTS_LIST = ['SPDK_BDEV_EVENT_REMOVE', "error_open", 'error_read', "error_write", "error_unmap", "error_write_cannot_allocate"] -def process_device_event(event): + +def remove_remote_device_from_node(node_id, device_id): + node = db.get_storage_node_by_id(node_id) + for remote_dev in node.remote_devices: + if remote_dev.get_id() == device_id: + node.remote_devices.remove(remote_dev) + node.write_to_db() + break + + +def process_device_event(event, logger): if event.message in EVENTS_LIST: node_id = event.node_id storage_id = event.storage_id @@ -39,15 +48,31 @@ def process_device_event(event): event.status = 'device_not_found' return - if device_obj.connecting_from_node == event_node_obj.get_id(): + if "timestamp" in event.object_dict: + ev_time = event.object_dict['timestamp'] + time_delta = datetime.now() - datetime.strptime(ev_time, '%Y-%m-%dT%H:%M:%S.%fZ') + if time_delta.total_seconds() > 8: + if snode.rpc_client().bdev_nvme_controller_list(device_obj.nvme_controller): + logger.info(f"event was fired {time_delta.total_seconds()} seconds ago, controller ok, skipping") + event.status = f'skipping_late_by_{int(time_delta.total_seconds())}s_but_controller_ok' + return + + logger.info(f"event was fired {time_delta.total_seconds()} seconds ago, checking controller filed") + event.status = f'late_by_{int(time_delta.total_seconds())}s' + + if device_obj.is_connection_in_progress_to_node(event_node_obj.get_id()): logger.warning("Connection attempt was found from node to device, sleeping 5 seconds") time.sleep(5) + device_obj.lock_device_connection(event_node_obj.get_id()) + if device_obj.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, NVMeDevice.STATUS_CANNOT_ALLOCATE]: logger.info(f"The device is not online, skipping. status: {device_obj.status}") event.status = f'skipped:dev_{device_obj.status}' distr_controller.send_dev_status_event(device_obj, device_obj.status, event_node_obj) + remove_remote_device_from_node(event_node_obj.get_id(), device_obj.get_id()) + device_obj.release_device_connection() return @@ -55,12 +80,16 @@ def process_device_event(event): distr_controller.send_dev_status_event(device_obj, NVMeDevice.STATUS_UNAVAILABLE, event_node_obj) logger.info(f"Node is not online, skipping. status: {event_node_obj.status}") event.status = 'skipped:node_offline' + remove_remote_device_from_node(event_node_obj.get_id(), device_obj.get_id()) + device_obj.release_device_connection() return if device_node_obj.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: distr_controller.send_dev_status_event(device_obj, NVMeDevice.STATUS_UNAVAILABLE, event_node_obj) logger.info(f"Node is not online, skipping. status: {device_node_obj.status}") event.status = f'skipped:device_node_{device_node_obj.status}' + remove_remote_device_from_node(event_node_obj.get_id(), device_obj.get_id()) + device_obj.release_device_connection() return @@ -83,63 +112,37 @@ def process_device_event(event): device_controller.device_set_io_error(device_obj.get_id(), True) else: distr_controller.send_dev_status_event(device_obj, NVMeDevice.STATUS_UNAVAILABLE, event_node_obj) - event_node_obj = db.get_storage_node_by_id(event_node_obj.get_id()) - for dev in event_node_obj.remote_devices: - if dev.get_id() == device_obj.get_id(): - event_node_obj.remote_devices.remove(dev) - event_node_obj.write_to_db() - break + remove_remote_device_from_node(event_node_obj.get_id(), device_obj.get_id()) event.status = 'processed' + device_obj.release_device_connection() -def process_lvol_event(event): +def process_lvol_event(event, logger): if event.message in ["error_open", 'error_read', "error_write", "error_unmap"]: vuid = event.object_dict['vuid'] - # node_id = event.node_id - # storage_node_ops.set_node_status(node_id, StorageNode.STATUS_SUSPENDED) - # event_node_obj = db.get_storage_node_by_id(node_id) - # tasks_controller.add_node_to_auto_restart(event_node_obj) - - # lvols = [] - # for lv in db.get_lvols(): # pass - # if lv.node_id == node_id: - # lvols.append(lv) - # - # if not lvols: - # logger.error(f"LVols on node {node_id} not found") - # event.status = 'lvols_not_found' - # else: - # for lvol in lvols: - # if lvol.status == LVol.STATUS_ONLINE: - # logger.info("Setting LVol to offline") - # lvol.io_error = True - # old_status = lvol.status - # lvol.status = LVol.STATUS_OFFLINE - # lvol.write_to_db(db.kv_store) - # lvol_events.lvol_status_change(lvol, lvol.status, old_status, caused_by="monitor") - # lvol_events.lvol_io_error_change(lvol, True, False, caused_by="monitor") event.status = f'distr error {vuid}' else: logger.error(f"Unknown event message: {event.message}") event.status = "event_unknown" -def process_event(event): +def process_event(event, logger): if event.event == "device_status": if event.storage_id >= 0: - process_device_event(event) + process_device_event(event, logger) if event.vuid >= 0: - process_lvol_event(event) + process_lvol_event(event, logger) event.write_to_db(db.kv_store) def start_event_collector_on_node(node_id): + snode = db.get_storage_node_by_id(node_id) + logger.info(f"Starting Distr event collector on node: {node_id}") - snode = db.get_storage_node_by_id(node_id) client = rpc_client.RPCClient( snode.mgmt_ip, snode.rpc_port, @@ -151,6 +154,7 @@ def start_event_collector_on_node(node_id): while True: page = 1 events_groups = {} + events_list = [] while True: try: events = client.distr_status_events_discard_then_get( @@ -181,14 +185,17 @@ def start_event_collector_on_node(node_id): events_groups[sid][et][msg]: 1 # type: ignore else: events_groups[sid][et][msg].count += 1 # type: ignore - events_groups[sid][et][msg].write_to_db() # type: ignore - logger.info(f"Event {msg} already processed") continue event = events_controller.log_distr_event(snode.cluster_id, snode.get_id(), event_dict) logger.info(f"Processing event: {event.get_id()}") - process_event(event) + process_event(event, logger) events_groups[sid][et][msg] = event + events_list.append(event) + + for ev in events_list: + if ev.count > 1 : + ev.write_to_db(db.kv_store) logger.info(f"Discarding events: {len(events)}") client.distr_status_events_discard_then_get(len(events), 0) @@ -197,8 +204,7 @@ def start_event_collector_on_node(node_id): logger.info("no events found, sleeping") break except Exception as e: - logger.error("Failed to process distr events") - logger.exception(e) + logger.error(f"Failed to process distr events: {e}") break time.sleep(constants.DISTR_EVENT_COLLECTOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/snapshot_monitor.py b/simplyblock_core/services/snapshot_monitor.py index a99ed89f3..2910df3d6 100644 --- a/simplyblock_core/services/snapshot_monitor.py +++ b/simplyblock_core/services/snapshot_monitor.py @@ -8,7 +8,6 @@ from simplyblock_core.controllers import health_controller, snapshot_events, tasks_controller from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.rpc_client import RPCClient logger = utils.get_logger(__name__) @@ -64,20 +63,22 @@ def process_snap_delete_finish(snap, leader_node): # 3-1 async delete lvol bdev from primary primary_node = db.get_storage_node_by_id(leader_node.get_id()) + non_leader_id = snode.secondary_node_id + if snode.get_id() != leader_node.get_id(): + non_leader_id = snode.get_id() + non_leader = db.get_storage_node_by_id(non_leader_id) if primary_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + if non_leader and non_leader.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN, StorageNode.STATUS_UNREACHABLE]: + primary_node.lvol_del_sync_lock() ret, _ = primary_node.rpc_client().delete_lvol(snap.snap_bdev, del_async=True) if not ret: logger.error(f"Failed to delete snap from node: {snode.get_id()}") # 3-2 async delete lvol bdev from secondary - non_leader_id = snode.secondary_node_id - if snode.get_id() != leader_node.get_id(): - non_leader_id = snode.get_id() - - non_leader = db.get_storage_node_by_id(non_leader_id) - if non_leader: - tasks_controller.add_lvol_sync_del_task(non_leader.cluster_id, non_leader.get_id(), snap.snap_bdev) - + if non_leader and non_leader.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN, StorageNode.STATUS_UNREACHABLE]: + tasks_controller.add_lvol_sync_del_task(non_leader.cluster_id, non_leader.get_id(), snap.snap_bdev, primary_node.get_id()) snapshot_events.snapshot_delete(snap) snap.remove(db.kv_store) @@ -95,6 +96,115 @@ def set_snap_offline(snap): sn.write_to_db() +def process_snap_delete(snap, snode): + # check leadership + leader_node = None + if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, + StorageNode.STATUS_DOWN]: + ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if not ret: + raise Exception("Failed to get LVol store info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = snode + + if not leader_node and sec_node: + ret = sec_node.rpc_client().bdev_lvol_get_lvstores(sec_node.lvstore) + if not ret: + raise Exception("Failed to get LVol store info") + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + leader_node = sec_node + + if not leader_node: + raise Exception("Failed to get leader node") + + if snap.deletion_status == "" or snap.deletion_status != leader_node.get_id(): + + ret, _ = leader_node.rpc_client().delete_lvol(snap.snap_bdev) + if not ret: + logger.error(f"Failed to delete snap from node: {snode.get_id()}") + return False + snap = db.get_snapshot_by_id(snap.get_id()) + snap.deletion_status = leader_node.get_id() + snap.write_to_db() + + time.sleep(3) + + try: + ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status(snap.snap_bdev) + except Exception as e: + logger.error(e) + # timeout detected, check other node + return False + + if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed + process_snap_delete_finish(snap, leader_node) + + elif ret == 1: # Async lvol deletion is in progress or queued + logger.info(f"Snap deletion in progress, id: {snap.get_id()}") + + elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error( + "Async deletion is done, but leadership has changed (sync deletion is now blocked)") + + elif ret == 4: # No async delete request exists for this Snap + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No async delete request exists for this snap") + set_snap_offline(snap) + + elif ret == -1: # Operation not permitted + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Operation not permitted") + process_snap_delete_try_again(snap) + + elif ret == -2: # No such file or directory + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No such file or directory") + process_snap_delete_finish(snap, leader_node) + + elif ret == -5: # I/O error + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("I/O error") + process_snap_delete_try_again(snap) + + elif ret == -11: # Try again + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Try again") + process_snap_delete_try_again(snap) + + elif ret == -12: # Out of memory + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Out of memory") + process_snap_delete_try_again(snap) + + elif ret == -16: # Device or resource busy + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Device or resource busy") + process_snap_delete_try_again(snap) + + elif ret == -19: # No such device + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("No such device") + set_snap_offline(snap) + + elif ret == -35: # Leadership changed + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Leadership changed") + process_snap_delete_try_again(snap) + + elif ret == -36: # Failed to update lvol for deletion + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Failed to update snapshot for deletion") + process_snap_delete_try_again(snap) + + else: # Failed to update lvol for deletion + logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") + logger.error("Failed to update snapshot for deletion") + + + # get DB controller db = db_controller.DBController() @@ -109,159 +219,46 @@ def set_snap_offline(snap): for snode in db.get_storage_nodes_by_cluster_id(cluster.get_id()): node_bdev_names = [] - node_lvols_nqns = {} sec_node_bdev_names = {} - sec_node_lvols_nqns = {} sec_node = None if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: - - rpc_client = RPCClient( - snode.mgmt_ip, snode.rpc_port, - snode.rpc_username, snode.rpc_password, timeout=3, retry=2) - node_bdevs = rpc_client.get_bdevs() + rpc_client = snode.rpc_client(timeout=3, retry=2) + try: + node_bdevs = rpc_client.get_bdevs() + except Exception as e: + logger.error(e) + continue if node_bdevs: node_bdev_names = [b['name'] for b in node_bdevs] for bdev in node_bdevs: if "aliases" in bdev and bdev["aliases"]: node_bdev_names.extend(bdev['aliases']) - ret = rpc_client.subsystem_list() - if ret: - for sub in ret: - node_lvols_nqns[sub['nqn']] = sub - if snode.secondary_node_id: sec_node = db.get_storage_node_by_id(snode.secondary_node_id) - if sec_node and sec_node.status==StorageNode.STATUS_ONLINE: - sec_rpc_client = RPCClient( - sec_node.mgmt_ip, sec_node.rpc_port, - sec_node.rpc_username, sec_node.rpc_password, timeout=3, retry=2) - ret = sec_rpc_client.get_bdevs() + if sec_node and sec_node.status in [ + StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, StorageNode.STATUS_DOWN]: + sec_rpc_client = sec_node.rpc_client(timeout=3, retry=2) + try: + ret = sec_rpc_client.get_bdevs() + except Exception as e: + logger.error(e) + continue if ret: for bdev in ret: sec_node_bdev_names[bdev['name']] = bdev - ret = sec_rpc_client.subsystem_list() - if ret: - for sub in ret: - sec_node_lvols_nqns[sub['nqn']] = sub - - if snode.lvstore_status == "ready": - - for snap in db.get_snapshots_by_node_id(snode.get_id()): - if snap.status == SnapShot.STATUS_ONLINE: - - present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) + for snap in db.get_snapshots_by_node_id(snode.get_id()): + if snap.status == SnapShot.STATUS_ONLINE: + present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names) + if snode.lvstore_status == "ready": set_snapshot_health_check(snap, present) - elif snap.status == SnapShot.STATUS_IN_DELETION: - - # check leadership - leader_node = None - if snode.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_SUSPENDED, - StorageNode.STATUS_DOWN]: - ret = snode.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if not ret: - raise Exception("Failed to get LVol store info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = snode - - if not leader_node and sec_node: - ret = sec_node.rpc_client().bdev_lvol_get_lvstores(sec_node.lvstore) - if not ret: - raise Exception("Failed to get LVol store info") - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - leader_node = sec_node - - if not leader_node: - raise Exception("Failed to get leader node") - - if snap.deletion_status == "" or snap.deletion_status != leader_node.get_id(): - - ret, _ = leader_node.rpc_client().delete_lvol(snap.snap_bdev) - if not ret: - logger.error(f"Failed to delete snap from node: {snode.get_id()}") - continue - snap = db.get_snapshot_by_id(snap.get_id()) - snap.deletion_status = leader_node.get_id() - snap.write_to_db() - - time.sleep(3) - - try: - ret = leader_node.rpc_client().bdev_lvol_get_lvol_delete_status(snap.snap_bdev) - except Exception as e: - logger.error(e) - # timeout detected, check other node - break - - if ret == 0 or ret == 2: # Lvol may have already been deleted (not found) or delete completed - process_snap_delete_finish(snap, leader_node) - - elif ret == 1: # Async lvol deletion is in progress or queued - logger.info(f"Snap deletion in progress, id: {snap.get_id()}") - - elif ret == 3: # Async deletion is done, but leadership has changed (sync deletion is now blocked) - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error( - "Async deletion is done, but leadership has changed (sync deletion is now blocked)") - - elif ret == 4: # No async delete request exists for this Snap - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No async delete request exists for this snap") - set_snap_offline(snap) - - elif ret == -1: # Operation not permitted - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Operation not permitted") - process_snap_delete_try_again(snap) - - elif ret == -2: # No such file or directory - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No such file or directory") - process_snap_delete_finish(snap, leader_node) - - elif ret == -5: # I/O error - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("I/O error") - process_snap_delete_try_again(snap) - - elif ret == -11: # Try again - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Try again") - process_snap_delete_try_again(snap) - - elif ret == -12: # Out of memory - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Out of memory") - process_snap_delete_try_again(snap) - - elif ret == -16: # Device or resource busy - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Device or resource busy") - process_snap_delete_try_again(snap) - - elif ret == -19: # No such device - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("No such device") - set_snap_offline(snap) - - elif ret == -35: # Leadership changed - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Leadership changed") - process_snap_delete_try_again(snap) - - elif ret == -36: # Failed to update lvol for deletion - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Failed to update snapshot for deletion") - process_snap_delete_try_again(snap) - - else: # Failed to update lvol for deletion - logger.info(f"Snap deletion error, id: {snap.get_id()}, error code: {ret}") - logger.error("Failed to update snapshot for deletion") - + elif snap.status == SnapShot.STATUS_IN_DELETION: + try: + process_snap_delete(snap, snode) + except Exception as e: + logger.error(e) time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/spdk_http_proxy_server.py b/simplyblock_core/services/spdk_http_proxy_server.py index e0bb05bff..dd60a7111 100644 --- a/simplyblock_core/services/spdk_http_proxy_server.py +++ b/simplyblock_core/services/spdk_http_proxy_server.py @@ -6,18 +6,58 @@ import os import socket import sys +import threading +import time from http.server import HTTPServer from http.server import ThreadingHTTPServer from http.server import BaseHTTPRequestHandler +rpc_sock = '/mnt/ramdisk/spdk.sock' logger_handler = logging.StreamHandler(stream=sys.stdout) logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s')) logger = logging.getLogger() logger.addHandler(logger_handler) logger.setLevel(logging.INFO) +read_line_time_diff: dict = {} +recv_from_spdk_time_diff: dict = {} +def print_stats(): + while True: + try: + time.sleep(3) + t = time.time_ns() + read_line_time_diff_max = max(list(read_line_time_diff.values())) + read_line_time_diff_avg = int(sum(list(read_line_time_diff.values()))/len(read_line_time_diff)) + last_3_sec = [] + for k,v in read_line_time_diff.items(): + if k > t - 3*1000*1000*1000: + last_3_sec.append(v) + if len(last_3_sec) > 0: + read_line_time_diff_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + else: + read_line_time_diff_avg_last_3_sec = 0 + logger.info(f"Periodic stats: {t}: read_line_time: max={read_line_time_diff_max} ns, avg={read_line_time_diff_avg} ns, last_3s_avg={read_line_time_diff_avg_last_3_sec} ns") + if len(read_line_time_diff) > 10000: + read_line_time_diff.clear() + + recv_from_spdk_time_max = max(list(recv_from_spdk_time_diff.values())) + recv_from_spdk_time_avg = int(sum(list(recv_from_spdk_time_diff.values()))/len(recv_from_spdk_time_diff)) + last_3_sec = [] + for k,v in recv_from_spdk_time_diff.items(): + if k > t - 3*1000*1000*1000: + last_3_sec.append(v) + if len(last_3_sec) > 0: + recv_from_spdk_time_avg_last_3_sec = int(sum(last_3_sec)/len(last_3_sec)) + else: + recv_from_spdk_time_avg_last_3_sec = 0 + logger.info(f"Periodic stats: {t}: recv_from_spdk_time: max={recv_from_spdk_time_max} ns, avg={recv_from_spdk_time_avg} ns, last_3s_avg={recv_from_spdk_time_avg_last_3_sec} ns") + if len(recv_from_spdk_time_diff) > 10000: + recv_from_spdk_time_diff.clear() + except Exception as e: + logger.error(e) + def get_env_var(name, default=None, is_required=False): if not name: @@ -31,12 +71,16 @@ def get_env_var(name, default=None, is_required=False): unix_sockets: list[socket] = [] # type: ignore[valid-type] def rpc_call(req): + logger.info(f"active threads: {threading.active_count()}") + logger.info(f"active unix sockets: {len(unix_sockets)}") req_data = json.loads(req.decode('ascii')) + req_time = time.time_ns() params = "" if "params" in req_data: params = str(req_data['params']) - logger.info(f"Request function: {str(req_data['method'])}, params: {params}") + logger.info(f"Request:{req_time} function: {str(req_data['method'])}, params: {params}") sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + unix_sockets.append(sock) sock.settimeout(TIMEOUT) sock.connect(rpc_sock) sock.sendall(req) @@ -48,7 +92,7 @@ def rpc_call(req): buf = '' closed = False response = None - + recv_from_spdk_time_start = time.time_ns() while not closed: newdata = sock.recv(1024*1024*1024) if newdata == b'': @@ -59,13 +103,18 @@ def rpc_call(req): except ValueError: continue # incomplete response; keep buffering break + recv_from_spdk_time_end = time.time_ns() + time_diff = recv_from_spdk_time_end - recv_from_spdk_time_start + logger.info(f"recv_from_spdk_time_diff: {time_diff}") + recv_from_spdk_time_diff[recv_from_spdk_time_start] = time_diff sock.close() + unix_sockets.remove(sock) if not response and len(buf) > 0: raise ValueError('Invalid response') - logger.debug(f"Response data: {buf}") + logger.info(f"Response:{req_time}") return buf @@ -74,7 +123,6 @@ class ServerHandler(BaseHTTPRequestHandler): server_session: list[int] = [] key = "" - def do_HEAD(self): self.send_response(200) self.send_header('Content-type', 'text/html') @@ -97,9 +145,14 @@ def do_INTERNALERROR(self): self.end_headers() def do_POST(self): + req_time = time.time_ns() + self.server_session.append(req_time) + logger.info(f"incoming request at: {req_time}") + logger.info(f"active server session: {len(self.server_session)}") if self.headers['Authorization'] != 'Basic ' + self.key: self.do_AUTHHEAD() else: + read_line_time_start = time.time_ns() if "Content-Length" in self.headers: data_string = self.rfile.read(int(self.headers['Content-Length'])) elif "chunked" in self.headers.get("Transfer-Encoding", ""): @@ -119,7 +172,10 @@ def do_POST(self): # Finally, a chunk size of 0 is an end indication if chunk_length == 0: break - + read_line_time_end = time.time_ns() + time_diff = read_line_time_end - read_line_time_start + logger.info(f"read_line_time_diff: {time_diff}") + read_line_time_diff[read_line_time_start] = time_diff try: response = rpc_call(data_string) if response is not None: @@ -130,12 +186,14 @@ def do_POST(self): except ValueError: self.do_INTERNALERROR() + self.server_session.remove(req_time) def run_server(host, port, user, password, is_threading_enabled=False): # encoding user and password key = base64.b64encode((user+':'+password).encode(encoding='ascii')).decode('ascii') - + print_stats_thread = threading.Thread(target=print_stats, ) + print_stats_thread.start() try: ServerHandler.key = key httpd = (ThreadingHTTPServer if is_threading_enabled else HTTPServer)((host, port), ServerHandler) diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py index e7f32ad82..24079d51c 100644 --- a/simplyblock_core/services/storage_node_monitor.py +++ b/simplyblock_core/services/storage_node_monitor.py @@ -3,7 +3,6 @@ import time from datetime import datetime, timezone - from simplyblock_core import constants, db_controller, cluster_ops, storage_node_ops, utils from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events, \ cluster_events @@ -15,7 +14,6 @@ logger = utils.get_logger(__name__) - # get DB controller db = db_controller.DBController() @@ -75,13 +73,16 @@ def get_next_cluster_status(cluster_id): continue online_nodes += 1 # check for jm rep tasks: - ret = node.rpc_client().jc_get_jm_status(node.jm_vuid) - if ret: - for jm in ret: - if ret[jm] is False: # jm is not ready (has active replication task) - jm_replication_tasks = True - logger.warning("Replication task found!") - break + if node.rpc_client().bdev_lvol_get_lvstores(node.lvstore): + try: + ret = node.rpc_client().jc_get_jm_status(node.jm_vuid) + for jm in ret: + if ret[jm] is False: # jm is not ready (has active replication task) + jm_replication_tasks = True + logger.warning("Replication task found!") + break + except Exception: + logger.warning("Failed to get replication task!") elif node.status == StorageNode.STATUS_REMOVED: pass else: @@ -115,11 +116,12 @@ def get_next_cluster_status(cluster_id): k = cluster.distr_npcs # if number of devices in the cluster unavailable on DIFFERENT nodes > k --> I cannot read and in some cases cannot write (suspended) - if affected_nodes == k and (not cluster.strict_node_anti_affinity or online_nodes >= (n+k)): + if affected_nodes == k and (not cluster.strict_node_anti_affinity or online_nodes >= (n + k)): return Cluster.STATUS_DEGRADED elif jm_replication_tasks: return Cluster.STATUS_DEGRADED - elif (affected_nodes > k or online_devices < (n + k) or (online_nodes < (n+k) and cluster.strict_node_anti_affinity)): + elif (affected_nodes > k or online_devices < (n + k) or ( + online_nodes < (n + k) and cluster.strict_node_anti_affinity)): return Cluster.STATUS_SUSPENDED else: return Cluster.STATUS_ACTIVE @@ -133,7 +135,7 @@ def update_cluster_status(cluster_id): for task in db.get_job_tasks(cluster_id): if task.status != JobSchedule.STATUS_DONE and task.function_name in [ JobSchedule.FN_DEV_MIG, JobSchedule.FN_NEW_DEV_MIG, JobSchedule.FN_FAILED_DEV_MIG]: - if task.retry == 0: + if "migration" not in task.function_params: first_iter_task_pending += 1 is_re_balancing = first_iter_task_pending > 0 cluster = db.get_cluster_by_id(cluster_id) @@ -149,7 +151,7 @@ def update_cluster_status(cluster_id): return if current_cluster_status == Cluster.STATUS_DEGRADED and next_current_status == Cluster.STATUS_ACTIVE: - # if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_UNREADY] and cluster_current_status == Cluster.STATUS_ACTIVE: + # if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_UNREADY] and cluster_current_status == Cluster.STATUS_ACTIVE: # cluster_ops.cluster_activate(cluster_id, True) cluster_ops.set_cluster_status(cluster_id, Cluster.STATUS_ACTIVE) return @@ -190,7 +192,6 @@ def update_cluster_status(cluster_id): cluster_ops.set_cluster_status(cluster_id, next_current_status) - def set_node_online(node): if node.status != StorageNode.STATUS_ONLINE: @@ -215,24 +216,56 @@ def set_node_online(node): if online_devices_list: tasks_controller.add_device_mig_task(online_devices_list, node.cluster_id) -def set_node_offline(node, set_devs_offline=False): - if node.status != StorageNode.STATUS_UNREACHABLE: - # set node unavailable - storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_UNREACHABLE) + update_cluster_status(cluster_id) + - # if set_devs_offline: - # # set devices unavailable - # for dev in node.nvme_devices: - # if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY]: - # device_controller.device_set_unavailable(dev.get_id()) +def set_node_offline(node): + if node.status != StorageNode.STATUS_OFFLINE: + try: + storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_OFFLINE) + for dev in node.nvme_devices: + if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, + NVMeDevice.STATUS_CANNOT_ALLOCATE]: + device_controller.device_set_unavailable(dev.get_id()) + update_cluster_status(cluster_id) + # initiate restart + tasks_controller.add_node_to_auto_restart(node) + except Exception as e: + logger.debug("Setting node to OFFLINE state failed") + logger.error(e) + + +def set_node_unreachable(node): + if node.status != StorageNode.STATUS_UNREACHABLE: + try: + storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_UNREACHABLE) + update_cluster_status(cluster_id) + except Exception as e: + logger.debug("Setting node to UNREACHABLE state failed") + logger.error(e) + + +def set_node_schedulable(node): + if node.status != StorageNode.STATUS_SCHEDULABLE: + try: + storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_SCHEDULABLE) + # initiate shutdown + # initiate restart + tasks_controller.add_node_to_auto_restart(node) + for dev in node.nvme_devices: + if dev.status in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, + NVMeDevice.STATUS_CANNOT_ALLOCATE]: + device_controller.device_set_unavailable(dev.get_id()) + update_cluster_status(cluster_id) + except Exception as e: + logger.debug("Setting node to SCHEDULABLE state failed") + logger.error(e) - # # set jm dev offline - # if node.jm_device.status != JMDevice.STATUS_UNAVAILABLE: - # device_controller.set_jm_device_state(node.jm_device.get_id(), JMDevice.STATUS_UNAVAILABLE) def set_node_down(node): if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_SUSPENDED]: storage_node_ops.set_node_status(node.get_id(), StorageNode.STATUS_DOWN) + update_cluster_status(cluster_id) def node_rpc_timeout_check_and_report(node): @@ -246,10 +279,151 @@ def node_rpc_timeout_check_and_report(node): except Exception as e: logger.debug(e) # RPC timeout detected, send to cluster log - storage_events.snode_rpc_timeout(node, time.time()-start_time) + storage_events.snode_rpc_timeout(node, int(time.time() - start_time)) + return False + + +def node_port_check_fun(snode): + node_port_check = True + if snode.lvstore_status == "ready": + ports = [snode.nvmf_port] + if snode.lvstore_stack_secondary_1: + for n in db.get_primary_storage_nodes_by_secondary_node_id(snode.get_id()): + if n.lvstore_status == "ready": + ports.append(n.lvol_subsys_port) + if not snode.is_secondary_node: + ports.append(snode.lvol_subsys_port) + + for port in ports: + try: + ret = health_controller.check_port_on_node(snode, port) + logger.info(f"Check: node port {snode.mgmt_ip}, {port} ... {ret}") + node_port_check &= ret + except Exception: + logger.error("Check node port failed, connection error") + + node_data_nic_ping_check = False + for data_nic in snode.data_nics: + if data_nic.ip4_address: + data_ping_check = health_controller._check_node_ping(data_nic.ip4_address) + logger.info(f"Check: ping data nic {data_nic.ip4_address} ... {data_ping_check}") + node_data_nic_ping_check |= data_ping_check + + node_port_check &= node_data_nic_ping_check + + return node_port_check + + +class State: + counter = 0 +def increment(): + State.counter = 1 +def decrement(): + State.counter = 0 +def value(): + return State.counter + +def check_node(snode): + snode = db.get_storage_node_by_id(snode.get_id()) + + if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE, + StorageNode.STATUS_SCHEDULABLE, StorageNode.STATUS_DOWN]: + logger.info(f"Node status is: {snode.status}, skipping") + return False + + if snode.status == StorageNode.STATUS_ONLINE and snode.lvstore_status == "in_creation": + logger.info(f"Node lvstore is in creation: {snode.get_id()}, skipping") + return False + + logger.info(f"Checking node {snode.hostname}") + + + # 1- check node ping + ping_check = health_controller._check_node_ping(snode.mgmt_ip) + logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") + if not ping_check: + logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}: FAILED") + set_node_unreachable(snode) + return False + + # 2- check node API + try: + snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=10, retry=2) + ret, _ = snode_api.is_live() + logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {ret}") + if not ret: + logger.info("Check: node API failed, setting node unreachable") + set_node_unreachable(snode) + return False + except Exception as e: + logger.debug(e) + set_node_unreachable(snode) + return False + + # 3- check spdk process through node API + try: + snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=20, retry=2) + is_up, _ = snode_api.spdk_process_is_up( snode.rpc_port, snode.cluster_id) + logger.info(f"Check: spdk process {snode.mgmt_ip}:5000 ... {bool(is_up)}") + if not is_up: + logger.info("Check: node API failed, setting node offline") + set_node_offline(snode) + return False + except Exception as e: + logger.debug(e) + return False + + # 4- check node rpc interface + node_rpc_check, node_rpc_check_1 = health_controller._check_node_rpc( + snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=20, retry=1) + logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") + + #if RPC times out, we dont know if its due to node becoming unavailable or spdk hanging + #so we try it twice. If all other checks pass again, but only this one fails: it's the spdk process + if not node_rpc_check: + logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}:TIMEOUT") + if value()==0: + increment() + return False + + decrement() + if not node_rpc_check or not node_rpc_check_1: + logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}:FAILED") + set_node_schedulable(snode) + return False + + #if not node_rpc_check and snode.get_id() not in node_rpc_timeout_threads: + # t = threading.Thread(target=node_rpc_timeout_check_and_report, args=(snode,)) + # t.start() + # node_rpc_timeout_threads[snode.get_id()] = t + + node_port_check = node_port_check_fun(snode) + + if not node_port_check: + cluster = db.get_cluster_by_id(snode.cluster_id) + if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: + logger.error("Port check failed") + set_node_down(snode) + return True + + set_node_online(snode) + + +def loop_for_node(snode): + # global logger + # logger = logging.getLogger() + # logger_handler = logging.StreamHandler(stream=sys.stdout) + # logger_handler.setFormatter(logging.Formatter(f'%(asctime)s: node:{snode.mgmt_ip} %(levelname)s: %(message)s')) + # logger.addHandler(logger_handler) + while True: + check_node(snode) + logger.info(f"Sleeping for {constants.NODE_MONITOR_INTERVAL_SEC} seconds") + time.sleep(constants.NODE_MONITOR_INTERVAL_SEC) logger.info("Starting node monitor") +threads_maps: dict[str, threading.Thread] = {} + while True: clusters = db.get_clusters() for cluster in clusters: @@ -259,166 +433,15 @@ def node_rpc_timeout_check_and_report(node): continue nodes = db.get_storage_nodes_by_cluster_id(cluster_id) - for snode in nodes: - - # get fresh node object, something could have changed until the last for loop is reached - snode = db.get_storage_node_by_id(snode.get_id()) - - if snode.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_UNREACHABLE, - StorageNode.STATUS_SCHEDULABLE, StorageNode.STATUS_DOWN]: - logger.info(f"Node status is: {snode.status}, skipping") - continue - - if snode.status == StorageNode.STATUS_ONLINE and snode.lvstore_status == "in_creation": - logger.info(f"Node lvstore is in creation: {snode.get_id()}, skipping") - continue - - logger.info(f"Checking node {snode.hostname}") - - # 1- check node ping - ping_check = health_controller._check_node_ping(snode.mgmt_ip) - logger.info(f"Check: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") - if not ping_check: - time.sleep(1) - ping_check = health_controller._check_node_ping(snode.mgmt_ip) - logger.info(f"Check 2: ping mgmt ip {snode.mgmt_ip} ... {ping_check}") - - # 2- check node API - node_api_check = health_controller._check_node_api(snode.mgmt_ip) - logger.info(f"Check: node API {snode.mgmt_ip}:5000 ... {node_api_check}") - - if snode.status == StorageNode.STATUS_SCHEDULABLE and not ping_check and not node_api_check: - continue - - spdk_process = False - if node_api_check: - # 3- check spdk_process - spdk_process = health_controller._check_spdk_process_up(snode.mgmt_ip, snode.rpc_port, snode.cluster_id) - logger.info(f"Check: spdk process {snode.mgmt_ip}:5000 ... {spdk_process}") - - # 4- check rpc - node_rpc_check = health_controller._check_node_rpc( - snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=5, retry=2) - logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") - - if not node_rpc_check and snode.get_id() not in node_rpc_timeout_threads: - t = threading.Thread(target=node_rpc_timeout_check_and_report, args=(snode,)) + for node in nodes: + node_id = node.get_id() + if node_id not in threads_maps or threads_maps[node_id].is_alive() is False: + t = threading.Thread(target=loop_for_node, args=(node,)) t.start() - node_rpc_timeout_threads[snode.get_id()] = t - - if ping_check and node_api_check and spdk_process and not node_rpc_check: - start_time = time.time() - while time.time() < start_time + 60: - node_rpc_check = health_controller._check_node_rpc( - snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=5, retry=2) - logger.info(f"Check: node RPC {snode.mgmt_ip}:{snode.rpc_port} ... {node_rpc_check}") - if node_rpc_check: - break - - node_port_check = True - - if spdk_process and node_rpc_check and snode.lvstore_status == "ready": - ports = [snode.nvmf_port] - if snode.lvstore_stack_secondary_1: - for n in db.get_primary_storage_nodes_by_secondary_node_id(snode.get_id()): - if n.lvstore_status == "ready": - ports.append(n.lvol_subsys_port) - if not snode.is_secondary_node: - ports.append(snode.lvol_subsys_port) - - for port in ports: - ret = health_controller._check_port_on_node(snode, port) - logger.info(f"Check: node port {snode.mgmt_ip}, {port} ... {ret}") - node_port_check &= ret - - node_data_nic_ping_check = False - for data_nic in snode.data_nics: - if data_nic.ip4_address: - data_ping_check = health_controller._check_node_ping(data_nic.ip4_address) - logger.info(f"Check: ping data nic {data_nic.ip4_address} ... {data_ping_check}") - node_data_nic_ping_check |= data_ping_check - - node_port_check &= node_data_nic_ping_check - - cluster = db.get_cluster_by_id(cluster.get_id()) - - # is_node_online = ping_check and spdk_process and node_rpc_check and node_port_check - is_node_online = spdk_process or node_rpc_check - if is_node_online: - - if snode.status == StorageNode.STATUS_UNREACHABLE: - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_UNREADY, - Cluster.STATUS_SUSPENDED, Cluster.STATUS_READONLY]: - # tasks_controller.add_node_to_auto_restart(snode) - set_node_online(snode) - continue - - if not node_port_check: - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: - logger.error("Port check failed") - set_node_down(snode) - continue - - set_node_online(snode) - - # # check JM device - # if snode.jm_device: - # if snode.jm_device.status in [JMDevice.STATUS_ONLINE, JMDevice.STATUS_UNAVAILABLE]: - # ret = health_controller.check_jm_device(snode.jm_device.get_id()) - # if ret: - # logger.info(f"JM bdev is online: {snode.jm_device.get_id()}") - # if snode.jm_device.status != JMDevice.STATUS_ONLINE: - # device_controller.set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) - # else: - # logger.error(f"JM bdev is offline: {snode.jm_device.get_id()}") - # if snode.jm_device.status != JMDevice.STATUS_UNAVAILABLE: - # device_controller.set_jm_device_state(snode.jm_device.get_id(), - # JMDevice.STATUS_UNAVAILABLE) - else: - - if not ping_check and not node_api_check and not spdk_process: - # restart on new node - storage_node_ops.set_node_status(snode.get_id(), StorageNode.STATUS_SCHEDULABLE) - - elif ping_check and node_api_check and (not spdk_process or not node_rpc_check): - # add node to auto restart - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_UNREADY, - Cluster.STATUS_SUSPENDED, Cluster.STATUS_READONLY]: - if not spdk_process and not node_rpc_check: - logger.info("ping is fine, snodeapi is fine, But no spdk process and no rpc check, " - "So that we set device offline") - set_node_offline(snode, set_devs_offline=(not spdk_process and not node_rpc_check)) - try: - ret = snode.rpc_client(timeout=10).get_version() - if not ret: - logger.debug("False RPC response, adding node to auto restart") - tasks_controller.add_node_to_auto_restart(snode) - except Exception as e: - logger.debug("Timeout to get RPC response, skipping restart") - logger.error(e) - - elif not node_port_check: - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: - logger.error("Port check failed") - set_node_down(snode) - - else: - set_node_offline(snode, set_devs_offline=not spdk_process) - - if ping_check and node_api_check and spdk_process and not node_rpc_check: - # restart spdk proxy cont - if cluster.status in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_UNREADY, - Cluster.STATUS_SUSPENDED, Cluster.STATUS_READONLY]: - logger.info(f"Restarting spdk_proxy_{snode.rpc_port} on {snode.get_id()}") - snode_api = SNodeClient(f"{snode.mgmt_ip}:5000", timeout=60, retry=1) - ret, err = snode_api.spdk_proxy_restart(snode.rpc_port) - if ret: - logger.info(f"Restarting spdk_proxy on {snode.get_id()} successfully") - continue - if err: - logger.error(err) - - update_cluster_status(cluster_id) + threads_maps[node_id] = t - logger.info(f"Sleeping for {constants.NODE_MONITOR_INTERVAL_SEC} seconds") + try: + update_cluster_status(cluster_id) + except Exception: + logger.error("Error while updating cluster status") time.sleep(constants.NODE_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/tasks_runner_failed_migration.py b/simplyblock_core/services/tasks_runner_failed_migration.py index 7d0b3e89f..e3baeb7f0 100644 --- a/simplyblock_core/services/tasks_runner_failed_migration.py +++ b/simplyblock_core/services/tasks_runner_failed_migration.py @@ -87,8 +87,12 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_failure_start( - distr_name, device.cluster_device_order, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS) + try: + rsp = rpc_client.distr_migration_failure_start( + distr_name, device.cluster_device_order, qos_high_priority, job_size=constants.MIG_JOB_SIZE, jobs=constants.MIG_PARALLEL_JOBS) + except Exception as e: + logger.error(e) + rsp = False if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") task.function_result = "Failed to start device migration task" diff --git a/simplyblock_core/services/tasks_runner_jc_comp.py b/simplyblock_core/services/tasks_runner_jc_comp.py index 6caf85b19..9e1ce2368 100644 --- a/simplyblock_core/services/tasks_runner_jc_comp.py +++ b/simplyblock_core/services/tasks_runner_jc_comp.py @@ -46,9 +46,9 @@ task.write_to_db(db.kv_store) continue - node = db.get_storage_node_by_id(task.node_id) - - if not node: + try: + node = db.get_storage_node_by_id(task.node_id) + except KeyError: task.function_result = "node not found" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) @@ -88,12 +88,16 @@ jm_vuid = node.jm_vuid if "jm_vuid" in task.function_params: jm_vuid = task.function_params["jm_vuid"] - ret, err = rpc_client.jc_compression_start(jm_vuid=jm_vuid) + try: + ret, err = rpc_client.jc_suspend_compression(jm_vuid=jm_vuid, suspend=False) + except Exception as e: + logger.error(e) + continue if ret: task.function_result = f"JC {node.jm_vuid} compression resumed on node" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) - elif err and "code" in err and err["code"] == -2: + elif err: task.function_result = f"JC {node.jm_vuid} compression not needed" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) diff --git a/simplyblock_core/services/tasks_runner_migration.py b/simplyblock_core/services/tasks_runner_migration.py index e325e3d7e..c00231d2c 100644 --- a/simplyblock_core/services/tasks_runner_migration.py +++ b/simplyblock_core/services/tasks_runner_migration.py @@ -62,16 +62,6 @@ def task_runner(task): except Exception as e: logger.error(f"Failed to get online since: {e}") - for dev in node.nvme_devices: - if dev.status not in [NVMeDevice.STATUS_ONLINE, - NVMeDevice.STATUS_FAILED_AND_MIGRATED, - NVMeDevice.STATUS_CANNOT_ALLOCATE]: - task.function_result = f"Some dev status is {dev.status }, retrying" - task.status = JobSchedule.STATUS_SUSPENDED - task.retry += 1 - task.write_to_db(db.kv_store) - return False - task.status = JobSchedule.STATUS_RUNNING task.function_result = "" task.write_to_db(db.kv_store) @@ -93,8 +83,12 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64, - jobs=constants.MIG_PARALLEL_JOBS) + try: + rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=constants.MIG_JOB_SIZE, + jobs=constants.MIG_PARALLEL_JOBS) + except Exception as e: + logger.error(e) + rsp = False if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") task.function_result = "Failed to start device migration task, retry later" @@ -112,7 +106,7 @@ def task_runner(task): allow_all_errors = False for node in db.get_storage_nodes_by_cluster_id(task.cluster_id): for dev in node.nvme_devices: - if dev.status in [NVMeDevice.STATUS_READONLY, NVMeDevice.STATUS_CANNOT_ALLOCATE]: + if dev.status in [NVMeDevice.STATUS_READONLY, NVMeDevice.STATUS_CANNOT_ALLOCATE, NVMeDevice.STATUS_FAILED]: allow_all_errors = True break @@ -219,9 +213,12 @@ def _set_master_task_status(master_task, status): continue rpc_client = RPCClient( node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=5, retry=2) - ret, err = rpc_client.jc_compression_start(jm_vuid=node.jm_vuid) - if err and "code" in err and err["code"] != -2: - logger.info("Failed to resume JC compression adding task...") - tasks_controller.add_jc_comp_resume_task(task.cluster_id, task.node_id, node.jm_vuid) + try: + ret, err = rpc_client.jc_suspend_compression(jm_vuid=node.jm_vuid, suspend=False) + if err: + logger.info("Failed to resume JC compression adding task...") + tasks_controller.add_jc_comp_resume_task(task.cluster_id, task.node_id, node.jm_vuid) + except Exception as e: + logger.error(e) time.sleep(3) diff --git a/simplyblock_core/services/tasks_runner_new_dev_migration.py b/simplyblock_core/services/tasks_runner_new_dev_migration.py index 9feec7a56..db4143eec 100644 --- a/simplyblock_core/services/tasks_runner_new_dev_migration.py +++ b/simplyblock_core/services/tasks_runner_new_dev_migration.py @@ -98,8 +98,12 @@ def task_runner(task): qos_high_priority = False if db.get_cluster_by_id(snode.cluster_id).is_qos_set(): qos_high_priority = True - rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64, - jobs=constants.MIG_PARALLEL_JOBS) + try: + rsp = rpc_client.distr_migration_expansion_start( + distr_name, qos_high_priority, job_size=constants.MIG_JOB_SIZE,jobs=constants.MIG_PARALLEL_JOBS) + except Exception as e: + logger.error(f"Failed to start migration : {e}") + rsp = False if not rsp: logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}") task.function_result = "Failed to start device migration task" diff --git a/simplyblock_core/services/tasks_runner_node_add.py b/simplyblock_core/services/tasks_runner_node_add.py index daeba918e..819e611d7 100644 --- a/simplyblock_core/services/tasks_runner_node_add.py +++ b/simplyblock_core/services/tasks_runner_node_add.py @@ -2,7 +2,7 @@ import time -from simplyblock_core import db_controller, storage_node_ops, utils +from simplyblock_core import db_controller, storage_node_ops, utils, constants from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.cluster import Cluster @@ -13,46 +13,67 @@ db = db_controller.DBController() -logger.info("Starting Tasks runner...") -while True: +def process_task(task): + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return False + + if task.retry >= task.max_retry: + task.function_result = "max retry reached" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return True + + if db.get_cluster_by_id(cl.get_id()).status == Cluster.STATUS_IN_ACTIVATION: + task.function_result = "Cluster is in_activation, waiting" + task.status = JobSchedule.STATUS_NEW + task.write_to_db(db.kv_store) + return False + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + try: + res = storage_node_ops.add_node(**task.function_params) + msg = f"Node add result: {res}" + logger.info(msg) + task.function_result = msg + if res: + task.status = JobSchedule.STATUS_DONE + else: + task.retry += 1 + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return True + except Exception as e: + logger.error(e) + return False + + +logger.info("Starting Tasks runner node add...") +while True: clusters = db.get_clusters() if not clusters: logger.error("No clusters found!") else: for cl in clusters: - if cl.status == Cluster.STATUS_IN_ACTIVATION: - continue - tasks = db.get_job_tasks(cl.get_id(), reverse=False) for task in tasks: - + delay_seconds = constants.TASK_EXEC_INTERVAL_SEC if task.function_name == JobSchedule.FN_NODE_ADD: - if task.status != JobSchedule.STATUS_DONE: - + while task.status != JobSchedule.STATUS_DONE: # get new task object because it could be changed from cancel task task = db.get_task_by_id(task.uuid) - - if task.canceled: - task.function_result = "canceled" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - continue - - if db.get_cluster_by_id(cl.get_id()).status == Cluster.STATUS_IN_ACTIVATION: - task.function_result = "Cluster is in_activation, waiting" - task.status = JobSchedule.STATUS_NEW - task.write_to_db(db.kv_store) - continue - - if task.status != JobSchedule.STATUS_RUNNING: - task.status = JobSchedule.STATUS_RUNNING - task.write_to_db(db.kv_store) - - res = storage_node_ops.add_node(**task.function_params) - logger.info(f"Node add result: {res}") - task.function_result = str(res) - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - - time.sleep(5) + res = process_task(task) + if res: + if task.status == JobSchedule.STATUS_DONE: + break + else: + delay_seconds *= 2 + time.sleep(delay_seconds) + + time.sleep(constants.TASK_EXEC_INTERVAL_SEC) diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py index 96ffc4664..1417c3a62 100644 --- a/simplyblock_core/services/tasks_runner_port_allow.py +++ b/simplyblock_core/services/tasks_runner_port_allow.py @@ -7,9 +7,8 @@ from simplyblock_core.fw_api_client import FirewallClient from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.cluster import Cluster -from simplyblock_core.models.nvme_device import NVMeDevice +from simplyblock_core.models.nvme_device import NVMeDevice, RemoteDevice from simplyblock_core.models.storage_node import StorageNode -from simplyblock_core.snode_client import SNodeClient logger = utils.get_logger(__name__) @@ -17,9 +16,234 @@ db = db_controller.DBController() +def exec_port_allow_task(task): + # get new task object because it could be changed from cancel task + task = db.get_task_by_id(task.uuid) + + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + node = db.get_storage_node_by_id(task.node_id) + + if not node: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: + msg = f"Node is {node.status}, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + # check node ping + ping_check = health_controller._check_node_ping(node.mgmt_ip) + logger.info(f"Check: ping mgmt ip {node.mgmt_ip} ... {ping_check}") + if not ping_check: + time.sleep(1) + ping_check = health_controller._check_node_ping(node.mgmt_ip) + logger.info(f"Check 2: ping mgmt ip {node.mgmt_ip} ... {ping_check}") + + if not ping_check: + msg = "Node ping is false, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + # check node ping + logger.info("connect to remote devices") + nodes = db.get_storage_nodes_by_cluster_id(node.cluster_id) + # connect to remote devs + try: + node_bdevs = node.rpc_client().get_bdevs() + logger.debug(node_bdevs) + if node_bdevs: + node_bdev_names = {} + for b in node_bdevs: + node_bdev_names[b['name']] = b + for al in b['aliases']: + node_bdev_names[al] = b + else: + node_bdev_names = {} + remote_devices = [] + for nd in nodes: + if nd.get_id() == node.get_id() or nd.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: + continue + logger.info(f"Connecting to node {nd.get_id()}") + for index, dev in enumerate(nd.nvme_devices): + + if dev.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, + NVMeDevice.STATUS_CANNOT_ALLOCATE]: + logger.debug(f"Device is not online: {dev.get_id()}, status: {dev.status}") + continue + + if not dev.alceml_bdev: + raise ValueError(f"device alceml bdev not found!, {dev.get_id()}") + + remote_device = RemoteDevice() + remote_device.uuid = dev.uuid + remote_device.alceml_name = dev.alceml_name + remote_device.node_id = dev.node_id + remote_device.size = dev.size + remote_device.nvmf_multipath = dev.nvmf_multipath + remote_device.status = NVMeDevice.STATUS_ONLINE + remote_device.remote_bdev = storage_node_ops.connect_device( + f"remote_{dev.alceml_bdev}", dev, node, + bdev_names=list(node_bdev_names), reattach=False) + + remote_devices.append(remote_device) + if not remote_devices: + msg = "Node unable to connect to remote devs, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + else: + node = db.get_storage_node_by_id(task.node_id) + node.remote_devices = remote_devices + node.write_to_db() + + logger.info("connect to remote JM devices") + remote_jm_devices = storage_node_ops._connect_to_remote_jm_devs(node) + if not remote_jm_devices or len(remote_jm_devices) < 2: + msg = "Node unable to connect to remote JMs, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + else: + node = db.get_storage_node_by_id(task.node_id) + node.remote_jm_devices = remote_jm_devices + node.write_to_db() + + + except Exception as e: + logger.error(e) + msg = "Error when connect to remote devs, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + logger.info("Sending device status event") + for db_dev in node.nvme_devices: + distr_controller.send_dev_status_event(db_dev, db_dev.status, node) + + logger.info("Finished sending device status and now waiting 5s for JMs to connect") + time.sleep(5) + + sec_node = db.get_storage_node_by_id(node.secondary_node_id) + snode = db.get_storage_node_by_id(node.get_id()) + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + try: + ret = sec_node.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) + if ret: + lvs_info = ret[0] + if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: + # is_sec_node_leader = True + # check jc_compression status + jc_compression_is_active = sec_node.rpc_client().jc_compression_get_status(snode.jm_vuid) + retries = 10 + while jc_compression_is_active: + if retries <= 0: + logger.warning("Timeout waiting for JC compression task to finish") + break + retries -= 1 + logger.info( + f"JC compression task found on node: {sec_node.get_id()}, retrying in 60 seconds") + time.sleep(60) + jc_compression_is_active = sec_node.rpc_client().jc_compression_get_status( + snode.jm_vuid) + except Exception as e: + logger.error(e) + return + + if node.lvstore_status == "ready": + lvstore_check = health_controller._check_node_lvstore(node.lvstore_stack, node, auto_fix=True) + if not lvstore_check: + msg = "Node LVolStore check fail, retry later" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + if node.secondary_node_id: + primary_hublvol_check = health_controller._check_node_hublvol(node) + if not primary_hublvol_check: + msg = "Node hublvol check fail, retry later" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + sec_node = db.get_storage_node_by_id(node.secondary_node_id) + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + secondary_hublvol_check = health_controller._check_sec_node_hublvol(sec_node, auto_fix=True) + if not secondary_hublvol_check: + msg = "Secondary node hublvol check fail, retry later" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + try: + # wait for lvol sync delete + lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) + while lvol_sync_del_found: + logger.info("Lvol sync delete task found, waiting") + time.sleep(3) + lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) + + if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: + sec_rpc_client = sec_node.rpc_client() + ret = sec_node.wait_for_jm_rep_tasks_to_finish(node.jm_vuid) + if not ret: + msg = "JM replication task found on secondary" + logger.warning(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + sec_rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False, bs_nonleadership=True) + + except Exception as e: + logger.error(e) + return + + port_number = task.function_params["port_number"] + logger.info(f"Allow port {port_number} on node {node.get_id()}") + fw_api = FirewallClient(snode, timeout=5, retry=2) + port_type = "tcp" + if node.active_rdma: + port_type = "udp" + fw_api.firewall_set_port(port_number, port_type, "allow", node.rpc_port) + tcp_ports_events.port_allowed(node, port_number) + + task.function_result = f"Port {port_number} allowed on node" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + logger.info("Starting Tasks runner...") while True: - clusters = db.get_clusters() if not clusters: logger.error("No clusters found!") @@ -27,209 +251,10 @@ for cl in clusters: if cl.status == Cluster.STATUS_IN_ACTIVATION: continue - tasks = db.get_job_tasks(cl.get_id(), reverse=False) for task in tasks: - if task.function_name == JobSchedule.FN_PORT_ALLOW: if task.status != JobSchedule.STATUS_DONE: - - # get new task object because it could be changed from cancel task - task = db.get_task_by_id(task.uuid) - - if task.canceled: - task.function_result = "canceled" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - continue - - node = db.get_storage_node_by_id(task.node_id) - - if not node: - task.function_result = "node not found" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - continue - - if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: - msg = f"Node is {node.status}, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - - # check node ping - ping_check = health_controller._check_node_ping(node.mgmt_ip) - logger.info(f"Check: ping mgmt ip {node.mgmt_ip} ... {ping_check}") - if not ping_check: - time.sleep(1) - ping_check = health_controller._check_node_ping(node.mgmt_ip) - logger.info(f"Check 2: ping mgmt ip {node.mgmt_ip} ... {ping_check}") - - if not ping_check: - msg = "Node ping is false, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - - # check node ping - logger.info("connect to remote devices") - nodes = db.get_storage_nodes_by_cluster_id(node.cluster_id) - # connect to remote devs - try: - node_bdevs = node.rpc_client().get_bdevs() - logger.debug(node_bdevs) - if node_bdevs: - node_bdev_names = {} - for b in node_bdevs: - node_bdev_names[b['name']] = b - for al in b['aliases']: - node_bdev_names[al] = b - else: - node_bdev_names = {} - remote_devices = [] - for nd in nodes: - if nd.get_id() == node.get_id() or nd.status not in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN]: - continue - logger.info(f"Connecting to node {nd.get_id()}") - for index, dev in enumerate(nd.nvme_devices): - - if dev.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_READONLY, - NVMeDevice.STATUS_CANNOT_ALLOCATE]: - logger.debug(f"Device is not online: {dev.get_id()}, status: {dev.status}") - continue - - if not dev.alceml_bdev: - raise ValueError(f"device alceml bdev not found!, {dev.get_id()}") - - dev.remote_bdev = storage_node_ops.connect_device( - f"remote_{dev.alceml_bdev}", dev, node, - bdev_names=list(node_bdev_names), reattach=False) - - remote_devices.append(dev) - if not remote_devices: - msg = "Node unable to connect to remote devs, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - else: - node = db.get_storage_node_by_id(task.node_id) - node.remote_devices = remote_devices - node.write_to_db() - - logger.info("connect to remote JM devices") - remote_jm_devices = storage_node_ops._connect_to_remote_jm_devs(node) - if not remote_jm_devices or len(remote_jm_devices) < 2: - msg = "Node unable to connect to remote JMs, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - else: - node = db.get_storage_node_by_id(task.node_id) - node.remote_jm_devices = remote_jm_devices - node.write_to_db() - - - except Exception as e: - logger.error(e) - msg = "Error when connect to remote devs, retry task" - logger.info(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - - logger.info("Sending device status event") - for db_dev in node.nvme_devices: - distr_controller.send_dev_status_event(db_dev, db_dev.status) - - logger.info("Finished sending device status and now waiting 5s for JMs to connect") - time.sleep(5) - - sec_node = db.get_storage_node_by_id(node.secondary_node_id) - snode = db.get_storage_node_by_id(node.get_id()) - if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: - ret = sec_node.rpc_client().bdev_lvol_get_lvstores(snode.lvstore) - if ret: - lvs_info = ret[0] - if "lvs leadership" in lvs_info and lvs_info['lvs leadership']: - # is_sec_node_leader = True - # check jc_compression status - jc_compression_is_active = sec_node.rpc_client().jc_compression_get_status(snode.jm_vuid) - retries = 10 - while jc_compression_is_active: - if retries <= 0: - logger.warning("Timeout waiting for JC compression task to finish") - break - retries -= 1 - logger.info( - f"JC compression task found on node: {sec_node.get_id()}, retrying in 60 seconds") - time.sleep(60) - jc_compression_is_active = sec_node.rpc_client().jc_compression_get_status( - snode.jm_vuid) - - lvstore_check = True - if node.lvstore_status == "ready": - lvstore_check &= health_controller._check_node_lvstore(node.lvstore_stack, node, auto_fix=True) - if node.secondary_node_id: - lvstore_check &= health_controller._check_node_hublvol(node) - sec_node = db.get_storage_node_by_id(node.secondary_node_id) - if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: - lvstore_check &= health_controller._check_sec_node_hublvol(sec_node, auto_fix=True) - - if lvstore_check is False: - msg = "Node LVolStore check fail, retry later" - logger.warning(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - - if task.status != JobSchedule.STATUS_RUNNING: - task.status = JobSchedule.STATUS_RUNNING - task.write_to_db(db.kv_store) - - # wait for lvol sync delete - lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) - while lvol_sync_del_found: - logger.info("Lvol sync delete task found, waiting") - can_continue = False - time.sleep(3) - lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id) - - if sec_node and sec_node.status == StorageNode.STATUS_ONLINE: - sec_rpc_client = sec_node.rpc_client() - ret = sec_node.wait_for_jm_rep_tasks_to_finish(node.jm_vuid) - if not ret: - msg = "JM replication task found on secondary" - logger.warning(msg) - task.function_result = msg - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - continue - sec_rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False, bs_nonleadership=True) - - port_number = task.function_params["port_number"] - snode_api = SNodeClient(f"{node.mgmt_ip}:5000", timeout=3, retry=2) - - logger.info(f"Allow port {port_number} on node {node.get_id()}") - - fw_api = FirewallClient(snode, timeout=5, retry=2) - port_type = "tcp" - if node.active_rdma: - port_type = "udp" - fw_api.firewall_set_port(port_number, port_type, "allow", node.rpc_port) - tcp_ports_events.port_allowed(node, port_number) - - task.function_result = f"Port {port_number} allowed on node" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) + exec_port_allow_task(task) time.sleep(5) diff --git a/simplyblock_core/services/tasks_runner_restart.py b/simplyblock_core/services/tasks_runner_restart.py index 2cfc82a53..61f8c5e6b 100644 --- a/simplyblock_core/services/tasks_runner_restart.py +++ b/simplyblock_core/services/tasks_runner_restart.py @@ -3,6 +3,7 @@ from simplyblock_core import constants, db_controller, storage_node_ops, utils from simplyblock_core.controllers import device_controller, health_controller, tasks_controller +from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.nvme_device import NVMeDevice from simplyblock_core.models.storage_node import StorageNode @@ -127,18 +128,19 @@ def task_runner_device(task): def task_runner_node(task): - node = db.get_storage_node_by_id(task.node_id) - if task.retry >= task.max_retry: - task.function_result = "max retry reached" + try: + node = db.get_storage_node_by_id(task.node_id) + except KeyError: + task.function_result = "node not found" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) - storage_node_ops.set_node_status(task.node_id, StorageNode.STATUS_OFFLINE) return True - if not node: - task.function_result = "node not found" + if task.retry >= task.max_retry: + task.function_result = "max retry reached" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) + storage_node_ops.set_node_status(task.node_id, StorageNode.STATUS_OFFLINE) return True if node.status in [StorageNode.STATUS_REMOVED, StorageNode.STATUS_SCHEDULABLE, StorageNode.STATUS_DOWN]: @@ -171,6 +173,13 @@ def task_runner_node(task): task.status = JobSchedule.STATUS_RUNNING task.write_to_db(db.kv_store) + cluster = db.get_cluster_by_id(task.cluster_id) + if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: + task.function_result = f"Cluster is not active: {cluster.status}, retry" + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return False + # is node reachable? ping_check = health_controller._check_node_ping(node.mgmt_ip) logger.info(f"Check: ping mgmt ip {node.mgmt_ip} ... {ping_check}") @@ -191,19 +200,26 @@ def task_runner_node(task): return False - # shutting down node - logger.info(f"Shutdown node {node.get_id()}") - ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True) - if ret: - logger.info("Node shutdown succeeded") - - time.sleep(3) + try: + # shutting down node + logger.info(f"Shutdown node {node.get_id()}") + ret = storage_node_ops.shutdown_storage_node(node.get_id(), force=True) + if ret: + logger.info("Node shutdown succeeded") + time.sleep(3) + except Exception as e: + logger.error(e) + return False - # resetting node - logger.info(f"Restart node {node.get_id()}") - ret = storage_node_ops.restart_storage_node(node.get_id(), force=True) - if ret: - logger.info("Node restart succeeded") + try: + # resetting node + logger.info(f"Restart node {node.get_id()}") + ret = storage_node_ops.restart_storage_node(node.get_id(), force=True) + if ret: + logger.info("Node restart succeeded") + except Exception as e: + logger.error(e) + return False time.sleep(3) node = db.get_storage_node_by_id(task.node_id) diff --git a/simplyblock_core/services/tasks_runner_sync_lvol_del.py b/simplyblock_core/services/tasks_runner_sync_lvol_del.py index fbf0c1ee4..ce41806a4 100644 --- a/simplyblock_core/services/tasks_runner_sync_lvol_del.py +++ b/simplyblock_core/services/tasks_runner_sync_lvol_del.py @@ -37,6 +37,8 @@ task.function_result = "canceled" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) + primary_node = db.get_storage_node_by_id(task.function_params["primary_node"]) + primary_node.lvol_del_sync_lock_reset() continue node = db.get_storage_node_by_id(task.node_id) @@ -45,6 +47,8 @@ task.function_result = "node not found" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) + primary_node = db.get_storage_node_by_id(task.function_params["primary_node"]) + primary_node.lvol_del_sync_lock_reset() continue if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: @@ -73,5 +77,7 @@ task.function_result = f"bdev {lvol_bdev_name} deleted" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) + primary_node = db.get_storage_node_by_id(task.function_params["primary_node"]) + primary_node.lvol_del_sync_lock_reset() time.sleep(3) diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 23504ab0b..53ffe0583 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -40,8 +40,7 @@ def _request(self, method, path, payload=None): response = self.session.request(method, self.url+path, data=data, timeout=self.timeout, params=params) except Exception as e: - logger.error("Request failed: %s", e) - raise e + raise SNodeClientException(str(e)) logger.debug("Response: status_code: %s, content: %s", response.status_code, response.content) @@ -69,11 +68,14 @@ def _request(self, method, path, payload=None): if ret_code == 422: raise SNodeClientException(f"Request validation failed: '{response.text}'") - logger.error("Unknown http status: %s", ret_code) - return None, None + raise SNodeClientException(f"Unknown http status: {ret_code}") def is_live(self): - return self._request("GET", "check") + try: + return self._request("GET", "check") + except SNodeClientException: + logger.warning("Failed to call snode/check, trying snode/info") + return self.info() def info(self): return self._request("GET", "info") @@ -81,8 +83,7 @@ def info(self): def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None, cluster_ip=None, fdb_connection=None, namespace=None, server_ip=None, rpc_port=None, rpc_username=None, rpc_password=None, multi_threading_enabled=False, timeout=0, ssd_pcie=None, - total_mem=None, system_mem=None, cluster_mode=None, socket=0, cluster_id=None): - + total_mem=None, system_mem=None, cluster_mode=None, socket=0, cluster_id=None, firewall_port=0): params = { "cluster_ip": cluster_ip, "server_ip": server_ip, @@ -118,6 +119,9 @@ def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None if cluster_id: params["cluster_id"] = cluster_id + if firewall_port: + params["firewall_port"] = firewall_port + params["socket"] = socket return self._request("POST", "spdk_process_start", params) def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id): diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index a7d7c9d74..adda8f4f5 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -26,7 +26,7 @@ from simplyblock_core.models.iface import IFace from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.lvol_model import LVol -from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice +from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice, RemoteDevice, RemoteJMDevice from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.models.cluster import Cluster @@ -38,6 +38,7 @@ from simplyblock_core.utils import pull_docker_image_with_retry import os + logger = utils.get_logger(__name__) @@ -58,73 +59,88 @@ def connect_device(name: str, device: NVMeDevice, node: StorageNode, bdev_names: rpc_client = node.rpc_client() # check connection status - if device.connecting_from_node and device.connecting_from_node != node.get_id(): + if device.is_connection_in_progress_to_node(node.get_id()): logger.warning("This device is being connected to from other node, sleep for 5 seconds") time.sleep(5) - device.connecting_from_node = node.get_id() - device.write_to_db() + device.lock_device_connection(node.get_id()) ret = rpc_client.bdev_nvme_controller_list(name) if ret: - for controller in ret[0]["ctrlrs"]: + counter=0 + while(counter<5): + waiting = False + for controller in ret[0]["ctrlrs"]: controller_state = controller["state"] logger.info(f"Controller found: {name}, status: {controller_state}") - if controller_state == "deleting": - raise RuntimeError(f"Controller: {name}, status is {controller_state}") - - if reattach: - rpc_client.bdev_nvme_detach_controller(name) - time.sleep(1) - - bdev_name = None - - db_ctrl = DBController() - node = db_ctrl.get_storage_node_by_id(device.node_id) - if node.active_rdma: - tr_type = "RDMA" - else: - if node.active_tcp: - tr_type = "TCP" + if controller_state== "failed": + #we can remove the controller only for certain, if its failed. other states are intermediate and require retry. + rpc_client.bdev_nvme_detach_controller(name) + time.sleep(2) + break + elif controller_state == "resetting" or controller_state == "deleting" or controller_state == "reconnect_is_delayed": + if counter < 5: + time.sleep(2) + waiting = True + break + else: #this should never happen. It means controller is "hanging" in an intermediate state for more than 10 seconds. usually if some io is hanging. + raise RuntimeError(f"Controller: {name}, status is {controller_state}") + if not waiting: + counter=5 + else: + counter+=1 + + #if reattach: + # rpc_client.bdev_nvme_detach_controller(name) + # time.sleep(1) + + # only if the controller is really gone we try to reattach it + if not rpc_client.bdev_nvme_controller_list(name): + bdev_name = None + + db_ctrl = DBController() + node = db_ctrl.get_storage_node_by_id(device.node_id) + if node.active_rdma: + tr_type = "RDMA" else: - msg = "target node to connect has no active fabric." - logger.error(msg) - raise RuntimeError(msg) - - for ip in device.nvmf_ip.split(","): - ret = rpc_client.bdev_nvme_attach_controller( + if node.active_tcp: + tr_type = "TCP" + else: + msg = "target node to connect has no active fabric." + logger.error(msg) + raise RuntimeError(msg) + + for ip in device.nvmf_ip.split(","): + ret = rpc_client.bdev_nvme_attach_controller( name, device.nvmf_nqn, ip, device.nvmf_port, tr_type, multipath=device.nvmf_multipath) - if not bdev_name and ret and isinstance(ret, list): - bdev_name = ret[0] - - if device.nvmf_multipath: - rpc_client.bdev_nvme_set_multipath_policy(bdev_name, "active_active") - - # wait 5 seconds after controller attach - time.sleep(5) - - if not bdev_name: - msg = "Bdev name not returned from controller attach" - logger.error(msg) - raise RuntimeError(msg) - bdev_found = False - for i in range(5): - ret = rpc_client.get_bdevs(bdev_name) - if ret: - bdev_found = True - break - else: - time.sleep(1) + if not bdev_name and ret and isinstance(ret, list): + bdev_name = ret[0] + + if device.nvmf_multipath: + rpc_client.bdev_nvme_set_multipath_policy(bdev_name, "active_active") + + if not bdev_name: + msg = "Bdev name not returned from controller attach" + logger.error(msg) + raise RuntimeError(msg) + bdev_found = False + for i in range(5): + ret = rpc_client.get_bdevs(bdev_name) + if ret: + bdev_found = True + break + else: + time.sleep(1) - device.connecting_from_node = "" - device.write_to_db() + device.release_device_connection() - if not bdev_found: - logger.error("Bdev not found after 5 attempts") - raise RuntimeError(f"Failed to connect to device: {device.get_id()}") + if not bdev_found: + logger.error("Bdev not found after 5 attempts") + raise RuntimeError(f"Failed to connect to device: {device.get_id()}") - return bdev_name + return bdev_name + return None def get_next_cluster_device_order(db_controller, cluster_id): @@ -172,14 +188,12 @@ def _search_for_partitions(rpc_client, nvme_device): def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart): raid_bdev = f"raid_jm_{snode.get_id()}" - if len(jm_nvme_bdevs) > 1: - raid_level = "1" - ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) - if not ret: - logger.error(f"Failed to create raid_jm_{snode.get_id()}") - return False - else: - raid_bdev = jm_nvme_bdevs[0] + + raid_level = "1" + ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) + if not ret: + logger.error(f"Failed to create raid_jm_{snode.get_id()}") + return False alceml_id = snode.get_id() alceml_name = f"alceml_jm_{snode.get_id()}" @@ -696,7 +710,7 @@ def _connect_to_remote_devs( rpc_client = RPCClient( this_node.mgmt_ip, this_node.rpc_port, - this_node.rpc_username, this_node.rpc_password, timeout=3, retry=1) + this_node.rpc_username, this_node.rpc_password, timeout=5, retry=1) node_bdevs = rpc_client.get_bdevs() if node_bdevs: @@ -744,14 +758,21 @@ def _connect_to_remote_devs( node_bdev_names = [b['name'] for b in node_bdevs] for dev in devices_to_connect: + remote_bdev = RemoteDevice() + remote_bdev.uuid = dev.uuid + remote_bdev.alceml_name = dev.alceml_name + remote_bdev.node_id = dev.node_id + remote_bdev.size = dev.size + remote_bdev.status = NVMeDevice.STATUS_ONLINE + remote_bdev.nvmf_multipath = dev.nvmf_multipath for bdev in node_bdev_names: if bdev.startswith(f"remote_{dev.alceml_bdev}"): - dev.remote_bdev = bdev + remote_bdev.remote_bdev = bdev break - if not dev.remote_bdev: + if not remote_bdev.remote_bdev: logger.error(f"Failed to connect to remote device {dev.alceml_name}") continue - remote_devices.append(dev) + remote_devices.append(remote_bdev) return remote_devices @@ -790,6 +811,10 @@ def _connect_to_remote_jm_devs(this_node, jm_ids=None): if jm_dev and jm_dev not in remote_devices: remote_devices.append(jm_dev) + logger.debug(f"remote_devices: {remote_devices}") + allowed_node_statuses = [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN, StorageNode.STATUS_RESTARTING] + allowed_dev_statuses = [NVMeDevice.STATUS_ONLINE] + new_devs = [] for jm_dev in remote_devices: if not jm_dev.jm_bdev: @@ -806,15 +831,30 @@ def _connect_to_remote_jm_devs(this_node, jm_ids=None): if not org_dev or org_dev in new_devs or org_dev_node and org_dev_node.get_id() == this_node.get_id(): continue + if org_dev_node is not None and org_dev_node.status not in allowed_node_statuses: + logger.warning(f"Skipping node:{org_dev_node.get_id()} with status: {org_dev_node.status}") + continue + + if org_dev is not None and org_dev.status not in allowed_dev_statuses: + logger.warning(f"Skipping device:{org_dev.get_id()} with status: {org_dev.status}") + continue + + remote_device = RemoteJMDevice() + remote_device.uuid = org_dev.uuid + remote_device.alceml_name = org_dev.alceml_name + remote_device.node_id = org_dev.node_id + remote_device.size = org_dev.size + remote_device.jm_bdev = org_dev.jm_bdev + remote_device.status = NVMeDevice.STATUS_ONLINE + remote_device.nvmf_multipath = org_dev.nvmf_multipath try: - org_dev.remote_bdev = connect_device( + remote_device.remote_bdev = connect_device( f"remote_{org_dev.jm_bdev}", org_dev, this_node, bdev_names=node_bdev_names, reattach=True, - ) except RuntimeError: logger.error(f'Failed to connect to {org_dev.get_id()}') - new_devs.append(org_dev) + new_devs.append(remote_device) return new_devs @@ -933,7 +973,7 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, # Calculate pool count max_prov = 0 if node_config.get("max_size"): - max_prov = int(utils.parse_size(node_config.get("max_size"))) + max_prov = int(utils.parse_size(node_config.get("max_size"))) if max_prov < 0: logger.error(f"Incorrect max-prov value {max_prov}") return False @@ -1001,6 +1041,7 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, else: cluster_ip = utils.get_k8s_node_ip() + firewall_port = utils.get_next_fw_port(cluster_id) rpc_port = utils.get_next_rpc_port(cluster_id) rpc_user, rpc_pass = utils.generate_rpc_user_and_pass() mgmt_info = utils.get_mgmt_ip(node_info, iface_name) @@ -1027,7 +1068,6 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, total_mem += (n.spdk_mem + 500000000) logger.info("Deploying SPDK") - results = None l_cores = node_config.get("l-cores") spdk_cpu_mask = node_config.get("cpu_mask") for ssd in ssd_pcie: @@ -1039,8 +1079,7 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, - socket=node_socket, cluster_id=cluster_id) - + socket=node_socket, cluster_id=cluster_id, firewall_port=firewall_port) time.sleep(5) except Exception as e: @@ -1165,6 +1204,7 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, snode.iobuf_small_bufsize = small_bufsize or 0 snode.iobuf_large_bufsize = large_bufsize or 0 snode.enable_test_device = enable_test_device + snode.firewall_port = firewall_port if cluster.is_single_node: snode.physical_label = 0 @@ -1527,8 +1567,6 @@ def restart_storage_node( spdk_image=None, set_spdk_debug=None, small_bufsize=0, large_bufsize=0, force=False, node_ip=None, reattach_volume=False, clear_data=False, new_ssd_pcie=[], force_lvol_recreate=False): - db_controller = DBController() - kv_store = db_controller.kv_store db_controller = DBController() logger.info("Restarting storage node") @@ -1649,15 +1687,14 @@ def restart_storage_node( if max_prov > 0: try: - max_prov = int(utils.parse_size(max_prov)) - snode.max_prov = max_prov + max_prov = int(utils.parse_size(max_prov)) + snode.max_prov = max_prov except Exception as e: logger.debug(e) logger.error(f"Invalid max_prov value: {max_prov}") return False else: max_prov = snode.max_prov - if spdk_image: snode.spdk_image = spdk_image @@ -1698,6 +1735,9 @@ def restart_storage_node( spdk_debug = True snode.spdk_debug = spdk_debug + if minimum_sys_memory: + snode.minimum_sys_memory = minimum_sys_memory + cluster = db_controller.get_cluster_by_id(snode.cluster_id) if cluster.mode == "docker": @@ -1722,8 +1762,7 @@ def restart_storage_node( snode.namespace, snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT, ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, - socket=snode.socket, - cluster_id=snode.cluster_id) + cluster_id=snode.cluster_id, socket=snode.socket, firewall_port=snode.firewall_port) except Exception as e: logger.error(e) @@ -1867,9 +1906,11 @@ def restart_storage_node( active_devices.append(db_dev) else: logger.info(f"Device not found: {db_dev.get_id()}") - db_dev.status = NVMeDevice.STATUS_REMOVED - removed_devices.append(db_dev) - # distr_controller.send_dev_status_event(db_dev, db_dev.status) + if db_dev.status == NVMeDevice.STATUS_NEW: + snode.nvme_devices.remove(db_dev) + else: + db_dev.status = NVMeDevice.STATUS_REMOVED + removed_devices.append(db_dev) jm_dev_sn = "" if snode.jm_device and "serial_number" in snode.jm_device.device_data_dict: @@ -1947,23 +1988,6 @@ def restart_storage_node( db_dev.health_check = True device_events.device_restarted(db_dev) snode.write_to_db(db_controller.kv_store) - # - # # make other nodes connect to the new devices - # logger.info("Make other nodes connect to the node devices") - # snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id) - # for node in snodes: - # if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE: - # continue - # node.remote_devices = _connect_to_remote_devs(node, force_connect_restarting_nodes=True) - # node.write_to_db(kv_store) - # - # logger.info(f"Sending device status event") - # snode = db_controller.get_storage_node_by_id(snode.get_id()) - # for db_dev in snode.nvme_devices: - # distr_controller.send_dev_status_event(db_dev, db_dev.status) - # - # if snode.jm_device and snode.jm_device.status in [JMDevice.STATUS_UNAVAILABLE, JMDevice.STATUS_ONLINE]: - # device_controller.set_jm_device_state(snode.jm_device.get_id(), JMDevice.STATUS_ONLINE) cluster = db_controller.get_cluster_by_id(snode.cluster_id) if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]: @@ -1979,7 +2003,7 @@ def restart_storage_node( except RuntimeError: logger.error('Failed to connect to remote devices') return False - node.write_to_db(kv_store) + node.write_to_db() logger.info("Sending device status event") snode = db_controller.get_storage_node_by_id(snode.get_id()) @@ -2026,8 +2050,7 @@ def restart_storage_node( except RuntimeError: logger.error('Failed to connect to remote devices') return False - node.write_to_db(kv_store) - + node.write_to_db() logger.info("Sending device status event") snode = db_controller.get_storage_node_by_id(snode.get_id()) for db_dev in snode.nvme_devices: @@ -2183,13 +2206,13 @@ def list_storage_devices(node_id, is_json): "Health": snode.jm_device.health_check }) - for device in snode.remote_devices: - logger.debug(device) + for remote_device in snode.remote_devices: + logger.debug(remote_device) logger.debug("*" * 20) - name = device.alceml_name - status = device.status - if device.remote_bdev: - name = device.remote_bdev + name = remote_device.alceml_name + status = remote_device.status + if remote_device.remote_bdev: + name = remote_device.remote_bdev try: org_dev = db_controller.get_storage_device_by_id(device.get_id()) status = org_dev.status @@ -2197,22 +2220,22 @@ def list_storage_devices(node_id, is_json): pass remote_devices.append({ - "UUID": device.uuid, + "UUID": remote_device.uuid, "Name": name, - "Size": utils.humanbytes(device.size), - "Node ID": device.node_id, + "Size": utils.humanbytes(remote_device.size), + "Node ID": remote_device.node_id, "Status": status, }) - for device in snode.remote_jm_devices: - logger.debug(device) + for remote_jm_device in snode.remote_jm_devices: + logger.debug(remote_jm_device) logger.debug("*" * 20) remote_devices.append({ - "UUID": device.uuid, - "Name": device.remote_bdev, - "Size": utils.humanbytes(device.size), - "Node ID": device.node_id, - "Status": device.status, + "UUID": remote_jm_device.uuid, + "Name": remote_jm_device.remote_bdev, + "Size": utils.humanbytes(remote_jm_device.size), + "Node ID": remote_jm_device.node_id, + "Status": remote_jm_device.status, }) data: dict[str, List[Any]] = { @@ -2395,27 +2418,26 @@ def suspend_storage_node(node_id, force=False): node.hublvol.nvmf_port, port_type, "block", snode.rpc_port, is_reject=True) fw_api.firewall_set_port( node.lvol_subsys_port, port_type, "block", snode.rpc_port, is_reject=True) + time.sleep(0.5) + rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False) + rpc_client.bdev_distrib_force_to_non_leader(node.jm_vuid) except Exception as e: logger.error(e) return False - time.sleep(0.5) - rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False) - rpc_client.bdev_distrib_force_to_non_leader(node.jm_vuid) try: fw_api.firewall_set_port( snode.hublvol.nvmf_port, port_type, "block", snode.rpc_port, is_reject=True) fw_api.firewall_set_port( snode.lvol_subsys_port, port_type, "block", snode.rpc_port, is_reject=True) + time.sleep(0.5) + rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False) + rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid) + time.sleep(1) except Exception as e: logger.error(e) return False - time.sleep(0.5) - rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False) - rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid) - time.sleep(1) - logger.info("Done") return True @@ -2459,7 +2481,7 @@ def resume_storage_node(node_id): return False if snode.enable_ha_jm: snode.remote_jm_devices = _connect_to_remote_jm_devs(snode) - snode.write_to_db(db_controller.kv_store) + snode.write_to_db() fw_api = FirewallClient(snode, timeout=20, retry=1) port_type = "tcp" @@ -3067,7 +3089,7 @@ def recreate_lvstore_on_sec(secondary_node): return False # sending to the node that is being restarted (secondary_node) with the secondary group jm_vuid (primary_node.jm_vuid) - ret = secondary_node.rpc_client().jc_suspend_compression(jm_vuid=primary_node.jm_vuid, suspend=False) + ret, err = secondary_node.rpc_client().jc_suspend_compression(jm_vuid=primary_node.jm_vuid, suspend=False) if not ret: logger.info("Failed to resume JC compression adding task...") tasks_controller.add_jc_comp_resume_task( @@ -3152,6 +3174,7 @@ def recreate_lvstore(snode, force=False): ### 1- create distribs and raid ret, err = _create_bdev_stack(snode, []) + if err: logger.error(f"Failed to recreate lvstore on node {snode.get_id()}") logger.error(err) @@ -3421,7 +3444,7 @@ def get_sorted_ha_jms(current_node): continue mgmt_ips.append(jm_dev_to_mgmt_ip[jm_id]) out.append(jm_id) - return out[:constants.HA_JM_COUNT - 1] + return out[:current_node.ha_jm_count - 1] def get_node_jm_names(current_node, remote_node=None): @@ -3443,16 +3466,11 @@ def get_node_jm_names(current_node, remote_node=None): if remote_node.jm_device.get_id() == jm_id: jm_list.append(remote_node.jm_device.jm_bdev) continue - for jm_dev in remote_node.remote_jm_devices: - if jm_dev.get_id() == jm_id: - jm_list.append(jm_dev.remote_bdev) - break - else: - for jm_dev in current_node.remote_jm_devices: - if jm_dev.get_id() == jm_id: - jm_list.append(jm_dev.remote_bdev) - break - return jm_list[:constants.HA_JM_COUNT] + + jm_dev = DBController().get_jm_device_by_id(jm_id) + jm_list.append(f"remote_{jm_dev.jm_bdev}n1") + + return jm_list[:current_node.ha_jm_count] def get_secondary_nodes(current_node): @@ -3608,7 +3626,7 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo return False # sending to the other node (sec_node) with the primary group jm_vuid (snode.jm_vuid) - ret = sec_node.rpc_client().jc_suspend_compression(jm_vuid=snode.jm_vuid, suspend=False) + ret, err = sec_node.rpc_client().jc_suspend_compression(jm_vuid=snode.jm_vuid, suspend=False) if not ret: logger.info("Failed to resume JC compression adding task...") tasks_controller.add_jc_comp_resume_task(sec_node.cluster_id, sec_node.get_id(), jm_vuid=snode.jm_vuid) @@ -3637,6 +3655,7 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo return True + def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None): def _create_distr(snode, name, params): try: @@ -3836,7 +3855,7 @@ def dump_lvstore(node_id): logger.error("Storage node does not have lvstore") return False - rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=3, retry=0) + rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=120) logger.info(f"Dumping lvstore data on node: {snode.get_id()}") file_name = f"LVS_dump_{snode.hostname}_{snode.lvstore}_{str(datetime.datetime.now().isoformat())}.txt" file_path = f"/etc/simplyblock/{file_name}" diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 6add65d62..47f3c16e8 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -550,7 +550,7 @@ def calculate_pool_count(alceml_count, number_of_distribs, cpu_count, poller_cou large_pool_count = 48 * (alceml_count + number_of_distribs + 3 + poller_count) + ( 6 + alceml_count + number_of_distribs) * 32 + poller_number * 15 + 384 + 16 * poller_number + constants.EXTRA_LARGE_POOL_COUNT - return int(4.0 * small_pool_count), int(2.5 * large_pool_count) + return int(small_pool_count), int(large_pool_count) def calculate_minimum_hp_memory(small_pool_count, large_pool_count, lvol_count, max_prov, cpu_count): @@ -636,7 +636,7 @@ def get_logger(name=""): if not logg.hasHandlers(): logger_handler = logging.StreamHandler(stream=sys.stdout) - logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(levelname)s: %(message)s')) + logger_handler.setFormatter(logging.Formatter('%(asctime)s: %(thread)d: %(levelname)s: %(message)s')) logg.addHandler(logger_handler) # gelf_handler = GELFTCPHandler('0.0.0.0', constants.GELF_PORT) # logg.addHandler(gelf_handler) @@ -743,8 +743,6 @@ def first_six_chars(s: str) -> str: If the string is shorter than six characters, returns the entire string. """ return s[:6] - - def nearest_upper_power_of_2(n): # Check if n is already a power of 2 if (n & (n - 1)) == 0: @@ -838,7 +836,7 @@ def get_next_rpc_port(cluster_id): from simplyblock_core.db_controller import DBController db_controller = DBController() - port = 8080 + port = constants.RPC_PORT_RANGE_START used_ports = [] for node in db_controller.get_storage_nodes_by_cluster_id(cluster_id): if node.rpc_port > 0: @@ -853,6 +851,22 @@ def get_next_rpc_port(cluster_id): return 0 +def get_next_fw_port(cluster_id): + from simplyblock_core.db_controller import DBController + db_controller = DBController() + + port = constants.FW_PORT_START + used_ports = [] + for node in db_controller.get_storage_nodes_by_cluster_id(cluster_id): + if node.firewall_port > 0: + used_ports.append(node.firewall_port) + next_port = port + while True: + if next_port not in used_ports: + return next_port + next_port += 1 + + def get_next_dev_port(cluster_id): from simplyblock_core.db_controller import DBController db_controller = DBController() @@ -2110,7 +2124,7 @@ def patch_cr_node_status( ): """ Patch status.nodes[*] fields for a specific node identified by UUID. - + Operations: - Update a node (by uuid or mgmtIp) - Remove a node (by uuid or mgmtIp) diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index 31b4912a3..51090b9b2 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -128,7 +128,7 @@ def scan_devices(): class SPDKParams(BaseModel): server_ip: str = Field(pattern=utils.IP_PATTERN) - rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=1, le=65536) + rpc_port: int = Field(constants.RPC_PORT_RANGE_START, ge=1, le=65536) rpc_username: str rpc_password: str ssd_pcie: Optional[List[str]] = Field(None) @@ -143,6 +143,7 @@ class SPDKParams(BaseModel): cluster_mode: str socket: Optional[int] = Field(None, ge=0) cluster_id: str + firewall_port: int = Field(constants.FW_PORT_START) @api.post('/spdk_process_start', responses={ @@ -155,7 +156,8 @@ def spdk_process_start(body: SPDKParams): ssd_pcie_list = " ".join(body.ssd_pcie) if body.ssd_pcie else "none" spdk_debug = '1' if body.spdk_debug else '' total_mem_mib = core_utils.convert_size(core_utils.parse_size(body.total_mem), 'MiB') if body.total_mem else '' - spdk_mem_mib = core_utils.convert_size(body.spdk_mem, 'MiB') + # spdk_mem_mib = core_utils.convert_size(body.spdk_mem, 'MiB') + spdk_mem_mib = 0 node_docker = get_docker_client(timeout=60 * 3) for name in {f"/spdk_{body.rpc_port}", f"/spdk_proxy_{body.rpc_port}"}: @@ -190,6 +192,7 @@ def spdk_process_start(body: SPDKParams): f"PCI_ALLOWED={ssd_pcie_list}", f"TOTAL_HP={total_mem_mib}", f"NSOCKET={body.socket}", + f"FW_PORT={body.firewall_port}", ] # restart_policy={"Name": "on-failure", "MaximumRetryCount": 99} ) diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index 59a8ec607..5a2fe24c7 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -268,6 +268,8 @@ class SPDKParams(BaseModel): spdk_image: str = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE) cluster_ip: str = Field(pattern=utils.IP_PATTERN) cluster_mode: str + socket: Optional[int] = Field(None, ge=0) + firewall_port: Optional[int] = Field(constants.FW_PORT_START) cluster_id: str @@ -356,7 +358,9 @@ def spdk_process_start(body: SPDKParams): 'CLUSTER_ID': first_six_cluster_id, 'SSD_PCIE': ssd_pcie_params, 'PCI_ALLOWED': ssd_pcie_list, - 'TOTAL_HP': total_mem_mib + 'TOTAL_HP': total_mem_mib, + 'NSOCKET': body.socket, + 'FW_PORT': body.firewall_port } if ubuntu_host: diff --git a/simplyblock_web/api/v1/storage_node.py b/simplyblock_web/api/v1/storage_node.py index b44313c11..f3ec2fbcd 100644 --- a/simplyblock_web/api/v1/storage_node.py +++ b/simplyblock_web/api/v1/storage_node.py @@ -249,6 +249,10 @@ def storage_node_add(): if 'iobuf_large_pool_count' in req_data: iobuf_large_pool_count = int(req_data['iobuf_large_pool_count']) + ha_jm_count = 3 + if 'ha_jm_count' in req_data: + ha_jm_count = int(req_data['ha_jm_count']) + tasks_controller.add_node_add_task(cluster_id, { "cluster_id": cluster_id, "node_addr": node_addr, @@ -264,6 +268,7 @@ def storage_node_add(): "enable_test_device": enable_test_device, "namespace": namespace, "enable_ha_jm": not disable_ha_jm, + "ha_jm_count": ha_jm_count, }) return utils.get_response(True) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index 5159425f1..f3d7bd33c 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -48,8 +48,9 @@ class StorageNodeParams(BaseModel): iobuf_small_pool_count: int = Field(0) iobuf_large_pool_count: int = Field(0) cr_name: str - cr_namespace: str + cr_namespace: str cr_plural: str + ha_jm_count: int = Field(3) @api.post('/', name='clusters:storage-nodes:create', status_code=201, responses={201: {"content": None}}) @@ -75,6 +76,7 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams): 'cr_name': parameters.cr_name, 'cr_namespace': parameters.cr_namespace, 'cr_plural': parameters.cr_plural, + "ha_jm_count": parameters.ha_jm_count, } ) if not task_id_or_false: @@ -112,7 +114,7 @@ def delete( ) if none_or_false == False: # noqa raise ValueError('Failed to remove storage node') - + if force_delete: none_or_false = storage_node_ops.delete_storage_node( storage_node.get_id(), force=force_delete diff --git a/simplyblock_web/node_configure.py b/simplyblock_web/node_configure.py index 52dd02be2..ffb05915d 100755 --- a/simplyblock_web/node_configure.py +++ b/simplyblock_web/node_configure.py @@ -3,7 +3,6 @@ import argparse import logging -import os import sys from typing import List, Optional, cast @@ -16,6 +15,8 @@ ) from simplyblock_cli.clibase import range_type from simplyblock_web import node_utils_k8s +import os +import subprocess logger = logging.getLogger(__name__) logger.setLevel(constants.LOG_LEVEL) @@ -150,10 +151,8 @@ def parse_arguments() -> argparse.Namespace: dest='size_range', required=False ) - return parser.parse_args() - def validate_arguments(args: argparse.Namespace) -> None: """ Validate the provided command line arguments. @@ -168,8 +167,7 @@ def validate_arguments(args: argparse.Namespace) -> None: if not args.max_lvol: raise argparse.ArgumentError(None, '--max-lvol is required') if not args.max_prov: - args.max_prov = 0 - + args.max_prov=0 try: max_lvol = int(args.max_lvol) if max_lvol <= 0: @@ -204,7 +202,7 @@ def main() -> None: return if not args.max_prov: - args.max_prov = 0 + args.max_prov=0 validate_arguments(args) if _is_pod_present_for_node(): @@ -257,6 +255,28 @@ def main() -> None: size_range=args.size_range ) + logger.info("create RPC socket mount") + mount_point = "/mnt/ramdisk" + size = "1G" + fstab_entry = f"tmpfs {mount_point} tmpfs size={size},mode=1777,noatime 0 0\n" + + # 1️⃣ Create the mount point if it doesn't exist + os.makedirs(mount_point, exist_ok=True) + + # 2️⃣ Add to /etc/fstab if not already present + with open("/etc/fstab", "r+") as fstab: + lines = fstab.readlines() + if not any(mount_point in line for line in lines): + fstab.write(fstab_entry) + print(f"Added fstab entry for {mount_point}") + else: + print(f"fstab entry for {mount_point} already exists") + + # 3️⃣ Mount the RAM disk immediately + subprocess.run(["mount", mount_point], check=True) + + # 4️⃣ Verify + subprocess.run(["df", "-h", mount_point]) except argparse.ArgumentError as e: logger.error(f"Argument error: {e}") sys.exit(1) diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index 7963aa248..ec92c850f 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -93,6 +93,20 @@ spec: value: "{{ TOTAL_HP }}" - name: RPC_PORT value: "{{ RPC_PORT }}" + - name: NSOCKET + value: "{{ NSOCKET }}" + - name: FW_PORT + value: "{{ FW_PORT }}" + - name: SPDKCSI_SECRET + valueFrom: + secretKeyRef: + name: simplyblock-csi-secret + key: secret.json + - name: CLUSTER_CONFIG + valueFrom: + configMapKeyRef: + name: simplyblock-csi-cm + key: config.json lifecycle: postStart: exec: diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py index a610cd177..de72db274 100644 --- a/simplyblock_web/utils.py +++ b/simplyblock_web/utils.py @@ -148,7 +148,7 @@ def error_handler(exception: Exception): class RPCPortParams(BaseModel): - rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536) + rpc_port: int = Field(constants.RPC_PORT_RANGE_START, ge=0, le=65536) cluster_id: Optional[str] From 5f966949fb4c678a99df29163b1855a706f21121 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 28 Jan 2026 02:50:35 +0300 Subject: [PATCH 133/192] Enhance snapshot replication logic to support snapshot instances and streamline replication task handling --- .../controllers/tasks_controller.py | 23 ++++++++++ simplyblock_core/models/snapshot.py | 3 +- .../services/snapshot_replication.py | 45 ++++++++++--------- 3 files changed, 50 insertions(+), 21 deletions(-) diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index a50de1b22..6cfc2c8a8 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -441,7 +441,30 @@ def get_snapshot_replication_task(cluster_id, snapshot_id, replicate_to_source): return False +def _check_snap_instance_on_node(snapshot_id: str , node_id: str): + snapshot = db.get_snapshot_by_id(snapshot_id) + for sn_inst in snapshot.instances: + if sn_inst.lvol.node_id == node_id: + logger.info("Snapshot instance found on node, skip adding replication task") + return + + if snapshot.snap_ref_id: + prev_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) + _check_snap_instance_on_node(prev_snap.get_id(), node_id) + + _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, snapshot.cluster_id, node_id, "", + function_params={"snapshot_id": snapshot.get_id(), "replicate_to_source": False, + "replicate_as_snap_instance": True}, + send_to_cluster_log=False) + + def add_snapshot_replication_task(cluster_id, node_id, snapshot_id, replicate_to_source=False): + if not replicate_to_source: + snapshot = db.get_snapshot_by_id(snapshot_id) + if snapshot.snap_ref_id: + prev_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) + _check_snap_instance_on_node(prev_snap.get_id(), node_id) + return _add_task(JobSchedule.FN_SNAPSHOT_REPLICATION, cluster_id, node_id, "", function_params={"snapshot_id": snapshot_id, "replicate_to_source": replicate_to_source}, send_to_cluster_log=False) diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py index 4be27c328..ab91a0087 100644 --- a/simplyblock_core/models/snapshot.py +++ b/simplyblock_core/models/snapshot.py @@ -33,4 +33,5 @@ class SnapShot(BaseModel): target_replicated_snap_uuid: str = "" source_replicated_snap_uuid: str = "" next_snap_uuid: str = "" - prev_snap_uuid: str = "" \ No newline at end of file + prev_snap_uuid: str = "" + instances: list = [] \ No newline at end of file diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index d11a5a28f..47f1c8c76 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -119,6 +119,7 @@ def process_snap_replicate_finish(task, snapshot): snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) remote_snode = db.get_storage_node_by_id(remote_lv.node_id) replicate_to_source = task.function_params["replicate_to_source"] + replicate_as_snap_instance = task.function_params["replicate_as_snap_instance"] target_prev_snap = None if replicate_to_source: org_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) @@ -127,11 +128,13 @@ def process_snap_replicate_finish(task, snapshot): except KeyError as e: logger.error(e) else: - if snapshot.prev_snap_uuid: + if snapshot.snap_ref_id: try: - prev_snap = db.get_snapshot_by_id(snapshot.prev_snap_uuid) - if prev_snap.target_replicated_snap_uuid: - target_prev_snap = db.get_snapshot_by_id(prev_snap.target_replicated_snap_uuid) + prev_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) + for sn_inst in prev_snap.instances: + if sn_inst.lvol.node_id == remote_snode.get_id(): + target_prev_snap = sn_inst + break except KeyError as e: logger.error(e) @@ -167,14 +170,6 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot_uuid = str(uuid.uuid4()) - if snapshot.status == SnapShot.STATUS_IN_REPLICATION: - snapshot.status = SnapShot.STATUS_ONLINE - if replicate_to_source: - snapshot.source_replicated_snap_uuid = new_snapshot_uuid - else: - snapshot.target_replicated_snap_uuid = new_snapshot_uuid - snapshot.write_to_db() - new_snapshot = SnapShot() new_snapshot.uuid = new_snapshot_uuid new_snapshot.cluster_id = remote_snode.cluster_id @@ -187,18 +182,28 @@ def process_snap_replicate_finish(task, snapshot): new_snapshot.snap_name = snapshot.snap_name new_snapshot.blobid = remote_lv.blobid new_snapshot.created_at = int(time.time()) - if replicate_to_source: - new_snapshot.target_replicated_snap_uuid = snapshot.uuid - else: - new_snapshot.source_replicated_snap_uuid = snapshot.uuid new_snapshot.status = SnapShot.STATUS_ONLINE - if target_prev_snap: - new_snapshot.prev_snap_uuid = target_prev_snap.get_id() - target_prev_snap.next_snap_uuid = new_snapshot_uuid - target_prev_snap.write_to_db() + snapshot.instances.append(new_snapshot) + if not replicate_as_snap_instance: + if replicate_to_source: + new_snapshot.target_replicated_snap_uuid = snapshot.uuid + snapshot.source_replicated_snap_uuid = new_snapshot_uuid + else: + snapshot.target_replicated_snap_uuid = new_snapshot_uuid + new_snapshot.source_replicated_snap_uuid = snapshot.uuid + + if target_prev_snap: + new_snapshot.prev_snap_uuid = target_prev_snap.get_id() + target_prev_snap.next_snap_uuid = new_snapshot_uuid + target_prev_snap.write_to_db() new_snapshot.write_to_db() + if snapshot.status == SnapShot.STATUS_IN_REPLICATION: + snapshot.status = SnapShot.STATUS_ONLINE + + snapshot.write_to_db() + # delete lvol object remote_lv.bdev_stack = [] remote_lv.write_to_db() From 105a14ff524052e08380dc72b6f3f04733ed3979 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 28 Jan 2026 03:56:39 +0300 Subject: [PATCH 134/192] Add replication-trigger command to start replication for logical volumes --- simplyblock_cli/cli-reference.yaml | 7 ++ simplyblock_cli/cli.py | 7 ++ simplyblock_cli/clibase.py | 3 + .../controllers/lvol_controller.py | 81 +++++++++++++++++++ 4 files changed, 98 insertions(+) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 547d4b277..c3f19a715 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1705,6 +1705,13 @@ commands: help: "Cluster UUID" dest: cluster_id type: str + - name: replication-trigger + help: "Start replication for lvol" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str - name: "control-plane" help: "Control plane commands" aliases: diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index e7ea442a8..7ccb40697 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -555,6 +555,7 @@ def init_volume(self): self.init_volume__replication_start(subparser) self.init_volume__replication_stop(subparser) self.init_volume__replication_status(subparser) + self.init_volume__replication_trigger(subparser) def init_volume__add(self, subparser): @@ -674,6 +675,10 @@ def init_volume__replication_status(self, subparser): subcommand = self.add_sub_command(subparser, 'replication-status', 'Lists replication status') subcommand.add_argument('cluster_id', help='Cluster UUID', type=str) + def init_volume__replication_trigger(self, subparser): + subcommand = self.add_sub_command(subparser, 'replication-trigger', 'Start replication for lvol') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + def init_control_plane(self): subparser = self.add_command('control-plane', 'Control plane commands', aliases=['cp','mgmt',]) @@ -1130,6 +1135,8 @@ def run(self): ret = self.volume__replication_stop(sub_command, args) elif sub_command in ['replication-status']: ret = self.volume__replication_status(sub_command, args) + elif sub_command in ['replication-trigger']: + ret = self.volume__replication_trigger(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 245aecc77..6b3d41aa2 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -594,6 +594,9 @@ def volume__replication_stop(self, sub_command, args): def volume__replication_status(self, sub_command, args): return snapshot_controller.list_replication_tasks(args.cluster_id) + def volume__replication_trigger(self, sub_command, args): + return lvol_controller.replication_trigger(args.lvol_id) + def control_plane__add(self, sub_command, args): cluster_id = args.cluster_id cluster_ip = args.cluster_ip diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 68a18214e..6d40a2ccc 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1773,6 +1773,87 @@ def inflate_lvol(lvol_id): logger.error(f"Failed to inflate LVol: {lvol_id}") return ret +def replication_trigger(lvol_id): + # create snapshot and replicate it + db_controller = DBController() + lvol = db_controller.get_lvol_by_id(lvol_id) + node = db_controller.get_storage_node_by_id(lvol.node_id) + snapshot_controller.add(lvol_id, f"replication_{lvol_id}") + + tasks = [] + snaps = [] + out = { + "lvol": lvol, + "last_snapshot_id": None, + "last_replication_time": None, + "last_replication_duration": None, + "replicated_count": None, + "snaps": None, + "tasks": None, + } + for task in db_controller.get_job_tasks(node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + if snap.lvol.get_id() != lvol_id: + continue + + snaps.append(snap) + tasks.append(task) + # duration = "" + # try: + # if task.status == JobSchedule.STATUS_RUNNING: + # duration = utils.strfdelta_seconds(int(time.time()) - task.function_params["start_time"]) + # elif "end_time" in task.function_params: + # duration = utils.strfdelta_seconds( + # task.function_params["end_time"] - task.function_params["start_time"]) + # except Exception as e: + # logger.error(e) + # status = task.status + # if task.canceled: + # status = "cancelled" + # replicate_to = "target" + # if "replicate_to_source" in task.function_params: + # if task.function_params["replicate_to_source"] is True: + # replicate_to = "source" + # offset = 0 + # if "offset" in task.function_params: + # offset = task.function_params["offset"] + # data.append({ + # "Task ID": task.uuid, + # "Snapshot ID": snap.uuid, + # "Size": utils.humanbytes(snap.used_size), + # "Duration": duration, + # "Offset": offset, + # "Status": status, + # "Replicate to": replicate_to, + # "Result": task.function_result, + # "Cluster ID": task.cluster_id, + # }) + + if tasks: + tasks = sorted(tasks, key=lambda x: x.date) + snaps = sorted(snaps, key=lambda x: x.creation_dt) + out["snaps"] = snaps + out["tasks"] = tasks + out["replicated_count"] = len(snaps) + last_task = tasks[-1] + last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) + out["last_snapshot_id"] = last_snap.get_id() + out["last_replication_time"] = last_task.updated_at + if "end_time" in last_task.function_params: + duration = utils.strfdelta_seconds( + last_task.function_params["end_time"] - last_task.function_params["start_time"]) + else: + duration = utils.strfdelta_seconds(int(time.time()) - last_task.function_params["start_time"]) + out["last_replication_duration"] = duration + + return out + def replication_start(lvol_id): db_controller = DBController() try: From 1bdb533bfcdbc1e852f920db1702cc173a5a61db Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 28 Jan 2026 13:59:49 +0100 Subject: [PATCH 135/192] fixed UnboundLocalError: local variable 'os_endpoint' referenced before assignment (#855) --- simplyblock_core/cluster_ops.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index 78e06ccd7..c12322494 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -313,16 +313,17 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, cluster.fabric_tcp = protocols["tcp"] cluster.fabric_rdma = protocols["rdma"] cluster.is_single_node = is_single_node - if grafana_endpoint: - cluster.grafana_endpoint = grafana_endpoint - elif ingress_host_source == "hostip": - cluster.grafana_endpoint = f"http://{dev_ip}/grafana" - graylog_endpoint = f"http://{dev_ip}/graylog" - os_endpoint = f"http://{dev_ip}/opensearch" + + if ingress_host_source == "hostip": + base = dev_ip else: - cluster.grafana_endpoint = f"http://{dns_name}/grafana" - graylog_endpoint = f"http://{dns_name}/graylog" - os_endpoint = f"http://{dns_name}/opensearch" + base = dns_name + + graylog_endpoint = f"http://{base}/graylog" + os_endpoint = f"http://{base}/opensearch" + default_grafana = f"http://{base}/grafana" + + cluster.grafana_endpoint = grafana_endpoint or default_grafana cluster.enable_node_affinity = enable_node_affinity cluster.qpair_count = qpair_count or constants.QPAIR_COUNT cluster.client_qpair_count = client_qpair_count or constants.CLIENT_QPAIR_COUNT From e09135e33db35beb3ea3ad31c7b7b84b63a15366 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 29 Jan 2026 13:49:34 +0300 Subject: [PATCH 136/192] Adds JM device fixes to main (#856) --- simplyblock_core/storage_node_ops.py | 187 +++++++++++++++------------ 1 file changed, 107 insertions(+), 80 deletions(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index adda8f4f5..55fe98fd1 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -67,30 +67,30 @@ def connect_device(name: str, device: NVMeDevice, node: StorageNode, bdev_names: ret = rpc_client.bdev_nvme_controller_list(name) if ret: - counter=0 - while(counter<5): - waiting = False - for controller in ret[0]["ctrlrs"]: - controller_state = controller["state"] - logger.info(f"Controller found: {name}, status: {controller_state}") - if controller_state== "failed": - #we can remove the controller only for certain, if its failed. other states are intermediate and require retry. - rpc_client.bdev_nvme_detach_controller(name) - time.sleep(2) - break - elif controller_state == "resetting" or controller_state == "deleting" or controller_state == "reconnect_is_delayed": - if counter < 5: - time.sleep(2) - waiting = True - break - else: #this should never happen. It means controller is "hanging" in an intermediate state for more than 10 seconds. usually if some io is hanging. - raise RuntimeError(f"Controller: {name}, status is {controller_state}") - if not waiting: - counter=5 - else: - counter+=1 - - #if reattach: + counter = 0 + while (counter < 5): + waiting = False + for controller in ret[0]["ctrlrs"]: + controller_state = controller["state"] + logger.info(f"Controller found: {name}, status: {controller_state}") + if controller_state== "failed": + # we can remove the controller only for certain, if its failed. other states are intermediate and require retry. + rpc_client.bdev_nvme_detach_controller(name) + time.sleep(2) + break + elif controller_state == "resetting" or controller_state == "deleting" or controller_state == "reconnect_is_delayed": + if counter < 5: + time.sleep(2) + waiting = True + break + else: # this should never happen. It means controller is "hanging" in an intermediate state for more than 10 seconds. usually if some io is hanging. + raise RuntimeError(f"Controller: {name}, status is {controller_state}") + if not waiting: + counter = 5 + else: + counter += 1 + + # if reattach: # rpc_client.bdev_nvme_detach_controller(name) # time.sleep(1) @@ -101,36 +101,36 @@ def connect_device(name: str, device: NVMeDevice, node: StorageNode, bdev_names: db_ctrl = DBController() node = db_ctrl.get_storage_node_by_id(device.node_id) if node.active_rdma: - tr_type = "RDMA" + tr_type = "RDMA" else: - if node.active_tcp: - tr_type = "TCP" - else: - msg = "target node to connect has no active fabric." - logger.error(msg) - raise RuntimeError(msg) + if node.active_tcp: + tr_type = "TCP" + else: + msg = "target node to connect has no active fabric." + logger.error(msg) + raise RuntimeError(msg) for ip in device.nvmf_ip.split(","): - ret = rpc_client.bdev_nvme_attach_controller( - name, device.nvmf_nqn, ip, device.nvmf_port, tr_type, - multipath=device.nvmf_multipath) - if not bdev_name and ret and isinstance(ret, list): + ret = rpc_client.bdev_nvme_attach_controller( + name, device.nvmf_nqn, ip, device.nvmf_port, tr_type, + multipath=device.nvmf_multipath) + if not bdev_name and ret and isinstance(ret, list): bdev_name = ret[0] - if device.nvmf_multipath: + if device.nvmf_multipath: rpc_client.bdev_nvme_set_multipath_policy(bdev_name, "active_active") if not bdev_name: - msg = "Bdev name not returned from controller attach" - logger.error(msg) - raise RuntimeError(msg) + msg = "Bdev name not returned from controller attach" + logger.error(msg) + raise RuntimeError(msg) bdev_found = False for i in range(5): - ret = rpc_client.get_bdevs(bdev_name) - if ret: + ret = rpc_client.get_bdevs(bdev_name) + if ret: bdev_found = True break - else: + else: time.sleep(1) device.release_device_connection() @@ -187,13 +187,24 @@ def _search_for_partitions(rpc_client, nvme_device): def _create_jm_stack_on_raid(rpc_client, jm_nvme_bdevs, snode, after_restart): - raid_bdev = f"raid_jm_{snode.get_id()}" - - raid_level = "1" - ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) - if not ret: - logger.error(f"Failed to create raid_jm_{snode.get_id()}") - return False + if snode.jm_device and snode.jm_device.raid_bdev: + raid_bdev = snode.jm_device.raid_bdev + if raid_bdev.startswith("raid_jm_"): + raid_level = "1" + ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) + if not ret: + logger.error(f"Failed to create raid_jm_{snode.get_id()}") + return False + else: + if len(jm_nvme_bdevs) > 1: + raid_bdev = f"raid_jm_{snode.get_id()}" + raid_level = "1" + ret = rpc_client.bdev_raid_create(raid_bdev, jm_nvme_bdevs, raid_level) + if not ret: + logger.error(f"Failed to create raid_jm_{snode.get_id()}") + return False + else: + raid_bdev = jm_nvme_bdevs[0] alceml_id = snode.get_id() alceml_name = f"alceml_jm_{snode.get_id()}" @@ -625,7 +636,7 @@ def _prepare_cluster_devices_on_restart(snode, clear_data=False): # prepare JM device jm_device = snode.jm_device - if jm_device is None or jm_device.status == JMDevice.STATUS_REMOVED: + if jm_device is None: return True if not jm_device or not jm_device.uuid: @@ -634,20 +645,36 @@ def _prepare_cluster_devices_on_restart(snode, clear_data=False): jm_device.status = JMDevice.STATUS_UNAVAILABLE if jm_device.jm_nvme_bdev_list: - all_bdevs_found = True - for bdev_name in jm_device.jm_nvme_bdev_list: - ret = rpc_client.get_bdevs(bdev_name) + if len(jm_device.jm_nvme_bdev_list) == 1: + ret = rpc_client.get_bdevs(jm_device.jm_nvme_bdev_list[0]) if not ret: - logger.error(f"BDev not found: {bdev_name}") - all_bdevs_found = False - break - - if all_bdevs_found: + logger.error(f"BDev not found: {jm_device.jm_nvme_bdev_list[0]}") + jm_device.status = JMDevice.STATUS_REMOVED + return True ret = _create_jm_stack_on_raid(rpc_client, jm_device.jm_nvme_bdev_list, snode, after_restart=not clear_data) if not ret: logger.error("Failed to create JM device") return False + return True + jm_bdevs_found = [] + for bdev_name in jm_device.jm_nvme_bdev_list: + ret = rpc_client.get_bdevs(bdev_name) + if ret: + logger.info(f"JM bdev found: {bdev_name}") + jm_bdevs_found.append(bdev_name) + else: + logger.error(f"JM bdev not found: {bdev_name}") + + if len(jm_bdevs_found) > 1: + ret = _create_jm_stack_on_raid(rpc_client, jm_bdevs_found, snode, after_restart=not clear_data) + if not ret: + logger.error("Failed to create JM device") + return False + else: + logger.error("Only one jm nvme bdev found, setting jm device to removed") + jm_device.status = JMDevice.STATUS_REMOVED + return True else: nvme_bdev = jm_device.nvme_bdev @@ -973,7 +1000,7 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, # Calculate pool count max_prov = 0 if node_config.get("max_size"): - max_prov = int(utils.parse_size(node_config.get("max_size"))) + max_prov = int(utils.parse_size(node_config.get("max_size"))) if max_prov < 0: logger.error(f"Incorrect max-prov value {max_prov}") return False @@ -1167,6 +1194,8 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, snode.active_tcp = active_tcp snode.active_rdma = active_rdma + if 'cpu_count' in node_info: + snode.cpu = node_info['cpu_count'] if 'cpu_hz' in node_info: snode.cpu_hz = node_info['cpu_hz'] if 'memory' in node_info: @@ -1174,7 +1203,6 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, if 'hugepages' in node_info: snode.hugepages = node_info['hugepages'] - snode.cpu = len(utils.hexa_to_cpu_list(spdk_cpu_mask)) snode.l_cores = l_cores or "" snode.spdk_cpu_mask = spdk_cpu_mask or "" snode.spdk_mem = minimum_hp_memory @@ -1687,8 +1715,8 @@ def restart_storage_node( if max_prov > 0: try: - max_prov = int(utils.parse_size(max_prov)) - snode.max_prov = max_prov + max_prov = int(utils.parse_size(max_prov)) + snode.max_prov = max_prov except Exception as e: logger.debug(e) logger.error(f"Invalid max_prov value: {max_prov}") @@ -1864,11 +1892,14 @@ def restart_storage_node( logger.error("Failed to set jc singleton mask") return False + node_info, _ = snode_api.info() if not snode.ssd_pcie: - node_info, _ = snode_api.info() ssds = node_info['spdk_pcie_list'] else: - ssds = snode.ssd_pcie + ssds = [] + for ssd in snode.ssd_pcie: + if ssd in node_info['spdk_pcie_list']: + ssds.append(ssd) nvme_devs = addNvmeDevices(rpc_client, snode, ssds) if not nvme_devs: @@ -1929,8 +1960,7 @@ def restart_storage_node( snode.nvme_devices.append(dev) snode.write_to_db(db_controller.kv_store) - if node_ip: - + if node_ip and len(new_devices) > 0: # prepare devices on new node if snode.num_partitions_per_dev == 0 or snode.jm_percent == 0: @@ -2210,21 +2240,13 @@ def list_storage_devices(node_id, is_json): logger.debug(remote_device) logger.debug("*" * 20) name = remote_device.alceml_name - status = remote_device.status - if remote_device.remote_bdev: - name = remote_device.remote_bdev - try: - org_dev = db_controller.get_storage_device_by_id(device.get_id()) - status = org_dev.status - except KeyError: - pass remote_devices.append({ "UUID": remote_device.uuid, "Name": name, "Size": utils.humanbytes(remote_device.size), "Node ID": remote_device.node_id, - "Status": status, + "Status": remote_device.status, }) for remote_jm_device in snode.remote_jm_devices: @@ -2311,9 +2333,12 @@ def shutdown_storage_node(node_id, force=False): pci_address = [] for dev in snode.nvme_devices: if dev.pcie_address not in pci_address: - ret = SNodeClient(snode.api_endpoint, timeout=30, retry=1).bind_device_to_nvme(dev.pcie_address) - logger.debug(ret) - pci_address.append(dev.pcie_address) + try: + ret = SNodeClient(snode.api_endpoint, timeout=30, retry=1).bind_device_to_nvme(dev.pcie_address) + logger.debug(ret) + pci_address.append(dev.pcie_address) + except Exception as e: + logger.debug(e) logger.info("Setting node status to offline") set_node_status(node_id, StorageNode.STATUS_OFFLINE) @@ -3027,7 +3052,8 @@ def set_node_status(node_id, status, reconnect_on_online=True): if snode.enable_ha_jm: snode.remote_jm_devices = _connect_to_remote_jm_devs(snode) snode.write_to_db(db_controller.kv_store) - distr_controller.send_cluster_map_to_node(snode) + for device in snode.nvme_devices: + distr_controller.send_dev_status_event(device, device.status, target_node=snode) for node in db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id): if node.get_id() == snode.get_id(): @@ -3036,7 +3062,8 @@ def set_node_status(node_id, status, reconnect_on_online=True): try: node.remote_devices = _connect_to_remote_devs(node) node.write_to_db() - distr_controller.send_cluster_map_to_node(node) + for device in node.nvme_devices: + distr_controller.send_dev_status_event(device, device.status, target_node=node) except RuntimeError: logger.error(f'Failed to connect to remote devices from node: {node.get_id()}') continue From c4bee3fc9c310e59e4301ecf6e92a1df680be941 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Thu, 29 Jan 2026 14:03:37 +0100 Subject: [PATCH 137/192] added check for spdk_container is running (#857) --- simplyblock_web/api/internal/storage_node/kubernetes.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index 5a2fe24c7..580921d58 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -526,11 +526,18 @@ def spdk_process_kill(query: utils.RPCPortParams): def _is_pod_up(rpc_port, cluster_id): k8s_core_v1 = core_utils.get_k8s_core_client() pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}" + container_name = "spdk-container" try: resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace()) for pod in resp.items: if pod.metadata.name.startswith(pod_name): - return pod.status.phase == "Running" + if pod.status.phase == "Running": + cs = next((c for c in pod.status.container_statuses if c.name == container_name),None) + if cs is None: + logger.error(f"Container '{container_name}' not found in pod '{pod_name}'") + return False + if cs.state.running: + return True except ApiException as e: logger.error(f"API error: {e}") return False From 56bd031acf45aca8b1af2122afd086d795218329 Mon Sep 17 00:00:00 2001 From: "Hamdy H. Khader" Date: Thu, 29 Jan 2026 17:01:28 +0300 Subject: [PATCH 138/192] Refactor lvol_controller.py to improve node validation during LVol creation (#858) --- .../controllers/lvol_controller.py | 36 +++---------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 6cdbfd476..96a9a0aeb 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -140,36 +140,17 @@ def _get_next_3_nodes(cluster_id, lvol_size=0): for node in snodes: if node.is_secondary_node: # pass continue - if node.status == node.STATUS_ONLINE: - lvol_count = len(db_controller.get_lvols_by_node_id(node.get_id())) if lvol_count >= node.max_lvol: continue - - # Validate Eligible nodes for adding lvol - # snode_api = SNodeClient(node.api_endpoint) - # result, _ = snode_api.info() - # memory_free = result["memory_details"]["free"] - # huge_free = result["memory_details"]["huge_free"] - # total_node_capacity = db_controller.get_snode_size(node.get_id()) - # error = utils.validate_add_lvol_or_snap_on_node(memory_free, huge_free, node.max_lvol, lvol_size, total_node_capacity, len(node.lvols)) - # if error: - # logger.warning(error) - # continue - # + if node.lvol_sync_del(): + logger.warning(f"LVol sync delete task found on node: {node.get_id()}, skipping") + continue online_nodes.append(node) - # node_stat_list = db_controller.get_node_stats(node, limit=1000) - # combined_record = utils.sum_records(node_stat_list) node_st = { - "lvol": lvol_count+1, - # "cpu": 1 + (node.cpu * node.cpu_hz), - # "r_io": combined_record.read_io_ps, - # "w_io": combined_record.write_io_ps, - # "r_b": combined_record.read_bytes_ps, - # "w_b": combined_record.write_bytes_ps + "lvol": lvol_count+1 } - node_stats[node.get_id()] = node_st if len(online_nodes) <= 1: @@ -463,14 +444,7 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp, nodes = _get_next_3_nodes(cl.get_id(), lvol.size) if not nodes: return False, "No nodes found with enough resources to create the LVol" - for n in nodes: - if n.lvol_sync_del(): - logger.warning(f"LVol sync delete task found on node: {n.get_id()}, skipping") - else: - host_node = n - break - if not host_node: - return False, "No nodes found with enough resources to create the LVol" + host_node = nodes[0] s_node = db_controller.get_storage_node_by_id(host_node.secondary_node_id) attr_name = f"active_{fabric}" From 8d6652fdea1e7982b01244990353a5a9abc161fd Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Mon, 2 Feb 2026 23:25:56 +0300 Subject: [PATCH 139/192] Update rpc_client.py (#860) --- simplyblock_core/rpc_client.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index e2235b8d9..c69d0f0c7 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -305,7 +305,7 @@ def ultra21_alloc_ns_init(self, pci_addr): } return self._request2("ultra21_alloc_ns_init", params) - def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None): + def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None, eui64=None): params = { "nqn": nqn, "namespace": { @@ -322,6 +322,10 @@ def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None) if nsid: params['namespace']['nsid'] = nsid + if euid: + params['namespace']['eui64'] = eui64 + + return self._request("nvmf_subsystem_add_ns", params) def nvmf_subsystem_remove_ns(self, nqn, nsid): From 0cdf8a62d6bc27578ff8ce9b588017f93243612e Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Mon, 2 Feb 2026 23:33:53 +0300 Subject: [PATCH 140/192] Update storage_node_ops.py (#861) --- simplyblock_core/storage_node_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 55fe98fd1..5a3744fc8 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -3415,7 +3415,7 @@ def add_lvol_thread(lvol, snode, lvol_ana_state="optimized"): return False, msg logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, nsid=lvol.ns_id) + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, nsid=lvol.ns_id, eui64=hex(lvol.vuid)) for iface in snode.data_nics: if iface.ip4_address and lvol.fabric == iface.trtype.lower(): logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) From 9cb7636bb4de0a63fa083141caaae4e64906e7c3 Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Tue, 3 Feb 2026 20:10:44 +0300 Subject: [PATCH 141/192] Update rpc_client.py (#864) --- simplyblock_core/rpc_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index c69d0f0c7..d1f123b72 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -322,7 +322,7 @@ def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None, if nsid: params['namespace']['nsid'] = nsid - if euid: + if eui64: params['namespace']['eui64'] = eui64 From 3671b7b79c98601364ef7c96f92de32153bf0bb7 Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Tue, 3 Feb 2026 21:23:50 +0300 Subject: [PATCH 142/192] Update lvol_controller.py (#865) --- simplyblock_core/controllers/lvol_controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 96a9a0aeb..6933c774d 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -714,7 +714,7 @@ def add_lvol_on_node(lvol, snode, is_primary=True): return False, f"Failed to create listener for {lvol.get_id()}" logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid) + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid,hex(lvol.vuid)) if not ret: return False, "Failed to add bdev to subsystem" lvol.ns_id = int(ret) @@ -758,7 +758,7 @@ def recreate_lvol_on_node(lvol, snode, ha_inode_self=0, ana_state=None): # if namespace_found is False: logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid) + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, hex(lvol.vuid)) # if not ret: # return False, "Failed to add bdev to subsystem" From 1d3213b865fe9783aa2431d8382b226d4d11acce Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Tue, 3 Feb 2026 21:27:26 +0300 Subject: [PATCH 143/192] Update storage_node_ops.py (#866) --- simplyblock_core/storage_node_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 5a3744fc8..a7516f958 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -3414,7 +3414,7 @@ def add_lvol_thread(lvol, snode, lvol_ana_state="optimized"): logger.error(msg) return False, msg - logger.info("Add BDev to subsystem") + logger.info("Add BDev to subsystem "+hex(lvol.vuid)) ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, nsid=lvol.ns_id, eui64=hex(lvol.vuid)) for iface in snode.data_nics: if iface.ip4_address and lvol.fabric == iface.trtype.lower(): From 9db81561a1132d479f81f0d6b9ceac1d3c91ac3b Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Tue, 3 Feb 2026 21:56:11 +0300 Subject: [PATCH 144/192] Update lvol_controller.py (#867) --- simplyblock_core/controllers/lvol_controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 6933c774d..4e948ead2 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -714,7 +714,7 @@ def add_lvol_on_node(lvol, snode, is_primary=True): return False, f"Failed to create listener for {lvol.get_id()}" logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid,hex(lvol.vuid)) + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid,f"{lvol.vuid:016X}") if not ret: return False, "Failed to add bdev to subsystem" lvol.ns_id = int(ret) @@ -758,7 +758,7 @@ def recreate_lvol_on_node(lvol, snode, ha_inode_self=0, ana_state=None): # if namespace_found is False: logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, hex(lvol.vuid)) + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, f"{lvol.vuid:016X}") # if not ret: # return False, "Failed to add bdev to subsystem" From adc9e062015baadc599f828a779278c89d31fc60 Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Tue, 3 Feb 2026 21:57:38 +0300 Subject: [PATCH 145/192] Update storage_node_ops.py (#868) --- simplyblock_core/storage_node_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index a7516f958..1e505b8bc 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -3414,8 +3414,8 @@ def add_lvol_thread(lvol, snode, lvol_ana_state="optimized"): logger.error(msg) return False, msg - logger.info("Add BDev to subsystem "+hex(lvol.vuid)) - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, nsid=lvol.ns_id, eui64=hex(lvol.vuid)) + logger.info("Add BDev to subsystem "+f"{lvol.vuid:016X}") + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, nsid=lvol.ns_id, eui64=f"{lvol.vuid:016X}") for iface in snode.data_nics: if iface.ip4_address and lvol.fabric == iface.trtype.lower(): logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) From 37f46afed83f4f7f347ef048042b01f3e1962ae5 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 5 Feb 2026 15:07:30 +0300 Subject: [PATCH 146/192] fix 1 --- simplyblock_core/controllers/lvol_controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index ccc80eeba..4aefaa8e7 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -721,7 +721,7 @@ def add_lvol_on_node(lvol, snode, is_primary=True): return False, f"Failed to create listener for {lvol.get_id()}" logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid,f"{lvol.vuid:016X}") + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, lvol.ns_id) if not ret: return False, "Failed to add bdev to subsystem" lvol.ns_id = int(ret) @@ -765,7 +765,7 @@ def recreate_lvol_on_node(lvol, snode, ha_inode_self=0, ana_state=None): # if namespace_found is False: logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, f"{lvol.vuid:016X}") + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, lvol.ns_id) # if not ret: # return False, "Failed to add bdev to subsystem" From 920034ef8a6784dfd05eaa9cf2246898e2d70f40 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 5 Feb 2026 15:34:06 +0300 Subject: [PATCH 147/192] fix 1 --- simplyblock_web/api/v1/lvol.py | 2 +- simplyblock_web/api/v2/volume.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_web/api/v1/lvol.py b/simplyblock_web/api/v1/lvol.py index 78d58b024..31443e6c8 100644 --- a/simplyblock_web/api/v1/lvol.py +++ b/simplyblock_web/api/v1/lvol.py @@ -316,7 +316,7 @@ def replication_start(uuid): except KeyError as e: return utils.get_response_error(str(e), 404) - ret = lvol_controller.replication_start(uuid) + ret = lvol_controller.replication_trigger(uuid) return utils.get_response(ret) @bp.route('/lvol/replication_stop/', methods=['PUT']) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index a7e23a2e0..052196411 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -187,7 +187,7 @@ def inflate(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: @instance_api.post('/replication_start', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) def replication_start(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: - if not lvol_controller.replication_start(volume.get_id()): + if not lvol_controller.replication_trigger(volume.get_id()): raise ValueError('Failed to start volume snapshot replication') return Response(status_code=204) From f81e198805a8c1fb2829c3b8f4c32d1c9f9d5ba0 Mon Sep 17 00:00:00 2001 From: Waleed Mousa <32266980+wmousa@users.noreply.github.com> Date: Thu, 5 Feb 2026 14:11:01 +0100 Subject: [PATCH 148/192] Hotfix to main (#862) * Prometheus hostpath (#761) * fixed merge conflict * fixed merge conflict * increased k8s fdb memory limit (#740) * added try and except to patch_prometheus_configmap func (#756) * use max_size instead as hugepage memory when set (#754) * use max_size instead as hugepage memory when set * fixed linter issue * fixed type check * fixed: Remove assignment to unused variable e * fixed: Remove extraneous prefix * comment out to using max_size value as hugpages * comment out max_size * setup ramdisk mount * use the max of huge_page_memory and max_size when max_size is set * Update storage_init_job.yaml.j2 * Update storage_init_job.yaml.j2 * Update storage_init_job.yaml.j2 * removed nsenter * fixed Syntax error: end of file unexpected * run systemctl ram mount disk on host * updated core Isolation * inherit from worker pool * schedule admin control replica pod on the different worker * check and compare hugepage before and after apply_config * increased node add to 16 retry * run upto 2 replicas of the ingress controller across workers * support 2 fdb coordinator failure * sleep core isolation job for 5mins * use emptyDir memory medium as socket directory (#783) * added graylog env GRAYLOG_MESSAGE_JOURNAL_MAX_SIZE (#782) * addd endpoint bind_device_to_nvme in kubernetes (#784) * remove fdb customParameters * removed hostpath capacity * Update mongodb.yaml * Update app_k8s.yaml * added check for spdk_container is running * removed helm chart dependency * Revert "removed helm chart dependency" This reverts commit 65ffd7b63980a4085f9b80a6e07e298f06f697fc. * Revert "added check for spdk_container is running" This reverts commit 5f0e729125cfa46f1c5bcfbc214ceeaef7b76ac9. * Increase total sys HP memory with a buffer .5G for each sn node and and container limit * Change hugepages memory variable from MEM_GEGA to MEM_MEGA * Change hugepages memory variable from MEM_GEGA to MEM_MEGA2 * Add fix to p2p to allow passing non-exist pci * Add option --nvme-names to select nvme devices by their namespace names (#859) and also add --format-4k to sn add-node to force format the nvme devices with 4K * bind device to spdk before formatting it * Fix type issue * Make LVOL_NVMF_PORT_START configurable via environment variable * Fix sn restart with new device - check new pcie against the current ssds list for the node - bind new devices to spdk driver * Fix storage node key error on port allow task * Handle sync delete errors in tasks_runner_sync_lvol_del.py and update task status accordingly * Update env_var * fix --------- Co-authored-by: Geoffrey Israel Co-authored-by: hamdykhader --- simplyblock_cli/cli-reference.yaml | 13 ++ simplyblock_cli/cli.py | 2 + simplyblock_cli/clibase.py | 7 +- simplyblock_core/constants.py | 4 +- .../controllers/lvol_controller.py | 4 +- .../controllers/tasks_controller.py | 2 +- simplyblock_core/scripts/charts/Chart.yaml | 1 + .../scripts/charts/templates/app_k8s.yaml | 9 +- .../templates/csi-hostpath-controller.yaml | 217 ++++++++++++++++++ .../templates/csi-hostpath-driverinfo.yaml | 4 +- .../charts/templates/csi-hostpath-node.yaml | 163 +++++++++++++ .../charts/templates/foundationdb.yaml | 4 +- simplyblock_core/scripts/charts/values.yaml | 15 +- .../services/storage_node_monitor.py | 5 +- .../services/tasks_runner_node_add.py | 2 +- .../services/tasks_runner_port_allow.py | 6 +- .../services/tasks_runner_sync_lvol_del.py | 8 +- simplyblock_core/snode_client.py | 4 + simplyblock_core/storage_node_ops.py | 33 +-- simplyblock_core/utils/__init__.py | 162 ++++++++++--- .../api/internal/storage_node/docker.py | 8 + .../api/internal/storage_node/kubernetes.py | 12 +- simplyblock_web/api/v1/storage_node.py | 10 + simplyblock_web/api/v2/storage_node.py | 2 + simplyblock_web/node_configure.py | 15 +- simplyblock_web/templates/Untitled-1.j2 | 0 .../oc_storage_core_isolation.yaml.j2 | 12 +- .../templates/storage_deploy_spdk.yaml.j2 | 5 +- .../templates/storage_init_job.yaml.j2 | 54 ++++- 29 files changed, 704 insertions(+), 79 deletions(-) create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml create mode 100644 simplyblock_web/templates/Untitled-1.j2 diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 3ef6d71d8..1855384bc 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -102,6 +102,14 @@ commands: required: false type: str default: "" + - name: "--nvme-names" + help: "Comma separated list of nvme namespace names like nvme0n1,nvme1n1..." + description: > + Comma separated list of nvme namespace names like nvme0n1,nvme1n1... + dest: nvme_names + required: false + type: str + default: "" - name: "--force" help: "Force format detected or passed nvme pci address to 4K and clean partitions" dest: force @@ -150,6 +158,11 @@ commands: dest: partitions type: int default: 1 + - name: "--format-4k" + help: "Force format nvme devices with 4K" + dest: format_4k + type: bool + action: store_true - name: "--jm-percent" help: "Number in percent to use for JM from each device" dest: jm_percent diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 3f85be70a..ef5449e53 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -96,6 +96,7 @@ def init_storage_node__configure(self, subparser): argument = subcommand.add_argument('--pci-blocked', help='Comma separated list of PCI addresses of Nvme devices to not use for storage devices', type=str, default='', dest='pci_blocked', required=False) argument = subcommand.add_argument('--device-model', help='NVMe SSD model string, example: --model PM1628, --device-model and --size-range must be set together', type=str, default='', dest='device_model', required=False) argument = subcommand.add_argument('--size-range', help='NVMe SSD device size range separated by -, can be X(m,g,t) or bytes as integer, example: --size-range 50G-1T or --size-range 1232345-67823987, --device-model and --size-range must be set together', type=str, default='', dest='size_range', required=False) + argument = subcommand.add_argument('--nvme-names', help='Comma separated list of nvme namespace names like nvme0n1,nvme1n1...', type=str, default='', dest='nvme_names', required=False) argument = subcommand.add_argument('--force', help='Force format detected or passed nvme pci address to 4K and clean partitions', dest='force', action='store_true') def init_storage_node__configure_upgrade(self, subparser): @@ -114,6 +115,7 @@ def init_storage_node__add_node(self, subparser): subcommand.add_argument('node_addr', help='Address of storage node api to add, like :5000', type=str) subcommand.add_argument('ifname', help='Management interface name', type=str) argument = subcommand.add_argument('--journal-partition', help='1: auto-create small partitions for journal on nvme devices. 0: use a separate (the smallest) nvme device of the node for journal. The journal needs a maximum of 3 percent of total available raw disk space.', type=int, default=1, dest='partitions') + argument = subcommand.add_argument('--format-4k', help='Force format nvme devices with 4K', dest='format_4k', action='store_true') if self.developer_mode: argument = subcommand.add_argument('--jm-percent', help='Number in percent to use for JM from each device', type=int, default=3, dest='jm_percent') argument = subcommand.add_argument('--data-nics', help='Storage network interface names. currently one interface is supported.', type=str, dest='data_nics', nargs='+') diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 240df3381..58f01cae3 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -103,12 +103,15 @@ def storage_node__configure(self, sub_command, args): max_prov = utils.parse_size(max_size, assume_unit='G') pci_allowed = [] pci_blocked = [] + nvme_names = [] if args.pci_allowed: pci_allowed = [str(x) for x in args.pci_allowed.split(',')] if args.pci_blocked: pci_blocked = [str(x) for x in args.pci_blocked.split(',')] if (args.device_model and not args.size_range) or (not args.device_model and args.size_range): self.parser.error("device_model and size_range must be set together") + if args.nvme_names: + nvme_names = [str(x) for x in args.nvme_names.split(',')] use_pci_allowed = bool(args.pci_allowed) use_pci_blocked = bool(args.pci_blocked) use_model_range = bool(args.device_model and args.size_range) @@ -122,7 +125,7 @@ def storage_node__configure(self, sub_command, args): return storage_ops.generate_automated_deployment_config( args.max_lvol, max_prov, sockets_to_use,args.nodes_per_socket, pci_allowed, pci_blocked, force=args.force, device_model=args.device_model, - size_range=args.size_range, cores_percentage=cores_percentage) + size_range=args.size_range, cores_percentage=cores_percentage, nvme_names=nvme_names) def storage_node__deploy_cleaner(self, sub_command, args): storage_ops.deploy_cleaner() @@ -150,6 +153,7 @@ def storage_node__add_node(self, sub_command, args): enable_ha_jm = args.enable_ha_jm namespace = args.namespace ha_jm_count = args.ha_jm_count + format_4k = args.format_4k try: out = storage_ops.add_node( cluster_id=cluster_id, @@ -169,6 +173,7 @@ def storage_node__add_node(self, sub_command, args): id_device_by_nqn=args.id_device_by_nqn, partition_size=args.partition_size, ha_jm_count=ha_jm_count, + format_4k=format_4k ) except Exception as e: print(e) diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index ff5bd484f..23cb100d8 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -233,7 +233,9 @@ def get_config_var(name, default=None): # ports ranges RPC_PORT_RANGE_START = 8080 -LVOL_NVMF_PORT_START = 9100 NODE_NVMF_PORT_START=9060 NODE_HUBLVOL_PORT_START=9030 FW_PORT_START = 50001 +# todo(hamdy): make it configurable: sfam-2586 +LVOL_NVMF_PORT_ENV = os.getenv("LVOL_NVMF_PORT_START", "") +LVOL_NVMF_PORT_START = int(LVOL_NVMF_PORT_ENV) if LVOL_NVMF_PORT_ENV else 9100 \ No newline at end of file diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 4e948ead2..64b985994 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -714,7 +714,7 @@ def add_lvol_on_node(lvol, snode, is_primary=True): return False, f"Failed to create listener for {lvol.get_id()}" logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid,f"{lvol.vuid:016X}") + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, lvol.ns_id) if not ret: return False, "Failed to add bdev to subsystem" lvol.ns_id = int(ret) @@ -758,7 +758,7 @@ def recreate_lvol_on_node(lvol, snode, ha_inode_self=0, ana_state=None): # if namespace_found is False: logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, f"{lvol.vuid:016X}") + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, lvol.ns_id) # if not ret: # return False, "Failed to add bdev to subsystem" diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index a51425861..cef49e85d 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -322,7 +322,7 @@ def add_new_device_mig_task(device_id): def add_node_add_task(cluster_id, function_params): return _add_task(JobSchedule.FN_NODE_ADD, cluster_id, "", "", - function_params=function_params, max_retry=11) + function_params=function_params, max_retry=16) def get_active_node_tasks(cluster_id, node_id): diff --git a/simplyblock_core/scripts/charts/Chart.yaml b/simplyblock_core/scripts/charts/Chart.yaml index 671f39cfa..2790e31df 100644 --- a/simplyblock_core/scripts/charts/Chart.yaml +++ b/simplyblock_core/scripts/charts/Chart.yaml @@ -25,6 +25,7 @@ dependencies: - name: prometheus version: "25.18.0" repository: "https://prometheus-community.github.io/helm-charts" + condition: monitoring.enabled - name: ingress-nginx version: 4.10.1 repository: "https://kubernetes.github.io/ingress-nginx" diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index 1626292e2..148b450e7 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -5,7 +5,7 @@ metadata: name: simplyblock-admin-control namespace: {{ .Release.Namespace }} spec: - replicas: 1 + replicas: 2 selector: matchLabels: app: simplyblock-admin-control @@ -21,6 +21,13 @@ spec: serviceAccountName: simplyblock-sa hostNetwork: true dnsPolicy: ClusterFirstWithHostNet + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + app: simplyblock-admin-control + topologyKey: kubernetes.io/hostname containers: - name: simplyblock-control image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml new file mode 100644 index 000000000..153c29bda --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml @@ -0,0 +1,217 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-hostpathplugin-sa + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: csi-hostpathplugin +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: [""] + resources: ["persistentvolumeclaims/status"] + verbs: ["get", "update", "patch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csinodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csistoragecapacities"] + verbs: ["get", "list", "watch", "create", "update", "delete"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch", "update", "get", "list", "watch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: csi-hostpathplugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: csi-hostpathplugin +subjects: + - kind: ServiceAccount + name: csi-hostpathplugin-sa + namespace: {{ .Release.Namespace }} +--- +kind: StatefulSet +apiVersion: apps/v1 +metadata: + name: csi-hostpathplugin + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin +spec: + serviceName: "csi-hostpathplugin" + # One replica only: + # Host path driver only works when everything runs + # on a single node. + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + template: + metadata: + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + spec: + serviceAccountName: csi-hostpathplugin-sa + containers: + - name: hostpath + image: registry.k8s.io/sig-storage/hostpathplugin:v1.17.0 + args: + - "--drivername=hostpath.csi.k8s.io" + - "--v=5" + - "--endpoint=$(CSI_ENDPOINT)" + - "--nodeid=$(KUBE_NODE_NAME)" + # end hostpath args + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + securityContext: + privileged: true + ports: + - containerPort: 9898 + name: healthz + protocol: TCP + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: healthz + initialDelaySeconds: 10 + timeoutSeconds: 3 + periodSeconds: 2 + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /var/lib/kubelet/pods + mountPropagation: Bidirectional + name: mountpoint-dir + - mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + name: plugins-dir + - mountPath: /csi-data-dir + name: csi-data-dir + - mountPath: /dev + name: dev-dir + + - name: liveness-probe + volumeMounts: + - mountPath: /csi + name: socket-dir + image: registry.k8s.io/sig-storage/livenessprobe:v2.17.0 + args: + - --csi-address=/csi/csi.sock + - --health-port=9898 + + - name: csi-provisioner + image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0 + args: + - -v=5 + - --csi-address=/csi/csi.sock + - --feature-gates=Topology=true + - --enable-capacity + - --capacity-ownerref-level=0 # pod is owner + - --node-deployment=true + - --strict-topology=true + - --immediate-topology=false + - --worker-threads=5 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + # end csi-provisioner args + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + volumeMounts: + - mountPath: /csi + name: socket-dir + + - name: csi-resizer + image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0 + args: + - -v=5 + - -csi-address=/csi/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + volumeMounts: + - mountPath: /csi + name: socket-dir + + volumes: + - hostPath: + path: /var/lib/kubelet/plugins/csi-hostpath + type: DirectoryOrCreate + name: socket-dir + - hostPath: + path: /var/lib/kubelet/pods + type: DirectoryOrCreate + name: mountpoint-dir + - hostPath: + path: /var/lib/kubelet/plugins_registry + type: Directory + name: registration-dir + - hostPath: + path: /var/lib/kubelet/plugins + type: Directory + name: plugins-dir + - hostPath: + # 'path' is where PV data is persisted on host. + # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot + path: /var/lib/csi-hostpath-data/ + type: DirectoryOrCreate + name: csi-data-dir + - hostPath: + path: /dev + type: Directory + name: dev-dir + # end csi volumes diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml index 2a9d7d044..2f6a64b14 100644 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml @@ -17,8 +17,8 @@ spec: podInfoOnMount: true # No attacher needed. attachRequired: false - storageCapacity: false - # Kubernetes may use fsGroup to change permissions and ownership + storageCapacity: true + # Kubernetes may use fsGroup to change permissions and ownership # of the volume to match user requested fsGroup in the pod's SecurityPolicy fsGroupPolicy: File \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml new file mode 100644 index 000000000..07e08f36e --- /dev/null +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml @@ -0,0 +1,163 @@ + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-hostpath-node-sa + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: csi-hostpath-node +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: csi-hostpath-node +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: csi-hostpath-node +subjects: + - kind: ServiceAccount + name: csi-hostpath-node-sa + namespace: {{ .Release.Namespace }} +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: csi-hostpathplugin + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin +spec: + selector: + matchLabels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + template: + metadata: + labels: + app.kubernetes.io/instance: hostpath.csi.k8s.io + app.kubernetes.io/part-of: csi-driver-host-path + app.kubernetes.io/name: csi-hostpathplugin + app.kubernetes.io/component: plugin + spec: + serviceAccountName: csi-hostpath-node-sa + containers: + - name: node-driver-registrar + image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0 + args: + - --v=5 + - --csi-address=/csi/csi.sock + - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock + securityContext: + # This is necessary only for systems with SELinux, where + # non-privileged sidecar containers cannot access unix domain socket + # created by privileged CSI driver container. + privileged: true + env: + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /registration + name: registration-dir + - mountPath: /csi-data-dir + name: csi-data-dir + + - name: hostpath + image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0 + args: + - --drivername=hostpath.csi.k8s.io + - --v=5 + - --endpoint=$(CSI_ENDPOINT) + - --nodeid=$(KUBE_NODE_NAME) + - --capacity=slow=10Gi + - --capacity=fast=100Gi + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + securityContext: + privileged: true + ports: + - containerPort: 9898 + name: healthz + protocol: TCP + livenessProbe: + failureThreshold: 5 + httpGet: + path: /healthz + port: healthz + initialDelaySeconds: 10 + timeoutSeconds: 3 + periodSeconds: 2 + volumeMounts: + - mountPath: /csi + name: socket-dir + - mountPath: /var/lib/kubelet/pods + mountPropagation: Bidirectional + name: mountpoint-dir + - mountPath: /var/lib/kubelet/plugins + mountPropagation: Bidirectional + name: plugins-dir + - mountPath: /csi-data-dir + name: csi-data-dir + - mountPath: /dev + name: dev-dir + - name: liveness-probe + volumeMounts: + - mountPath: /csi + name: socket-dir + image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0 + args: + - --csi-address=/csi/csi.sock + - --health-port=9898 + + volumes: + - hostPath: + path: /var/lib/kubelet/plugins/csi-hostpath + type: DirectoryOrCreate + name: socket-dir + - hostPath: + path: /var/lib/kubelet/pods + type: DirectoryOrCreate + name: mountpoint-dir + - hostPath: + path: /var/lib/kubelet/plugins_registry + type: Directory + name: registration-dir + - hostPath: + path: /var/lib/kubelet/plugins + type: Directory + name: plugins-dir + - hostPath: + # 'path' is where PV data is persisted on host. + # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot + path: /var/lib/csi-hostpath-data/ + type: DirectoryOrCreate + name: csi-data-dir + - hostPath: + path: /dev + type: Directory + name: dev-dir diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml index 5020c2fea..96d1c1979 100644 --- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml +++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml @@ -53,7 +53,7 @@ spec: - /manager args: - "--health-probe-bind-address=:9443" - image: foundationdb/fdb-kubernetes-operator:v2.13.0 + image: foundationdb/fdb-kubernetes-operator:v2.18.0 name: manager env: - name: WATCH_NAMESPACE @@ -230,6 +230,8 @@ spec: processGroupIDLabels: - foundationdb.org/fdb-process-group-id minimumUptimeSecondsForBounce: 60 + databaseConfiguration: + redundancy_mode: triple processCounts: {{- if .Values.foundationdb.multiAZ }} cluster_controller: 1 diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml index 7272846d7..f6837654b 100644 --- a/simplyblock_core/scripts/charts/values.yaml +++ b/simplyblock_core/scripts/charts/values.yaml @@ -20,7 +20,7 @@ image: pullPolicy: "Always" ports: - lvolNvmfPortStart: 9100 + lvolNvmfPortStart: storageclass: allowedTopologyZones: [] @@ -58,7 +58,7 @@ opensearch: persistence: enabled: true storageClass: local-hostpath - size: 20Gi + size: 10Gi resources: requests: @@ -185,10 +185,21 @@ ingress: controller: hostNetwork: true dnsPolicy: ClusterFirstWithHostNet + replicaCount: 2 service: type: ClusterIP extraArgs: tcp-services-configmap: "simplyblock/simplyblock-tcp-services" + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - ingress + topologyKey: "kubernetes.io/hostname" nodeSelector: {} diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py index 24079d51c..bcc737bc3 100644 --- a/simplyblock_core/services/storage_node_monitor.py +++ b/simplyblock_core/services/storage_node_monitor.py @@ -431,17 +431,20 @@ def loop_for_node(snode): if cluster.status == Cluster.STATUS_IN_ACTIVATION: logger.info(f"Cluster status is: {cluster.status}, skipping monitoring") continue - + logger.info(f"Looping for cluster {cluster_id}") nodes = db.get_storage_nodes_by_cluster_id(cluster_id) for node in nodes: node_id = node.get_id() if node_id not in threads_maps or threads_maps[node_id].is_alive() is False: + logger.info(f"Creating thread for node {node_id}") t = threading.Thread(target=loop_for_node, args=(node,)) t.start() threads_maps[node_id] = t + logger.debug(threads_maps[node_id]) try: update_cluster_status(cluster_id) + logger.debug("Iteration has been finished...") except Exception: logger.error("Error while updating cluster status") time.sleep(constants.NODE_MONITOR_INTERVAL_SEC) diff --git a/simplyblock_core/services/tasks_runner_node_add.py b/simplyblock_core/services/tasks_runner_node_add.py index 819e611d7..263f2c73e 100644 --- a/simplyblock_core/services/tasks_runner_node_add.py +++ b/simplyblock_core/services/tasks_runner_node_add.py @@ -76,4 +76,4 @@ def process_task(task): delay_seconds *= 2 time.sleep(delay_seconds) - time.sleep(constants.TASK_EXEC_INTERVAL_SEC) + time.sleep(30) diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py index 1417c3a62..fd706b18a 100644 --- a/simplyblock_core/services/tasks_runner_port_allow.py +++ b/simplyblock_core/services/tasks_runner_port_allow.py @@ -26,9 +26,9 @@ def exec_port_allow_task(task): task.write_to_db(db.kv_store) return - node = db.get_storage_node_by_id(task.node_id) - - if not node: + try: + node = db.get_storage_node_by_id(task.node_id) + except KeyError: task.function_result = "node not found" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) diff --git a/simplyblock_core/services/tasks_runner_sync_lvol_del.py b/simplyblock_core/services/tasks_runner_sync_lvol_del.py index ce41806a4..bce8692c3 100644 --- a/simplyblock_core/services/tasks_runner_sync_lvol_del.py +++ b/simplyblock_core/services/tasks_runner_sync_lvol_del.py @@ -71,8 +71,12 @@ if "code" in err and err["code"] == -19: logger.error(f"Sync delete completed with error: {err}") else: - logger.error( - f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + msg = f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}" + logger.error(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + continue task.function_result = f"bdev {lvol_bdev_name} deleted" task.status = JobSchedule.STATUS_DONE diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py index 53ffe0583..efb724355 100644 --- a/simplyblock_core/snode_client.py +++ b/simplyblock_core/snode_client.py @@ -157,6 +157,10 @@ def bind_device_to_nvme(self, device_pci): params = {"device_pci": device_pci} return self._request("POST", "bind_device_to_nvme", params) + def format_device_with_4k(self, device_pci): + params = {"device_pci": device_pci} + return self._request("POST", "format_device_with_4k", params) + def bind_device_to_spdk(self, device_pci): params = {"device_pci": device_pci} return self._request("POST", "bind_device_to_spdk", params) diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py index 1e505b8bc..8124170ae 100644 --- a/simplyblock_core/storage_node_ops.py +++ b/simplyblock_core/storage_node_ops.py @@ -913,7 +913,8 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, small_bufsize=0, large_bufsize=0, num_partitions_per_dev=0, jm_percent=0, enable_test_device=False, namespace=None, enable_ha_jm=False, cr_name=None, cr_namespace=None, cr_plural=None, - id_device_by_nqn=False, partition_size="", ha_jm_count=3): + id_device_by_nqn=False, partition_size="", ha_jm_count=3, format_4k=False): + snode_api = SNodeClient(node_addr) node_info, _ = snode_api.info() if node_info.get("nodes_config") and node_info["nodes_config"].get("nodes"): @@ -1098,6 +1099,9 @@ def add_node(cluster_id, node_addr, iface_name, data_nics_list, l_cores = node_config.get("l-cores") spdk_cpu_mask = node_config.get("cpu_mask") for ssd in ssd_pcie: + if format_4k: + snode_api.format_device_with_4k(ssd) + snode_api.bind_device_to_spdk(ssd) snode_api.bind_device_to_spdk(ssd) try: results, err = snode_api.spdk_process_start( @@ -1783,7 +1787,14 @@ def restart_storage_node( results = None try: if new_ssd_pcie and type(new_ssd_pcie) is list: - snode.ssd_pcie.extend(new_ssd_pcie) + for new_ssd in new_ssd_pcie: + if new_ssd not in snode.ssd_pcie: + try: + snode_api.bind_device_to_spdk(new_ssd) + except Exception as e: + logger.error(e) + snode.ssd_pcie.append(new_ssd) + fdb_connection = cluster.db_connection results, err = snode_api.spdk_process_start( snode.l_cores, snode.spdk_mem, snode.spdk_image, spdk_debug, cluster_ip, fdb_connection, @@ -2731,7 +2742,7 @@ def upgrade_automated_deployment_config(): def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, - cores_percentage=0, force=False, device_model="", size_range=""): + cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None, k8s=False): # we need minimum of 6 VPCs. RAM 4GB min. Plus 0.2% of the storage. total_cores = os.cpu_count() or 0 if total_cores < 6: @@ -2743,7 +2754,7 @@ def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nod nodes_config, system_info = utils.generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, cores_percentage, force=force, - device_model=device_model, size_range=size_range) + device_model=device_model, size_range=size_range, nvme_names=nvme_names) if not nodes_config or not nodes_config.get("nodes"): return False utils.store_config_file(nodes_config, constants.NODES_CONFIG_FILE, create_read_only_file=True) @@ -2755,7 +2766,8 @@ def generate_automated_deployment_config(max_lvol, max_prov, sockets_to_use, nod for node_config in nodes_config["nodes"]: numa = node_config["socket"] huge_page_memory_dict[numa] = huge_page_memory_dict.get(numa, 0) + node_config["huge_page_memory"] - utils.create_rpc_socket_mount() + if not k8s: + utils.create_rpc_socket_mount() # for numa, huge_page_memory in huge_page_memory_dict.items(): # num_pages = huge_page_memory // (2048 * 1024) # utils.set_hugepages_if_needed(numa, num_pages) @@ -2855,15 +2867,8 @@ def deploy_cleaner(): scripts.deploy_cleaner() -def clean_devices(config_path): - with open(config_path) as f: - cfg = json.load(f) - ssd_pcis = [ - pci - for node in cfg.get("nodes", []) - for pci in node.get("ssd_pcis", []) - ] - utils.clean_devices(ssd_pcis) +def clean_devices(config_path, format=True, force=False): + utils.clean_devices(config_path, format=format, force=force) def get_host_secret(node_id): diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 47f3c16e8..ac38b54af 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -1,4 +1,5 @@ # coding=utf-8 +import glob import json import logging import math @@ -11,8 +12,8 @@ import uuid import time from datetime import datetime, timezone +from typing import Union, Any, Optional, Tuple, List, Dict, Iterable from docker import DockerClient -from typing import Union, Any, Optional, Tuple, List, Dict from kubernetes import client, config from kubernetes.client import ApiException, V1Deployment, V1DeploymentSpec, V1ObjectMeta, \ V1PodTemplateSpec, V1PodSpec, V1Container, V1EnvVar, V1VolumeMount, V1Volume, V1ConfigMapVolumeSource, \ @@ -743,6 +744,8 @@ def first_six_chars(s: str) -> str: If the string is shorter than six characters, returns the entire string. """ return s[:6] + + def nearest_upper_power_of_2(n): # Check if n is already a power of 2 if (n & (n - 1)) == 0: @@ -1264,9 +1267,10 @@ def get_nvme_pci_devices(): return [], [] -def detect_nvmes(pci_allowed, pci_blocked, device_model, size_range): +def detect_nvmes(pci_allowed, pci_blocked, device_model, size_range, nvme_names): pci_addresses, blocked_devices = get_nvme_pci_devices() ssd_pci_set = set(pci_addresses) + claim_devices_to_nvme() # Normalize SSD PCI addresses and user PCI list if pci_allowed: @@ -1278,10 +1282,10 @@ def detect_nvmes(pci_allowed, pci_blocked, device_model, size_range): # Check for unmatched addresses unmatched = user_pci_set - ssd_pci_set if unmatched: - logger.error(f"Invalid PCI addresses: {', '.join(unmatched)}") - return [] - - pci_addresses = list(user_pci_set) + logger.warn(f"Invalid PCI addresses: {', '.join(unmatched)}") + pci_addresses = user_pci_set & ssd_pci_set + else: + pci_addresses = list(user_pci_set) for pci in pci_addresses: pci_utils.ensure_driver(pci, 'nvme', override=True) logger.debug(f"Found nvme devices are {pci_addresses}") @@ -1289,6 +1293,9 @@ def detect_nvmes(pci_allowed, pci_blocked, device_model, size_range): pci_addresses = query_nvme_ssd_by_model_and_size(device_model, size_range) logger.debug(f"Found nvme devices are {pci_addresses}") pci_allowed = pci_addresses + elif nvme_names: + pci_addresses = query_nvme_ssd_by_namespace_names(nvme_names) + pci_allowed = pci_addresses elif pci_blocked: user_pci_set = set( addr if len(addr.split(":")[0]) == 4 else f"0000:{addr}" @@ -1351,10 +1358,10 @@ def get_core_indexes(core_to_index, list_of_cores): def build_unisolated_stride( - all_cores: List[int], - num_unisolated: int, - client_qpair_count: int, - pool_stride: int = 2, + all_cores: List[int], + num_unisolated: int, + client_qpair_count: int, + pool_stride: int = 2, ) -> List[int]: """ Build a list of 'unisolated' CPUs by picking from per-qpair pools. @@ -1392,7 +1399,7 @@ def build_unisolated_stride( # Build pools pool_size = math.ceil(total / client_qpair_count) - pools = [cores[i * pool_size : min((i + 1) * pool_size, total)] for i in range(client_qpair_count)] + pools = [cores[i * pool_size: min((i + 1) * pool_size, total)] for i in range(client_qpair_count)] pools = [p for p in pools if p] # drop empties # Per-pool index (within each pool) @@ -1455,8 +1462,7 @@ def generate_core_allocation(cores_by_numa, sockets_to_use, nodes_per_socket, co continue all_cores = sorted(cores_by_numa[numa_node]) num_unisolated = calculate_unisolated_cores(all_cores, cores_percentage) - unisolated = build_unisolated_stride(all_cores,num_unisolated,constants.CLIENT_QPAIR_COUNT) - + unisolated = build_unisolated_stride(all_cores, num_unisolated, constants.CLIENT_QPAIR_COUNT) available_cores = [c for c in all_cores if c not in unisolated] q1 = len(available_cores) // 4 @@ -1597,7 +1603,7 @@ def regenerate_config(new_config, old_config, force=False): def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_allowed, pci_blocked, - cores_percentage=0, force=False, device_model="", size_range=""): + cores_percentage=0, force=False, device_model="", size_range="", nvme_names=None): system_info = {} nodes_config: dict = {"nodes": []} @@ -1605,7 +1611,7 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a validate_sockets(sockets_to_use, cores_by_numa) logger.debug(f"Cores by numa {cores_by_numa}") nics = detect_nics() - nvmes = detect_nvmes(pci_allowed, pci_blocked, device_model, size_range) + nvmes = detect_nvmes(pci_allowed, pci_blocked, device_model, size_range, nvme_names) if not nvmes: logger.error( "There are no enough SSD devices on system, you may run 'sbctl sn clean-devices', to clean devices stored in /etc/simplyblock/sn_config_file") @@ -1755,6 +1761,29 @@ def generate_configs(max_lvol, max_prov, sockets_to_use, nodes_per_socket, pci_a return final_config, system_info +def get_nvme_name_from_pci(pci_address): + # Search for the PCI address in the sysfs tree for NVMe devices + path = f"/sys/bus/pci/devices/{pci_address}/nvme/nvme*" + matches = glob.glob(path) + + if matches: + # returns 'nvme0' + return os.path.basename(matches[0]) + return None + + +def format_device_with_4k(pci_device): + try: + nvme_device = get_nvme_name_from_pci(pci_device) + nvme_device_path = f"/dev/{nvme_device}n1" + clean_partitions(nvme_device_path) + nvme_json_string = get_idns(nvme_device_path) + lbaf_id = find_lbaf_id(nvme_json_string, 0, 12) + format_nvme_device(nvme_device_path, lbaf_id) + except Exception as e: + logger.error(f"Failed to format device with 4K {e}") + + def set_hugepages_if_needed(node, hugepages_needed, page_size_kb=2048): """Set hugepages for a specific NUMA node if current number is less than needed.""" hugepage_path = f"/sys/devices/system/node/node{node}/hugepages/hugepages-{page_size_kb}kB/nr_hugepages" @@ -2623,7 +2652,34 @@ def get_idns(nvme_device: str): return "Error: The 'nvme' command was not found. Is 'nvme-cli' installed?" +def is_namespace_4k_from_nvme_list(device_path: str) -> bool: + """ + Returns True if nvme list JSON shows SectorSize == 4096 for the given DevicePath + (e.g. '/dev/nvme3n1'). + """ + try: + out = subprocess.check_output(["nvme", "list", "--output-format", "json"], text=True) + data = json.loads(out) + + for dev in data.get("Devices", []): + if dev.get("DevicePath") == device_path: + return int(dev.get("SectorSize", 0)) == 4096 + + # Not found in list + return False + + except subprocess.CalledProcessError: + print("Error: nvme list failed") + return False + except (ValueError, json.JSONDecodeError) as e: + print(f"Error parsing nvme list output: {e}") + return False + + def format_nvme_device(nvme_device: str, lbaf_id: int): + if is_namespace_4k_from_nvme_list(nvme_device): + logger.debug(f"Device {nvme_device} already formatted with 4K...skipping") + return command = ['nvme', 'format', nvme_device, f"--lbaf={lbaf_id}", '--force'] print(" ".join(command)) try: @@ -2720,9 +2776,58 @@ def query_nvme_ssd_by_model_and_size(model: str, size_range: str) -> list: return pci_lst -def clean_devices(nvme_devices_list): - for pci in nvme_devices_list: - pci_utils.ensure_driver(pci, 'nvme') +def query_nvme_ssd_by_namespace_names(nvme_names: Iterable[str]) -> List[str]: + """ + Match NVMe devices by namespace names (e.g. nvme0n1, nvme1n1) using nvme list -v JSON output. + Returns a de-duplicated list of PCI addresses (e.g. 0000:00:03.0). + """ + nvme_names = list(nvme_names or []) + if not nvme_names: + print("No NVMe device names specified.") + return [] + + wanted = set(nvme_names) + + json_string = get_nvme_list_verbose() # should return the JSON string shown in your example + data = json.loads(json_string) + + out: List[str] = [] + seen = set() + + for dev in data.get("Devices", []): + for subsys in dev.get("Subsystems", []): + for ctrl in subsys.get("Controllers", []): + addr = ctrl.get("Address") + for ns in ctrl.get("Namespaces", []) or []: + ns_name = ns.get("NameSpace") # <-- exact key in your JSON + if ns_name in wanted and addr and addr not in seen: + seen.add(addr) + out.append(addr) + break + + return out + + +def claim_devices_to_nvme(config_path=""): + config_path = config_path or constants.NODES_CONFIG_FILE + nvme_devices_list = [] + try: + with open(config_path) as f: + cfg = json.load(f) + nvme_devices_list = [ + pci + for node in cfg.get("nodes", []) + for pci in node.get("ssd_pcis", []) + ] + for pci in nvme_devices_list: + pci_utils.ensure_driver(pci, 'nvme') + except Exception as e: + print(f"An unexpected error occurred: {e}") + return nvme_devices_list + + +def clean_devices(config_path, format, force): + nvme_devices_list = claim_devices_to_nvme(config_path) try: json_string = get_nvme_list_verbose() data = json.loads(json_string) @@ -2741,16 +2846,19 @@ def clean_devices(nvme_devices_list): "NAMESPACE": controller.get("Namespaces")[0].get("NameSpace") }) nvme_devices += f"/dev/{controller.get('Namespaces')[0].get('NameSpace')} " - logger.warning(f"Formating Nvme devices {nvme_devices}") - answer = input("Type YES/Y to continue: ").strip().lower() - if answer not in ("yes", "y"): - logger.warning("Aborted by user.") - exit(1) + if format: + logger.warning(f"Formating Nvme devices {nvme_devices}") + if not force: + answer = input("Type YES/Y to continue: ").strip().lower() + if answer not in ("yes", "y"): + logger.warning("Aborted by user.") + exit(1) + + for mapping in controllers_list: + if mapping['PCI_Address'] in nvme_devices_list: + nvme_device_path = f"/dev/{mapping['NAMESPACE']}" + clean_partitions(nvme_device_path) - for mapping in controllers_list: - if mapping['PCI_Address'] in nvme_devices_list: - nvme_device_path = f"/dev/{mapping['NAMESPACE']}" - clean_partitions(nvme_device_path) except json.JSONDecodeError as e: logger.error(f"Error decoding JSON: {e}") diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index 51090b9b2..6eb95a5f0 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -18,6 +18,7 @@ from simplyblock_core import scripts, constants, shell_utils, utils as core_utils import simplyblock_core.utils.pci as pci_utils +import simplyblock_core.utils as init_utils from simplyblock_web import utils, node_utils logger = core_utils.get_logger(__name__) @@ -538,6 +539,13 @@ def delete_gpt_partitions_for_dev(body: utils.DeviceParams): SYSTEM_ID = CLOUD_INFO["id"] +@api.post('/format_device_with_4k') +def format_device_with_4k(body: utils.DeviceParams): + pci_utils.ensure_driver(body.device_pci, 'nvme') + init_utils.format_device_with_4k(body.device_pci) + return utils.get_response(True) + + @api.post('/bind_device_to_spdk') def bind_device_to_spdk(body: utils.DeviceParams): device_path = pci_utils.device(body.device_pci) diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py index 580921d58..65ad28a3b 100644 --- a/simplyblock_web/api/internal/storage_node/kubernetes.py +++ b/simplyblock_web/api/internal/storage_node/kubernetes.py @@ -526,18 +526,11 @@ def spdk_process_kill(query: utils.RPCPortParams): def _is_pod_up(rpc_port, cluster_id): k8s_core_v1 = core_utils.get_k8s_core_client() pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}" - container_name = "spdk-container" try: resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace()) for pod in resp.items: if pod.metadata.name.startswith(pod_name): - if pod.status.phase == "Running": - cs = next((c for c in pod.status.container_statuses if c.name == container_name),None) - if cs is None: - logger.error(f"Container '{container_name}' not found in pod '{pod_name}'") - return False - if cs.state.running: - return True + return pod.status.phase == "Running" except ApiException as e: logger.error(f"API error: {e}") return False @@ -679,6 +672,7 @@ def is_alive(): def spdk_proxy_restart(query: utils.RPCPortParams): return utils.get_response(True) +api.post('/bind_device_to_nvme')(snode_ops.bind_device_to_nvme) api.post('/bind_device_to_spdk')(snode_ops.bind_device_to_spdk) @@ -686,3 +680,5 @@ def spdk_proxy_restart(query: utils.RPCPortParams): api.get('/ifc_is_roce')(snode_ops.ifc_is_roce) +api.post('/format_device_with_4k')(snode_ops.format_device_with_4k) + diff --git a/simplyblock_web/api/v1/storage_node.py b/simplyblock_web/api/v1/storage_node.py index f3ec2fbcd..b3f0925bf 100644 --- a/simplyblock_web/api/v1/storage_node.py +++ b/simplyblock_web/api/v1/storage_node.py @@ -253,6 +253,15 @@ def storage_node_add(): if 'ha_jm_count' in req_data: ha_jm_count = int(req_data['ha_jm_count']) + format_4k = False + param = req_data.get('format_4k') + if param: + if isinstance(param, bool): + format_4k = param + elif isinstance(param, str): + format_4k = param == "true" + + tasks_controller.add_node_add_task(cluster_id, { "cluster_id": cluster_id, "node_addr": node_addr, @@ -269,6 +278,7 @@ def storage_node_add(): "namespace": namespace, "enable_ha_jm": not disable_ha_jm, "ha_jm_count": ha_jm_count, + "format_4k": format_4k }) return utils.get_response(True) diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py index f3d7bd33c..7d27ecc5e 100644 --- a/simplyblock_web/api/v2/storage_node.py +++ b/simplyblock_web/api/v2/storage_node.py @@ -51,6 +51,7 @@ class StorageNodeParams(BaseModel): cr_namespace: str cr_plural: str ha_jm_count: int = Field(3) + format_4k: bool = Field(False) @api.post('/', name='clusters:storage-nodes:create', status_code=201, responses={201: {"content": None}}) @@ -77,6 +78,7 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams): 'cr_namespace': parameters.cr_namespace, 'cr_plural': parameters.cr_plural, "ha_jm_count": parameters.ha_jm_count, + "format_4k": parameters.format_4k, } ) if not task_id_or_false: diff --git a/simplyblock_web/node_configure.py b/simplyblock_web/node_configure.py index ffb05915d..ff5a2434d 100755 --- a/simplyblock_web/node_configure.py +++ b/simplyblock_web/node_configure.py @@ -151,6 +151,14 @@ def parse_arguments() -> argparse.Namespace: dest='size_range', required=False ) + parser.add_argument( + '--nvme-devices', + help='Comma separated list of nvme namespace names like nvme0n1,nvme1n1...', + type=str, + default='', + dest='nvme_names', + required=False + ) return parser.parse_args() def validate_arguments(args: argparse.Namespace) -> None: @@ -235,11 +243,14 @@ def main() -> None: # Process PCI device filters pci_allowed: List[str] = [] pci_blocked: List[str] = [] + nvme_names: List[str] = [] if args.pci_allowed: pci_allowed = [pci.strip() for pci in args.pci_allowed.split(',') if pci.strip()] if args.pci_blocked: pci_blocked = [pci.strip() for pci in args.pci_blocked.split(',') if pci.strip()] + if args.nvme_names: + nvme_names = [nvme_name.strip() for nvme_name in args.nvme_names.split(',') if nvme_name.strip()] # Generate the deployment configuration generate_automated_deployment_config( @@ -252,7 +263,9 @@ def main() -> None: cores_percentage=args.cores_percentage, force=args.force, device_model=args.device_model, - size_range=args.size_range + size_range=args.size_range, + nvme_names=nvme_names, + k8s=True ) logger.info("create RPC socket mount") diff --git a/simplyblock_web/templates/Untitled-1.j2 b/simplyblock_web/templates/Untitled-1.j2 new file mode 100644 index 000000000..e69de29bb diff --git a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 index 74f66721d..85bfd0f7b 100644 --- a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 +++ b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 @@ -74,7 +74,7 @@ spec: operator: In values: - worker - - worker-isolated + - worker-isolated-{{ HOSTNAME }} nodeSelector: matchLabels: kubernetes.io/hostname: {{ HOSTNAME }} @@ -87,7 +87,7 @@ spec: metadata: name: worker-isolated-{{ HOSTNAME }} labels: - machineconfiguration.openshift.io/role: worker-isolated + machineconfiguration.openshift.io/role: worker-isolated-{{ HOSTNAME }} spec: kernelArguments: - "nohz_full=${ISOLATED_CORES}" @@ -102,11 +102,11 @@ spec: apiVersion: machineconfiguration.openshift.io/v1 kind: KubeletConfig metadata: - name: set-static-cpu-manager + name: set-static-cpu-manager-{{ HOSTNAME }} spec: machineConfigPoolSelector: matchLabels: - machineconfiguration.openshift.io/role: worker-isolated + machineconfiguration.openshift.io/role: worker-isolated-{{ HOSTNAME }} kubeletConfig: cpuManagerPolicy: static cpuManagerReconcilePeriod: 5s @@ -117,5 +117,5 @@ spec: echo "[INFO] Marking node as configured." touch "$MARKER" - echo "[INFO] Node is rebooting. Sleeping indefinitely to stop pipeline..." - sleep infinity + echo "[INFO] Node is rebooting. Sleeping for 5 minutes to stop pipeline gracefully..." + sleep 300 \ No newline at end of file diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index ec92c850f..8e2e40f34 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -24,8 +24,9 @@ spec: operator: Exists volumes: - name: socket-dir - hostPath: - path: /mnt/ramdisk + emptyDir: + medium: Memory + sizeLimit: 1Gi - name: host-sys hostPath: path: /sys diff --git a/simplyblock_web/templates/storage_init_job.yaml.j2 b/simplyblock_web/templates/storage_init_job.yaml.j2 index 074f501b5..2d59571c7 100644 --- a/simplyblock_web/templates/storage_init_job.yaml.j2 +++ b/simplyblock_web/templates/storage_init_job.yaml.j2 @@ -19,6 +19,9 @@ spec: operator: Exists volumes: + - name: etc-systemd + hostPath: + path: /etc/systemd/ - name: host-proc hostPath: path: /proc @@ -27,8 +30,9 @@ spec: image: simplyblock/ubuntu-tools:22.04 securityContext: privileged: true - volumeMounts: + - name: etc-systemd + mountPath: /etc/systemd/ - name: host-proc mountPath: /proc command: ["/bin/sh", "-c"] @@ -37,7 +41,10 @@ spec: set -e echo "--- Starting init setup ---" - + + HUGEPAGES_BEFORE=$(grep HugePages_Total /proc/meminfo | awk '{print $2}') + echo "[INFO] Hugepages before: $HUGEPAGES_BEFORE" + NODE_IP=$(ip route get 1.1.1.1 | grep -oE 'src [0-9.]+' | awk '{print $2}') echo "Detected node IP: $NODE_IP" @@ -53,6 +60,47 @@ spec: OS_ID="$(cat /proc/version | awk '{print $3}' | awk -F'-' '{print $NF}')" if [ "$OS_ID" != "talos" ]; then + + echo "--- Creating RAM disk systemd unit on host ---" + + + UNIT_PATH="/etc/systemd/system/var-mnt-ramdisk.mount" + + echo "Writing systemd unit to $UNIT_PATH" + + + cat < "$UNIT_PATH" + [Unit] + Description=1G RAM disk at /var/mnt/ramdisk + After=local-fs-pre.target + Before=local-fs.target + + [Mount] + What=tmpfs + Where=/var/mnt/ramdisk + Type=tmpfs + Options=size=1G,mode=1777 + + [Install] + WantedBy=local-fs.target + EOF + + echo "Starting RAM disk mounting." + nsenter --target 1 --mount --uts --ipc --net --pid -- /bin/sh -c ' + if command -v systemctl >/dev/null 2>&1; then + echo "Reloading systemd..." + systemctl daemon-reload || echo "systemd reload failed" + + echo "Enabling mount unit..." + systemctl enable var-mnt-ramdisk.mount || echo "enable failed" + + echo "Starting mount unit..." + systemctl start var-mnt-ramdisk.mount || echo "start failed (check logs or unit file)" + else + echo "systemctl not found; skipping RAM disk mounting" + fi + ' + HUGEPAGES_AFTER=$(grep HugePages_Total /proc/meminfo | awk '{print $2}') echo "[INFO] Hugepages after: $HUGEPAGES_AFTER" @@ -68,7 +116,7 @@ spec: fi else echo "Talos detected - skipping nsenter and kubelet restart." - echo "Use 'talosctl service kubelet restart -n $NODE_IP' to restart the node kubelet" + echo "Use '\''talosctl service kubelet restart -n $NODE_IP'\'' to restart the node kubelet" fi echo "--- Init setup complete ---" From 745eeb520294699d862b676380ba8ae6125ef14c Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Thu, 5 Feb 2026 15:27:27 +0100 Subject: [PATCH 149/192] skip rendering prometheus configmap on helm upgrade (#869) * avoid rendering prometheus configmap on helm upgrade * updated prometheus configmap patch --- simplyblock_core/cluster_ops.py | 6 +++--- .../charts/templates/monitoring_configmap.yaml | 11 ++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index c12322494..0e1d2e2b2 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -354,8 +354,10 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, elif mode == "kubernetes": logger.info("Retrieving foundationdb connection string...") fdb_cluster_string = utils.get_fdb_cluster_string(constants.FDB_CONFIG_NAME, constants.K8S_NAMESPACE) - db_connection = fdb_cluster_string + + logger.info("Patching prometheus configmap...") + utils.patch_prometheus_configmap(cluster.uuid, cluster.secret) if not disable_monitoring: if ingress_host_source == "hostip": @@ -367,8 +369,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass, _add_graylog_input(graylog_endpoint, monitoring_secret) _create_update_user(cluster.uuid, cluster.grafana_endpoint, monitoring_secret, cluster.secret) - if mode == "kubernetes": - utils.patch_prometheus_configmap(cluster.uuid, cluster.secret) cluster.db_connection = db_connection cluster.status = Cluster.STATUS_UNREADY diff --git a/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml b/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml index 497760180..7f621fbb1 100644 --- a/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml +++ b/simplyblock_core/scripts/charts/templates/monitoring_configmap.yaml @@ -1,12 +1,17 @@ - +{{- $name := printf "%s-simplyblock-prometheus-config" .Release.Name -}} +{{- $existing := (lookup "v1" "ConfigMap" .Release.Namespace $name) -}} apiVersion: v1 kind: ConfigMap metadata: - name: {{ .Release.Name }}-simplyblock-prometheus-config + name: {{ $name }} labels: app: simplyblock-prometheus namespace: {{ .Release.Namespace }} data: + {{- if $existing }} + prometheus.yml: | +{{ index $existing.data "prometheus.yml" | indent 4 }} + {{- else }} prometheus.yml: | global: scrape_interval: 30s @@ -14,7 +19,6 @@ data: monitor: 'codelab-monitor' scrape_configs: - - job_name: 'cluster_metrics' static_configs: - targets: ['simplyblock-webappapi:5000'] @@ -23,6 +27,7 @@ data: basic_auth: username: password: + {{- end }} --- apiVersion: v1 From ff20e2efd78998a32401dbe82e1b8aab47266a4e Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 6 Feb 2026 14:43:49 +0300 Subject: [PATCH 150/192] Fix 1 --- .../controllers/lvol_controller.py | 88 ++++++++++++------- simplyblock_web/api/v2/dtos.py | 4 +- simplyblock_web/api/v2/volume.py | 3 +- 3 files changed, 62 insertions(+), 33 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 4aefaa8e7..1a4c35cc4 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1256,6 +1256,62 @@ def list_lvols_mem(is_json, is_csv): return utils.print_table(data) +def get_replication_info(lvol_id_or_name): + db_controller = DBController() + lvol = None + for lv in db_controller.get_lvols(): # pass + if lv.get_id() == lvol_id_or_name or lv.lvol_name == lvol_id_or_name: + lvol = lv + break + + if not lvol: + logger.error(f"LVol id or name not found: {lvol_id_or_name}") + return False + + tasks = [] + snaps = [] + out = { + "last_snapshot_id": None, + "last_replication_time": None, + "last_replication_duration": None, + "replicated_count": None, + "snaps": None, + "tasks": None, + } + node = db_controller.get_storage_node_by_id(lvol.node_id) + for task in db_controller.get_job_tasks(node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + if snap.lvol.get_id() != lvol.get_id(): + continue + snaps.append(snap) + tasks.append(task) + + if tasks: + tasks = sorted(tasks, key=lambda x: x.date) + snaps = sorted(snaps, key=lambda x: x.creation_dt) + out["snaps"] = snaps + out["tasks"] = tasks + out["replicated_count"] = len(snaps) + last_task = tasks[-1] + last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) + out["last_snapshot_id"] = last_snap.get_id() + out["last_replication_time"] = last_task.updated_at + if "end_time" in last_task.function_params: + duration = utils.strfdelta_seconds( + last_task.function_params["end_time"] - last_task.function_params["start_time"]) + else: + duration = utils.strfdelta_seconds(int(time.time()) - last_task.function_params["start_time"]) + out["last_replication_duration"] = duration + + return out + + def get_lvol(lvol_id_or_name, is_json): db_controller = DBController() lvol = None @@ -1272,6 +1328,7 @@ def get_lvol(lvol_id_or_name, is_json): del data['nvme_dev'] + if is_json: return json.dumps(data, indent=2) else: @@ -1775,39 +1832,8 @@ def replication_trigger(lvol_id): if snap.lvol.get_id() != lvol_id: continue - snaps.append(snap) tasks.append(task) - # duration = "" - # try: - # if task.status == JobSchedule.STATUS_RUNNING: - # duration = utils.strfdelta_seconds(int(time.time()) - task.function_params["start_time"]) - # elif "end_time" in task.function_params: - # duration = utils.strfdelta_seconds( - # task.function_params["end_time"] - task.function_params["start_time"]) - # except Exception as e: - # logger.error(e) - # status = task.status - # if task.canceled: - # status = "cancelled" - # replicate_to = "target" - # if "replicate_to_source" in task.function_params: - # if task.function_params["replicate_to_source"] is True: - # replicate_to = "source" - # offset = 0 - # if "offset" in task.function_params: - # offset = task.function_params["offset"] - # data.append({ - # "Task ID": task.uuid, - # "Snapshot ID": snap.uuid, - # "Size": utils.humanbytes(snap.used_size), - # "Duration": duration, - # "Offset": offset, - # "Status": status, - # "Replicate to": replicate_to, - # "Result": task.function_result, - # "Cluster ID": task.cluster_id, - # }) if tasks: tasks = sorted(tasks, key=lambda x: x.date) diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index b30e3ee26..8a18e2f03 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -267,9 +267,10 @@ class VolumeDTO(BaseModel): max_r_mbytes: util.Unsigned max_w_mbytes: util.Unsigned capacity: CapacityStatDTO + rep_info: Optional[dict] = None @staticmethod - def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optional[StatsObject]=None): + def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optional[StatsObject]=None, rep_info=None): return VolumeDTO( id=UUID(model.get_id()), name=model.lvol_name, @@ -315,4 +316,5 @@ def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optiona max_r_mbytes=model.r_mbytes_per_sec, max_w_mbytes=model.w_mbytes_per_sec, capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), + rep_info=rep_info ) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 052196411..de1e52228 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -136,7 +136,8 @@ def get(request: Request, cluster: Cluster, pool: StoragePool, volume: Volume) - ret = db.get_lvol_stats(volume, 1) if ret: stat_obj = ret[0] - return VolumeDTO.from_model(volume, request, cluster.get_id(), stat_obj) + rep_info = lvol_controller.get_replication_info(volume.get_id()) + return VolumeDTO.from_model(volume, request, cluster.get_id(), stat_obj, rep_info) class UpdatableLVolParams(BaseModel): From 73f71e05fd6cd8d525fc8f3a784fe12565b1d20b Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Mon, 9 Feb 2026 10:47:50 +0100 Subject: [PATCH 151/192] removed csi hostpath controller and node (#870) --- .../templates/csi-hostpath-controller.yaml | 217 ------------------ .../templates/csi-hostpath-driverinfo.yaml | 2 +- .../charts/templates/csi-hostpath-node.yaml | 163 ------------- 3 files changed, 1 insertion(+), 381 deletions(-) delete mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml delete mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml deleted file mode 100644 index 153c29bda..000000000 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml +++ /dev/null @@ -1,217 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: csi-hostpathplugin-sa - namespace: {{ .Release.Namespace }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: csi-hostpathplugin -rules: - - apiGroups: [""] - resources: ["persistentvolumes"] - verbs: ["get", "list", "watch", "create", "delete", "update", "patch"] - - apiGroups: [""] - resources: ["persistentvolumeclaims"] - verbs: ["get", "list", "watch", "update"] - - apiGroups: [""] - resources: ["persistentvolumeclaims/status"] - verbs: ["get", "update", "patch"] - - apiGroups: ["storage.k8s.io"] - resources: ["volumeattachments"] - verbs: ["get", "list", "watch", "update"] - - apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "list", "watch"] - - apiGroups: ["storage.k8s.io"] - resources: ["csinodes"] - verbs: ["get", "list", "watch"] - - apiGroups: ["storage.k8s.io"] - resources: ["storageclasses"] - verbs: ["get", "list", "watch"] - - apiGroups: ["storage.k8s.io"] - resources: ["csistoragecapacities"] - verbs: ["get", "list", "watch", "create", "update", "delete"] - - apiGroups: [""] - resources: ["events"] - verbs: ["create", "patch", "update", "get", "list", "watch"] - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: csi-hostpathplugin -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: csi-hostpathplugin -subjects: - - kind: ServiceAccount - name: csi-hostpathplugin-sa - namespace: {{ .Release.Namespace }} ---- -kind: StatefulSet -apiVersion: apps/v1 -metadata: - name: csi-hostpathplugin - labels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin -spec: - serviceName: "csi-hostpathplugin" - # One replica only: - # Host path driver only works when everything runs - # on a single node. - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin - template: - metadata: - labels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin - spec: - serviceAccountName: csi-hostpathplugin-sa - containers: - - name: hostpath - image: registry.k8s.io/sig-storage/hostpathplugin:v1.17.0 - args: - - "--drivername=hostpath.csi.k8s.io" - - "--v=5" - - "--endpoint=$(CSI_ENDPOINT)" - - "--nodeid=$(KUBE_NODE_NAME)" - # end hostpath args - env: - - name: CSI_ENDPOINT - value: unix:///csi/csi.sock - - name: KUBE_NODE_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: spec.nodeName - securityContext: - privileged: true - ports: - - containerPort: 9898 - name: healthz - protocol: TCP - livenessProbe: - failureThreshold: 5 - httpGet: - path: /healthz - port: healthz - initialDelaySeconds: 10 - timeoutSeconds: 3 - periodSeconds: 2 - volumeMounts: - - mountPath: /csi - name: socket-dir - - mountPath: /var/lib/kubelet/pods - mountPropagation: Bidirectional - name: mountpoint-dir - - mountPath: /var/lib/kubelet/plugins - mountPropagation: Bidirectional - name: plugins-dir - - mountPath: /csi-data-dir - name: csi-data-dir - - mountPath: /dev - name: dev-dir - - - name: liveness-probe - volumeMounts: - - mountPath: /csi - name: socket-dir - image: registry.k8s.io/sig-storage/livenessprobe:v2.17.0 - args: - - --csi-address=/csi/csi.sock - - --health-port=9898 - - - name: csi-provisioner - image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0 - args: - - -v=5 - - --csi-address=/csi/csi.sock - - --feature-gates=Topology=true - - --enable-capacity - - --capacity-ownerref-level=0 # pod is owner - - --node-deployment=true - - --strict-topology=true - - --immediate-topology=false - - --worker-threads=5 - env: - - name: NODE_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: spec.nodeName - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - # end csi-provisioner args - securityContext: - # This is necessary only for systems with SELinux, where - # non-privileged sidecar containers cannot access unix domain socket - # created by privileged CSI driver container. - privileged: true - volumeMounts: - - mountPath: /csi - name: socket-dir - - - name: csi-resizer - image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0 - args: - - -v=5 - - -csi-address=/csi/csi.sock - securityContext: - # This is necessary only for systems with SELinux, where - # non-privileged sidecar containers cannot access unix domain socket - # created by privileged CSI driver container. - privileged: true - volumeMounts: - - mountPath: /csi - name: socket-dir - - volumes: - - hostPath: - path: /var/lib/kubelet/plugins/csi-hostpath - type: DirectoryOrCreate - name: socket-dir - - hostPath: - path: /var/lib/kubelet/pods - type: DirectoryOrCreate - name: mountpoint-dir - - hostPath: - path: /var/lib/kubelet/plugins_registry - type: Directory - name: registration-dir - - hostPath: - path: /var/lib/kubelet/plugins - type: Directory - name: plugins-dir - - hostPath: - # 'path' is where PV data is persisted on host. - # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot - path: /var/lib/csi-hostpath-data/ - type: DirectoryOrCreate - name: csi-data-dir - - hostPath: - path: /dev - type: Directory - name: dev-dir - # end csi volumes diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml index 2f6a64b14..f735a3b34 100644 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml @@ -17,7 +17,7 @@ spec: podInfoOnMount: true # No attacher needed. attachRequired: false - storageCapacity: true + storageCapacity: false # Kubernetes may use fsGroup to change permissions and ownership # of the volume to match user requested fsGroup in the pod's SecurityPolicy fsGroupPolicy: File diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml deleted file mode 100644 index 07e08f36e..000000000 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml +++ /dev/null @@ -1,163 +0,0 @@ - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: csi-hostpath-node-sa - namespace: {{ .Release.Namespace }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: csi-hostpath-node -rules: - - apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "list", "watch"] - - apiGroups: ["storage.k8s.io"] - resources: ["volumeattachments"] - verbs: ["get", "list", "watch", "update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: csi-hostpath-node -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: csi-hostpath-node -subjects: - - kind: ServiceAccount - name: csi-hostpath-node-sa - namespace: {{ .Release.Namespace }} ---- -kind: DaemonSet -apiVersion: apps/v1 -metadata: - name: csi-hostpathplugin - labels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin -spec: - selector: - matchLabels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin - template: - metadata: - labels: - app.kubernetes.io/instance: hostpath.csi.k8s.io - app.kubernetes.io/part-of: csi-driver-host-path - app.kubernetes.io/name: csi-hostpathplugin - app.kubernetes.io/component: plugin - spec: - serviceAccountName: csi-hostpath-node-sa - containers: - - name: node-driver-registrar - image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0 - args: - - --v=5 - - --csi-address=/csi/csi.sock - - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock - securityContext: - # This is necessary only for systems with SELinux, where - # non-privileged sidecar containers cannot access unix domain socket - # created by privileged CSI driver container. - privileged: true - env: - - name: KUBE_NODE_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: spec.nodeName - volumeMounts: - - mountPath: /csi - name: socket-dir - - mountPath: /registration - name: registration-dir - - mountPath: /csi-data-dir - name: csi-data-dir - - - name: hostpath - image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0 - args: - - --drivername=hostpath.csi.k8s.io - - --v=5 - - --endpoint=$(CSI_ENDPOINT) - - --nodeid=$(KUBE_NODE_NAME) - - --capacity=slow=10Gi - - --capacity=fast=100Gi - env: - - name: CSI_ENDPOINT - value: unix:///csi/csi.sock - - name: KUBE_NODE_NAME - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: spec.nodeName - securityContext: - privileged: true - ports: - - containerPort: 9898 - name: healthz - protocol: TCP - livenessProbe: - failureThreshold: 5 - httpGet: - path: /healthz - port: healthz - initialDelaySeconds: 10 - timeoutSeconds: 3 - periodSeconds: 2 - volumeMounts: - - mountPath: /csi - name: socket-dir - - mountPath: /var/lib/kubelet/pods - mountPropagation: Bidirectional - name: mountpoint-dir - - mountPath: /var/lib/kubelet/plugins - mountPropagation: Bidirectional - name: plugins-dir - - mountPath: /csi-data-dir - name: csi-data-dir - - mountPath: /dev - name: dev-dir - - name: liveness-probe - volumeMounts: - - mountPath: /csi - name: socket-dir - image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0 - args: - - --csi-address=/csi/csi.sock - - --health-port=9898 - - volumes: - - hostPath: - path: /var/lib/kubelet/plugins/csi-hostpath - type: DirectoryOrCreate - name: socket-dir - - hostPath: - path: /var/lib/kubelet/pods - type: DirectoryOrCreate - name: mountpoint-dir - - hostPath: - path: /var/lib/kubelet/plugins_registry - type: Directory - name: registration-dir - - hostPath: - path: /var/lib/kubelet/plugins - type: Directory - name: plugins-dir - - hostPath: - # 'path' is where PV data is persisted on host. - # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot - path: /var/lib/csi-hostpath-data/ - type: DirectoryOrCreate - name: csi-data-dir - - hostPath: - path: /dev - type: Directory - name: dev-dir From 3d4d935843ae54af85b1918c1a7a5c8c13311bc9 Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Mon, 9 Feb 2026 13:37:33 +0300 Subject: [PATCH 152/192] Add ptpl_file parameter to namespace in rpc_client --- simplyblock_core/rpc_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index d1f123b72..81989e2e9 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -324,6 +324,7 @@ def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None, if eui64: params['namespace']['eui64'] = eui64 + params['namespace']['ptpl_file'] = "/mnt/ns_resv"+hex(eui64)+".json" return self._request("nvmf_subsystem_add_ns", params) From 9fa83937df605c80312111ee85a7b7594df5d373 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 9 Feb 2026 15:51:28 +0300 Subject: [PATCH 153/192] fix typo --- simplyblock_core/controllers/lvol_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 1a4c35cc4..df294ec32 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1294,7 +1294,7 @@ def get_replication_info(lvol_id_or_name): if tasks: tasks = sorted(tasks, key=lambda x: x.date) - snaps = sorted(snaps, key=lambda x: x.creation_dt) + snaps = sorted(snaps, key=lambda x: x.created_at) out["snaps"] = snaps out["tasks"] = tasks out["replicated_count"] = len(snaps) From 3bffc55cf4498856c0ef30287af6c986b54074e0 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 9 Feb 2026 17:01:40 +0300 Subject: [PATCH 154/192] fix rep status return output --- simplyblock_core/controllers/lvol_controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index df294ec32..6486f4e54 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1295,8 +1295,8 @@ def get_replication_info(lvol_id_or_name): if tasks: tasks = sorted(tasks, key=lambda x: x.date) snaps = sorted(snaps, key=lambda x: x.created_at) - out["snaps"] = snaps - out["tasks"] = tasks + out["snaps"] = [s.to_dict() for s in snaps] + out["tasks"] = [t.to_dict() for t in tasks] out["replicated_count"] = len(snaps) last_task = tasks[-1] last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) From 16c4f6e735bef613d88335a45de8da148d66b6e6 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 9 Feb 2026 22:11:51 +0300 Subject: [PATCH 155/192] fix: handle missing replicate_as_snap_instance parameter --- simplyblock_core/services/snapshot_replication.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/services/snapshot_replication.py b/simplyblock_core/services/snapshot_replication.py index 47f1c8c76..2549b8546 100644 --- a/simplyblock_core/services/snapshot_replication.py +++ b/simplyblock_core/services/snapshot_replication.py @@ -119,7 +119,10 @@ def process_snap_replicate_finish(task, snapshot): snode.rpc_client().bdev_nvme_detach_controller(remote_lv.top_bdev) remote_snode = db.get_storage_node_by_id(remote_lv.node_id) replicate_to_source = task.function_params["replicate_to_source"] - replicate_as_snap_instance = task.function_params["replicate_as_snap_instance"] + if "replicate_as_snap_instance" in task.function_params: + replicate_as_snap_instance = task.function_params["replicate_as_snap_instance"] + else: + replicate_as_snap_instance = False target_prev_snap = None if replicate_to_source: org_snap = db.get_snapshot_by_id(snapshot.snap_ref_id) From 03da3e2f5199de713f9d44f3338d0d357a9bdd87 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 10 Feb 2026 16:11:01 +0300 Subject: [PATCH 156/192] fix: use unique UUID for snapshot replication identifier --- simplyblock_core/controllers/lvol_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 6486f4e54..2cbaa37a4 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1809,7 +1809,7 @@ def replication_trigger(lvol_id): db_controller = DBController() lvol = db_controller.get_lvol_by_id(lvol_id) node = db_controller.get_storage_node_by_id(lvol.node_id) - snapshot_controller.add(lvol_id, f"replication_{lvol_id}") + snapshot_controller.add(lvol_id, f"replication_{uuid.uuid4()}") tasks = [] snaps = [] From a3ce25f4546e126105eb30270c040154e08aabb0 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 10 Feb 2026 16:16:56 +0300 Subject: [PATCH 157/192] fix: improve replication duration calculation logic --- simplyblock_core/controllers/lvol_controller.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 2cbaa37a4..52f27c31f 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1845,11 +1845,13 @@ def replication_trigger(lvol_id): last_snap = db_controller.get_snapshot_by_id(last_task.function_params["snapshot_id"]) out["last_snapshot_id"] = last_snap.get_id() out["last_replication_time"] = last_task.updated_at - if "end_time" in last_task.function_params: - duration = utils.strfdelta_seconds( - last_task.function_params["end_time"] - last_task.function_params["start_time"]) - else: - duration = utils.strfdelta_seconds(int(time.time()) - last_task.function_params["start_time"]) + duration = 0 + if "start_time" in last_task.function_params: + if "end_time" in last_task.function_params: + duration = utils.strfdelta_seconds( + last_task.function_params["end_time"] - last_task.function_params["start_time"]) + else: + duration = utils.strfdelta_seconds(int(time.time()) - last_task.function_params["start_time"]) out["last_replication_duration"] = duration return out From 80e7e4309b5f75b313226518a4ee16061b52b59c Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Wed, 11 Feb 2026 14:28:43 +0300 Subject: [PATCH 158/192] Update lvol_controller.py (#871) --- simplyblock_core/controllers/lvol_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 64b985994..26140dcca 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -714,7 +714,7 @@ def add_lvol_on_node(lvol, snode, is_primary=True): return False, f"Failed to create listener for {lvol.get_id()}" logger.info("Add BDev to subsystem") - ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, lvol.ns_id) + ret = rpc_client.nvmf_subsystem_add_ns(lvol.nqn, lvol.top_bdev, lvol.uuid, lvol.guid, lvol.ns_id, f"{lvol.vuid:016X}") if not ret: return False, "Failed to add bdev to subsystem" lvol.ns_id = int(ret) From 0b804b6c6193022250f04096480d4dbe9dcbd559 Mon Sep 17 00:00:00 2001 From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com> Date: Wed, 11 Feb 2026 14:37:22 +0300 Subject: [PATCH 159/192] Fix ptpl_file path construction for eui64 (#872) --- simplyblock_core/rpc_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 81989e2e9..eba4a30f3 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -324,7 +324,7 @@ def nvmf_subsystem_add_ns(self, nqn, dev_name, uuid=None, nguid=None, nsid=None, if eui64: params['namespace']['eui64'] = eui64 - params['namespace']['ptpl_file'] = "/mnt/ns_resv"+hex(eui64)+".json" + params['namespace']['ptpl_file'] = "/mnt/ns_resv"+eui64+".json" return self._request("nvmf_subsystem_add_ns", params) From bdca1961ea7f42d92aeb735764d7c432c161f494 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 11 Feb 2026 16:14:06 +0100 Subject: [PATCH 160/192] reverted api v2 field to id from uuid (#873) --- simplyblock_web/api/v2/dtos.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index b30e3ee26..552220bba 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -40,7 +40,7 @@ def from_model(model: StatsObject): class ClusterDTO(BaseModel): - uuid: UUID + id: UUID name: Optional[str] nqn: str status: Literal['active', 'read_only', 'inactive', 'suspended', 'degraded', 'unready', 'in_activation', 'in_expansion'] @@ -61,7 +61,7 @@ class ClusterDTO(BaseModel): @staticmethod def from_model(model: Cluster, stat_obj: Optional[StatsObject]=None): return ClusterDTO( - uuid=UUID(model.get_id()), + id=UUID(model.get_id()), name=model.cluster_name, nqn=model.nqn, status=model.status, # type: ignore @@ -181,7 +181,7 @@ def from_model(model: SnapShot, request: Request, cluster_id, pool_id, volume_id class StorageNodeDTO(BaseModel): - uuid: UUID + id: UUID status: str hostname: str cpu: int @@ -198,7 +198,7 @@ class StorageNodeDTO(BaseModel): @staticmethod def from_model(model: StorageNode, stat_obj: Optional[StatsObject]=None): return StorageNodeDTO( - uuid=UUID(model.get_id()), + id=UUID(model.get_id()), status=model.status, hostname=model.hostname, cpu=model.cpu, From ee06ee3e64d15b69822f0fbd781d139e34188d79 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 13 Feb 2026 12:53:45 +0300 Subject: [PATCH 161/192] feat: add replicate_lvol_on_target_cluster function and API endpoint --- .../controllers/lvol_controller.py | 65 ++++++++++++++++++- simplyblock_core/controllers/lvol_events.py | 4 ++ simplyblock_web/api/v2/volume.py | 5 ++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 2e191cc39..df09e88a4 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1,4 +1,5 @@ # coding=utf-8 +import copy import logging as lg import json import math @@ -1920,4 +1921,66 @@ def replication_stop(lvol_id, delete=False): if snap.lvol.uuid == lvol.uuid: tasks_controller.cancel_task(task.uuid) - return True \ No newline at end of file + return True + + +def replicate_lvol_on_target_cluster(lvol_id): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + if not lvol.replication_node_id: + logger.error(f"LVol: {lvol_id} replication node id not found") + return False + + target_node = db_controller.get_storage_node_by_id(lvol.replication_node_id) + if not target_node: + logger.error(f"Node not found: {lvol.replication_node_id}") + return False + + if target_node.status != StorageNode.STATUS_ONLINE: + logger.error(f"Node is not online!: {target_node}, status: {target_node.status}") + return False + + source_node = db_controller.get_storage_node_by_id(lvol.node_id) + source_cluster = db_controller.get_cluster_by_id(source_node.cluster_id) + # create lvol on target node + new_lvol = copy.deepcopy(lvol) + new_lvol.uuid = str(uuid.uuid4()) + new_lvol.create_dt = str(datetime.now()) + new_lvol.node_id = target_node.get_id() + new_lvol.nodes = [target_node.get_id(), target_node.secondary_node_id] + new_lvol.replication_node_id = "" + new_lvol.do_replicate = False + new_lvol.cloned_from_snap = "" + new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool + + new_lvol.write_to_db(db_controller.kv_store) + + lvol_bdev, error = add_lvol_on_node(new_lvol, target_node) + if error: + logger.error(error) + lvol.remove(db_controller.kv_store) + return False, error + + new_lvol.lvol_uuid = lvol_bdev['uuid'] + new_lvol.blobid = lvol_bdev['driver_specific']['lvol']['blobid'] + + secondary_node = db_controller.get_storage_node_by_id(target_node.secondary_node_id) + if secondary_node.status == StorageNode.STATUS_ONLINE: + lvol_bdev, error = add_lvol_on_node(new_lvol, secondary_node, is_primary=False) + if error: + logger.error(error) + # remove lvol from primary + ret = delete_lvol_from_node(new_lvol, target_node) + if not ret: + logger.error("") + lvol.remove(db_controller.kv_store) + return False, error + + lvol_events.lvol_replicated(lvol, new_lvol) + + return new_lvol.lvol_uuid diff --git a/simplyblock_core/controllers/lvol_events.py b/simplyblock_core/controllers/lvol_events.py index 91b91027b..c4f2abde8 100644 --- a/simplyblock_core/controllers/lvol_events.py +++ b/simplyblock_core/controllers/lvol_events.py @@ -117,3 +117,7 @@ def lvol_health_check_change(lvol, new_state, old_status, caused_by=ec.CAUSED_BY def lvol_io_error_change(lvol, new_state, old_status, caused_by=ec.CAUSED_BY_CLI): _lvol_event(lvol, f"LVol IO Error changed from: {old_status} to: {new_state}", caused_by, ec.EVENT_STATUS_CHANGE) + +def lvol_replicated(lvol, new_lvol, caused_by=ec.CAUSED_BY_CLI): + _lvol_event(lvol, f"LVol Replicated, {lvol.get_id()}, new lvol: {new_lvol.get_id()}", caused_by, ec.EVENT_STATUS_CHANGE) + diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index de1e52228..9ac6ef087 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -260,3 +260,8 @@ def create_snapshot( cluster_id=cluster.get_id(), pool_id=pool.get_id(), snapshot_id=snapshot_id, ) return Response(status_code=201, headers={'Location': entity_url}) + + +@instance_api.get('/replicate_lvol', name='clusters:storage-pools:volumes:replicate_lvol') +def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume: Volume): + return lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) From f85b354fe9ad87086d941231bc217519008cb04f Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 13 Feb 2026 15:46:00 +0300 Subject: [PATCH 162/192] fix: change replicate_lvol endpoint from GET to POST --- simplyblock_web/api/v2/volume.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 9ac6ef087..fcce15aa6 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -262,6 +262,6 @@ def create_snapshot( return Response(status_code=201, headers={'Location': entity_url}) -@instance_api.get('/replicate_lvol', name='clusters:storage-pools:volumes:replicate_lvol') +@instance_api.post('/replicate_lvol', name='clusters:storage-pools:volumes:replicate_lvol') def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume: Volume): return lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) From a3b4b0ee7b326bb990d274736165730ec0764b37 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 13 Feb 2026 16:25:26 +0300 Subject: [PATCH 163/192] fix: set lvs_name for bdev_lvol in replication process --- simplyblock_core/controllers/lvol_controller.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index df09e88a4..bc3f01f57 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1958,6 +1958,12 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.cloned_from_snap = "" new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool + lvol.lvs_name = target_node.lvstore + for stack in lvol.bdev_stack: + if stack["type"] == "bdev_lvol": + stack["params"]["lvs_name"] = new_lvol.lvs_name + break + new_lvol.write_to_db(db_controller.kv_store) lvol_bdev, error = add_lvol_on_node(new_lvol, target_node) From 3e070702742d4c219fce1d92eaab5682eb99795d Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 13 Feb 2026 17:11:21 +0300 Subject: [PATCH 164/192] wip --- .../controllers/lvol_controller.py | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index bc3f01f57..f75c9e1bc 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1957,12 +1957,30 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.do_replicate = False new_lvol.cloned_from_snap = "" new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool + new_lvol.lvs_name = target_node.lvstore - lvol.lvs_name = target_node.lvstore - for stack in lvol.bdev_stack: - if stack["type"] == "bdev_lvol": - stack["params"]["lvs_name"] = new_lvol.lvs_name - break + new_lvol.bdev_stack = [ + { + "type": "bdev_lvol_clone", + "name": lvol.top_bdev, + "params": { + "snapshot_name": lvol.snapshot_name, + "clone_name": lvol.lvol_bdev + } + } + ] + + if new_lvol.crypto_bdev: + new_lvol.bdev_stack.append({ + "type": "crypto", + "name": lvol.crypto_bdev, + "params": { + "name": lvol.crypto_bdev, + "base_name": lvol.top_bdev, + "key1": lvol.crypto_key1, + "key2": lvol.crypto_key2, + } + }) new_lvol.write_to_db(db_controller.kv_store) From 2f1f86f2d0b7204dbf624d9ff40dd81a4e5a567b Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 16 Feb 2026 16:41:50 +0300 Subject: [PATCH 165/192] adds lvol clone stack --- .../controllers/lvol_controller.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index f75c9e1bc..d53e7963f 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1947,6 +1947,31 @@ def replicate_lvol_on_target_cluster(lvol_id): source_node = db_controller.get_storage_node_by_id(lvol.node_id) source_cluster = db_controller.get_cluster_by_id(source_node.cluster_id) + + snaps = [] + snapshot_name = None + for task in db_controller.get_job_tasks(source_node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + if snap.lvol.get_id() != lvol_id: + continue + snaps.append(snap) + + if snaps: + snaps = sorted(snaps, key=lambda x: x.creation_dt) + last_snapshot = snaps[-1] + rep_snap = db_controller.get_snapshot_by_id(last_snapshot.target_replicated_snap_uuid) + snapshot_name = rep_snap.snap_bdev + + if not snapshot_name: + logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") + return False + # create lvol on target node new_lvol = copy.deepcopy(lvol) new_lvol.uuid = str(uuid.uuid4()) @@ -1964,7 +1989,7 @@ def replicate_lvol_on_target_cluster(lvol_id): "type": "bdev_lvol_clone", "name": lvol.top_bdev, "params": { - "snapshot_name": lvol.snapshot_name, + "snapshot_name": snapshot_name, "clone_name": lvol.lvol_bdev } } From c4f4f444bd5f1332e6e94da6932d3ce73d676de2 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 16 Feb 2026 18:49:48 +0300 Subject: [PATCH 166/192] fix: update sorting key for snapshots from creation_dt to created_at --- simplyblock_core/controllers/lvol_controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index d53e7963f..85eb8f3de 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1838,7 +1838,7 @@ def replication_trigger(lvol_id): if tasks: tasks = sorted(tasks, key=lambda x: x.date) - snaps = sorted(snaps, key=lambda x: x.creation_dt) + snaps = sorted(snaps, key=lambda x: x.created_at) out["snaps"] = snaps out["tasks"] = tasks out["replicated_count"] = len(snaps) @@ -1963,7 +1963,7 @@ def replicate_lvol_on_target_cluster(lvol_id): snaps.append(snap) if snaps: - snaps = sorted(snaps, key=lambda x: x.creation_dt) + snaps = sorted(snaps, key=lambda x: x.created_at) last_snapshot = snaps[-1] rep_snap = db_controller.get_snapshot_by_id(last_snapshot.target_replicated_snap_uuid) snapshot_name = rep_snap.snap_bdev From 2374c6288b69915dd9062491caa117cd9aea82b3 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 16 Feb 2026 19:44:28 +0300 Subject: [PATCH 167/192] feat: add configuration for MCP and implement device status reset functionality --- simplyblock_core/controllers/lvol_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 85eb8f3de..4aa8e3fc1 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1983,6 +1983,7 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.cloned_from_snap = "" new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool new_lvol.lvs_name = target_node.lvstore + new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" new_lvol.bdev_stack = [ { From 60371c98629c74f0be63662a134a6f14944e8660 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 17 Feb 2026 16:21:48 +0300 Subject: [PATCH 168/192] set snapshot name when creating lvol no target cluster --- simplyblock_core/controllers/lvol_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 4aa8e3fc1..8b0cb84a5 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1984,6 +1984,7 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool new_lvol.lvs_name = target_node.lvstore new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" + new_lvol.snapshot_name = snapshot_name new_lvol.bdev_stack = [ { From 777c9804d65663fc12a28d3103a30ad3ef3d768c Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 17 Feb 2026 18:28:50 +0300 Subject: [PATCH 169/192] return lvol on target if exists fix new lvol health check --- simplyblock_core/controllers/lvol_controller.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 8b0cb84a5..cf5cb9e70 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1948,6 +1948,11 @@ def replicate_lvol_on_target_cluster(lvol_id): source_node = db_controller.get_storage_node_by_id(lvol.node_id) source_cluster = db_controller.get_cluster_by_id(source_node.cluster_id) + for lv in db_controller.get_lvols(source_cluster.snapshot_replication_target_cluster): + if lv.nqn == lvol.nqn: + logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") + return lv.get_id() + snaps = [] snapshot_name = None for task in db_controller.get_job_tasks(source_node.cluster_id): @@ -1980,11 +1985,12 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.nodes = [target_node.get_id(), target_node.secondary_node_id] new_lvol.replication_node_id = "" new_lvol.do_replicate = False - new_lvol.cloned_from_snap = "" + new_lvol.cloned_from_snap = snapshot_name new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool new_lvol.lvs_name = target_node.lvstore new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" new_lvol.snapshot_name = snapshot_name + new_lvol.status = LVol.STATUS_IN_CREATION new_lvol.bdev_stack = [ { @@ -2014,7 +2020,7 @@ def replicate_lvol_on_target_cluster(lvol_id): lvol_bdev, error = add_lvol_on_node(new_lvol, target_node) if error: logger.error(error) - lvol.remove(db_controller.kv_store) + new_lvol.remove(db_controller.kv_store) return False, error new_lvol.lvol_uuid = lvol_bdev['uuid'] @@ -2029,9 +2035,11 @@ def replicate_lvol_on_target_cluster(lvol_id): ret = delete_lvol_from_node(new_lvol, target_node) if not ret: logger.error("") - lvol.remove(db_controller.kv_store) + new_lvol.remove(db_controller.kv_store) return False, error + new_lvol.status = LVol.STATUS_ONLINE + new_lvol.write_to_db(db_controller.kv_store) lvol_events.lvol_replicated(lvol, new_lvol) return new_lvol.lvol_uuid From 2357bb938888bf9945efe7b9858327be68027c9b Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 17 Feb 2026 19:01:52 +0300 Subject: [PATCH 170/192] fix lvol list --- simplyblock_core/controllers/lvol_controller.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index cf5cb9e70..59c58fdb2 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -1954,7 +1954,7 @@ def replicate_lvol_on_target_cluster(lvol_id): return lv.get_id() snaps = [] - snapshot_name = None + snapshot = None for task in db_controller.get_job_tasks(source_node.cluster_id): if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: logger.debug(task) @@ -1971,9 +1971,9 @@ def replicate_lvol_on_target_cluster(lvol_id): snaps = sorted(snaps, key=lambda x: x.created_at) last_snapshot = snaps[-1] rep_snap = db_controller.get_snapshot_by_id(last_snapshot.target_replicated_snap_uuid) - snapshot_name = rep_snap.snap_bdev + snapshot = rep_snap - if not snapshot_name: + if not snapshot: logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") return False @@ -1985,11 +1985,11 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.nodes = [target_node.get_id(), target_node.secondary_node_id] new_lvol.replication_node_id = "" new_lvol.do_replicate = False - new_lvol.cloned_from_snap = snapshot_name + new_lvol.cloned_from_snap = snapshot.get_id() new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool new_lvol.lvs_name = target_node.lvstore new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" - new_lvol.snapshot_name = snapshot_name + new_lvol.snapshot_name = snapshot.snap_bdev new_lvol.status = LVol.STATUS_IN_CREATION new_lvol.bdev_stack = [ @@ -1997,7 +1997,7 @@ def replicate_lvol_on_target_cluster(lvol_id): "type": "bdev_lvol_clone", "name": lvol.top_bdev, "params": { - "snapshot_name": snapshot_name, + "snapshot_name": snapshot.snap_bdev, "clone_name": lvol.lvol_bdev } } From edb9a9fd53a2d56cb728241ad67d1b7832a9ded4 Mon Sep 17 00:00:00 2001 From: Geoffrey Israel Date: Wed, 18 Feb 2026 12:35:25 +0100 Subject: [PATCH 171/192] updated _ReplicationParams field (#847) * updated _ReplicationParams field * pool list return uuid intead of id * lvol list return uuid intead of id * lvol list return do_replicate * added service snapshot-replication * don't fails upon cr patch failure * added imagepullpolicy * removed csi configmap and secret from spdk-pod * update crs name * updated csi hostpath configuration * updated csi hostpath configuration * updated rpc_client logger message * updated env_var file * fixed snap param name created_at * updated snapshotreplications crd * reverted api v2 field to id from uuid * updated env_var --- simplyblock_core/rpc_client.py | 2 +- simplyblock_core/scripts/charts/Chart.yaml | 1 - ...ck.io_simplyblocksnapshotreplications.yaml | 133 ++++++++++++++++++ .../scripts/charts/templates/app_k8s.yaml | 32 +++++ .../templates/csi-hostpath-driverinfo.yaml | 2 +- .../charts/templates/csi-hostpath-plugin.yaml | 1 + .../charts/templates/simplyblock-manager.yaml | 5 +- simplyblock_core/scripts/charts/values.yaml | 28 ++-- simplyblock_core/utils/__init__.py | 6 +- simplyblock_web/api/v2/cluster.py | 6 +- simplyblock_web/api/v2/dtos.py | 2 + .../templates/storage_deploy_spdk.yaml.j2 | 10 -- 12 files changed, 194 insertions(+), 34 deletions(-) create mode 100644 simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 40f25a578..abfd5a216 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -109,7 +109,7 @@ def _request2(self, method, params=None): if params: payload['params'] = params try: - logger.debug("From: %s, Requesting from: %s, method: %s, params: %s",self.ip_address, method, params) + logger.debug("From: %s, Requesting method: %s, params: %s", self.ip_address, method, params) response = self.session.post(self.url, data=json.dumps(payload), timeout=self.timeout) except Exception: raise RPCException("connection error") diff --git a/simplyblock_core/scripts/charts/Chart.yaml b/simplyblock_core/scripts/charts/Chart.yaml index 2790e31df..671f39cfa 100644 --- a/simplyblock_core/scripts/charts/Chart.yaml +++ b/simplyblock_core/scripts/charts/Chart.yaml @@ -25,7 +25,6 @@ dependencies: - name: prometheus version: "25.18.0" repository: "https://prometheus-community.github.io/helm-charts" - condition: monitoring.enabled - name: ingress-nginx version: 4.10.1 repository: "https://kubernetes.github.io/ingress-nginx" diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml new file mode 100644 index 000000000..8edf42432 --- /dev/null +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml @@ -0,0 +1,133 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: simplyblocksnapshotreplications.simplyblock.simplyblock.io +spec: + group: simplyblock.simplyblock.io + names: + kind: SimplyBlockSnapshotReplication + listKind: SimplyBlockSnapshotReplicationList + plural: simplyblocksnapshotreplications + singular: simplyblocksnapshotreplication + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SimplyBlockSnapshotReplication is the Schema for the simplyblocksnapshotreplications + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of SimplyBlockSnapshotReplication + properties: + interval: + description: 'snapshot replication interval in seconds (default: 60sec)' + format: int32 + type: integer + sourceCluster: + description: Source cluster for the snapshots + type: string + targetCluster: + description: Target cluster for replication + type: string + targetPool: + description: Target cluster pool for replication + type: string + timeout: + description: snapshot replication timeout + format: int32 + type: integer + volumeIDs: + description: 'Optional: list of volumes to replicate. Empty means + all volumes' + items: + type: string + type: array + required: + - sourceCluster + - targetCluster + - targetPool + type: object + status: + description: status defines the observed state of SimplyBlockSnapshotReplication + properties: + configured: + type: boolean + volumes: + description: Per-volume replication status + items: + description: VolumeReplicationStatus tracks the replication state + of an individual volume + properties: + errors: + description: 'Optional: list of errors encountered for this + volume' + items: + description: ReplicationError stores timestamped error messages + properties: + message: + type: string + timestamp: + format: date-time + type: string + required: + - message + - timestamp + type: object + type: array + lastReplicationTime: + description: Timestamp of the last successful replication for + this volume + format: date-time + type: string + lastSnapshotID: + description: Last snapshot ID replicated for this volume + type: string + phase: + description: Current phase for this volume + enum: + - Pending + - Running + - Completed + - Failed + - Paused + type: string + replicatedCount: + description: Number of snapshots successfully replicated + format: int32 + type: integer + volumeID: + description: Volume ID + type: string + required: + - volumeID + type: object + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index 148b450e7..82f1d4f2c 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -195,6 +195,7 @@ spec: - name: storage-node-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/storage_node_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -207,6 +208,7 @@ spec: - name: mgmt-node-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/mgmt_node_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" env: - name: BACKEND_TYPE value: "k8s" @@ -221,6 +223,7 @@ spec: - name: lvol-stats-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/lvol_stat_collector.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -233,6 +236,7 @@ spec: - name: main-distr-event-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/main_distr_event_collector.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -245,6 +249,7 @@ spec: - name: capacity-and-stats-collector image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/capacity_and_stats_collector.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -257,6 +262,7 @@ spec: - name: capacity-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/cap_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -269,6 +275,7 @@ spec: - name: health-check image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/health_check_service.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -281,6 +288,7 @@ spec: - name: device-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/device_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -293,6 +301,7 @@ spec: - name: lvol-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/lvol_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -305,6 +314,7 @@ spec: - name: snapshot-monitor image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/snapshot_monitor.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -368,6 +378,7 @@ spec: - name: tasks-node-add-runner image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_runner_node_add.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" env: - name: LVOL_NVMF_PORT_START value: "{{ .Values.ports.lvolNvmfPortStart }}" @@ -382,6 +393,7 @@ spec: - name: tasks-runner-restart image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_runner_restart.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -394,6 +406,7 @@ spec: - name: tasks-runner-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_runner_migration.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -406,6 +419,7 @@ spec: - name: tasks-runner-failed-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_runner_failed_migration.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -418,6 +432,7 @@ spec: - name: tasks-runner-cluster-status image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_cluster_status.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -430,6 +445,7 @@ spec: - name: tasks-runner-new-device-migration image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_runner_new_dev_migration.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -442,6 +458,7 @@ spec: - name: tasks-runner-port-allow image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_runner_port_allow.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -454,6 +471,7 @@ spec: - name: tasks-runner-jc-comp-resume image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_runner_jc_comp.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} @@ -466,6 +484,20 @@ spec: - name: tasks-runner-sync-lvol-del image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" command: ["python", "simplyblock_core/services/tasks_runner_sync_lvol_del.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" +{{- with (include "simplyblock.commonContainer" . | fromYaml) }} + env: +{{ toYaml .env | nindent 12 }} + volumeMounts: +{{ toYaml .volumeMounts | nindent 12 }} + resources: +{{ toYaml .resources | nindent 12 }} +{{- end }} + + - name: tasks-runner-snapshot-replication + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + command: ["python", "simplyblock_core/services/snapshot_replication.py"] + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" {{- with (include "simplyblock.commonContainer" . | fromYaml) }} env: {{ toYaml .env | nindent 12 }} diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml index f735a3b34..2a9d7d044 100644 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml @@ -18,7 +18,7 @@ spec: # No attacher needed. attachRequired: false storageCapacity: false - # Kubernetes may use fsGroup to change permissions and ownership + # Kubernetes may use fsGroup to change permissions and ownership # of the volume to match user requested fsGroup in the pod's SecurityPolicy fsGroupPolicy: File \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml index 721815fa5..aa645bff4 100644 --- a/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml +++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml @@ -229,3 +229,4 @@ spec: path: /dev type: Directory name: dev-dir + \ No newline at end of file diff --git a/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml b/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml index 257be5ae1..cca5e522d 100644 --- a/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml +++ b/simplyblock_core/scripts/charts/templates/simplyblock-manager.yaml @@ -23,7 +23,7 @@ spec: fsGroup: 65532 serviceAccountName: simplyblock-manager containers: - - image: simplyblock/simplyblock-manager:main + - image: simplyblock/simplyblock-manager:snapshot_replication imagePullPolicy: Always name: manager env: @@ -145,6 +145,7 @@ rules: - simplyblockstoragenodes - simplyblockdevices - simplyblocktasks + - simplyblocksnapshotreplications verbs: - create - delete @@ -162,6 +163,7 @@ rules: - simplyblockstoragenodes/finalizers - simplyblockdevices/finalizers - simplyblocktasks/finalizers + - simplyblocksnapshotreplications/finalizers verbs: - update - delete @@ -174,6 +176,7 @@ rules: - simplyblockstoragenodes/status - simplyblockdevices/status - simplyblocktasks/status + - simplyblocksnapshotreplications/status verbs: - get - patch diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml index f6837654b..919b20319 100644 --- a/simplyblock_core/scripts/charts/values.yaml +++ b/simplyblock_core/scripts/charts/values.yaml @@ -16,11 +16,11 @@ observability: image: simplyblock: repository: "public.ecr.aws/simply-block/simplyblock" - tag: "improved_user_experience" + tag: "snapshot_replication_operator_support" pullPolicy: "Always" ports: - lvolNvmfPortStart: + lvolNvmfPortStart: 9100 storageclass: allowedTopologyZones: [] @@ -58,7 +58,7 @@ opensearch: persistence: enabled: true storageClass: local-hostpath - size: 10Gi + size: 20Gi resources: requests: @@ -205,7 +205,7 @@ ingress: simplyblock: cluster: - clusterName: demo-cluster + clusterName: simplyblock-cluster mgmtIfc: eth0 fabric: tcp isSingleNode: false @@ -217,29 +217,29 @@ simplyblock: provCapCrit: 150 pool: - name: demo-pool + name: simplyblock-pool capacityLimit: 100Gi lvol: - name: demo-lvol + name: simplyblock-lvol storageNodes: - name: demo-node - clusterImage: public.ecr.aws/simply-block/simplyblock:improved_user_experience + name: simplyblock-node + clusterImage: public.ecr.aws/simply-block/simplyblock:snapshot_replication_operator_support mgmtIfc: eth0 maxLVol: 10 - maxSize: 150G + maxSize: 0 partitions: 0 corePercentage: 65 spdkDebug: false coreIsolation: false workerNodes: - - vm02.simplyblock3.localdomain - - vm03.simplyblock3.localdomain - - vm04.simplyblock3.localdomain + - israel-storage-node-1 + - israel-storage-node-2 + - israel-storage-node-3 devices: - name: demo-devices + name: simplyblock-devices tasks: - name: demo-task + name: simplyblock-task diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py index 649c1eb75..6e15fba9c 100644 --- a/simplyblock_core/utils/__init__.py +++ b/simplyblock_core/utils/__init__.py @@ -2145,7 +2145,7 @@ def patch_cr_status( body=body, ) except ApiException as e: - raise RuntimeError( + logger.error( f"Failed to patch status for {name}: {e.reason} {e.body}" ) @@ -2251,7 +2251,7 @@ def patch_cr_node_status( ) except ApiException as e: - raise RuntimeError( + logger.error( f"Failed to patch node for {name}: {e.reason} {e.body}" ) @@ -2358,7 +2358,7 @@ def patch_cr_lvol_status( ) except ApiException as e: - raise RuntimeError( + logger.error( f"Failed to patch lvol status for {name}: {e.reason} {e.body}" ) diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py index f7b87aa29..9447ff718 100644 --- a/simplyblock_web/api/v2/cluster.py +++ b/simplyblock_web/api/v2/cluster.py @@ -18,9 +18,9 @@ class _ReplicationParams(BaseModel): - snapshot_replication_target_cluster: Optional[str] - snapshot_replication_timeout: Optional[str] - target_pool: Optional[str] + snapshot_replication_target_cluster: str + snapshot_replication_timeout: int = 0 + target_pool: Optional[str] = None class _UpdateParams(BaseModel): management_image: Optional[str] diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index 070a215ce..2832372cd 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -261,6 +261,7 @@ class VolumeDTO(BaseModel): crypto_key: Optional[Tuple[str, str]] high_availability: bool lvol_priority_class: util.Unsigned + do_replicate: bool = False max_namespace_per_subsys: int max_rw_iops: util.Unsigned max_rw_mbytes: util.Unsigned @@ -310,6 +311,7 @@ def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optiona blobid=model.blobid, ns_id=model.ns_id, lvol_priority_class=model.lvol_priority_class, + do_replicate=model.do_replicate, max_namespace_per_subsys=model.max_namespace_per_subsys, max_rw_iops=model.rw_ios_per_sec, max_rw_mbytes=model.rw_mbytes_per_sec, diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 index 8e2e40f34..105ee1157 100644 --- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 +++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 @@ -98,16 +98,6 @@ spec: value: "{{ NSOCKET }}" - name: FW_PORT value: "{{ FW_PORT }}" - - name: SPDKCSI_SECRET - valueFrom: - secretKeyRef: - name: simplyblock-csi-secret - key: secret.json - - name: CLUSTER_CONFIG - valueFrom: - configMapKeyRef: - name: simplyblock-csi-cm - key: config.json lifecycle: postStart: exec: From 38a7d92afdf9dbb43497c86003c094f9fc4156f4 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 18 Feb 2026 14:58:40 +0300 Subject: [PATCH 172/192] return new lvol connection string on lvol connect if cluster is suspended and lvol is replicated --- simplyblock_core/controllers/lvol_controller.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 59c58fdb2..0bf680a67 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -14,6 +14,7 @@ from simplyblock_core.controllers import snapshot_controller, pool_controller, lvol_events, tasks_controller, \ snapshot_events from simplyblock_core.db_controller import DBController +from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.pool import Pool from simplyblock_core.models.lvol_model import LVol @@ -1345,6 +1346,16 @@ def connect_lvol(uuid, ctrl_loss_tmo=constants.LVOL_NVME_CONNECT_CTRL_LOSS_TMO): logger.error(e) return False + node = db_controller.get_storage_node_by_id(lvol.node_id) + cluster = db_controller.get_cluster_by_id(node.cluster_id) + if cluster.status == Cluster.STATUS_SUSPENDED and cluster.snapshot_replication_target_cluster: + logger.error("Cluster is suspended, looking for replicated lvol") + for lv in db_controller.get_lvols(cluster.snapshot_replication_target_cluster): + if lv.nqn == lvol.nqn: + logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") + lvol = lv + break + out = [] nodes_ids = [] if lvol.ha_type == 'single': From e77ea4187964a8712636015aad57aa3969a3cfe9 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 2 Mar 2026 16:31:05 +0300 Subject: [PATCH 173/192] feat: add endpoint to list replication tasks for a volume --- .../controllers/lvol_controller.py | 18 ++++++++++++++++++ simplyblock_web/api/v2/volume.py | 8 +++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 0bf680a67..bbb72342a 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2054,3 +2054,21 @@ def replicate_lvol_on_target_cluster(lvol_id): lvol_events.lvol_replicated(lvol, new_lvol) return new_lvol.lvol_uuid + + +def list_replication_tasks(lvol_id): + db_controller = DBController() + lvol = db_controller.get_lvol_by_id(lvol_id) + node = db_controller.get_storage_node_by_id(lvol.node_id) + tasks = [] + for task in db_controller.get_job_tasks(node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + if snap.lvol.get_id() != lvol_id: + continue + tasks.append(task) + + return tasks diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index fcce15aa6..4797525af 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -11,7 +11,7 @@ from .cluster import Cluster from .pool import StoragePool -from .dtos import VolumeDTO, SnapshotDTO +from .dtos import VolumeDTO, SnapshotDTO, TaskDTO from . import util @@ -265,3 +265,9 @@ def create_snapshot( @instance_api.post('/replicate_lvol', name='clusters:storage-pools:volumes:replicate_lvol') def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume: Volume): return lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) + + +@api.get('/', name='clusters:storage-pools:volumes:list_replication_tasks') +def list(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: + tasks = lvol_controller.list_replication_tasks(volume.get_id()) + return [TaskDTO.from_model(task) for task in tasks] From d2630eebd5abc71fbd783e56f27724f6516cdd3d Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Fri, 6 Mar 2026 12:27:39 +0100 Subject: [PATCH 174/192] updated endpoint and func list_replication_tasks --- simplyblock_core/scripts/charts/values.yaml | 4 ++-- simplyblock_web/api/v2/volume.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml index 919b20319..3c17f041e 100644 --- a/simplyblock_core/scripts/charts/values.yaml +++ b/simplyblock_core/scripts/charts/values.yaml @@ -16,7 +16,7 @@ observability: image: simplyblock: repository: "public.ecr.aws/simply-block/simplyblock" - tag: "snapshot_replication_operator_support" + tag: "main-sfam-2359" pullPolicy: "Always" ports: @@ -225,7 +225,7 @@ simplyblock: storageNodes: name: simplyblock-node - clusterImage: public.ecr.aws/simply-block/simplyblock:snapshot_replication_operator_support + clusterImage: public.ecr.aws/simply-block/simplyblock:main-sfam-2359 mgmtIfc: eth0 maxLVol: 10 maxSize: 0 diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 4797525af..e2053361d 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -267,7 +267,7 @@ def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume return lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) -@api.get('/', name='clusters:storage-pools:volumes:list_replication_tasks') -def list(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: +@api.get('/list_replication_tasks', name='clusters:storage-pools:volumes:list_replication_tasks') +def list_replication_tasks(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: tasks = lvol_controller.list_replication_tasks(volume.get_id()) return [TaskDTO.from_model(task) for task in tasks] From 79f156c4e0ee40ddf29eeb0fda9feaeda321212f Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Fri, 6 Mar 2026 12:55:07 +0100 Subject: [PATCH 175/192] update endpoint list_replication_tasks to use instance_api --- simplyblock_web/api/v2/volume.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index e2053361d..2cba217ea 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -267,7 +267,7 @@ def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume return lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) -@api.get('/list_replication_tasks', name='clusters:storage-pools:volumes:list_replication_tasks') +@instance_api.get('/list_replication_tasks', name='clusters:storage-pools:volumes:list_replication_tasks') def list_replication_tasks(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: tasks = lvol_controller.list_replication_tasks(volume.get_id()) return [TaskDTO.from_model(task) for task in tasks] From af8a43d1531d80ac5089c99a2979bdec2f673764 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Tue, 10 Mar 2026 15:04:04 +0100 Subject: [PATCH 176/192] updated snapshot replication crd --- ...ck.io_simplyblocksnapshotreplications.yaml | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml index 8edf42432..8eebd8370 100644 --- a/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml +++ b/simplyblock_core/scripts/charts/crds/simplyblock.simplyblock.io_simplyblocksnapshotreplications.yaml @@ -40,8 +40,24 @@ spec: spec: description: spec defines the desired state of SimplyBlockSnapshotReplication properties: + action: + enum: + - failback + type: string + excludeVolumeIDs: + description: 'Optional: volumes to exclude from failback.' + items: + type: string + type: array + includeVolumeIDs: + description: |- + Optional: only these volumes are included in failback. + If empty, all volumes are candidates unless excluded below. + items: + type: string + type: array interval: - description: 'snapshot replication interval in seconds (default: 60sec)' + description: 'snapshot replication interval in seconds (default: 300sec)' format: int32 type: integer sourceCluster: @@ -73,6 +89,11 @@ spec: properties: configured: type: boolean + observedFailbackGeneration: + description: The metadata.generation value for which failback was + last processed. + format: int64 + type: integer volumes: description: Per-volume replication status items: From 90c133f0bf00364bbcf2a72d227ab7fe3faaa6dd Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 11 Mar 2026 15:52:41 +0300 Subject: [PATCH 177/192] feat: add suspend and resume commands for lvol subsystems --- simplyblock_cli/cli-reference.yaml | 18 ++++ simplyblock_cli/cli.py | 15 ++++ simplyblock_cli/clibase.py | 8 +- .../controllers/lvol_controller.py | 85 +++++++++++++++++-- simplyblock_web/api/v2/volume.py | 11 ++- 5 files changed, 129 insertions(+), 8 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 51e7d7293..d5d889a55 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1704,6 +1704,10 @@ commands: help: "Logical volume id" dest: lvol_id type: str + - name: "--replication-cluster-id" + help: "Cluster ID of the replication target cluster" + dest: replication_cluster_id + type: str - name: replication-stop help: "Stop snapshot replication taken from lvol" arguments: @@ -1725,6 +1729,20 @@ commands: help: "Logical volume id" dest: lvol_id type: str + - name: suspend + help: "Suspend lvol subsystems" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str + - name: resume + help: "Resume lvol subsystems" + arguments: + - name: "lvol_id" + help: "Logical volume id" + dest: lvol_id + type: str - name: "control-plane" help: "Control plane commands" aliases: diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 045cc1153..1c5ed552c 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -558,6 +558,8 @@ def init_volume(self): self.init_volume__replication_stop(subparser) self.init_volume__replication_status(subparser) self.init_volume__replication_trigger(subparser) + self.init_volume__suspend(subparser) + self.init_volume__resume(subparser) def init_volume__add(self, subparser): @@ -668,6 +670,7 @@ def init_volume__inflate(self, subparser): def init_volume__replication_start(self, subparser): subcommand = self.add_sub_command(subparser, 'replication-start', 'Start snapshot replication taken from lvol') subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + argument = subcommand.add_argument('--replication-cluster-id', help='Cluster ID of the replication target cluster', type=str, dest='replication_cluster_id') def init_volume__replication_stop(self, subparser): subcommand = self.add_sub_command(subparser, 'replication-stop', 'Stop snapshot replication taken from lvol') @@ -681,6 +684,14 @@ def init_volume__replication_trigger(self, subparser): subcommand = self.add_sub_command(subparser, 'replication-trigger', 'Start replication for lvol') subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + def init_volume__suspend(self, subparser): + subcommand = self.add_sub_command(subparser, 'suspend', 'Suspend lvol subsystems') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + + def init_volume__resume(self, subparser): + subcommand = self.add_sub_command(subparser, 'resume', 'Resume lvol subsystems') + subcommand.add_argument('lvol_id', help='Logical volume id', type=str) + def init_control_plane(self): subparser = self.add_command('control-plane', 'Control plane commands', aliases=['cp','mgmt',]) @@ -1139,6 +1150,10 @@ def run(self): ret = self.volume__replication_status(sub_command, args) elif sub_command in ['replication-trigger']: ret = self.volume__replication_trigger(sub_command, args) + elif sub_command in ['suspend']: + ret = self.volume__suspend(sub_command, args) + elif sub_command in ['resume']: + ret = self.volume__resume(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index b4057e69e..2603bc574 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -591,7 +591,7 @@ def volume__inflate(self, sub_command, args): return lvol_controller.inflate_lvol(args.volume_id) def volume__replication_start(self, sub_command, args): - return lvol_controller.replication_start(args.lvol_id) + return lvol_controller.replication_start(args.lvol_id, args.replication_cluster_id) def volume__replication_stop(self, sub_command, args): return lvol_controller.replication_stop(args.lvol_id) @@ -602,6 +602,12 @@ def volume__replication_status(self, sub_command, args): def volume__replication_trigger(self, sub_command, args): return lvol_controller.replication_trigger(args.lvol_id) + def volume__suspend(self, sub_command, args): + return lvol_controller.suspend_lvol(args.lvol_id) + + def volume__resume(self, sub_command, args): + return lvol_controller.resume_lvol(args.lvol_id) + def control_plane__add(self, sub_command, args): cluster_id = args.cluster_id cluster_ip = args.cluster_ip diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index bbb72342a..53eebec09 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -253,7 +253,7 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp= distr_vuid=0, max_rw_iops=0, max_rw_mbytes=0, max_r_mbytes=0, max_w_mbytes=0, with_snapshot=False, max_size=0, crypto_key1=None, crypto_key2=None, lvol_priority_class=0, uid=None, pvc_name=None, namespace=None, max_namespace_per_subsys=1, fabric="tcp", ndcs=0, npcs=0, - do_replicate=False): + do_replicate=False, replication_cluster_id=None): db_controller = DBController() logger.info(f"Adding LVol: {name}") @@ -474,7 +474,13 @@ def add_lvol_ha(name, size, host_id_or_name, ha_type, pool_id_or_name, use_comp= lvol.ndcs = cl.distr_ndcs lvol.do_replicate = bool(do_replicate) if lvol.do_replicate: - random_nodes = _get_next_3_nodes(cl.snapshot_replication_target_cluster, lvol.size) + if replication_cluster_id: + replication_cluster = db_controller.get_cluster_by_id(replication_cluster_id) + if not replication_cluster: + return False, f"Replication cluster not found: {replication_cluster_id}" + else: + replication_cluster_id = cl.snapshot_replication_target_cluster + random_nodes = _get_next_3_nodes(replication_cluster_id, lvol.size) lvol.replication_node_id = random_nodes[0].get_id() lvol_count = len(db_controller.get_lvols_by_node_id(host_node.get_id())) @@ -1868,7 +1874,7 @@ def replication_trigger(lvol_id): return out -def replication_start(lvol_id): +def replication_start(lvol_id, replication_cluster_id=None): db_controller = DBController() try: lvol = db_controller.get_lvol_by_id(lvol_id) @@ -1886,10 +1892,12 @@ def replication_start(lvol_id): excluded_nodes.append(org_snap.lvol.node_id) snode = db_controller.get_storage_node_by_id(lvol.node_id) cluster = db_controller.get_cluster_by_id(snode.cluster_id) - if not cluster.snapshot_replication_target_cluster: + if not replication_cluster_id: + replication_cluster_id = cluster.snapshot_replication_target_cluster + if not replication_cluster_id: logger.error(f"Cluster: {snode.cluster_id} not replicated") return False - random_nodes = _get_next_3_nodes(cluster.snapshot_replication_target_cluster, lvol.size) + random_nodes = _get_next_3_nodes(replication_cluster_id, lvol.size) for r_node in random_nodes: if r_node.get_id() not in excluded_nodes: logger.info(f"Replicating on node: {r_node.get_id()}") @@ -2072,3 +2080,70 @@ def list_replication_tasks(lvol_id): tasks.append(task) return tasks + + +def suspend_lvol(lvol_id): + + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + logger.info(f"suspending LVol subsystem: {lvol.get_id()}") + snode = db_controller.get_storage_node_by_id(lvol.node_id) + for iface in snode.data_nics: + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): + logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) + ret = snode.rpc_client().nvmf_subsystem_listener_set_ana_state(lvol.nqn, iface.ip4_address, lvol.subsys_port, ana_state="inaccessible") + if not ret: + logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") + return False + + if snode.secondary_node_id: + sec_node = db_controller.get_storage_node_by_id(snode.secondary_node_id) + if sec_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN, StorageNode.STATUS_SUSPENDED]: + for iface in sec_node.data_nics: + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): + logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) + ret = sec_node.rpc_client().nvmf_subsystem_listener_set_ana_state(lvol.nqn, iface.ip4_address, lvol.subsys_port, ana_state="inaccessible") + if not ret: + logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") + return False + + return True + + +def resume_lvol(lvol_id): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + logger.info(f"suspending LVol subsystem: {lvol.get_id()}") + snode = db_controller.get_storage_node_by_id(lvol.node_id) + for iface in snode.data_nics: + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): + logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) + ret = snode.rpc_client().nvmf_subsystem_listener_set_ana_state( + lvol.nqn, iface.ip4_address, lvol.subsys_port, is_optimized=True) + if not ret: + logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") + return False + + if snode.secondary_node_id: + sec_node = db_controller.get_storage_node_by_id(snode.secondary_node_id) + if sec_node.status in [StorageNode.STATUS_ONLINE, StorageNode.STATUS_DOWN, StorageNode.STATUS_SUSPENDED]: + for iface in sec_node.data_nics: + if iface.ip4_address and lvol.fabric == iface.trtype.lower(): + logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) + ret = sec_node.rpc_client().nvmf_subsystem_listener_set_ana_state( + lvol.nqn, iface.ip4_address, lvol.subsys_port, is_optimized=False) + if not ret: + logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") + return False + + return True diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 2cba217ea..53f7c17de 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -186,13 +186,20 @@ def inflate(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: return Response(status_code=204) +@instance_api.post('/replication_trigger', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) +def replication_trigger(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: + if not lvol_controller.replication_trigger(volume.get_id()): + raise ValueError('Failed to start volume snapshot replication') + + return Response(status_code=204) + @instance_api.post('/replication_start', name='clusters:storage-pools:volumes:replication_start', status_code=204, responses={204: {"content": None}}) def replication_start(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: - if not lvol_controller.replication_trigger(volume.get_id()): + if not lvol_controller.replication_start(volume.get_id(), cluster.get_id()): raise ValueError('Failed to start volume snapshot replication') return Response(status_code=204) - + @instance_api.post('/replication_stop', name='clusters:storage-pools:volumes:replication_stop', status_code=204, responses={204: {"content": None}}) def replication_stop(cluster: Cluster, pool: StoragePool, volume: Volume) -> Response: if not lvol_controller.replication_stop(volume.get_id()): From 7a28adbc16a3cf6596b29d2eaaab4693e3479242 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 11 Mar 2026 15:55:07 +0300 Subject: [PATCH 178/192] feat: add configuration settings and utility scripts for volume management --- simplyblock_web/api/v2/volume.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 53f7c17de..0aeb01a2f 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -49,6 +49,7 @@ class _CreateParams(BaseModel): fabric: str = "tcp" max_namespace_per_subsys: int = 1 do_replicate: bool = False + replication_cluster_id: Optional[str] = None class _CloneParams(BaseModel): @@ -94,6 +95,7 @@ def add( fabric=data.fabric, max_namespace_per_subsys=data.max_namespace_per_subsys, do_replicate=data.do_replicate, + replication_cluster_id=data.replication_cluster_id, ) elif isinstance(data, _CloneParams): From 8961c8f44754df3aef487c12434bcbf53b09c8e7 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 11 Mar 2026 15:58:31 +0300 Subject: [PATCH 179/192] feat: add configuration settings, utility scripts, and endpoints for volume management --- simplyblock_web/api/v2/volume.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 0aeb01a2f..790dce6bd 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -280,3 +280,11 @@ def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume def list_replication_tasks(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: tasks = lvol_controller.list_replication_tasks(volume.get_id()) return [TaskDTO.from_model(task) for task in tasks] + +@instance_api.get('/suspend', name='clusters:storage-pools:volumes:suspend') +def suspend(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: + return lvol_controller.suspend_lvol(volume.get_id()) + +@instance_api.get('/resume', name='clusters:storage-pools:volumes:resume') +def resume(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: + return lvol_controller.resume_lvol(volume.get_id()) From e28549aa113d8c1f0b7703a479e87d8787d61d01 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 12 Mar 2026 16:18:08 +0300 Subject: [PATCH 180/192] wip --- .../controllers/lvol_controller.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 53eebec09..9d0998956 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2147,3 +2147,120 @@ def resume_lvol(lvol_id): return False return True + + +def replicate_lvol_on_source_cluster(lvol_id): + db_controller = DBController() + try: + lvol = db_controller.get_lvol_by_id(lvol_id) + except KeyError as e: + logger.error(e) + return False + + source_node = db_controller.get_storage_node_by_id(lvol.node_id) + source_cluster = db_controller.get_cluster_by_id(source_node.cluster_id) + + if not source_node: + logger.error(f"Node not found: {lvol.node_id}") + return False + + if source_node.status != StorageNode.STATUS_ONLINE: + logger.error(f"Node is not online!: {source_node.get_id()}, status: {source_node.status}") + return False + + for lv in db_controller.get_lvols(source_cluster.snapshot_replication_target_cluster): + if lv.nqn == lvol.nqn: + logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") + return lv.get_id() + + snaps = [] + snapshot = None + for task in db_controller.get_job_tasks(source_node.cluster_id): + if task.function_name == JobSchedule.FN_SNAPSHOT_REPLICATION: + logger.debug(task) + try: + snap = db_controller.get_snapshot_by_id(task.function_params["snapshot_id"]) + except KeyError: + continue + + if snap.lvol.get_id() != lvol_id: + continue + snaps.append(snap) + + if snaps: + snaps = sorted(snaps, key=lambda x: x.created_at) + last_snapshot = snaps[-1] + rep_snap = db_controller.get_snapshot_by_id(last_snapshot.target_replicated_snap_uuid) + snapshot = rep_snap + + if not snapshot: + logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") + return False + + # create lvol on target node + new_lvol = copy.deepcopy(lvol) + new_lvol.uuid = str(uuid.uuid4()) + new_lvol.create_dt = str(datetime.now()) + new_lvol.node_id = target_node.get_id() + new_lvol.nodes = [target_node.get_id(), target_node.secondary_node_id] + new_lvol.replication_node_id = "" + new_lvol.do_replicate = False + new_lvol.cloned_from_snap = snapshot.get_id() + new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool + new_lvol.lvs_name = target_node.lvstore + new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" + new_lvol.snapshot_name = snapshot.snap_bdev + new_lvol.status = LVol.STATUS_IN_CREATION + + new_lvol.bdev_stack = [ + { + "type": "bdev_lvol_clone", + "name": lvol.top_bdev, + "params": { + "snapshot_name": snapshot.snap_bdev, + "clone_name": lvol.lvol_bdev + } + } + ] + + if new_lvol.crypto_bdev: + new_lvol.bdev_stack.append({ + "type": "crypto", + "name": lvol.crypto_bdev, + "params": { + "name": lvol.crypto_bdev, + "base_name": lvol.top_bdev, + "key1": lvol.crypto_key1, + "key2": lvol.crypto_key2, + } + }) + + new_lvol.write_to_db(db_controller.kv_store) + + lvol_bdev, error = add_lvol_on_node(new_lvol, target_node) + if error: + logger.error(error) + new_lvol.remove(db_controller.kv_store) + return False, error + + new_lvol.lvol_uuid = lvol_bdev['uuid'] + new_lvol.blobid = lvol_bdev['driver_specific']['lvol']['blobid'] + + secondary_node = db_controller.get_storage_node_by_id(target_node.secondary_node_id) + if secondary_node.status == StorageNode.STATUS_ONLINE: + lvol_bdev, error = add_lvol_on_node(new_lvol, secondary_node, is_primary=False) + if error: + logger.error(error) + # remove lvol from primary + ret = delete_lvol_from_node(new_lvol, target_node) + if not ret: + logger.error("") + new_lvol.remove(db_controller.kv_store) + return False, error + + new_lvol.status = LVol.STATUS_ONLINE + new_lvol.write_to_db(db_controller.kv_store) + lvol_events.lvol_replicated(lvol, new_lvol) + + return new_lvol.lvol_uuid + From 34666665e9bbaa5e6db54dcd1bf7b870e6e7f590 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 12 Mar 2026 20:30:11 +0300 Subject: [PATCH 181/192] Adds replicate_lvol_on_source_cluster apiv2 --- .../controllers/lvol_controller.py | 19 ++++++++----------- simplyblock_web/api/v2/volume.py | 5 +++++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 9d0998956..d90a889d4 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2189,9 +2189,7 @@ def replicate_lvol_on_source_cluster(lvol_id): if snaps: snaps = sorted(snaps, key=lambda x: x.created_at) - last_snapshot = snaps[-1] - rep_snap = db_controller.get_snapshot_by_id(last_snapshot.target_replicated_snap_uuid) - snapshot = rep_snap + snapshot = snaps[-1] if not snapshot: logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") @@ -2201,14 +2199,9 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol = copy.deepcopy(lvol) new_lvol.uuid = str(uuid.uuid4()) new_lvol.create_dt = str(datetime.now()) - new_lvol.node_id = target_node.get_id() - new_lvol.nodes = [target_node.get_id(), target_node.secondary_node_id] new_lvol.replication_node_id = "" new_lvol.do_replicate = False new_lvol.cloned_from_snap = snapshot.get_id() - new_lvol.pool_uuid = source_cluster.snapshot_replication_target_pool - new_lvol.lvs_name = target_node.lvstore - new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" new_lvol.snapshot_name = snapshot.snap_bdev new_lvol.status = LVol.STATUS_IN_CREATION @@ -2237,7 +2230,11 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol.write_to_db(db_controller.kv_store) - lvol_bdev, error = add_lvol_on_node(new_lvol, target_node) + delete_lvol(lvol_id) + + time.sleep(3) + + lvol_bdev, error = add_lvol_on_node(new_lvol, source_node) if error: logger.error(error) new_lvol.remove(db_controller.kv_store) @@ -2246,13 +2243,13 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol.lvol_uuid = lvol_bdev['uuid'] new_lvol.blobid = lvol_bdev['driver_specific']['lvol']['blobid'] - secondary_node = db_controller.get_storage_node_by_id(target_node.secondary_node_id) + secondary_node = db_controller.get_storage_node_by_id(source_node.secondary_node_id) if secondary_node.status == StorageNode.STATUS_ONLINE: lvol_bdev, error = add_lvol_on_node(new_lvol, secondary_node, is_primary=False) if error: logger.error(error) # remove lvol from primary - ret = delete_lvol_from_node(new_lvol, target_node) + ret = delete_lvol_from_node(new_lvol, source_node) if not ret: logger.error("") new_lvol.remove(db_controller.kv_store) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 790dce6bd..2f56a8d2c 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -276,6 +276,11 @@ def replicate_lvol_on_target_cluster(cluster: Cluster, pool: StoragePool, volume return lvol_controller.replicate_lvol_on_target_cluster(volume.get_id()) +@instance_api.post('/replicate_lvol_on_source_cluster', name='clusters:storage-pools:volumes:replicate_lvol_on_source_cluster') +def replicate_lvol_on_source_cluster(cluster: Cluster, pool: StoragePool, volume: Volume): + return lvol_controller.replicate_lvol_on_source_cluster(volume.get_id()) + + @instance_api.get('/list_replication_tasks', name='clusters:storage-pools:volumes:list_replication_tasks') def list_replication_tasks(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: tasks = lvol_controller.list_replication_tasks(volume.get_id()) From b29038763603eed61ea73603b941d1e1347623d5 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 13 Mar 2026 14:11:21 +0300 Subject: [PATCH 182/192] fix 1 --- simplyblock_core/controllers/lvol_controller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index d90a889d4..23a25e328 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2096,7 +2096,7 @@ def suspend_lvol(lvol_id): for iface in snode.data_nics: if iface.ip4_address and lvol.fabric == iface.trtype.lower(): logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) - ret = snode.rpc_client().nvmf_subsystem_listener_set_ana_state(lvol.nqn, iface.ip4_address, lvol.subsys_port, ana_state="inaccessible") + ret = snode.rpc_client().nvmf_subsystem_listener_set_ana_state(lvol.nqn, iface.ip4_address, lvol.subsys_port, ana="inaccessible") if not ret: logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") return False @@ -2107,7 +2107,7 @@ def suspend_lvol(lvol_id): for iface in sec_node.data_nics: if iface.ip4_address and lvol.fabric == iface.trtype.lower(): logger.info("adding listener for %s on IP %s" % (lvol.nqn, iface.ip4_address)) - ret = sec_node.rpc_client().nvmf_subsystem_listener_set_ana_state(lvol.nqn, iface.ip4_address, lvol.subsys_port, ana_state="inaccessible") + ret = sec_node.rpc_client().nvmf_subsystem_listener_set_ana_state(lvol.nqn, iface.ip4_address, lvol.subsys_port, ana="inaccessible") if not ret: logger.error(f"Failed to set subsystem listener state for {lvol.nqn} on {iface.ip4_address}") return False From 28a21ab2f42f6257bbb4a0eec231b75a6ddb20ee Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 13 Mar 2026 15:05:49 +0300 Subject: [PATCH 183/192] Adds 'from_source' attr to lvol model --- simplyblock_core/controllers/lvol_controller.py | 4 ++++ simplyblock_core/models/lvol_model.py | 1 + simplyblock_web/api/v2/dtos.py | 2 ++ 3 files changed, 7 insertions(+) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 23a25e328..a00bca4e6 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2010,6 +2010,7 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" new_lvol.snapshot_name = snapshot.snap_bdev new_lvol.status = LVol.STATUS_IN_CREATION + lvol.from_source = True new_lvol.bdev_stack = [ { @@ -2257,6 +2258,9 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol.status = LVol.STATUS_ONLINE new_lvol.write_to_db(db_controller.kv_store) + lvol = db_controller.get_lvol_by_id(lvol_id) + lvol.from_source = False + lvol.write_to_db() lvol_events.lvol_replicated(lvol, new_lvol) return new_lvol.lvol_uuid diff --git a/simplyblock_core/models/lvol_model.py b/simplyblock_core/models/lvol_model.py index e82e4171d..a67032c53 100644 --- a/simplyblock_core/models/lvol_model.py +++ b/simplyblock_core/models/lvol_model.py @@ -68,6 +68,7 @@ class LVol(BaseModel): npcs: int = 0 do_replicate: bool = False replication_node_id: str = "" + from_source: bool = True def has_qos(self): return (self.rw_ios_per_sec > 0 or self.rw_mbytes_per_sec > 0 or self.r_mbytes_per_sec > 0 or self.w_mbytes_per_sec > 0) diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index 2832372cd..504ba784d 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -269,6 +269,8 @@ class VolumeDTO(BaseModel): max_w_mbytes: util.Unsigned capacity: CapacityStatDTO rep_info: Optional[dict] = None + from_source: bool = True + @staticmethod def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optional[StatsObject]=None, rep_info=None): From 0ffb1551a7b3970396c9928c74b60462bfe4fa09 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Fri, 13 Mar 2026 17:33:24 +0300 Subject: [PATCH 184/192] fix: update suspend and resume functions to return boolean values --- simplyblock_web/api/v2/volume.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py index 2f56a8d2c..ba342f071 100644 --- a/simplyblock_web/api/v2/volume.py +++ b/simplyblock_web/api/v2/volume.py @@ -287,9 +287,9 @@ def list_replication_tasks(cluster: Cluster, pool: StoragePool, volume: Volume) return [TaskDTO.from_model(task) for task in tasks] @instance_api.get('/suspend', name='clusters:storage-pools:volumes:suspend') -def suspend(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: +def suspend(cluster: Cluster, pool: StoragePool, volume: Volume) -> bool: return lvol_controller.suspend_lvol(volume.get_id()) @instance_api.get('/resume', name='clusters:storage-pools:volumes:resume') -def resume(cluster: Cluster, pool: StoragePool, volume: Volume) -> List[TaskDTO]: +def resume(cluster: Cluster, pool: StoragePool, volume: Volume) -> bool: return lvol_controller.resume_lvol(volume.get_id()) From cb20278dd20975022601e3635789403ce2ea57bc Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 16 Mar 2026 16:15:36 +0300 Subject: [PATCH 185/192] fix: toggle 'from_source' attribute in lvol model during replication --- simplyblock_core/controllers/lvol_controller.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index a00bca4e6..b4d94b330 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2010,7 +2010,6 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" new_lvol.snapshot_name = snapshot.snap_bdev new_lvol.status = LVol.STATUS_IN_CREATION - lvol.from_source = True new_lvol.bdev_stack = [ { @@ -2060,6 +2059,9 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.status = LVol.STATUS_ONLINE new_lvol.write_to_db(db_controller.kv_store) + lvol = db_controller.get_lvol_by_id(lvol_id) + lvol.from_source = False + lvol.write_to_db() lvol_events.lvol_replicated(lvol, new_lvol) return new_lvol.lvol_uuid @@ -2259,7 +2261,7 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol.status = LVol.STATUS_ONLINE new_lvol.write_to_db(db_controller.kv_store) lvol = db_controller.get_lvol_by_id(lvol_id) - lvol.from_source = False + lvol.from_source = True lvol.write_to_db() lvol_events.lvol_replicated(lvol, new_lvol) From 3033eca13cdbd185791b436fa8b6302086b48225 Mon Sep 17 00:00:00 2001 From: geoffrey1330 Date: Mon, 16 Mar 2026 15:26:27 +0100 Subject: [PATCH 186/192] return from_source from api --- simplyblock_web/api/v2/dtos.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py index 504ba784d..ca29bfae0 100644 --- a/simplyblock_web/api/v2/dtos.py +++ b/simplyblock_web/api/v2/dtos.py @@ -320,5 +320,6 @@ def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optiona max_r_mbytes=model.r_mbytes_per_sec, max_w_mbytes=model.w_mbytes_per_sec, capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()), - rep_info=rep_info + rep_info=rep_info, + from_source=model.from_source ) From 0aec9e58157a6a3aa9eb16930a23b68cf7608ba9 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 16 Mar 2026 17:59:27 +0300 Subject: [PATCH 187/192] fix: update lvol UUID handling during replication process --- simplyblock_core/controllers/lvol_controller.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index b4d94b330..8fcfd2e82 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2171,10 +2171,10 @@ def replicate_lvol_on_source_cluster(lvol_id): logger.error(f"Node is not online!: {source_node.get_id()}, status: {source_node.status}") return False - for lv in db_controller.get_lvols(source_cluster.snapshot_replication_target_cluster): - if lv.nqn == lvol.nqn: - logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") - return lv.get_id() + # for lv in db_controller.get_lvols(source_cluster.snapshot_replication_target_cluster): + # if lv.nqn == lvol.nqn: + # logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") + # return lv.get_id() snaps = [] snapshot = None @@ -2200,10 +2200,6 @@ def replicate_lvol_on_source_cluster(lvol_id): # create lvol on target node new_lvol = copy.deepcopy(lvol) - new_lvol.uuid = str(uuid.uuid4()) - new_lvol.create_dt = str(datetime.now()) - new_lvol.replication_node_id = "" - new_lvol.do_replicate = False new_lvol.cloned_from_snap = snapshot.get_id() new_lvol.snapshot_name = snapshot.snap_bdev new_lvol.status = LVol.STATUS_IN_CREATION @@ -2233,7 +2229,10 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol.write_to_db(db_controller.kv_store) - delete_lvol(lvol_id) + lvol = db_controller.get_lvol_by_id(lvol_id) + lvol.uuid = str(uuid.uuid4()) + lvol.write_to_db() + delete_lvol(lvol.uuid) time.sleep(3) From bd1e38d07fd1056d986369ca2e7154275d581a85 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 16 Mar 2026 18:07:15 +0300 Subject: [PATCH 188/192] fix: lvol delete on target --- simplyblock_core/controllers/lvol_controller.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 8fcfd2e82..eec7d3da7 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2014,10 +2014,10 @@ def replicate_lvol_on_target_cluster(lvol_id): new_lvol.bdev_stack = [ { "type": "bdev_lvol_clone", - "name": lvol.top_bdev, + "name": new_lvol.top_bdev, "params": { "snapshot_name": snapshot.snap_bdev, - "clone_name": lvol.lvol_bdev + "clone_name": new_lvol.lvol_bdev } } ] @@ -2025,12 +2025,12 @@ def replicate_lvol_on_target_cluster(lvol_id): if new_lvol.crypto_bdev: new_lvol.bdev_stack.append({ "type": "crypto", - "name": lvol.crypto_bdev, + "name": new_lvol.crypto_bdev, "params": { - "name": lvol.crypto_bdev, - "base_name": lvol.top_bdev, - "key1": lvol.crypto_key1, - "key2": lvol.crypto_key2, + "name": new_lvol.crypto_bdev, + "base_name": new_lvol.top_bdev, + "key1": new_lvol.crypto_key1, + "key2": new_lvol.crypto_key2, } }) From 547054f6cc26f09da0c8df1e6fe29ab453be8aa8 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 16 Mar 2026 19:06:15 +0300 Subject: [PATCH 189/192] feat: add configuration and utility scripts for managing storage nodes and volumes --- simplyblock_core/controllers/lvol_controller.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index eec7d3da7..6a162e7c2 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2202,6 +2202,7 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol = copy.deepcopy(lvol) new_lvol.cloned_from_snap = snapshot.get_id() new_lvol.snapshot_name = snapshot.snap_bdev + new_lvol.from_source = True new_lvol.status = LVol.STATUS_IN_CREATION new_lvol.bdev_stack = [ @@ -2259,9 +2260,6 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol.status = LVol.STATUS_ONLINE new_lvol.write_to_db(db_controller.kv_store) - lvol = db_controller.get_lvol_by_id(lvol_id) - lvol.from_source = True - lvol.write_to_db() lvol_events.lvol_replicated(lvol, new_lvol) return new_lvol.lvol_uuid From a21f00db5383d31a589313e85b882ee86fd41dfa Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 17 Mar 2026 17:32:15 +0300 Subject: [PATCH 190/192] fix: update lvol attributes for cloning and set from_source flag --- simplyblock_core/controllers/lvol_controller.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index 6a162e7c2..f25f8cec7 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2208,10 +2208,10 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol.bdev_stack = [ { "type": "bdev_lvol_clone", - "name": lvol.top_bdev, + "name": new_lvol.top_bdev, "params": { "snapshot_name": snapshot.snap_bdev, - "clone_name": lvol.lvol_bdev + "clone_name": new_lvol.lvol_bdev } } ] @@ -2219,12 +2219,12 @@ def replicate_lvol_on_source_cluster(lvol_id): if new_lvol.crypto_bdev: new_lvol.bdev_stack.append({ "type": "crypto", - "name": lvol.crypto_bdev, + "name": new_lvol.crypto_bdev, "params": { - "name": lvol.crypto_bdev, - "base_name": lvol.top_bdev, - "key1": lvol.crypto_key1, - "key2": lvol.crypto_key2, + "name": new_lvol.crypto_bdev, + "base_name": new_lvol.top_bdev, + "key1": new_lvol.crypto_key1, + "key2": new_lvol.crypto_key2, } }) @@ -2232,6 +2232,7 @@ def replicate_lvol_on_source_cluster(lvol_id): lvol = db_controller.get_lvol_by_id(lvol_id) lvol.uuid = str(uuid.uuid4()) + lvol.from_source = True lvol.write_to_db() delete_lvol(lvol.uuid) From eb05502bcdc46c8549ce1844aead1da66b132d53 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 17 Mar 2026 20:37:37 +0300 Subject: [PATCH 191/192] refactor replicate_lvol_on_source_cluster --- .../controllers/lvol_controller.py | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index f25f8cec7..df9eda64e 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2161,7 +2161,6 @@ def replicate_lvol_on_source_cluster(lvol_id): return False source_node = db_controller.get_storage_node_by_id(lvol.node_id) - source_cluster = db_controller.get_cluster_by_id(source_node.cluster_id) if not source_node: logger.error(f"Node not found: {lvol.node_id}") @@ -2171,11 +2170,6 @@ def replicate_lvol_on_source_cluster(lvol_id): logger.error(f"Node is not online!: {source_node.get_id()}, status: {source_node.status}") return False - # for lv in db_controller.get_lvols(source_cluster.snapshot_replication_target_cluster): - # if lv.nqn == lvol.nqn: - # logger.info(f"LVol with same nqn already exists on target cluster: {lv.get_id()}") - # return lv.get_id() - snaps = [] snapshot = None for task in db_controller.get_job_tasks(source_node.cluster_id): @@ -2198,13 +2192,31 @@ def replicate_lvol_on_source_cluster(lvol_id): logger.error(f"Snapshot for replication not found for lvol: {lvol_id}") return False + original_lvol_id = lvol.get_id() + + lvol = db_controller.get_lvol_by_id(lvol_id) + lvol.uuid = str(uuid.uuid4()) + lvol.from_source = True + lvol.write_to_db() + + ret, err = delete_lvol(lvol.uuid) + if ret: + logger.info(f"deleted lvol: {lvol.uuid}, results {ret}") + if err: + logger.error(f"deleting lvol: {lvol.uuid} failed with error: {err}") + # create lvol on target node new_lvol = copy.deepcopy(lvol) + new_lvol.uuid = original_lvol_id + new_lvol.size = snapshot.lvol.size + new_lvol.max_size = snapshot.lvol.max_size + new_lvol.base_bdev = snapshot.lvol.base_bdev + new_lvol.lvol_bdev = f"CLN_{utils.get_random_vuid()}" + new_lvol.top_bdev = f"{new_lvol.lvs_name}/{new_lvol.lvol_bdev}" new_lvol.cloned_from_snap = snapshot.get_id() new_lvol.snapshot_name = snapshot.snap_bdev new_lvol.from_source = True new_lvol.status = LVol.STATUS_IN_CREATION - new_lvol.bdev_stack = [ { "type": "bdev_lvol_clone", @@ -2215,7 +2227,6 @@ def replicate_lvol_on_source_cluster(lvol_id): } } ] - if new_lvol.crypto_bdev: new_lvol.bdev_stack.append({ "type": "crypto", @@ -2228,15 +2239,7 @@ def replicate_lvol_on_source_cluster(lvol_id): } }) - new_lvol.write_to_db(db_controller.kv_store) - - lvol = db_controller.get_lvol_by_id(lvol_id) - lvol.uuid = str(uuid.uuid4()) - lvol.from_source = True - lvol.write_to_db() - delete_lvol(lvol.uuid) - - time.sleep(3) + new_lvol.write_to_db() lvol_bdev, error = add_lvol_on_node(new_lvol, source_node) if error: @@ -2261,6 +2264,7 @@ def replicate_lvol_on_source_cluster(lvol_id): new_lvol.status = LVol.STATUS_ONLINE new_lvol.write_to_db(db_controller.kv_store) + logger.info(new_lvol.to_dict()) lvol_events.lvol_replicated(lvol, new_lvol) return new_lvol.lvol_uuid From 362935abde5b9c04612120f48e1d26bc09e83309 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Wed, 18 Mar 2026 17:52:13 +0300 Subject: [PATCH 192/192] fix issue --- simplyblock_core/controllers/lvol_controller.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py index df9eda64e..2b0789cf8 100644 --- a/simplyblock_core/controllers/lvol_controller.py +++ b/simplyblock_core/controllers/lvol_controller.py @@ -2199,11 +2199,9 @@ def replicate_lvol_on_source_cluster(lvol_id): lvol.from_source = True lvol.write_to_db() - ret, err = delete_lvol(lvol.uuid) - if ret: - logger.info(f"deleted lvol: {lvol.uuid}, results {ret}") - if err: - logger.error(f"deleting lvol: {lvol.uuid} failed with error: {err}") + ret = delete_lvol(lvol.uuid) + if not ret: + logger.info(f"deleting lvol: {lvol.uuid} failed") # create lvol on target node new_lvol = copy.deepcopy(lvol)