diff --git a/isvctl/configs/providers/aws/config/eks.yaml b/isvctl/configs/providers/aws/config/eks.yaml index d11050da..64e985dd 100644 --- a/isvctl/configs/providers/aws/config/eks.yaml +++ b/isvctl/configs/providers/aws/config/eks.yaml @@ -112,6 +112,22 @@ commands: TF_VAR_test_pool_labels_json: '{"isv.ncp.validation/workload":"gpu-compute"}' TF_VAR_test_pool_node_type: "gpu" + # Throwaway pool for the delete leg (K8S06-03): created here, removed in + # the test phase. CPU-only with its own state file so no other check + # depends on it. + - name: create_test_delete_node_pool + phase: setup + command: "../scripts/eks/create_node_pool.sh" + output_schema: node_pool + timeout: 900 # 15 min - EKS node group join can take 5-8 min + env: + TF_AUTO_APPROVE: "true" + NODE_POOL_STATE_FILE: "terraform-delete.tfstate" + TF_VAR_test_pool_name: "isv-test-delete-pool" + TF_VAR_test_pool_instance_types: '["m6i.large"]' + TF_VAR_test_pool_desired_size: "1" + TF_VAR_test_pool_node_type: "cpu" + # Scale the CPU pool to a new target count. Re-applies the # terraform-node-pool module with a bumped desired_size; create_node_pool.sh # is reused because `terraform apply` is idempotent. The emitted @@ -134,6 +150,16 @@ commands: TF_VAR_test_pool_taints_json: '[{"key":"isv.ncp.validation/dedicated","value":"test","effect":"NoSchedule"}]' TF_VAR_test_pool_node_type: "cpu" + # Delete the throwaway pool (K8S06-03). In the test phase so the matching + # K8sNodePoolCheck can assert it reaches zero nodes after all test steps. + - name: delete_test_node_pool + phase: test + command: "../scripts/eks/destroy_node_pool.sh" + timeout: 900 + env: + TF_AUTO_APPROVE: "true" + NODE_POOL_STATE_FILE: "terraform-delete.tfstate" + # Teardown order: destroy both test node pools before tearing down the # cluster so their ENIs/instances are freed before the VPC comes down. # Each destroy targets its pool's state file (matching create). @@ -163,6 +189,16 @@ commands: TF_AUTO_APPROVE: "true" SHARED_VPC_CLUSTER_STATE_FILE: "terraform.tfstate" + # Safety net in case the test phase (which normally deletes it) didn't + # run; idempotent when the state is already empty. + - name: destroy_test_delete_node_pool + phase: teardown + command: "../scripts/eks/destroy_node_pool.sh" + timeout: 900 + env: + TF_AUTO_APPROVE: "true" + NODE_POOL_STATE_FILE: "terraform-delete.tfstate" + - name: teardown phase: teardown command: "../scripts/eks/teardown.sh" diff --git a/isvctl/configs/providers/aws/scripts/eks/setup.sh b/isvctl/configs/providers/aws/scripts/eks/setup.sh index 5682c6e9..3f07570b 100755 --- a/isvctl/configs/providers/aws/scripts/eks/setup.sh +++ b/isvctl/configs/providers/aws/scripts/eks/setup.sh @@ -411,6 +411,7 @@ EFS_SC=$(kubectl get sc -o json 2>/dev/null \ STATIC_VOLUME_HANDLE="" STATIC_DRIVER_NAME="" +STATIC_VOLUME_AZ="" if [ -n "$BLOCK_SC" ]; then NODE_AZ=$(kubectl get nodes -l nvidia.com/gpu.present=true \ -o jsonpath='{.items[0].metadata.labels.topology\.kubernetes\.io/zone}' 2>/dev/null || echo "") @@ -449,6 +450,16 @@ if [ -n "$BLOCK_SC" ]; then if [ -n "$STATIC_VOLUME_HANDLE" ]; then STATIC_DRIVER_NAME="ebs.csi.aws.com" + # EBS volumes are zonal; the static PV must pin its consumer pod to + # the volume's AZ or the attach hangs cross-zone. Read the actual AZ + # (covers both the freshly-created and reused-volume paths). + STATIC_VOLUME_AZ=$(aws ec2 describe-volumes \ + --volume-ids "$STATIC_VOLUME_HANDLE" \ + --region "$AWS_REGION" \ + --query 'Volumes[0].AvailabilityZone' --output text 2>/dev/null || echo "") + if [ "$STATIC_VOLUME_AZ" = "None" ]; then + STATIC_VOLUME_AZ="" + fi fi else echo "Warning: could not determine worker-node AZ; skipping standalone EBS volume creation" >&2 @@ -490,7 +501,8 @@ cat << EOF "shared_fs_storage_class": "${EFS_SC}", "nfs_storage_class": "${EFS_SC}", "static_volume_handle": "${STATIC_VOLUME_HANDLE}", - "static_driver_name": "${STATIC_DRIVER_NAME}" + "static_driver_name": "${STATIC_DRIVER_NAME}", + "static_volume_az": "${STATIC_VOLUME_AZ}" }, "aws": { "region": "${AWS_REGION}", diff --git a/isvctl/configs/suites/k8s.yaml b/isvctl/configs/suites/k8s.yaml index 7ec63de5..123f1f9f 100644 --- a/isvctl/configs/suites/k8s.yaml +++ b/isvctl/configs/suites/k8s.yaml @@ -284,6 +284,9 @@ tests: static_pv: volume_handle: "{{ steps.setup.csi.static_volume_handle | default('', true) }}" csi_driver: "{{ steps.setup.csi.static_driver_name | default('', true) }}" + # Zonal block volumes (EBS/PD/Disk) must pin their consumer pod to + # the volume's AZ; empty for zone-agnostic backends. + zone: "{{ steps.setup.csi.static_volume_az | default('', true) }}" fs_type: "ext4" capacity: "1Gi" access_mode: "ReadWriteOnce" diff --git a/isvtest/src/isvtest/validations/k8s_storage.py b/isvtest/src/isvtest/validations/k8s_storage.py index befc73b7..84216f11 100644 --- a/isvtest/src/isvtest/validations/k8s_storage.py +++ b/isvtest/src/isvtest/validations/k8s_storage.py @@ -1501,6 +1501,11 @@ class K8sCsiProvisioningModesCheck(BaseValidation): static_pv.capacity: PV ``spec.capacity.storage`` and the matching PVC request size (default: ``1Gi``). static_pv.access_mode: PV / PVC access mode (default: ``ReadWriteOnce``). + static_pv.zone: ``topology.kubernetes.io/zone`` the backing volume + lives in. Set for zonal block backends (AWS EBS, GCE PD, Azure + Disk) so the consumer pod is scheduled in the volume's zone and + the attach does not hang cross-zone. Unset for zone-agnostic + backends (e.g. EFS). bind_timeout_s: Max wait for PVC Bind and mount-pod Ready (default: 180). namespace_prefix: Prefix for the ephemeral namespace @@ -1561,6 +1566,7 @@ def run(self) -> None: capacity=str(static_pv_cfg.get("capacity") or "1Gi"), access_mode=str(static_pv_cfg.get("access_mode") or "ReadWriteOnce"), bind_timeout=bind_timeout, + zone=str(static_pv_cfg.get("zone") or ""), ) if not static_ok: any_failed = True @@ -1685,6 +1691,7 @@ def _run_static( capacity: str, access_mode: str, bind_timeout: int, + zone: str = "", ) -> bool: """Run the ``static`` subtest: pre-create PV + PVC → Bound → mount + canary.""" pvc_name = f"csi-prov-static-{uuid.uuid4().hex[:6]}" @@ -1698,6 +1705,7 @@ def _run_static( capacity=capacity, access_mode=access_mode, claim_name=pvc_name, + zone=zone, ) if returncode != 0: self.report_subtest( @@ -1778,6 +1786,7 @@ def _apply_pv( capacity: str, access_mode: str, claim_name: str, + zone: str = "", ) -> tuple[int, str]: """Render the static PV manifest and apply it.""" @@ -1792,6 +1801,7 @@ def _mutate(doc: dict[str, Any]) -> dict[str, Any]: access_mode=access_mode, claim_namespace=self._namespace, claim_name=claim_name, + zone=zone, ) return self._run_kubectl_apply(render_k8s_manifest(_PV_MANIFEST, _mutate)) @@ -1895,12 +1905,19 @@ def _set_pv_fields( access_mode: str, claim_namespace: str, claim_name: str, + zone: str = "", ) -> dict[str, Any]: """Mutate a parsed PersistentVolume manifest in place with the requested fields. ``claimRef`` pre-reserves the PV for the matching PVC so the binding is deterministic and cannot race against another claim landing in the same cluster while the static probe runs. + + When ``zone`` is set, a ``spec.nodeAffinity`` on + ``topology.kubernetes.io/zone`` pins the volume to that zone. Zonal block + backends (AWS EBS, GCE PD, Azure Disk) can only attach to a node in the + volume's own zone; without this the scheduler may place the consumer pod + in a different zone and the attach hangs until the mount timeout. """ metadata = doc.setdefault("metadata", {}) metadata["name"] = name @@ -1919,6 +1936,24 @@ def _set_pv_fields( "namespace": claim_namespace, "name": claim_name, } + if zone: + spec["nodeAffinity"] = { + "required": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "topology.kubernetes.io/zone", + "operator": "In", + "values": [zone], + } + ] + } + ] + } + } + else: + spec.pop("nodeAffinity", None) return doc @@ -1929,12 +1964,37 @@ def _set_mount_pod_fields( name: str, pvc_name: str, ) -> dict[str, Any]: - """Mutate a parsed mount-pod manifest in place, binding its single PVC volume.""" + """Mutate a parsed mount-pod manifest in place, binding its single PVC volume. + + The pod is kept off transient test-provisioning node pools (nodes carrying + the ``isv.ncp.validation/pool`` marker). Those pools are created/scaled/ + deleted within the same run by node-pool CRUD checks, so a freshly joined + node may not yet have the CSI node-plugin DaemonSet pod running - a probe + pod landing there hangs at mount until the bind timeout. Baseline cluster + nodes never carry the marker, so this is a no-op for providers that do not + provision test pools (single-node k3s/minikube/microk8s included). + """ metadata = doc.setdefault("metadata", {}) metadata["name"] = name metadata["namespace"] = namespace spec = doc.setdefault("spec", {}) + spec["affinity"] = { + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "isv.ncp.validation/pool", + "operator": "DoesNotExist", + } + ] + } + ] + } + } + } volumes = spec.setdefault("volumes", []) if not volumes: volumes.append({"name": "data"}) diff --git a/isvtest/tests/test_k8s_storage.py b/isvtest/tests/test_k8s_storage.py index 035422ac..4902a495 100644 --- a/isvtest/tests/test_k8s_storage.py +++ b/isvtest/tests/test_k8s_storage.py @@ -1911,6 +1911,35 @@ def test_missing_sections_are_created(self) -> None: assert out["spec"]["csi"]["volumeHandle"] == "vh" assert out["spec"]["claimRef"]["name"] == "pvc" + def test_no_zone_omits_node_affinity(self) -> None: + """A zone-agnostic backend (e.g. EFS) must not pin the PV to a zone.""" + out = _set_pv_fields(self._base_doc(), **self._args()) + assert "nodeAffinity" not in out["spec"] + + def test_zone_sets_topology_node_affinity(self) -> None: + """A zonal block backend pins the PV to its volume's AZ.""" + out = _set_pv_fields(self._base_doc(), zone="us-west-2a", **self._args()) + terms = out["spec"]["nodeAffinity"]["required"]["nodeSelectorTerms"] + expr = terms[0]["matchExpressions"][0] + assert expr == { + "key": "topology.kubernetes.io/zone", + "operator": "In", + "values": ["us-west-2a"], + } + + def _args(self) -> dict[str, Any]: + """Common required kwargs for ``_set_pv_fields``.""" + return { + "name": "pv-z", + "driver": "ebs.csi.aws.com", + "volume_handle": "vol-1", + "fs_type": "ext4", + "capacity": "1Gi", + "access_mode": "ReadWriteOnce", + "claim_namespace": "ns", + "claim_name": "pvc", + } + class TestSetMountPodFields: """Tests for ``_set_mount_pod_fields`` - the BusyBox mount-pod mutator.""" @@ -1943,6 +1972,15 @@ def test_sets_name_namespace_and_pvc(self) -> None: # only touches volumes so the mount path stays /data as documented. assert out["spec"]["containers"][0]["volumeMounts"][0]["mountPath"] == "/data" + def test_excludes_test_pool_nodes(self) -> None: + """Probe pods must avoid transient test-pool nodes via node anti-affinity.""" + out = _set_mount_pod_fields(self._base_doc(), namespace="ns1", name="probe-1", pvc_name="pvc-1") + terms = out["spec"]["affinity"]["nodeAffinity"]["requiredDuringSchedulingIgnoredDuringExecution"][ + "nodeSelectorTerms" + ] + expr = terms[0]["matchExpressions"][0] + assert expr == {"key": "isv.ncp.validation/pool", "operator": "DoesNotExist"} + class TestK8sCsiProvisioningModesCheck: """Tests for ``K8sCsiProvisioningModesCheck``."""