From 65eb45dd328b94a21c20858e810093e87e07bfc0 Mon Sep 17 00:00:00 2001 From: luohua13 Date: Tue, 20 Jan 2026 15:37:31 +0800 Subject: [PATCH 1/3] Secure Accelerator Access Conformance Test --- ...ure_Accelerator_Access_Conformance_Test.md | 326 ++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md diff --git a/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md b/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md new file mode 100644 index 0000000..9ed736c --- /dev/null +++ b/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md @@ -0,0 +1,326 @@ +--- +products: + - Secure Accelerator Access Conformance Test +kind: + - Article +--- +# Secure Accelerator Access Conformance Test + +## Requirement + +**MUST**: Ensure that access to accelerators from within containers is properly isolated and mediated by the Kubernetes resource management framework (device plugin or DRA) and container runtime, preventing unauthorized access or interference between workloads. + +## Prerequisites + +Before running the test, ensure you have: +- A Kubernetes cluster with at least **one GPU node containing 2 or more physical GPUs** +- Alauda Build of NVIDIA GPU Device Plugin installed (see Step 2 below) +- `kubectl` configured to access the cluster +- Appropriate permissions to create namespaces and pods + +## Setup + +### Step 1: Create an Alauda Container Platform Kubernetes Cluster with GPU Nodes + +### Step 2: Install Alauda Build of NVIDIA GPU Device Plugin +https://docs.alauda.io/pgpu/0.17/install/install.html + +### Step 3: Label GPU Nodes + +Label all your GPU nodes to enable device plugin scheduling: + +```bash +kubectl label nodes nvidia-device-enable=pgpu +``` + +### Step 4: Verify GPU Capacity + +Verify that at least one GPU node has 2 or more GPUs: + +```bash +# Find nodes with 2 or more GPUs +kubectl get nodes -l nvidia-device-enable=pgpu -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.capacity.nvidia\.com/gpu}{"\n"}{end}' | \ + awk '$2 >= 2 {print $1 " has " $2 " GPUs"}' +``` + +## Tests Executed + +### Test 1: "Cannot access devices if a pod doesn't request them" + +**What it does**: Creates a pod that does NOT request any GPU devices and checks that the pod cannot access any GPUs. + +**Why it matters**: This is a security test. If a pod doesn't ask for a GPU, it shouldn't be able to access one. The test verifies this by checking for GPU device files (`/dev/nvidia*`) and running `nvidia-smi` inside the pod, expecting it to fail (command not found), proving the pod has no access to GPUs. + +### Test 2: "Must map devices to the right pods" + +**What it does**: Creates TWO pods, each requesting 1 GPU, on the same node (which has at least 2 GPUs). Then it runs `nvidia-smi -L` in both pods to list which GPU each pod can see. + +**Why it matters**: This verifies isolation between pods. Each pod should see a different GPU - pod A shouldn't be able to see or access the GPU assigned to pod B. The test confirms this by checking that the GPU UUIDs from `nvidia-smi -L` are different in each pod, proving they have different GPUs assigned to them. + +## Test Script + +```bash +#!/bin/bash +set -e + +# ================= CONFIG ================= +NAMESPACE="secure-accelerator-access" +CUDA_IMAGE="${CUDA_IMAGE:-nvidia/cuda:12.1.1-base-ubuntu22.04}" + +echo "=== Secure Accelerator Access Conformance Test ===" +echo "" +echo "Configuration:" +echo " Namespace: $NAMESPACE" +echo " CUDA Image: $CUDA_IMAGE" +echo "" + +# ================ PRE-FLIGHT ================ +echo "=== Pre-flight Checks ===" + +if ! command -v kubectl &> /dev/null; then + echo "❌ ERROR: kubectl not found." + exit 1 +fi +echo "✓ kubectl is available" + +echo "Searching for GPU nodes with 2+ GPUs..." + +GPU_NODES_INFO=$(kubectl get nodes -l nvidia-device-enable=pgpu \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.capacity.nvidia\.com/gpu}{"\n"}{end}') + +if [ -z "$GPU_NODES_INFO" ]; then + echo "❌ ERROR: No GPU node found." + echo "Label GPU nodes with:" + echo " kubectl label node nvidia-device-enable=pgpu" + exit 1 +fi + +GPU_NODE=$(echo "$GPU_NODES_INFO" | awk '$2 >= 2 {print $1; exit}') +GPU_CAPACITY=$(echo "$GPU_NODES_INFO" | awk '$2 >= 2 {print $2; exit}') + +if [ -z "$GPU_NODE" ]; then + echo "❌ ERROR: No GPU node with 2+ GPUs found." + echo "Current GPU nodes:" + echo "$GPU_NODES_INFO" + exit 1 +fi + +echo "✓ Selected GPU node: $GPU_NODE with $GPU_CAPACITY GPUs" + +ELIGIBLE_COUNT=$(echo "$GPU_NODES_INFO" | awk '$2 >= 2 {count++} END {print count+0}') +if [ "$ELIGIBLE_COUNT" -gt 1 ]; then + echo " Note: Found $ELIGIBLE_COUNT eligible GPU nodes, using: $GPU_NODE" +fi + +echo "" +echo "=== Starting Tests ===" +echo "" + +kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - + +cleanup() { + echo "" + echo "Cleaning up namespace..." + kubectl delete namespace "$NAMESPACE" --ignore-not-found=true +} +trap cleanup EXIT + +# ==================== TEST 1 ==================== +echo "" +echo "=== TEST 1: Access Denial (No GPU request) ===" + +cat </dev/null || echo "No nvidia devices found" + echo "" + echo "Running nvidia-smi..." + if nvidia-smi --query-gpu=uuid --format=csv,noheader; then + echo "ERROR: nvidia-smi succeeded - pod has GPU access!" + exit 1 + else + echo "SUCCESS: nvidia-smi failed (exit code $?) - GPU access properly denied" + exit 0 + fi +EOF + +echo "Waiting for pod completion..." +kubectl wait --for=jsonpath='{.status.phase}'=Succeeded \ + -n "$NAMESPACE" pod/no-gpu-pod --timeout=300s + +TEST1_EXIT_CODE=$(kubectl get pod -n "$NAMESPACE" no-gpu-pod \ + -o jsonpath='{.status.containerStatuses[0].state.terminated.exitCode}') + +if [ "$TEST1_EXIT_CODE" -eq 0 ]; then + echo "✅ TEST 1 PASSED" +else + echo "❌ TEST 1 FAILED" + kubectl logs -n "$NAMESPACE" no-gpu-pod +fi + +kubectl delete pod -n "$NAMESPACE" no-gpu-pod --ignore-not-found=true + +# ==================== TEST 2 ==================== +echo "" +echo "=== TEST 2: GPU Isolation Between Pods ===" + +GPU_CAPACITY=$(kubectl get node "$GPU_NODE" \ + -o jsonpath='{.status.capacity.nvidia\.com/gpu}') +echo "Node GPU capacity: $GPU_CAPACITY" + +cat < Date: Tue, 20 Jan 2026 15:46:31 +0800 Subject: [PATCH 2/3] improvement by comment --- ...ecure_Accelerator_Access_Conformance_Test.md | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md b/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md index 9ed736c..7fe9088 100644 --- a/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md +++ b/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md @@ -23,7 +23,8 @@ Before running the test, ensure you have: ### Step 1: Create an Alauda Container Platform Kubernetes Cluster with GPU Nodes ### Step 2: Install Alauda Build of NVIDIA GPU Device Plugin -https://docs.alauda.io/pgpu/0.17/install/install.html + +[Alauda Build of NVIDIA GPU Device Plugin installation guide](https://docs.alauda.io/pgpu/0.17/install/install.html) ### Step 3: Label GPU Nodes @@ -158,10 +159,18 @@ EOF echo "Waiting for pod completion..." kubectl wait --for=jsonpath='{.status.phase}'=Succeeded \ - -n "$NAMESPACE" pod/no-gpu-pod --timeout=300s + -n "$NAMESPACE" pod/no-gpu-pod --timeout=300s || true -TEST1_EXIT_CODE=$(kubectl get pod -n "$NAMESPACE" no-gpu-pod \ - -o jsonpath='{.status.containerStatuses[0].state.terminated.exitCode}') +TEST1_PHASE=$(kubectl get pod -n "$NAMESPACE" no-gpu-pod \ + -o jsonpath='{.status.phase}') +if [ "$TEST1_PHASE" != "Succeeded" ]; then + echo "❌ TEST 1 FAILED (phase: $TEST1_PHASE)" + kubectl logs -n "$NAMESPACE" no-gpu-pod + TEST1_EXIT_CODE=1 +else + TEST1_EXIT_CODE=$(kubectl get pod -n "$NAMESPACE" no-gpu-pod \ + -o jsonpath='{.status.containerStatuses[0].state.terminated.exitCode}') +fi if [ "$TEST1_EXIT_CODE" -eq 0 ]; then echo "✅ TEST 1 PASSED" From b09d8df2a2ff91e77a45e35e84622fa3a4855036 Mon Sep 17 00:00:00 2001 From: luohua13 Date: Tue, 20 Jan 2026 15:58:47 +0800 Subject: [PATCH 3/3] replace a avaliable image --- .../solutions/AI/Secure_Accelerator_Access_Conformance_Test.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md b/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md index 7fe9088..105f6ee 100644 --- a/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md +++ b/docs/en/solutions/AI/Secure_Accelerator_Access_Conformance_Test.md @@ -326,7 +326,7 @@ Save the above script as `test_secure_accelerator_access.sh`. **Note**: For air-gapped environments, ensure the CUDA image is already pushed to your private registry and accessible to the cluster. ```bash -CUDA_IMAGE=nvidia/cuda:11.0-base ./test_secure_accelerator_access.sh +CUDA_IMAGE=nvidia/cuda:12.4.1-base-ubuntu20.04 ./test_secure_accelerator_access.sh ```