SUSE AI: removed runTimeClassNames nvidia (#36)

tbazant · web-flow · commit eb07f72a074a · 2026-04-23T10:50:43.000+02:00
* removed runtimeclassnames nvidia accross all AI files

* fixed and removed 1 new instance of runtime ClassName

* added info about  no need to set runtimeClassName

* improved a link
diff --git a/DC-AI-deployment b/DC-AI-deployment
@@ -7,7 +7,6 @@ ADOC_POST=yes
 ADOC_TYPE=book
 ADOC_ATTRIBUTES=" --attribute env-daps=1"
 ADOC_ATTRIBUTES+=" --attribute PROF_PRODUCT=suseai"
-ADOC_ATTRIBUTES+=" --attribute PROF_PRODUCT=suseai"
 ADOC_ATTRIBUTES+=" --attribute PROF_DEPLOYMENT=standard"
 
 STYLEROOT="/usr/share/xml/docbook/stylesheet/suse2022-ns"
@@ -17,4 +16,5 @@ FALLBACK_STYLEROOT="/usr/share/xml/docbook/stylesheet/suse-ns"
 DOCBOOK5_RNG_URI="http://docbook.org/xml/5.2/rng/docbookxi.rng"
 
 #XSLTPARAM+=' --param toc.section.depth=2'
-#XSLTPARAM+=' --param bubbletoc.section.depth=3 --param bubbletoc.max.depth=3'
+#XSLTPARAM+=' --param bubbletoc.section.depth=3 --param bubbletoc.max.depth=3'
+#XSLTPARAM+=' --stringparam generate.toc="book title" '
diff --git a/references/ollama-helmchart.adoc b/references/ollama-helmchart.adoc
@@ -36,7 +36,6 @@ global:
 ingress:
   enabled: false
 defaultModel: "gemma:2b"
-runtimeClassName: nvidia
 ollama:
   models:
     pull:
diff --git a/references/owui-helm-overrides.adoc b/references/owui-helm-overrides.adoc
@@ -450,7 +450,6 @@ Following is an example of the `open-webui-pipelines-values.yaml` override file.
 
 [source,yaml]
 ----
-runtimeClassName: nvidia
 global:
   imagePullSecrets:
     - application-collection
diff --git a/references/pytorch-helm-overrides.adoc b/references/pytorch-helm-overrides.adoc
@@ -11,7 +11,6 @@ include::../snippets/helm-chart-overrides-intro.adoc[]
 [source,yaml]
 ----
 # pytorch_custom_overrides.yaml
-runtimeClassName: nvidia
 global:
   imagePullSecrets:
     - application-collection <.>
@@ -48,7 +47,6 @@ To create a ConfigMap, run the following command:
 [source,yaml]
 ----
 # pytorch_custom_overrides.yaml
-runtimeClassName: nvidia
 global:
   imagePullSecrets:
     - application-collection 
@@ -82,7 +80,6 @@ Move the `entrypoint.sh` file plus any helper files under the `scripts/` directo
 [source,yaml]
 ----
 # pytorch_custom_overrides.yaml
-runtimeClassName: nvidia
 global:
   imagePullSecrets:
     - application-collection <.>
@@ -117,7 +114,6 @@ For production use, we recommend using a storage solution suitable for persisten
 [source,yaml]
 ----
 # pytorch_custom_overrides.yaml
-runtimeClassName: nvidia
 global:
   imagePullSecrets:
     - application-collection <.>
@@ -153,7 +149,6 @@ For production use, we recommend using a storage solution suitable for persisten
 [source,yaml]
 ----
 # pytorch_custom_overrides.yaml
-runtimeClassName: nvidia
 global:
   imagePullSecrets:
     - application-collection <.>
diff --git a/references/vllm-helm-overrides.adoc b/references/vllm-helm-overrides.adoc
@@ -111,7 +111,6 @@ The following {vllm} override file includes basic configuration options.
 * Access to a {huggingface} token (`HF_TOKEN`).
 * The model `meta-llama/Llama-3.1-8B-Instruct` from this example is a gated model that requires you to accept the agreement to access it.
 For more information, see link:https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct[].
-* The `runtimeClassName` specified here is `nvidia`.
 * Update the `storageClass:` entry for each `modelSpec`.
 
 [source,yaml]
@@ -121,7 +120,6 @@ global:
   imagePullSecrets:
   - application-collection
 servingEngineSpec:
-  runtimeClassName: "nvidia"
   modelSpec:
   - name: "llama3" <.>
     registry: "dp.apps.rancher.io" <.>
@@ -263,7 +261,6 @@ global:
   imagePullSecrets:
   - application-collection
 servingEngineSpec:
-  runtimeClassName: "nvidia"
   modelSpec:
   - name: "llama3"
     registry: "dp.apps.rancher.io"
@@ -383,7 +380,6 @@ global:
   imagePullSecrets:
   - application-collection
 servingEngineSpec:
-  runtimeClassName: "nvidia"
   modelSpec:
   - name: "mistral"
     registry: "dp.apps.rancher.io"
@@ -432,7 +428,6 @@ global:
   imagePullSecrets:
   - application-collection
 servingEngineSpec:
-  runtimeClassName: "nvidia"
   modelSpec:
   - name: "mistral"
     registry: "dp.apps.rancher.io"
diff --git a/tasks/NVIDIA-Operator-installation.adoc b/tasks/NVIDIA-Operator-installation.adoc
@@ -76,7 +76,14 @@ The NVIDIA operator restarts containerd with a hangup call which restarts RKE2.
 
 [IMPORTANT]
 ====
-The envvars `ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED`, `ACCEPT_NVIDIA_VISIBLE_DEVICES_AS_VOLUME_MOUNTS` and `DEVICE_LIST_STRATEGY` are required to properly isolate GPU resources as explained in https://docs.google.com/document/d/1zy0key-EL6JH50MZgwg96RPYxxXXnVUdxLZwGiyqLd8/edit?tab=t.0[Preventing unprivileged access to GPUs in Kubernetes].
+The envvars `ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED`, `ACCEPT_NVIDIA_VISIBLE_DEVICES_AS_VOLUME_MOUNTS` and `DEVICE_LIST_STRATEGY` are required to properly isolate GPU resources as explained in link:https://docs.google.com/document/d/1zy0key-EL6JH50MZgwg96RPYxxXXnVUdxLZwGiyqLd8/edit?tab=t.0[Preventing unprivileged access to GPUs in Kubernetes].
+====
+
+[IMPORTANT]
+====
+NVIDIA GPU Operator v25.10.x uses link:https://github.com/cncf-tags/container-device-interface/blob/main/SPEC.md[Container Device Interface (CDI) specification] which simplifies operations.
+It is recommended that you enable CDI (the default) and the NRI plug-in on RKE2.
+With both features enabled, you no longer need to pass extra environment variables for security requirements or set `runtimeClassName: nvidia` in your pod specifications.
 ====
 
 [,yaml]
@@ -164,7 +171,6 @@ metadata:
   namespace: default
 spec:
   restartPolicy: OnFailure
-  runtimeClassName: nvidia
   containers:
   - name: cuda-container
     image: nvcr.io/nvidia/k8s/cuda-sample:nbody