diff --git a/DC-AI-deployment b/DC-AI-deployment index 378148f..c26fc56 100644 --- a/DC-AI-deployment +++ b/DC-AI-deployment @@ -16,5 +16,6 @@ FALLBACK_STYLEROOT="/usr/share/xml/docbook/stylesheet/suse-ns" # DocBook Validation DOCBOOK5_RNG_URI="http://docbook.org/xml/5.2/rng/docbookxi.rng" -#XSLTPARAM+=' --param toc.section.depth=2' -#XSLTPARAM+=' --param bubbletoc.section.depth=3 --param bubbletoc.max.depth=3' \ No newline at end of file +XSLTPARAM+=' --param toc.section.depth=2' +XSLTPARAM+=' --param bubbletoc.section.depth=3 --param bubbletoc.max.depth=3' +#XSLTPARAM+=' --stringparam generate.toc="book title" ' \ No newline at end of file diff --git a/DC-SLES-mcphost b/DC-SLES-mcphost index 5e93dc4..a9a5267 100644 --- a/DC-SLES-mcphost +++ b/DC-SLES-mcphost @@ -16,5 +16,5 @@ DOCBOOK5_RNG_URI="http://docbook.org/xml/5.2/rng/docbookxi.rng" PROFOS="sles" #PROFARCH="x86-64" -#XSLTPARAM+=' --param toc.section.depth=2' -#XSLTPARAM+=' --param bubbletoc.section.depth=3 --param bubbletoc.max.depth=3' +XSLTPARAM+=' --param toc.section.depth=2' +XSLTPARAM+=' --param bubbletoc.section.depth=3 --param bubbletoc.max.depth=3' diff --git a/articles/ai-deployment-docinfo.xml b/articles/ai-deployment-docinfo.xml index 94b649a..c443b7f 100644 --- a/articles/ai-deployment-docinfo.xml +++ b/articles/ai-deployment-docinfo.xml @@ -1,4 +1,11 @@ + 2026-04-01 + + + Added topics that describe {kubeflow} installation and operation + + + 2026-03-09 diff --git a/articles/ai-deployment.adoc b/articles/ai-deployment.adoc index 37da9f5..8ee25e4 100644 --- a/articles/ai-deployment.adoc +++ b/articles/ai-deployment.adoc @@ -92,6 +92,20 @@ include::../tasks/litellm-installing.adoc[leveloffset=+2] include::../references/litellm-helm-overrides.adoc[leveloffset=+3] include::../references/litellm-helmchart.adoc[leveloffset=+3] include::../tasks/mlflow-installing.adoc[leveloffset=+2] +include::../tasks/kubeflow-installing.adoc[leveloffset=+2] +include::../tasks/kubeflow-accessing.adoc[leveloffset=+3] +:override-title: Configuration scenarios +include::../references/kubeflow-configuration-scenarios.adoc[leveloffset=+3] +:override-title: Hardening for production use +include::../tasks/kubeflow-hardening.adoc[leveloffset=+3] +:override-title: Managing user profiles and namespaces +include::../tasks/kubeflow-user-profiles.adoc[leveloffset=+3] +:override-title: Upgrade notes +include::../references/kubeflow-upgrade-notes.adoc[leveloffset=+3] +:override-title: Known limitations +include::../references/kubeflow-limitations.adoc[leveloffset=+3] +:override-title: Troubleshooting +include::../references/kubeflow-troubleshooting.adoc[leveloffset=+3] include::../tasks/ai-library-apps-verifying.adoc[leveloffset=+2] diff --git a/concepts/AI-intro-how-works.adoc b/concepts/AI-intro-how-works.adoc index cc3bcc8..aa8b0cf 100644 --- a/concepts/AI-intro-how-works.adoc +++ b/concepts/AI-intro-how-works.adoc @@ -91,11 +91,14 @@ The {mcp}-to-OpenAPI proxy server provided by {owui}. link:https://pytorch.org/[{pytorch}]:: An open source machine learning framework. -link:https://mlflow.org/[{mlflow}]:: -An open source platform to manage the machine learning lifecycle, including experimentation, reproducibility, deployment and a central model registry. - link:https://qdrant.tech/[{qdrant}]:: A vector database and similarity search engine for storing, searching and managing high-dimensional vectors. link:https://docs.litellm.ai/docs/[{litellm}]:: -An open source LLM proxy and abstraction layer that lets you interact with many large language model providers through a single, OpenAI-compatible API. \ No newline at end of file +An open source LLM proxy and abstraction layer that lets you interact with many large language model providers through a single, OpenAI-compatible API. + +link:https://mlflow.org/[{mlflow}]:: +An open source platform to manage the machine learning lifecycle, including experimentation, reproducibility, deployment and a central model registry. + +link:https://www.kubeflow.org/[{kubeflow}]:: +An end-to-end machine learning platform on {kube}, packaged as a single {helm} umbrella chart. diff --git a/references/kubeflow-configuration-scenarios.adoc b/references/kubeflow-configuration-scenarios.adoc new file mode 100644 index 0000000..faf48c6 --- /dev/null +++ b/references/kubeflow-configuration-scenarios.adoc @@ -0,0 +1,297 @@ +[#kubeflow-configuration-scenarios] +include::../common/generic-attributes.adoc[] +// overridden title? +ifdef::override-title[] += {override-title} +endif::[] +ifndef::override-title[] += {kubeflow} configuration scenarios +endif::[] +// overridden abstract? +ifdef::override-abstract[] +{override-abstract} +endif::[] +ifndef::override-abstract[] +Following is a set of common configuration scenarios for deploying {kubeflow}. +They range from simple non-production setups for development and testing to production-ready configurations with TLS and automated DNS. +endif::[] + +// erase the flag for future overrides +:override-abstract!: +:override-title!: + +:revdate: 2026-03-30 +:page-revdate: {revdate} + +[#kubeflow-config-scenarios-nodeport] +== Non-production: NodePort access (zero-configuration) + +No values override file is needed. +Install with the default values and access via NodePort or port-forward as described in xref:kubeflow-accessing[]. + +Default credentials: +[source] +---- +Email: user@example.com +Password: 12341234 +---- + +[#kubeflow-config-scenarios-http] +== Non-production: Named host name over HTTP + +Use this if you want a stable URL for a shared development cluster. +You can point `/etc/hosts` at the cluster IP or use `external-dns` to automate DNS. + +[source,yaml] +---- +# kubeflow-override-values.yaml +kubeflow-istio-resources: + hostname: "kubeflow.dev.example.com" + externalDNSEnabled: false +---- + +After the installation, obtain the cluster IP and add a local DNS entry: + +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl get svc istio -n istio-system + +# /etc/hosts entry (on your local machine or in the cluster) +192.168.1.100 kubeflow.dev.example.com +---- + +Navigate to: http://kubeflow.dev.example.com. + +[#kubeflow-config-scenarios-self-signed-tls] +== Non-production: Self-signed TLS + +Suitable for shared development clusters where you can distribute the self-signed CA manually. +Requires {certmanager}, which was installed by the `runMe.sh` script in xref:kubeflow-installing-kubernetes[]. + +[source,yaml] +---- +# kubeflow-override-values.yaml +kubeflow-istio-resources: + hostname: "kubeflow.dev.example.com" + externalDNSEnabled: false + tls: + source: "selfSigned" + credentialName: kubeflow-gateway-tls + httpsRedirect: true +---- + +The chart creates a self-signed `ClusterIssuer` and requests a `Certificate` automatically. +No `kubectl` steps are required beyond the {helm} install. + +Add the self-signed CA to your Web browser trust store to avoid certificate warnings. + +[#kubeflow-config-scenarios-lets-encrypt] +== Production: Let's Encrypt TLS + external-dns + +Recommended for Internet-facing production deployments. +Uses DNS-01 challenge via Cloudflare, so HTTP-01 port requirements are avoided. +Requires a Cloudflare API token with DNS edit access. + +[source,yaml] +---- +# kubeflow-override-values.yaml + +# Access +kubeflow-istio-resources: + hostname: "kubeflow.example.com" + externalDNSEnabled: true + tls: + source: "letsEncrypt" + credentialName: kubeflow-gateway-tls + httpsRedirect: true + letsEncrypt: + email: "admin@example.com" # your ACME account email + server: prod # prod | staging (use staging first to test) + solver: cloudflare # dns01 via Cloudflare + # Configuring cloudflare since solver is cloudflare + cloudflare: + email: "admin@example.com" + apiTokenSecretRef: + name: cloudflare-api-key + key: apiKey + +# external-dns — watches the {istio} Gateway and creates/updates DNS records +externaldns: + enabled: true + provider: + name: cloudflare + cloudflare: + apiToken: "" # chart creates the Secret automatically + domainFilters: + - "example.com" + txtOwnerId: "kubeflow" # unique per cluster — prevents conflicts + sources: + - istio-gateway + env: + - name: CF_API_TOKEN + valueFrom: + secretKeyRef: + name: cloudflare-api-key + key: apiKey + +# Credentials — change ALL of these +auth: + oidc: + clientSecret: "" + initialUser: + email: "admin@example.com" + +dex: + config: + staticClients: + - id: kubeflow-oidc-authservice + redirectURIs: + - /oauth2/callback + name: kubeflow-oidc-authservice + secret: "" # must match auth.oidc.clientSecret + staticPasswords: + - email: "admin@example.com" + # Generate: htpasswd -nbBC 12 "" 'YourPassword' | tr -d ':\n' | sed 's/$2y/$2a/' + hash: "" + username: admin + userID: "1" + enablePasswordDB: true + +# Storage credentials — change these +pipelines: + seaweedfs: + accessKey: "" + secretKey: "" + mariadb: + backup: + enabled: true # recommended for production + schedule: "0 2 * * *" + storageSize: 20Gi + +# User namespace must use the same SeaweedFS credentials +user-namespace: + pipelines: + seaweedfs: + accessKey: "" # same as pipelines.seaweedfs.accessKey + secretKey: "" # same as pipelines.seaweedfs.secretKey + +# Optional hardening +networkPolicies: + enabled: false # (Experimental) set to 'true' when using CNI that enforces NetworkPolicy (Calico, Cilium, Canal) + +monitoring: + enabled: false # Leave this to 'false' as monitoring is not implemented yet +---- + +Installation: + +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade --install kubeflow \ + oci://registry.suse.com/ai/charts/kubeflow \ + --version 0.3.1 \ + -n kubeflow \ + --force-conflicts \ + --wait --timeout 15m \ + -f kubeflow-override-values.yaml +---- + +[IMPORTANT] +.Using `staging` first is strongly recommended +==== +Let's Encrypt rate-limits production certificate issuance. +Test with `server: staging` until the certificate is issued, then switch to `server: prod` and run `helm upgrade` again. +==== + +[#kubeflow-config-scenarios-byo-cert] +== Production: Bring-your-own certificate + +Use this if your organization manages TLS certificates through an existing PKI or secret manager. +Create your TLS Secret in the `istio-system` namespace before installing: + +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl create secret tls my-kubeflow-tls \ + --cert=path/to/tls.crt \ + --key=path/to/tls.key \ + -n istio-system +---- + +Then reference it in your override values file: + +[source,yaml] +---- +# kubeflow-override-values.yaml +kubeflow-istio-resources: + hostname: "kubeflow.example.com" + externalDNSEnabled: false # manage DNS separately + tls: + source: "secret" + existingSecret: "my-kubeflow-tls" + httpsRedirect: true + +# Change credentials as shown in the Let's Encrypt scenario above +auth: + oidc: + clientSecret: "" +[...] # (rest of credentials) +---- + +== Production: Use an existing external-dns and cert-manager + +If the existing environment already has `externa-dns` and `cert-manager`, {kubeflow} can make use of them if the following conditions are satisfied: + +* `external-dns` must be configured to watch for the `istio-gateway` source. +You can check the deployment with the `kubectl` command: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl get deployment external-dns -n external-dns -o yaml | grep source= + - --source=service + - --source=ingress + - --source=istio-gateway +---- + +* A cluster issuer must exist in the environment and be configured to issue certificates from a production public CA such as Let's Encrypt. +You can check the deployment with the `kubectl` command: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl get clusterissuer + +NAME READY AGE +letsencrypt-production True 75m +---- ++ +To configure {kubeflow} to use existing `external-dns` and `cert-manager`, reference it in your values: + +[source,yaml] +---- +# kubeflow-override-values.yaml +kubeflow-istio-resources: + hostname: "kubeflow.example.com" + externalDNSEnabled: true + tls: + source: "issuerRef" + httpsRedirect: true + + issuerRef: + name: letsencrypt-production + +externaldns: + enabled: false + +# Change credentials as shown in the Let's Encrypt scenario above +auth: + oidc: + clientSecret: "" +[...] # (rest of credentials) +---- + +[NOTE] +.Verify {istio} CRDs installation +==== +When adding `istio-gateway` as a source to `external-dns`, make sure the {istio} CRDs are installed. +Otherwise, `external-dns` pod may keep crashing with an error indicating a failure to list the {istio} gateway resource. +However, the error will eventually resolve after {istio} is installed by {kubeflow}. +==== diff --git a/references/kubeflow-limitations.adoc b/references/kubeflow-limitations.adoc new file mode 100644 index 0000000..5391be5 --- /dev/null +++ b/references/kubeflow-limitations.adoc @@ -0,0 +1,64 @@ +[#kubeflow-limitations] +include::../common/generic-attributes.adoc[] +// overridden title? +ifdef::override-title[] += {override-title} +endif::[] +ifndef::override-title[] += Known {kubeflow} limitations +endif::[] +// overridden abstract? +ifdef::override-abstract[] +{override-abstract} +endif::[] +ifndef::override-abstract[] +The current {kubeflow} distribution has the following known limitations. +Understanding these limitations is crucial for successful deployment and operation. +endif::[] + +// erase the flag for future overrides +:override-abstract!: +:override-title!: + +:revdate: 2026-03-31 +:page-revdate: {revdate} + +* *Namespace names*, such as `kubeflow`, `knative-serving` or `istio-system`, are *hardcoded* in most templates. +Deploying to a non-standard namespace requires template modifications. + +* *Argo workflow executor image pull secrets* must be set via the `workflow-controller-configmap` + `workflowDefaults` field. +They cannot be set via {helm} values at runtime because the configmap is rendered at pod-create time, not {helm} time. + +* *`knativeServing.enabled` must be `true`* for KServe to function. +*Disabling Knative Serving* will cause KServe InferenceService resources to remain in a *non-Ready state*. + +* Only the *Cloudflare provider* is supported for external-dns out of the box. +Other providers (AWS Route53, Azure, GCP) require additional `externaldns.env` configuration. + +* *{seaweedfs} is single-node, non-replicated.* +Pipeline artifact storage has no HA. A {seaweedfs} pod restart causes a brief (~10–60s) S3 outage. +For production HA, replace {seaweedfs} with an external S3-compatible store. + +* *Not all {kubeflow} controllers support multiple replicas.* +Leader election is confirmed for katib-controller (v0.17+), training-operator, kserve-controller-manager, pvcviewer-controller, and model-registry-controller. +These are safe to scale via `ha-overrides.yaml`. +It is not confirmed for notebook-controller, profiles-controller, tensorboard-controller, and several KFP background workers. +Setting `replicaCount > 1` for these controllers causes undefined behavior, such as duplicate reconciliation or data corruption. + +* *CRDs are not automatically upgraded by `helm upgrade`.* {kubeflow} CRDs are placed in `crds/` subdirectories. +{helm} intentionally skips them on upgrade. +After a chart version bump, run: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}find charts -path '*/crds/*.yaml' | xargs -I{} kubectl apply -f {} +---- + +* *Dex must be v0.23.0 (dex 2.42.0).* +v0.24.0 (dex 2.44.0) uses Go 1.25, which introduced strict IPv6 URL parsing. +{kube} API server addresses like `[10.43.0.1]:443` are rejected, crashing Dex on startup. +The chart pins v0.23.0. + +* *{seaweedfs} image is pulled from {dhub}* (`chrislusf/seaweedfs:4.00`). +Air-gapped clusters or environments with {dhub} pull-rate limits will fail to start {kubeflow} Pipelines. Mirror the image to a private registry and set `pipelines.seaweedfs.image.registry` to override. diff --git a/references/kubeflow-troubleshooting.adoc b/references/kubeflow-troubleshooting.adoc new file mode 100644 index 0000000..e0f6d44 --- /dev/null +++ b/references/kubeflow-troubleshooting.adoc @@ -0,0 +1,154 @@ +[#kubeflow-troubleshooting] +include::../common/generic-attributes.adoc[] +// overridden title? +ifdef::override-title[] += {override-title} +endif::[] +ifndef::override-title[] += {kubeflow} troubleshooting +endif::[] +// overridden abstract? +ifdef::override-abstract[] +{override-abstract} +endif::[] +ifndef::override-abstract[] +Learn about solutions for common issues encountered when using {kubeflow}. +endif::[] + +// erase the flag for future overrides +:override-abstract!: +:override-title!: + +:revdate: 2026-03-31 +:page-revdate: {revdate} + +[role=qanda] +Pods are not starting — too many open files:: +Run on each cluster node: ++ +[source,bash,subs="+attributes"] +---- +{prompt_sudo}sysctl -w fs.inotify.max_user_watches=524288 +{prompt_sudo}sysctl -w fs.inotify.max_user_instances=512 + +# Persist across reboots +{prompt_user}echo "fs.inotify.max_user_watches=524288" | \ + sudo tee -a /etc/sysctl.d/99-kubeflow.conf +{prompt_user}echo "fs.inotify.max_user_instances=512" | \ + sudo tee -a /etc/sysctl.d/99-kubeflow.conf +---- + +Pods are stuck pending — storage issues:: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl get pvc -n kubeflow +{prompt_user}kubectl describe pvc -n kubeflow +---- ++ +Ensure the default StorageClass exists: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl get storageclass +---- ++ +If none is marked as default, patch one: ++ +[source,bash,subs="+attributes"] +---- +kubectl patch storageclass \ + -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' +---- + +`metadata-grpc` CrashLoopBackOff (MLMD):: +The `metadb` database is created by a post-install hook job. If the job failed, rerun it: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl delete job metadb-init -n kubeflow --ignore-not-found +{prompt_user}helm template kubeflow charts/kubeflow -n kubeflow \ + --show-only 'charts/pipelines/templates/Job/metadb-init-kubeflow-Job.yaml' \ + | kubectl apply -n kubeflow -f - +---- + +TensorBoard unavailable / controller CrashLoopBackOff:: ++ +-- +Check that the `tensorboard-controller-config` ConfigMap contains the following required keys: + +* `ISTIO_HOST: "*"` +* `ISTIO_GATEWAY: kubeflow/kubeflow-gateway` (must be in the `namespace/name` format) + +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl get configmap -n kubeflow -l app=tensorboard-controller -o yaml | grep -A5 'data:' +---- +-- ++ +If either the key is wrong or missing, upgrade the chart with the following overrides: ++ +[source,bash,subs="+attributes"] +---- +tensorboard-controller: + configMapData: + ISTIO_HOST: "*" + ISTIO_GATEWAY: kubeflow/kubeflow-gateway +---- + +KServe InferenceService not progressing:: +Verify that the `ClusterStorageContainer` CRD and a default object exist: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl get crd clusterstoragecontainers.serving.kserve.io +{prompt_user}kubectl get clusterstoragecontainer default +---- ++ +If either is missing, rerun `helm upgrade` to install them. + +"RBAC: access denied" for user namespace traffic:: +{ranchera} {istio} uses the `istio` ServiceAccount (not `istio-ingressgateway-service-account`). +The `rancher-ingressgateway-access` AuthorizationPolicy in each user namespace handles this. ++ +If you pre-created the AuthorizationPolicy with `kubectl` before the first {helm} install, you need to add {helm} ownership labels/annotations before upgrading: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl label authorizationpolicy rancher-ingressgateway-access \ + -n kubeflow-user-example-com app.kubernetes.io/managed-by=Helm --overwrite +{prompt_user}kubectl annotate authorizationpolicy rancher-ingressgateway-access \ + -n kubeflow-user-example-com \ + meta.helm.sh/release-name=kubeflow \ + meta.helm.sh/release-namespace=kubeflow --overwrite +---- + +{seaweedfs} pod stuck on ContainerCreating:: +On K3s 1.34 with cri-dockerd ({ranchera} Desktop), a race between the CNI and the container runtime can leave the {seaweedfs} pod stuck. +{seaweedfs} acquires a LevelDB file lock on startup. +If the pod is stuck, the lock is held and the next pod will also fail to start. +To recover, run the following commands: ++ +[source,bash,subs="+attributes"] +---- +# Find the Docker container ID for the stuck pod +{prompt_user}docker ps | grep seaweedfs + +# Release the lock +{prompt_user}docker kill + +# Delete the stuck pod — a new pod will start cleanly +{prompt_user}kubectl delete pod -n kubeflow -l app=seaweedfs +---- + +Dex login loop (infinite redirect):: +This usually means that oauth2-proxy is receiving a 403 from an Istio AuthorizationPolicy rather than from oauth2-proxy itself. +The Lua-redirect filter only converts 403 → 302 when the response includes a `set-cookie` header; AuthorizationPolicy 403 messages do not have one. ++ +Check for sidecar-level AuthorizationPolicy denials: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl logs -n istio-system -l app=istiod | grep "RBAC" +{prompt_user}kubectl get authorizationpolicy -A +---- diff --git a/references/kubeflow-upgrade-notes.adoc b/references/kubeflow-upgrade-notes.adoc new file mode 100644 index 0000000..5101c3c --- /dev/null +++ b/references/kubeflow-upgrade-notes.adoc @@ -0,0 +1,94 @@ +[#kubeflow-upgrade-notes] +include::../common/generic-attributes.adoc[] +// overridden title? +ifdef::override-title[] += {override-title} +endif::[] +ifndef::override-title[] += {kubeflow} upgrade notes +endif::[] +// overridden abstract? +ifdef::override-abstract[] +{override-abstract} +endif::[] +ifndef::override-abstract[] +Following are important considerations and specific instructions for managing and upgrading your {kubeflow} deployment. +They cover critical aspects like configuration conflicts, credential rotation, and potential service interruptions. +endif::[] + +// erase the flag for future overrides +:override-abstract!: +:override-title!: + +:revdate: 2026-03-31 +:page-revdate: {revdate} + +Always use `--force-conflicts`:: +The `--force-conflicts` flag is required for `helm upgrade`. +Certain components, such as `cert-manager-cainjector` and `istiod`, modify fields that {helm} also manages. +This flag tells {helm} to overwrite these external changes and reclaim ownership. ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade kubeflow \ + oci://registry.suse.com/ai/charts/kubeflow \ + --version 0.3.1 \ + -n kubeflow --force-conflicts --wait --timeout 15m +---- +Switching database engines or changing StorageClass:: +PVCs with the option `helm.sh/resource-policy: keep` are not deleted by {helm}. +When switching database images or StorageClasses, delete the PVC manually first: ++ +[source,bash,subs="+attributes"] +---- +# For KFP MariaDB (Rancher MariaDB StatefulSet) +{prompt_user}kubectl delete pvc -n kubeflow data-mysql-0 + +# For Katib MariaDB (standalone Deployment, not StatefulSet) +{prompt_user}kubectl delete pvc -n kubeflow katib-mysql +---- + +Training operator webhook secret migration (auto):: +On upgrade from chart versions prior to 0.3.0, a pre-upgrade hook automatically migrates the `training-operator-webhook-cert` Secret from type `kubernetes.io/tls` to `Opaque`. +No manual action is required. +If the `kubectl get jobs -n kubeflow` command reports a hook failure, delete the secret manually and rerun `helm upgrade`: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl delete secret training-operator-webhook-cert -n kubeflow --ignore-not-found +{prompt_user}helm upgrade kubeflow charts/kubeflow -n kubeflow --force-conflicts --wait --timeout 15m +---- + +Credential rotation ({seaweedfs}):: +Changing `pipelines.seaweedfs.accessKey` and `secretKey` requires restarting all KFP Deployments that read those credentials. +Pods do not automatically restart when a Secret changes: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl rollout restart deployment -n kubeflow \ + ml-pipeline ml-pipeline-ui ml-pipeline-persistenceagent ml-pipeline-scheduledworkflow +{prompt_user}kubectl rollout restart deployment -n kubeflow-user-example-com \ + ml-pipeline-ui-artifact +---- ++ +{seaweedfs} IAM accumulates credentials across restarts (the `postStart` hook adds, never removes). +To clean stale entries after a rotation: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl exec -n kubeflow deploy/seaweedfs -- \ + sh -c "printf 's3.configure -user -access_key -delete -apply\n' \ + | weed shell -master 127.0.0.1:9333" +---- + +{seaweedfs} upgrade downtime:: +{seaweedfs} uses a `Recreate` deployment strategy (single-node S3 store backed by a PVC). +During `helm upgrade`, the old pod is terminated before the new pod starts. +Expect a brief window of 10s to 60s where S3 artifact uploads and downloads are unavailable. +In-flight pipeline runs may stall. ++ +We recommend temporarily disabling active pipeline runs in the GUI before upgrading. + +Supply the full values file on upgrade:: +Running `helm upgrade --reuse-values` does not update the `user-namespace` Secret with new {seaweedfs} credentials. +Always supply the full values file on upgrade, or patch the Secret manually after upgrade. diff --git a/tasks/AI-deployment-ailibrary-installing.adoc b/tasks/AI-deployment-ailibrary-installing.adoc index 92e26bf..c4d20ea 100644 --- a/tasks/AI-deployment-ailibrary-installing.adoc +++ b/tasks/AI-deployment-ailibrary-installing.adoc @@ -134,7 +134,8 @@ ifeval::["{PROF_DEPLOYMENT}" == "standard"] .. Install {vllm} as described in xref:vllm-installing[]. .. Install {mcpo} as described in xref:mcpo-installing[]. .. Install {pytorch} as described in xref:pytorch-installing[]. -.. Install {mlflow} as described in xref:mlflow-installing[]. .. Install {qdrant} as described in xref:qdrant-installing[]. .. Install {litellm} as described in xref:litellm-installing[]. +.. Install {mlflow} as described in xref:mlflow-installing[]. +.. Install {kubeflow} as described in xref:kubeflow-installing[]. endif::[] diff --git a/tasks/kubeflow-accessing.adoc b/tasks/kubeflow-accessing.adoc new file mode 100644 index 0000000..63f4126 --- /dev/null +++ b/tasks/kubeflow-accessing.adoc @@ -0,0 +1,87 @@ +[#kubeflow-accessing] +include::../common/generic-attributes.adoc[] +// overridden title? +ifdef::override-title[] += {override-title} +endif::[] +ifndef::override-title[] += Accessing {kubeflow} +endif::[] +// overridden abstract? +ifdef::override-abstract[] +{override-abstract} +endif::[] +ifndef::override-abstract[] +Once {kubeflow} is deployed, the next step is choosing how to access its Web interface. +The right method depends on your environment — whether you are running a local cluster, working inside a private network, or exposing {kubeflow} externally. + +The following options cover common access patterns, from simple local port forwarding to fully configured external endpoints with custom host names and TLS. +endif::[] + +// erase the flag for future overrides +:override-abstract!: +:override-title!: + +:revdate: 2026-03-27 +:page-revdate: {revdate} + +[#kubeflow-accessing-portforward] +== Option A: Port-forward (local dev) + +If NodePorts are not directly reachable, use port-forward instead: + +[source,bash,subs="+attributes"] +---- +# Run in a separate terminal and keep it open +{prompt_user}kubectl port-forward svc/istio -n istio-system 8080:80 +---- + +Navigate to \http://localhost:8080. + +[#kubeflow-accessing-nodeport] +== Option B: NodePort (standard Linux cluster) + +The {istio} gateway Service is of type `LoadBalancer` and always has NodePorts assigned, even without a load balancer controller: + +[source,bash] +---- +NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}') +HTTP_PORT=$(kubectl get svc istio -n istio-system -o jsonpath='{.spec.ports[?(@.port==80)].nodePort}') +echo "http://${NODE_IP}:${HTTP_PORT}" +---- + +[#kubeflow-accessing-external-ip] +== Option C: LoadBalancer external IP + +If your cluster has MetalLB or a cloud load-balancer controller, the Service receives an external IP: + +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl get svc istio -n istio-system # wait for EXTERNAL-IP +---- + +Navigate to \http://. + +[TIP] +==== +No Helm values changes are needed for options A, B, or C. +==== + +### Option D: Named host name (HTTP) + +Set a host name to restrict the gateway to a specific FQDN. +TLS is not required. + +See xref:kubeflow-config-scenarios-http[] for override values. + +### Option E — Named host name with TLS (HTTPS) + +See xref:kubeflow-config-scenarios-self-signed-tls[] or xref:kubeflow-config-scenarios-lets-encrypt[] for override values. + +[NOTE] +.`external-dns` is required for the Let's Encrypt issuer +==== +When using Let's Encrypt as the issuer for Gateway TLS certificates, `external-dns` is required for DNS-01 challenges. +Furthermore, a load balancer (i.e. MetalLB) must be used with `external-dns`. +This ensures that `external-dns` can properly obtain an external IP from the load balancer to create the DNS record. +==== diff --git a/tasks/kubeflow-hardening.adoc b/tasks/kubeflow-hardening.adoc new file mode 100644 index 0000000..12ec9d6 --- /dev/null +++ b/tasks/kubeflow-hardening.adoc @@ -0,0 +1,168 @@ +[#kubeflow-hardening-production] +include::../common/generic-attributes.adoc[] +// overridden title? +ifdef::override-title[] += {override-title} +endif::[] +ifndef::override-title[] += Hardening {kubeflow} for production +endif::[] +// overridden abstract? +ifdef::override-abstract[] +{override-abstract} +endif::[] +ifndef::override-abstract[] +The override value chart ships with defaults that are not suitable for production. +Update these values before exposing the deployment to any network or storing sensitive data. +endif::[] + +// erase the flag for future overrides +:override-abstract!: +:override-title!: + +:revdate: 2026-03-30 +:page-revdate: {revdate} + +[WARNING] +.Demo credentials +==== +By default (`global.demoMode: false`), the chart *fails at render time* with a security error if any well-known demo credential is still present. +To suppress this during local development, set `global.demoMode: true` in your override values file but *never set this in production*. +==== + +. *Change all default credentials.* +The credentials that trigger the render-time security check: ++ +[source,yaml] +---- +# in your kubeflow-override-values.yaml +auth: + oidc: + clientSecret: "" + cookieSecret: "" # generate: openssl rand -base64 32 + +dex: + config: + staticClients: + - id: kubeflow-oidc-authservice + redirectURIs: + - /oauth2/callback + name: kubeflow-oidc-authservice + secret: "" # must match auth.oidc.clientSecret above + staticPasswords: + - email: "admin@yourcompany.com" + # Generate: htpasswd -nbBC 12 "" 'YourPassword' | tr -d ':\n' | sed 's/$2y/$2a/' + hash: "" + username: admin + userID: "1" + enablePasswordDB: true + +pipelines: + seaweedfs: + accessKey: "" + secretKey: "" + +user-namespace: + pipelines: + seaweedfs: + accessKey: "" # must match pipelines.seaweedfs.accessKey + secretKey: "" # must match pipelines.seaweedfs.secretKey +---- ++ +[NOTE] +.{mariadb} {rootuser} password +==== +Both the KFP and Katib {mysql} secrets are *auto-generated* (24-char random password) on the first install. +They are preserved across upgrades with no action required. +To rotate them, delete the secret and run `helm upgrade` to regenerate: + +[source,bash,subs="+attributes"] +---- +{prompt_user}kubectl delete secret mysql-secret -n kubeflow # for KFP +{prompt_user}kubectl delete secret katib-mysql-secrets -n kubeflow # for Katib +{prompt_user}helm upgrade kubeflow . -f kubeflow-override-values.yaml -n kubeflow +---- +==== + +. *Use an external identity provider.* +Replace Dex static passwords with an LDAP, SAML, or upstream OIDC connector. +Add a `connectors` block to `dex.config` and remove `staticPasswords` and `enablePasswordDB: true`. + +. *NetworkPolicies.* +_NetworkPolicies_ are *disabled by default*. +They use an 'ingress-only deny-by-default' model where egress is unrestricted so that components can reach external services such as {huggingface} and container registries. +Such configurations are supported by Calico, Cilium, Canal, and any other CNI that enforces NetworkPolicy. +If your CNI does not enforce NetworkPolicy, enable it: ++ +[source,yaml] +---- +# in your kubeflow-override-values.yaml +networkPolicies: + enabled: true +---- + +. *Enable TLS.* +Refer to xref:kubeflow-config-scenarios-lets-encrypt[] or xref:kubeflow-config-scenarios-byo-cert[] for more details. + +. *Enable database backups.* ++ +[source,yaml] +---- +# in your kubeflow-override-values.yaml +pipelines: + mariadb: + backup: + enabled: true + schedule: "0 2 * * *" # daily at 02:00 UTC + storageSize: 20Gi +---- ++ +To restore the backup: ++ +[source,bash,subs="+attributes"] +---- +# List available backups +{prompt_user}kubectl exec -n kubeflow sts/mysql -- ls /backup/ + +# Restore +{prompt_user}kubectl exec -n kubeflow sts/mysql -- \ + sh -c "mariadb --ssl=false -u root < /backup/.sql" +---- + +. *Enable pre-install validation.* ++ +[source,yaml] +---- +# in your kubeflow-override-values.yaml +preflightChecks: + enabled: true +---- ++ +Runs a hook job before the installation that validates that the default `StorageClass` exists and that {certmanager} CRDs are registered. + +. *Enable High Availability.* +Apply `ha-overrides.yaml` (provided in the repository) on top of your base values to scale the Katib controller, training-operator, and KServe controller to 2 replicas. +KFP and Dex PodDisruptionBudgets are already enabled by default. ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade kubeflow . -f kubeflow-override-values.yaml -f ha-overrides.yaml -n kubeflow +---- ++ +PDBs protect against voluntary disruptions (node drains) but only provide meaningful coverage with 2 or more replicas. +With a single replica, the PDB allows full eviction. +See xref:kubeflow-limitations[Known limitations] for supported HA controllers. + +. *Apply resource quotas per user namespace.* ++ +[source,yaml] +---- +# in your kubeflow-override-values.yaml +additionalUsers: + - email: alice@example.com + namespace: alice + resourceQuota: + requests.cpu: "4" + requests.memory: "8Gi" + requests.nvidia.com/gpu: "1" +---- diff --git a/tasks/kubeflow-installing.adoc b/tasks/kubeflow-installing.adoc new file mode 100644 index 0000000..1f3b4b6 --- /dev/null +++ b/tasks/kubeflow-installing.adoc @@ -0,0 +1,253 @@ +[#kubeflow-installing] +include::../common/generic-attributes.adoc[] +// overridden title? +ifdef::override-title[] += {override-title} +endif::[] +ifndef::override-title[] += Installing {kubeflow} +endif::[] +// overridden abstract? +ifdef::override-abstract[] +{override-abstract} +endif::[] +ifndef::override-abstract[] +{kubeflow} is an end-to-end machine learning platform on {kube}, packaged as a single {helm} umbrella chart. +It targets {ranchera} / {rke2a} clusters using charts and containers from the {suse} {ailibrary}. +endif::[] + +// erase the flag for future overrides +:override-abstract!: +:override-title!: + +:revdate: 2026-03-26 +:page-revdate: {revdate} + +[#kubeflow-installing-app-details] +== Details about the application + +Before deploying {kubeflow}, it is important to know more about the supported configurations and documentation. +The following command provides the corresponding details: + +[source,subs="+attributes"] +---- +{prompt_user}helm show values oci://dp.apps.rancher.io/charts/kubeflow +---- + +Alternatively, you can also refer to the {kubeflow} {helm} chart page on the {sappco} site at link:https://apps.rancher.io/applications/kubeflow[]. +It contains {kubeflow} dependencies, available versions and the link to pull the {kubeflow} container image. + +[TIP] +.{kubeflow} repository +==== +Referenced files, such as the `runMe.sh` script, are located in the link:https://github.com/SUSE/suse-ai-charts/tree/main/kubeflow[{kubeflow} repository]. +==== + +[#kubeflow-installing-requirements] +== Prerequisites + +|=== +| Requirement | Version | Notes + +| {kube} +| >= 1.30 +| Tested on {rke2a} / K3s + +| {helm} +| >= 4.0 +| Required for OCI chart support + +| {certmanager} +| 1.19.3 +| Installed by the `runMe.sh` script or pre-installed manually + +| Istio +| 1.1.3 +| Installed by the `runMe.sh` script or pre-installed manually + +| Default StorageClass +| — +| Local Path Provisioner (development) or Longhorn (production) + +| {sappco} credentials +| — +| User name + token for `dp.apps.rancher.io` (application-collection secret) + +| {sregistry} credentials +| — +| User name + token for `registry.suse.com` (suse-registry secret) — required when sub-charts pull from {sregistry} +|=== + +Storage class:: +All PVCs use the cluster default StorageClass unless `global.storageClass` is set. +Most components only require RWO (ReadWriteOnce). +The exception is the Katib PBT hyperparameter tuning algorithm, which requires RWX (ReadWriteMany) to share model checkpoints across trial pods. +For single-node clusters, the Local Path Provisioner is sufficient for development (PBT will only work if all trial pods are scheduled on the same node). +For production, Longhorn is recommended as it supports both RWO and RWX. + +Cloudflare API token:: +Only required if you want automatic DNS record management via `external-dns`` or Let's Encrypt DNS-01 challenges. +Not needed for basic installs. + +[NOTE] +==== +We recommend using the latest versions of the {helm} charts. +==== + +[#kubeflow-installing-kubernetes] +== Installation procedure + +include::../snippets/ai-library-requirement.adoc[] + +. Log in to {sappco} registry ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}helm registry login dp.apps.rancher.io \ + --username= \ + --password= +---- + +. Log in to {sregistry} ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}helm registry login registry.suse.com \ + --username=regcode \ + --password= +---- + +. Create namespaces. ++ +[source,bash] +---- +for ns in cert-manager istio-system kubeflow kubeflow-user-example-com; do + kubectl create namespace "$ns" --dry-run=client -o yaml | kubectl apply -f - +done +---- + +. Create {sappco} image pull secrets. ++ +[source,bash] +---- +for ns in cert-manager istio-system kubeflow kubeflow-user-example-com; do + kubectl create secret docker-registry application-collection \ + --docker-server=dp.apps.rancher.io \ + --docker-username= \ + --docker-password= \ + -n "$ns" \ + --dry-run=client -o yaml | kubectl apply -f - +done +---- + +. Create {sregistry} image pull secrets. ++ +[source,bash] +---- +for ns in cert-manager istio-system kubeflow kubeflow-user-example-com; do + kubectl create secret docker-registry suse-ai-registry \ + --docker-server=registry.suse.com \ + --docker-username=regcode \ + --docker-password= \ + -n "$ns" \ + --dry-run=client -o yaml | kubectl apply -f - +done +---- + +. Label namespaces for {helm}. ++ +[source,bash] +---- +for ns in kubeflow kubeflow-user-example-com; do + kubectl label namespace "$ns" app.kubernetes.io/managed-by=Helm --overwrite + kubectl annotate namespace "$ns" \ + meta.helm.sh/release-name=kubeflow \ + meta.helm.sh/release-namespace=kubeflow \ + --overwrite +done +---- + +. Install {certmanager}. ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade --install cert-manager oci://dp.apps.rancher.io/charts/cert-manager \ + --version 1.19.3 \ + --namespace cert-manager \ + --set crds.enabled=true \ + --set crds.keep=true \ + --set global.imagePullSecrets[0].name=application-collection \ + --wait --timeout 5m +---- + +. Install {istio} (required by Kubeflow). ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade --install istio oci://dp.apps.rancher.io/charts/istio \ + --version 1.1.3 \ + --namespace istio-system \ + --set global.imagePullSecrets[0].name=application-collection \ + --set gateway.enabled=true \ + --force-conflicts \ + --server-side=true \ + --wait --timeout 5m +---- + +. Install {kubeflow}. ++ +Install directly from the OCI registry (no source checkout required): ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade --install kubeflow \ + oci://registry.suse.com/ai/charts/kubeflow \ + --version 0.3.1 \ + -n kubeflow \ + --force-conflicts \ <.> + --server-side=true \ + --wait --timeout 15m \ + -f my-values.yaml +---- +<.> `--force-conflicts` is required because cert-manager-cainjector, istiod (pilot-discovery), and the clusterrole-aggregation-controller modify fields (caBundle, webhook failurePolicy, aggregated RBAC rules) that {helm} tracks. +This flag lets {helm} reclaim ownership of those fields on each upgrade. + ++ +To apply a values override file, for example the `demo-overrides.yaml` provided in the link:https://github.com/SUSE/suse-ai-charts/tree/main/kubeflow[repo], run the following command. +**Never use `demo-overrides.yaml` in production**: ++ +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade --install kubeflow \ + oci://registry.suse.com/ai/charts/kubeflow \ + --version 0.3.1 \ + -n kubeflow \ + --force-conflicts \ + --server-side=true \ + --wait --timeout 15m \ + -f demo-overrides.yaml +---- + +ifdef::deployment_standard[] +[#kubeflow-upgrading] +== Upgrading {kubeflow} + +You can upgrade {kubeflow} to a specific version by running the following command: + +[source,subs="+attributes"] +---- +{prompt_user}helm upgrade kubeflow \ + oci://registry.suse.com/ai/charts/kubeflow \ + -n kubeflow \ + --version VERSION_NUMBER \ + --wait --timeout 15m \ + --force-conflicts <.> +---- +<.> Always use the `--force-conflicts` option during upgrades. +This flag lets {helm} reclaim ownership of modified fields on each upgrade. + +[TIP] +==== +If you omit the `--version` option, {kubeflow} gets upgraded to the latest available version. +==== +endif::[] diff --git a/tasks/kubeflow-user-profiles.adoc b/tasks/kubeflow-user-profiles.adoc new file mode 100644 index 0000000..9d640fc --- /dev/null +++ b/tasks/kubeflow-user-profiles.adoc @@ -0,0 +1,93 @@ +[#kubeflow-user-profiles] +include::../common/generic-attributes.adoc[] +// overridden title? +ifdef::override-title[] += {override-title} +endif::[] +ifndef::override-title[] += Managing {kubeflow} user profile and namespaces +endif::[] +// overridden abstract? +ifdef::override-abstract[] +{override-abstract} +endif::[] +ifndef::override-abstract[] +{kubeflow} uses a profile-per-user model. +The default user namespace that {kubeflow} creates during the installation is `kubeflow-user-example-com`. +endif::[] + +// erase the flag for future overrides +:override-abstract!: +:override-title!: + +:revdate: 2026-03-30 +:page-revdate: {revdate} + +[#kubeflow-user-profile-at-install] +== Adding users during installation + +Insert or update the following snippet in your `kubeflow-override-values.yaml` file: + +[source,yaml] +---- +# in your kubeflow-override-values.yaml +user-namespace: + additionalUsers: + - email: tux@example.com + namespace: tux + resourceQuota: + requests.cpu: "4" + requests.memory: 8Gi + - email: geeko@example.com + namespace: geeko +---- + +[IMPORTANT] +.Use an explicit namespace +==== +The `namespace` field is optional but strongly recommended. +Without it, the namespace is auto-generated from the e-mail by replacing `@` with `--` and `.` with `-`. +E-mails that differ only by `.` vs `-` -- such as `tux.geeko@example.com` and `tux-geeko@example.com` -- produce the same auto-generated namespace. +Use an explicit `namespace` to disambiguate. +==== + +[#kubeflow-user-profile-after-install] +== Adding users after installation + +Add the user to your values file and run `helm upgrade`: + +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade kubeflow oci://registry.suse.com/ai/charts/kubeflow \ + --version \ + -n kubeflow --reuse-values \ + --set "user-namespace.additionalUsers[0].email=tux@example.com" \ + --set "user-namespace.additionalUsers[0].namespace=tux" +---- + +Alternatively, add the user to your values file and rerun the upgrade: + +[source,bash] +---- +# in your kubeflow-override-values.yaml +user-namespace: + additionalUsers: + - email: tux@example.com + namespace: tux +---- + +[source,bash,subs="+attributes"] +---- +{prompt_user}helm upgrade kubeflow oci://registry.suse.com/ai/charts/kubeflow \ + --version \ + -n kubeflow -f kubeflow-override-values.yaml +---- + +This creates the Profile CR and deploys all required KFP per-namespace resources (pipeline artifact server, visualization server, credentials, authorization policies) in a single step. + +[NOTE] +==== +Do not add users by applying a `Profile` CR directly with `kubectl`. +The profiles controller only creates namespace-level RBAC -- it does not deploy the KFP per-namespace resources that pipelines depend on. +Users added this way will have an incomplete environment and the pipeline runs will fail. +==== diff --git a/tasks/litellm-installing.adoc b/tasks/litellm-installing.adoc index f389e96..8ef8edf 100644 --- a/tasks/litellm-installing.adoc +++ b/tasks/litellm-installing.adoc @@ -65,6 +65,7 @@ include::../snippets/ai-library-requirement.adoc[] -f litellm_custom_overrides.yaml ---- +ifdef::deployment_standard[] [#litellm-upgrading] == Upgrading {litellm} @@ -83,6 +84,7 @@ You can upgrade {litellm} to a specific version by running the following command ==== If you omit the `--version` option, {litellm} gets upgraded to the latest available version. ==== +endif::[] [#litellm-uninstalling] == Uninstalling {litellm}