From dbd7f42bf20ee2eec35e11fcba12e812de7e2f3e Mon Sep 17 00:00:00 2001 From: "Kratika.Patidar" Date: Sun, 31 May 2026 19:19:36 +0000 Subject: [PATCH 1/4] telemetry rollback and upgrade changes Signed-off-by: Kratika.Patidar --- rollback/playbooks/rollback_telemetry.yml | 28 +++-- rollback/roles/rollback_k8s/tasks/main.yml | 4 + .../tasks/verify_telemetry_rollback.yml | 104 ++++++++++++++++++ .../roles/upgrade_k8s/tasks/step_uncordon.yml | 12 ++ .../tasks/backup_telemetry.yml | 4 +- .../tasks/execute_telemetry_sh.yml | 55 +++++---- upgrade/roles/upgrade_telemetry/vars/main.yml | 14 +++ 7 files changed, 185 insertions(+), 36 deletions(-) create mode 100644 rollback/roles/rollback_k8s/tasks/verify_telemetry_rollback.yml diff --git a/rollback/playbooks/rollback_telemetry.yml b/rollback/playbooks/rollback_telemetry.yml index 5eeda1ed58..4c66cbb8dc 100644 --- a/rollback/playbooks/rollback_telemetry.yml +++ b/rollback/playbooks/rollback_telemetry.yml @@ -68,17 +68,25 @@ ansible.builtin.debug: msg: "[ROLLBACK] Component '{{ component_name }}' — status changed to: in-progress" - # TODO: Implement telemetry rollback steps per ESpec §4.8.5: - # 1. Helm uninstall new components (powerscale, vast, victorialogs, ufm) - # 2. Rollback Strimzi operator + Kafka brokers to previous version - # 3. Rollback VictoriaMetrics StatefulSet(s) to previous version - # 4. Rollback iDRAC telemetry receiver + pump images - # 5. Restore LDMS sampler/aggregator configs from backup - # 6. Rolling restart LDMS pods - # 7. Validate: all telemetry pods Running, metrics/logs flowing - - name: Telemetry rollback placeholder + # ── Telemetry rollback strategy ────────────────────────────────── + # The K8s rollback (etcd snapshot restore) restores ALL Kubernetes objects + # to their pre-upgrade (2.1) state, which includes all telemetry namespace + # resources: Deployments, StatefulSets, Services, ConfigMaps, Secrets, + # PVCs, Helm release secrets, and CRDs. + # + # 2.2-only components (vector-ldms, vector-ome, victoria-logs cluster, + # victoria-metrics-operator CRDs, vlagent-vector, vmagent-vector) are + # automatically removed because they did not exist in the etcd snapshot. + # + # Post-K8s-rollback telemetry pod verification is performed by + # verify_telemetry_rollback.yml (Stage 8d) inside the rollback_k8s role. + - name: "Telemetry rollback — handled by K8s rollback (etcd restore)" ansible.builtin.debug: - msg: "Telemetry rollback tasks to be implemented (Helm uninstall, component rollback)" + msg: + - "Telemetry rollback is handled by K8s rollback via etcd snapshot restore." + - "The etcd snapshot contains the full 2.1 telemetry namespace state." + - "2.2-only components will be removed automatically." + - "Post-rollback telemetry pod verification runs in K8s rollback Stage 8d." - name: Mark telemetry rollback as completed ansible.builtin.copy: diff --git a/rollback/roles/rollback_k8s/tasks/main.yml b/rollback/roles/rollback_k8s/tasks/main.yml index 196b413c3b..1ff98d18f3 100644 --- a/rollback/roles/rollback_k8s/tasks/main.yml +++ b/rollback/roles/rollback_k8s/tasks/main.yml @@ -197,6 +197,10 @@ - name: "Stage 8c — Clean up stale CSI VolumeAttachments" ansible.builtin.include_tasks: cleanup_stale_volume_attachments.yml + # ── Stage 8d: Verify telemetry pods after etcd restore ─────── + - name: "Stage 8d — Verify telemetry rollback" + ansible.builtin.include_tasks: verify_telemetry_rollback.yml + # ── Stage 9: Restore BSS boot params and cloud-init ────────── - name: "Stage 9 — Restore BSS boot params and cloud-init" ansible.builtin.include_tasks: restore_bss_cloud_init.yml diff --git a/rollback/roles/rollback_k8s/tasks/verify_telemetry_rollback.yml b/rollback/roles/rollback_k8s/tasks/verify_telemetry_rollback.yml new file mode 100644 index 0000000000..f7461f381c --- /dev/null +++ b/rollback/roles/rollback_k8s/tasks/verify_telemetry_rollback.yml @@ -0,0 +1,104 @@ +# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# ============================================================================ +# Verify Telemetry Rollback (Post-K8s-Rollback) +# ============================================================================ +# After etcd restore, the 2.1 telemetry objects are restored and 2.2-only +# objects are removed. This task verifies: +# 1. All telemetry pods are displayed with their status +# 2. No 2.2-only components remain (vector-ldms, vector-ome, victoria-logs, +# victoria-metrics-operator, vlagent-vector, vmagent-vector) +# 3. Pod readiness summary +# +# Prerequisites (guaranteed by rollback_k8s/main.yml execution order): +# - kube_vip: set in load_version_vars.yml, added to inventory in +# load_rollback_status.yml, verified reachable in post_validation.yml +# ============================================================================ + +# ── Get all telemetry pods ──────────────────────────────────────────── +- name: "Telemetry rollback verification — Get all pods in telemetry namespace" + ansible.builtin.command: + cmd: kubectl get pods -n telemetry -o wide + delegate_to: "{{ kube_vip }}" + connection: ssh + register: telemetry_pods_result + changed_when: false + failed_when: false + +- name: "Display telemetry pods after K8s rollback" + ansible.builtin.debug: + msg: "{{ telemetry_pods_result.stdout_lines | default(['No pods found in telemetry namespace']) }}" + +# ── Check for stale 2.2-only components ─────────────────────────────── +- name: "Check for 2.2-only components that should have been removed by etcd restore" + ansible.builtin.command: + cmd: >- + kubectl get deploy,sts -n telemetry --no-headers + -o custom-columns='KIND:.kind,NAME:.metadata.name' + delegate_to: "{{ kube_vip }}" + connection: ssh + register: telemetry_resources_result + changed_when: false + failed_when: false + +- name: "Identify any 2.2-only telemetry components still present" + ansible.builtin.set_fact: + stale_22_components: >- + {{ telemetry_resources_result.stdout_lines | default([]) + | select('search', + 'vector-ldms|vector-ome|victoria-metrics-operator|vlagent-vector|vmagent-vector|vlstorage|vlinsert|vlselect|vlagent-vlagent|vmstorage-victoria|vmselect-victoria|vminsert-victoria|vmagent-vmagent') + | list }} + +- name: "Display 2.2-only component cleanup status" + ansible.builtin.debug: + msg: >- + {% if stale_22_components | length == 0 -%} + All 2.2-only components removed successfully by etcd restore.{%- else -%} + WARNING: {{ stale_22_components | length }} stale 2.2 component(s) still present: {{ stale_22_components | join(', ') }}{%- endif %} + +# ── Pod readiness summary ───────────────────────────────────────────── +- name: "Check telemetry pod readiness summary" + ansible.builtin.shell: + cmd: | + set -o pipefail + total=$(kubectl get pods -n telemetry --no-headers 2>/dev/null | grep -cv 'Completed' || echo 0) + running=$(kubectl get pods -n telemetry --no-headers 2>/dev/null | grep -c 'Running' || echo 0) + not_ready=$(kubectl get pods -n telemetry --no-headers 2>/dev/null | grep -v 'Running\|Completed' || echo "") + echo "TOTAL=${total}" + echo "RUNNING=${running}" + if [ -n "${not_ready}" ]; then + echo "NOT_READY:" + echo "${not_ready}" + fi + executable: /bin/bash + delegate_to: "{{ kube_vip }}" + connection: ssh + register: telemetry_summary + changed_when: false + failed_when: false + +- name: "Display telemetry pod readiness summary" + ansible.builtin.debug: + msg: "{{ telemetry_summary.stdout_lines | default(['Could not retrieve pod summary']) }}" + +- name: "Warn if any telemetry pods are not Running" + ansible.builtin.debug: + msg: >- + WARNING: Some telemetry pods are not in Running state after K8s rollback. + This may be transient — pods may need time to start after etcd restore. + If pods remain unhealthy after 5-10 minutes, manual investigation is needed. + when: + - telemetry_pods_result.stdout is defined + - telemetry_pods_result.stdout | regex_search('CrashLoopBackOff|Error|ImagePullBackOff|Pending|ContainerCreating') is not none diff --git a/upgrade/roles/upgrade_k8s/tasks/step_uncordon.yml b/upgrade/roles/upgrade_k8s/tasks/step_uncordon.yml index 4291804071..b0f26c8256 100644 --- a/upgrade/roles/upgrade_k8s/tasks/step_uncordon.yml +++ b/upgrade/roles/upgrade_k8s/tasks/step_uncordon.yml @@ -12,8 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. --- +- name: Wait for API server to be reachable before uncordon + ansible.builtin.command: kubectl get --raw /healthz + delegate_to: "{{ kube_vip }}" + register: api_health + changed_when: false + retries: 30 + delay: 10 + until: api_health.rc == 0 + - name: Uncordon node {{ current_node_name }} ansible.builtin.command: kubectl uncordon {{ node_ip }} delegate_to: "{{ kube_vip }}" register: uncordon_result changed_when: true + retries: 5 + delay: 10 + until: uncordon_result.rc == 0 diff --git a/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml b/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml index a4f132845b..e3ef517a86 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml @@ -72,10 +72,10 @@ failed_when: false - name: Backup telemetry.sh from control plane - ansible.builtin.fetch: + ansible.builtin.copy: src: /root/telemetry.sh dest: "{{ tel_backup_dir }}/telemetry.sh" - flat: true + mode: '0644' delegate_to: "{{ kube_vip }}" connection: ssh when: diff --git a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml index ca6593fd24..03c8b24ea6 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml @@ -34,9 +34,7 @@ - name: Fail if telemetry.sh is missing ansible.builtin.fail: - msg: >- - telemetry.sh not found at {{ telemetry_script_path }} on kube_vip. - Ensure the provision playbook has generated the telemetry deployment script. + msg: "{{ telemetry_sh_missing_msg }}" when: not (telemetry_sh_stat.stat.exists | default(false)) - name: Verify kustomization.yaml exists in deployments directory @@ -48,9 +46,7 @@ - name: Fail if kustomization.yaml is missing ansible.builtin.fail: - msg: >- - kustomization.yaml not found at {{ telemetry_kustomization_dir }}/kustomization.yaml on kube_vip. - Ensure the provision playbook has generated the kustomize deployment files. + msg: "{{ kustomization_missing_msg }}" when: not (kustomization_stat.stat.exists | default(false)) - name: Execute telemetry.sh and validate deployment @@ -98,7 +94,7 @@ - name: Fail telemetry.sh for non-Helm errors ansible.builtin.fail: - msg: "telemetry.sh failed with non-Helm error: {{ telemetry_sh_result.stderr }}" + msg: "{{ telemetry_sh_non_helm_error_msg }}" when: - telemetry_sh_result.rc != 0 - not (telemetry_sh_helm_error | default(false)) @@ -125,11 +121,20 @@ delegate_to: "{{ kube_vip }}" connection: ssh changed_when: false + failed_when: false + register: rollout_scale_result when: - idrac_sts_check.rc == 0 - idrac_replica_count.stdout | int > 0 - restore_replicas_result.changed | default(false) + - name: Warn if idrac-telemetry rollout did not complete in time + ansible.builtin.debug: + msg: "{{ idrac_rollout_timeout_warning_msg }}" + when: + - rollout_scale_result is defined + - rollout_scale_result.rc | default(0) != 0 + - name: Display replica restore status ansible.builtin.debug: msg: "{{ idrac_replica_restore_msg }}" @@ -156,11 +161,14 @@ failed_when: false # ── Post-deployment validation: Check for MySQL issues ── - - name: Check idrac-telemetry pod status after deployment + - name: Check idrac-telemetry pod container status after deployment ansible.builtin.shell: - cmd: > - kubectl get pods -n {{ telemetry_namespace }} -l app=idrac-telemetry - --no-headers -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,READY:.status.containerStatuses[*].ready + cmd: | + set -o pipefail + kubectl get pods -n {{ telemetry_namespace }} -l app=idrac-telemetry \ + --no-headers \ + -o custom-columns=NAME:.metadata.name,PHASE:.status.phase,CONTAINER_STATUSES:.status.containerStatuses[*].state.waiting.reason \ + 2>/dev/null || true delegate_to: "{{ kube_vip }}" connection: ssh register: idrac_pod_status_check @@ -170,14 +178,14 @@ - name: Display idrac-telemetry pod status ansible.builtin.debug: msg: "idrac-telemetry pod status: {{ idrac_pod_status_check.stdout }}" - when: idrac_pod_status_check.stdout != "" + when: idrac_pod_status_check.stdout | default('') != "" - name: Fail if idrac-telemetry MySQL container is in CrashLoopBackOff ansible.builtin.fail: msg: "{{ mysql_crash_error_msg }}" when: - - idrac_pod_status_check.stdout != "" - - "'CrashLoopBackOff' in idrac_pod_status_check.stdout or 'Error' in idrac_pod_status_check.stdout" + - idrac_pod_status_check.stdout | default('') != "" + - "'CrashLoopBackOff' in idrac_pod_status_check.stdout" - name: Generate telemetry pod status report ansible.builtin.command: @@ -205,10 +213,7 @@ - name: Fail if some pods are not ready ansible.builtin.fail: - msg: >- - {{ pods_not_ready_msg }} - Review the pod status report above and check pod logs for errors: - kubectl logs -n {{ telemetry_namespace }} + msg: "{{ pods_not_ready_detailed_msg }}" when: pods_not_ready.stdout | int > 0 - name: Display telemetry.sh success @@ -216,14 +221,16 @@ msg: "{{ telemetry_sh_success_msg }}" rescue: - - name: Display telemetry.sh failure details + - name: Display actual failing task details ansible.builtin.debug: msg: - - "{{ telemetry_sh_fail_msg }}" - - "stdout: {{ telemetry_sh_result.stdout | default('N/A') }}" - - "stderr: {{ telemetry_sh_result.stderr | default('N/A') }}" - - "rc: {{ telemetry_sh_result.rc | default('N/A') }}" + - "Telemetry deployment failed during post-deployment validation." + - "Failed task: {{ ansible_failed_task.name | default('unknown') }}" + - "Failure reason: {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('unknown')) }}" + - "telemetry.sh rc: {{ telemetry_sh_result.rc | default('N/A') }}" - name: Fail the telemetry upgrade ansible.builtin.fail: - msg: "Telemetry deployment failed. See error details above." + msg: >- + Telemetry deployment failed at task '{{ ansible_failed_task.name | default('unknown') }}': + {{ ansible_failed_result.msg | default(ansible_failed_result.stderr | default('See error details above.')) }} diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index 08a826acfc..5d64455301 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -111,7 +111,21 @@ telemetry_sh_helm_skip_msg: >- telemetry.sh had Helm errors (existing releases), but StatefulSet was already patched with terminationGracePeriodSeconds=120s for MySQL safety. Skipping full re-deployment. Image tags should be updated via kustomize if needed. +telemetry_sh_missing_msg: >- + telemetry.sh not found at {{ telemetry_script_path }} on kube_vip. + Ensure the provision playbook has generated the telemetry deployment script. +kustomization_missing_msg: >- + kustomization.yaml not found at {{ telemetry_kustomization_dir }}/kustomization.yaml on kube_vip. + Ensure the provision playbook has generated the kustomize deployment files. +telemetry_sh_non_helm_error_msg: "telemetry.sh failed with non-Helm error: {{ telemetry_sh_result.stderr }}" +idrac_rollout_timeout_warning_msg: >- + WARNING: idrac-telemetry rollout did not complete within 300s (rc={{ rollout_scale_result.rc }}). + Continuing with pod readiness check. pods_not_ready_msg: "Some telemetry pods are not ready after deployment." +pods_not_ready_detailed_msg: >- + {{ pods_not_ready_msg }} + Review the pod status report above and check pod logs for errors: + kubectl logs -n {{ telemetry_namespace }} mysql_crash_error_msg: | ERROR: idrac-telemetry MySQL container failed to start after graceful shutdown. Manual intervention required: From 64ef86267b71386c790fb3729330d2c9e6811b9a Mon Sep 17 00:00:00 2001 From: "Kratika.Patidar" Date: Mon, 1 Jun 2026 05:47:58 +0000 Subject: [PATCH 2/4] rollback and upgrade comments fixes Signed-off-by: Kratika.Patidar --- rollback/playbooks/rollback_telemetry.yml | 104 ------------------ .../tasks/backup_telemetry.yml | 3 +- .../tasks/execute_telemetry_sh.yml | 9 +- upgrade/roles/upgrade_telemetry/vars/main.yml | 12 +- 4 files changed, 17 insertions(+), 111 deletions(-) delete mode 100644 rollback/playbooks/rollback_telemetry.yml diff --git a/rollback/playbooks/rollback_telemetry.yml b/rollback/playbooks/rollback_telemetry.yml deleted file mode 100644 index 4c66cbb8dc..0000000000 --- a/rollback/playbooks/rollback_telemetry.yml +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -- name: Rollback Telemetry components - hosts: localhost - connection: local - gather_facts: false - vars: - rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml - component_name: telemetry - tasks: - - name: Read rollback_manifest.yml - ansible.builtin.slurp: - src: "{{ rollback_manifest_path }}" - register: raw_rollback_manifest - - - name: Parse rollback manifest - ansible.builtin.set_fact: - rollback_manifest: "{{ raw_rollback_manifest.content | b64decode | from_yaml }}" - - - name: Skip if telemetry already rolled back - ansible.builtin.meta: end_play - when: - - rollback_manifest.component_status[component_name] | default('pending') == 'completed' - - - name: "Mark as skipped — BuildStream terminal gate active (C-24)" - ansible.builtin.copy: - content: >- - {{ rollback_manifest | combine({ - 'component_status': rollback_manifest.component_status | combine({ - component_name: 'skipped' - }) - }) | to_nice_yaml }} - dest: "{{ rollback_manifest_path }}" - mode: '0644' - when: - - hostvars['localhost']['build_stream_terminal'] | default(false) | bool - - - name: "Skip — BuildStream terminal gate active (C-24)" - ansible.builtin.meta: end_play - when: - - hostvars['localhost']['build_stream_terminal'] | default(false) | bool - - - name: Set telemetry rollback status to in-progress - ansible.builtin.copy: - content: >- - {{ rollback_manifest | combine({ - 'component_status': rollback_manifest.component_status | combine({ - component_name: 'in-progress' - }) - }) | to_nice_yaml }} - dest: "{{ rollback_manifest_path }}" - mode: '0644' - - - name: "Display rollback status in-progress — {{ component_name }}" - ansible.builtin.debug: - msg: "[ROLLBACK] Component '{{ component_name }}' — status changed to: in-progress" - - # ── Telemetry rollback strategy ────────────────────────────────── - # The K8s rollback (etcd snapshot restore) restores ALL Kubernetes objects - # to their pre-upgrade (2.1) state, which includes all telemetry namespace - # resources: Deployments, StatefulSets, Services, ConfigMaps, Secrets, - # PVCs, Helm release secrets, and CRDs. - # - # 2.2-only components (vector-ldms, vector-ome, victoria-logs cluster, - # victoria-metrics-operator CRDs, vlagent-vector, vmagent-vector) are - # automatically removed because they did not exist in the etcd snapshot. - # - # Post-K8s-rollback telemetry pod verification is performed by - # verify_telemetry_rollback.yml (Stage 8d) inside the rollback_k8s role. - - name: "Telemetry rollback — handled by K8s rollback (etcd restore)" - ansible.builtin.debug: - msg: - - "Telemetry rollback is handled by K8s rollback via etcd snapshot restore." - - "The etcd snapshot contains the full 2.1 telemetry namespace state." - - "2.2-only components will be removed automatically." - - "Post-rollback telemetry pod verification runs in K8s rollback Stage 8d." - - - name: Mark telemetry rollback as completed - ansible.builtin.copy: - content: >- - {{ rollback_manifest | combine({ - 'component_status': rollback_manifest.component_status | combine({ - component_name: 'completed' - }) - }) | to_nice_yaml }} - dest: "{{ rollback_manifest_path }}" - mode: '0644' - - - name: "Display rollback status completed — {{ component_name }}" - ansible.builtin.debug: - msg: "[ROLLBACK] Component '{{ component_name }}' — status changed to: completed" diff --git a/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml b/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml index e3ef517a86..ed0ff59f83 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml @@ -75,7 +75,8 @@ ansible.builtin.copy: src: /root/telemetry.sh dest: "{{ tel_backup_dir }}/telemetry.sh" - mode: '0644' + mode: "{{ executable_mode }}" + remote_src: true delegate_to: "{{ kube_vip }}" connection: ssh when: diff --git a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml index 03c8b24ea6..ce4ed930f5 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml @@ -123,14 +123,17 @@ changed_when: false failed_when: false register: rollout_scale_result + retries: "{{ idrac_rollout_retries }}" + delay: "{{ idrac_rollout_delay }}" + until: rollout_scale_result.rc == 0 when: - idrac_sts_check.rc == 0 - idrac_replica_count.stdout | int > 0 - restore_replicas_result.changed | default(false) - - name: Warn if idrac-telemetry rollout did not complete in time - ansible.builtin.debug: - msg: "{{ idrac_rollout_timeout_warning_msg }}" + - name: Fail if idrac-telemetry rollout did not complete after retries + ansible.builtin.fail: + msg: "{{ idrac_rollout_fail_msg }}" when: - rollout_scale_result is defined - rollout_scale_result.rc | default(0) != 0 diff --git a/upgrade/roles/upgrade_telemetry/vars/main.yml b/upgrade/roles/upgrade_telemetry/vars/main.yml index 5d64455301..8326120799 100644 --- a/upgrade/roles/upgrade_telemetry/vars/main.yml +++ b/upgrade/roles/upgrade_telemetry/vars/main.yml @@ -23,6 +23,7 @@ telemetry_namespace: telemetry # OIM host for NFS share access oim_host: oim +executable_mode: "0755" # Upgrade directory paths (on k8s NFS share, resolved at runtime) telemetry_upgrade_dir: "{{ k8s_client_mount_path }}/upgrade/telemetry" @@ -41,6 +42,10 @@ telemetry_kustomization_dir: "{{ telemetry_deploy_dir }}/deployments" pod_wait_retries: 60 pod_wait_delay: 15 +# idrac-telemetry rollout wait configuration +idrac_rollout_retries: 3 +idrac_rollout_delay: 30 + # Victoria operator configuration # victoria_operator_pkg is loaded dynamically from service_k8s JSON in include_required_input.yml victoria_operator_release_name: victoria-metrics-operator @@ -118,9 +123,10 @@ kustomization_missing_msg: >- kustomization.yaml not found at {{ telemetry_kustomization_dir }}/kustomization.yaml on kube_vip. Ensure the provision playbook has generated the kustomize deployment files. telemetry_sh_non_helm_error_msg: "telemetry.sh failed with non-Helm error: {{ telemetry_sh_result.stderr }}" -idrac_rollout_timeout_warning_msg: >- - WARNING: idrac-telemetry rollout did not complete within 300s (rc={{ rollout_scale_result.rc }}). - Continuing with pod readiness check. +idrac_rollout_fail_msg: >- + idrac-telemetry rollout did not complete after {{ idrac_rollout_retries }} attempts (300s each). + The pods may not be in Running state. Check pod events: + kubectl describe pods -n {{ telemetry_namespace }} -l app=idrac-telemetry pods_not_ready_msg: "Some telemetry pods are not ready after deployment." pods_not_ready_detailed_msg: >- {{ pods_not_ready_msg }} From fc765dd3a563dff6d8cb47e60d638440aefd3d39 Mon Sep 17 00:00:00 2001 From: "Kratika.Patidar" Date: Mon, 1 Jun 2026 10:46:56 +0000 Subject: [PATCH 3/4] changing component and tag name from k8s -> k8s-telemetry Signed-off-by: Kratika.Patidar --- rollback/playbooks/rollback_k8s.yml | 4 ++-- rollback/rollback.yml | 22 +++++++------------ .../tasks/execute_telemetry_sh.yml | 6 +++++ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/rollback/playbooks/rollback_k8s.yml b/rollback/playbooks/rollback_k8s.yml index ac37866cf1..2833803cfa 100644 --- a/rollback/playbooks/rollback_k8s.yml +++ b/rollback/playbooks/rollback_k8s.yml @@ -20,7 +20,7 @@ vars: input_project_dir: "/opt/omnia/input/project_default" rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml - component_name: k8s + component_name: k8s-telemetry tasks: - name: Read rollback_manifest.yml ansible.builtin.slurp: @@ -86,7 +86,7 @@ vars: banner_k8s_not_configured: - "========================================================================" - - "[ROLLBACK] Component 'k8s' — SKIPPED" + - "[ROLLBACK] Component 'k8s-telemetry' — SKIPPED" - "========================================================================" - "Reason: service_k8s is not present in software_config.json softwares list." - "K8s cluster was not provisioned, skipping K8s rollback." diff --git a/rollback/rollback.yml b/rollback/rollback.yml index 6530fd21ec..beed0dafc6 100644 --- a/rollback/rollback.yml +++ b/rollback/rollback.yml @@ -45,7 +45,7 @@ rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml upgrade_lock_path: /opt/omnia/.data/upgrade_in_progress.lock rollback_lock_path: /opt/omnia/.data/rollback_in_progress.lock - all_rollback_components: [slurm, telemetry, k8s, build_stream, oim] + all_rollback_components: [slurm, k8s-telemetry, build_stream, oim] tasks: # ═══════════════════════════════════════════════════════════════ # PHASE 1: READ-ONLY GUARDS (no state mutation allowed here) @@ -196,8 +196,7 @@ backup_dir: "{{ oim_metadata.upgrade_backup_dir }}" component_status: slurm: "pending" - telemetry: "pending" - k8s: "pending" + k8s-telemetry: "pending" build_stream: "pending" oim: "pending" dest: "{{ rollback_manifest_path }}" @@ -259,7 +258,7 @@ - name: Identify components skipped by BuildStream terminal gate ansible.builtin.set_fact: bs_rollback_skipped: >- - {{ ['slurm', 'telemetry', 'k8s'] + {{ ['slurm', 'k8s-telemetry'] if (build_stream_terminal | bool) else [] }} @@ -282,20 +281,16 @@ # ────────────────────────────────────────────────────────────────────── # Rollback sub-flows (reverse order of upgrade): -# slurm → telemetry → k8s → build_stream → oim +# slurm → k8s-telemetry → build_stream → oim # Each reads rollback_manifest and skips if already 'completed'. # ────────────────────────────────────────────────────────────────────── - name: Rollback Slurm cluster (rolled back first — depends on K8s/OIM) ansible.builtin.import_playbook: playbooks/rollback_slurm.yml tags: [slurm] -- name: Rollback Telemetry components - ansible.builtin.import_playbook: playbooks/rollback_telemetry.yml - tags: [telemetry] - -- name: Rollback Kubernetes cluster +- name: Rollback Kubernetes cluster and Telemetry ansible.builtin.import_playbook: playbooks/rollback_k8s.yml - tags: [k8s] + tags: [k8s-telemetry] - name: Rollback BuildStream upgrade / enablement ansible.builtin.import_playbook: playbooks/rollback_build_stream.yml @@ -318,7 +313,7 @@ rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml upgrade_manifest_path: /opt/omnia/.data/upgrade_manifest.yml rollback_lock_path: /opt/omnia/.data/rollback_in_progress.lock - all_rollback_components: [slurm, telemetry, k8s, build_stream, oim] + all_rollback_components: [slurm, k8s-telemetry, build_stream, oim] tasks: - name: Read rollback_manifest.yml ansible.builtin.slurp: @@ -423,8 +418,7 @@ - "" - "Component Status:" - " slurm: {{ cleaned_component_status.slurm }}" - - " telemetry: {{ cleaned_component_status.telemetry }}" - - " k8s: {{ cleaned_component_status.k8s }}" + - " k8s-telemetry: {{ cleaned_component_status['k8s-telemetry'] }}" - " build_stream: {{ cleaned_component_status.build_stream }}" - " oim: {{ cleaned_component_status.oim }}" - "" diff --git a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml index ce4ed930f5..d3101d8935 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml @@ -77,6 +77,12 @@ changed_when: true timeout: 900 failed_when: false + retries: 2 + delay: 30 + until: > + telemetry_sh_result.rc == 0 or + (telemetry_sh_result.stderr is not search('Failed to find required executable') and + telemetry_sh_result.stderr is not search('podman.*not found')) - name: Check if telemetry.sh failed due to Helm errors ansible.builtin.set_fact: From 30ef1c1d39148e24a0d2cb70c9f56fdc305b6b56 Mon Sep 17 00:00:00 2001 From: "Kratika.Patidar" Date: Mon, 1 Jun 2026 10:56:38 +0000 Subject: [PATCH 4/4] lint Signed-off-by: Kratika.Patidar --- upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml index d3101d8935..1dac883990 100644 --- a/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml +++ b/upgrade/roles/upgrade_telemetry/tasks/execute_telemetry_sh.yml @@ -80,7 +80,7 @@ retries: 2 delay: 30 until: > - telemetry_sh_result.rc == 0 or + telemetry_sh_result.rc == 0 or (telemetry_sh_result.stderr is not search('Failed to find required executable') and telemetry_sh_result.stderr is not search('podman.*not found'))