Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions rollback/playbooks/rollback_k8s.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
vars:
input_project_dir: "/opt/omnia/input/project_default"
rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml
component_name: k8s
component_name: k8s-telemetry
tasks:
- name: Read rollback_manifest.yml
ansible.builtin.slurp:
Expand Down Expand Up @@ -86,7 +86,7 @@
vars:
banner_k8s_not_configured:
- "========================================================================"
- "[ROLLBACK] Component 'k8s' — SKIPPED"
- "[ROLLBACK] Component 'k8s-telemetry' — SKIPPED"
- "========================================================================"
- "Reason: service_k8s is not present in software_config.json softwares list."
- "K8s cluster was not provisioned, skipping K8s rollback."
Expand Down
96 changes: 0 additions & 96 deletions rollback/playbooks/rollback_telemetry.yml

This file was deleted.

4 changes: 4 additions & 0 deletions rollback/roles/rollback_k8s/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@
- name: "Stage 8c — Clean up stale CSI VolumeAttachments"
ansible.builtin.include_tasks: cleanup_stale_volume_attachments.yml

# ── Stage 8d: Verify telemetry pods after etcd restore ───────
- name: "Stage 8d — Verify telemetry rollback"
ansible.builtin.include_tasks: verify_telemetry_rollback.yml

# ── Stage 9: Restore BSS boot params and cloud-init ──────────
- name: "Stage 9 — Restore BSS boot params and cloud-init"
ansible.builtin.include_tasks: restore_bss_cloud_init.yml
Expand Down
104 changes: 104 additions & 0 deletions rollback/roles/rollback_k8s/tasks/verify_telemetry_rollback.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright 2026 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
# ============================================================================
# Verify Telemetry Rollback (Post-K8s-Rollback)
# ============================================================================
# After etcd restore, the 2.1 telemetry objects are restored and 2.2-only
# objects are removed. This task verifies:
# 1. All telemetry pods are displayed with their status
# 2. No 2.2-only components remain (vector-ldms, vector-ome, victoria-logs,
# victoria-metrics-operator, vlagent-vector, vmagent-vector)
# 3. Pod readiness summary
#
# Prerequisites (guaranteed by rollback_k8s/main.yml execution order):
# - kube_vip: set in load_version_vars.yml, added to inventory in
# load_rollback_status.yml, verified reachable in post_validation.yml
# ============================================================================

# ── Get all telemetry pods ────────────────────────────────────────────
- name: "Telemetry rollback verification — Get all pods in telemetry namespace"
ansible.builtin.command:
cmd: kubectl get pods -n telemetry -o wide
delegate_to: "{{ kube_vip }}"
connection: ssh
register: telemetry_pods_result
changed_when: false
failed_when: false

- name: "Display telemetry pods after K8s rollback"
ansible.builtin.debug:
msg: "{{ telemetry_pods_result.stdout_lines | default(['No pods found in telemetry namespace']) }}"

# ── Check for stale 2.2-only components ───────────────────────────────
- name: "Check for 2.2-only components that should have been removed by etcd restore"
ansible.builtin.command:
cmd: >-
kubectl get deploy,sts -n telemetry --no-headers
-o custom-columns='KIND:.kind,NAME:.metadata.name'
delegate_to: "{{ kube_vip }}"
connection: ssh
register: telemetry_resources_result
changed_when: false
failed_when: false

- name: "Identify any 2.2-only telemetry components still present"
ansible.builtin.set_fact:
stale_22_components: >-
{{ telemetry_resources_result.stdout_lines | default([])
| select('search',
'vector-ldms|vector-ome|victoria-metrics-operator|vlagent-vector|vmagent-vector|vlstorage|vlinsert|vlselect|vlagent-vlagent|vmstorage-victoria|vmselect-victoria|vminsert-victoria|vmagent-vmagent')
| list }}

- name: "Display 2.2-only component cleanup status"
ansible.builtin.debug:
msg: >-
{% if stale_22_components | length == 0 -%}
All 2.2-only components removed successfully by etcd restore.{%- else -%}
WARNING: {{ stale_22_components | length }} stale 2.2 component(s) still present: {{ stale_22_components | join(', ') }}{%- endif %}

# ── Pod readiness summary ─────────────────────────────────────────────
- name: "Check telemetry pod readiness summary"
ansible.builtin.shell:
cmd: |
set -o pipefail
total=$(kubectl get pods -n telemetry --no-headers 2>/dev/null | grep -cv 'Completed' || echo 0)
running=$(kubectl get pods -n telemetry --no-headers 2>/dev/null | grep -c 'Running' || echo 0)
not_ready=$(kubectl get pods -n telemetry --no-headers 2>/dev/null | grep -v 'Running\|Completed' || echo "")
echo "TOTAL=${total}"
echo "RUNNING=${running}"
if [ -n "${not_ready}" ]; then
echo "NOT_READY:"
echo "${not_ready}"
fi
executable: /bin/bash
delegate_to: "{{ kube_vip }}"
connection: ssh
register: telemetry_summary
changed_when: false
failed_when: false

- name: "Display telemetry pod readiness summary"
ansible.builtin.debug:
msg: "{{ telemetry_summary.stdout_lines | default(['Could not retrieve pod summary']) }}"

- name: "Warn if any telemetry pods are not Running"
ansible.builtin.debug:
msg: >-
WARNING: Some telemetry pods are not in Running state after K8s rollback.
This may be transient — pods may need time to start after etcd restore.
If pods remain unhealthy after 5-10 minutes, manual investigation is needed.
when:
- telemetry_pods_result.stdout is defined
- telemetry_pods_result.stdout | regex_search('CrashLoopBackOff|Error|ImagePullBackOff|Pending|ContainerCreating') is not none
22 changes: 8 additions & 14 deletions rollback/rollback.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml
upgrade_lock_path: /opt/omnia/.data/upgrade_in_progress.lock
rollback_lock_path: /opt/omnia/.data/rollback_in_progress.lock
all_rollback_components: [slurm, telemetry, k8s, build_stream, oim]
all_rollback_components: [slurm, k8s-telemetry, build_stream, oim]
tasks:
# ═══════════════════════════════════════════════════════════════
# PHASE 1: READ-ONLY GUARDS (no state mutation allowed here)
Expand Down Expand Up @@ -196,8 +196,7 @@
backup_dir: "{{ oim_metadata.upgrade_backup_dir }}"
component_status:
slurm: "pending"
telemetry: "pending"
k8s: "pending"
k8s-telemetry: "pending"
build_stream: "pending"
oim: "pending"
dest: "{{ rollback_manifest_path }}"
Expand Down Expand Up @@ -259,7 +258,7 @@
- name: Identify components skipped by BuildStream terminal gate
ansible.builtin.set_fact:
bs_rollback_skipped: >-
{{ ['slurm', 'telemetry', 'k8s']
{{ ['slurm', 'k8s-telemetry']
if (build_stream_terminal | bool)
else [] }}

Expand All @@ -282,20 +281,16 @@

# ──────────────────────────────────────────────────────────────────────
# Rollback sub-flows (reverse order of upgrade):
# slurm → telemetry → k8s → build_stream → oim
# slurm → k8s-telemetry → build_stream → oim
# Each reads rollback_manifest and skips if already 'completed'.
# ──────────────────────────────────────────────────────────────────────
- name: Rollback Slurm cluster (rolled back first — depends on K8s/OIM)
ansible.builtin.import_playbook: playbooks/rollback_slurm.yml
tags: [slurm]

- name: Rollback Telemetry components
ansible.builtin.import_playbook: playbooks/rollback_telemetry.yml
tags: [telemetry]

- name: Rollback Kubernetes cluster
- name: Rollback Kubernetes cluster and Telemetry
ansible.builtin.import_playbook: playbooks/rollback_k8s.yml
tags: [k8s]
tags: [k8s-telemetry]

- name: Rollback BuildStream upgrade / enablement
ansible.builtin.import_playbook: playbooks/rollback_build_stream.yml
Expand All @@ -318,7 +313,7 @@
rollback_manifest_path: /opt/omnia/.data/rollback_manifest.yml
upgrade_manifest_path: /opt/omnia/.data/upgrade_manifest.yml
rollback_lock_path: /opt/omnia/.data/rollback_in_progress.lock
all_rollback_components: [slurm, telemetry, k8s, build_stream, oim]
all_rollback_components: [slurm, k8s-telemetry, build_stream, oim]
tasks:
- name: Read rollback_manifest.yml
ansible.builtin.slurp:
Expand Down Expand Up @@ -423,8 +418,7 @@
- ""
- "Component Status:"
- " slurm: {{ cleaned_component_status.slurm }}"
- " telemetry: {{ cleaned_component_status.telemetry }}"
- " k8s: {{ cleaned_component_status.k8s }}"
- " k8s-telemetry: {{ cleaned_component_status['k8s-telemetry'] }}"
- " build_stream: {{ cleaned_component_status.build_stream }}"
- " oim: {{ cleaned_component_status.oim }}"
- ""
Expand Down
12 changes: 12 additions & 0 deletions upgrade/roles/upgrade_k8s/tasks/step_uncordon.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
---
- name: Wait for API server to be reachable before uncordon
ansible.builtin.command: kubectl get --raw /healthz
delegate_to: "{{ kube_vip }}"
register: api_health
changed_when: false
retries: 30
delay: 10
until: api_health.rc == 0

- name: Uncordon node {{ current_node_name }}
ansible.builtin.command: kubectl uncordon {{ node_ip }}
delegate_to: "{{ kube_vip }}"
register: uncordon_result
changed_when: true
retries: 5
delay: 10
until: uncordon_result.rc == 0
5 changes: 3 additions & 2 deletions upgrade/roles/upgrade_telemetry/tasks/backup_telemetry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,11 @@
failed_when: false

- name: Backup telemetry.sh from control plane
ansible.builtin.fetch:
ansible.builtin.copy:
src: /root/telemetry.sh
dest: "{{ tel_backup_dir }}/telemetry.sh"
flat: true
mode: "{{ executable_mode }}"
remote_src: true
delegate_to: "{{ kube_vip }}"
connection: ssh
when:
Expand Down
Loading
Loading