Skip to content

Commit 9a72899

Browse files
authored
Improve Pacemaker cluster bootstrap with version checks and better diagnostics (#460)
Add conditional logic for the logd service, ensuring it only starts when Pacemaker version is below 2.0.0, to maintain compatibility with SLE 16 where cluster-glue (logd) is deprecated. Additionally, implement a post-bootstrap health check for the Pacemaker service on AWS and GCP. If the service fails to start, the playbook now automatically collects and displays systemd status and journal logs to simplify troubleshooting.
1 parent f396f00 commit 9a72899

2 files changed

Lines changed: 76 additions & 0 deletions

File tree

ansible/playbooks/tasks/aws-cluster-bootstrap.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,19 @@
4949
# if either of the the above conditions are false the back `Create authkeys` is run.
5050
# It will shutdown pacemaker on all nodes, write the authfile to the primary node and then copy it the other nodes.
5151
# Finally, it will notify a handler to start corosync.
52+
- name: Gather package facts
53+
ansible.builtin.package_facts:
54+
manager: auto
55+
56+
# ha_logd (logd) is provided by cluster-glue, which has been dropped in SLE 16.
57+
# Pacemaker >= 2.0.0 no longer uses ha_logd (adopted libqb logging instead),
58+
# so this is only needed for SLE 12 SP5 with Pacemaker 1.x.
5259
- name: Ensure logd is enabled and started
5360
ansible.builtin.systemd:
5461
name: logd
5562
state: started
5663
enabled: true
64+
when: ansible_facts.packages['pacemaker'][0].version is version('2.0.0', '<')
5765

5866
- name: Register authkey status
5967
ansible.builtin.stat:
@@ -143,6 +151,36 @@
143151
- name: Flush handler
144152
ansible.builtin.meta: flush_handlers
145153

154+
- name: Check pacemaker service status
155+
ansible.builtin.service_facts:
156+
157+
- name: Set pacemaker running status
158+
ansible.builtin.set_fact:
159+
pacemaker_is_running: "{{ ansible_facts.services['pacemaker.service'].state | default('') == 'running' }}"
160+
161+
- name: Collect pacemaker diagnostics on failure
162+
ansible.builtin.command: "{{ item }}" # noqa command-instead-of-module
163+
loop:
164+
- systemctl status pacemaker.service --no-pager -l
165+
- journalctl --no-pager -u pacemaker --lines=50
166+
register: pacemaker_diag
167+
failed_when: false
168+
changed_when: false
169+
when: not pacemaker_is_running | bool
170+
171+
- name: Display pacemaker diagnostics
172+
ansible.builtin.debug:
173+
msg: "{{ item.item }}:\n{{ item.stdout }}"
174+
loop: "{{ pacemaker_diag.results }}"
175+
loop_control:
176+
label: "{{ item.item }}"
177+
when: not pacemaker_is_running | bool
178+
179+
- name: Fail if pacemaker is not running
180+
ansible.builtin.fail:
181+
msg: "Pacemaker is not running after bootstrap. See diagnostics above."
182+
when: not pacemaker_is_running | bool
183+
146184
- name: Get DefaultTasksMax value
147185
ansible.builtin.command: # noqa command-instead-of-module
148186
cmd: systemctl --no-pager show

ansible/playbooks/tasks/gcp-cluster-bootstrap.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,19 @@
3030
# if either of the the above conditions are false the back `Create authkeys` is run.
3131
# It will shutdown pacemaker on all nodes, write the authfile to the primary node and then copy it the other nodes.
3232
# Finally, it will notify a handler to start corosync.
33+
- name: Gather package facts
34+
ansible.builtin.package_facts:
35+
manager: auto
36+
37+
# ha_logd (logd) is provided by cluster-glue, which has been dropped in SLE 16.
38+
# Pacemaker >= 2.0.0 no longer uses ha_logd (adopted libqb logging instead),
39+
# so this is only needed for SLE 12 SP5 with Pacemaker 1.x.
3340
- name: Ensure logd is enabled and started
3441
ansible.builtin.systemd:
3542
name: logd
3643
state: started
3744
enabled: true
45+
when: ansible_facts.packages['pacemaker'][0].version is version('2.0.0', '<')
3846

3947
- name: Register authkey status
4048
ansible.builtin.stat:
@@ -124,6 +132,36 @@
124132
- name: Flush handler
125133
ansible.builtin.meta: flush_handlers
126134

135+
- name: Check pacemaker service status
136+
ansible.builtin.service_facts:
137+
138+
- name: Set pacemaker running status
139+
ansible.builtin.set_fact:
140+
pacemaker_is_running: "{{ ansible_facts.services['pacemaker.service'].state | default('') == 'running' }}"
141+
142+
- name: Collect pacemaker diagnostics on failure
143+
ansible.builtin.command: "{{ item }}" # noqa command-instead-of-module
144+
loop:
145+
- systemctl status pacemaker.service --no-pager -l
146+
- journalctl --no-pager -u pacemaker --lines=50
147+
register: pacemaker_diag
148+
failed_when: false
149+
changed_when: false
150+
when: not pacemaker_is_running | bool
151+
152+
- name: Display pacemaker diagnostics
153+
ansible.builtin.debug:
154+
msg: "{{ item.item }}:\n{{ item.stdout }}"
155+
loop: "{{ pacemaker_diag.results }}"
156+
loop_control:
157+
label: "{{ item.item }}"
158+
when: not pacemaker_is_running | bool
159+
160+
- name: Fail if pacemaker is not running
161+
ansible.builtin.fail:
162+
msg: "Pacemaker is not running after bootstrap. See diagnostics above."
163+
when: not pacemaker_is_running | bool
164+
127165
- name: Get DefaultTasksMax value
128166
ansible.builtin.command: # noqa command-instead-of-module
129167
cmd: systemctl --no-pager show

0 commit comments

Comments
 (0)