Skip to content

Commit e3c40d1

Browse files
authored
Add retry to GCE native fencing (#371)
Add retry mechanism when creating fence_gce resorce (both primary and secondary). Fix ansible lint warnings. Add debug task to collect logs from the fence_gce RA.
1 parent 8b5b127 commit e3c40d1

2 files changed

Lines changed: 48 additions & 0 deletions

File tree

ansible/playbooks/tasks/aws-cluster-bootstrap.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@
397397
cmd: >-
398398
crm resource clear
399399
rsc_ip_{{ sap_hana_install_sid }}_HDB{{ sap_hana_install_instance_number }}
400+
changed_when: true
400401
when:
401402
- is_primary
402403
- reg_vip_location.stdout | trim | split(' ') | last != primary_hostname

ansible/playbooks/tasks/gcp-cluster-bootstrap.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,11 +219,29 @@
219219
- name: Enable SBD [sbd]
220220
ansible.builtin.command:
221221
cmd: crm configure primitive rsc_iscsi_sbd stonith:external/sbd
222+
register: reg_iscsi
223+
changed_when: reg_iscsi.rc == 0
222224
when:
223225
- sbd_stonith | string | lower == 'false'
224226
- use_sbd | default(false) | bool
225227
- is_primary
226228

229+
- name: Collect fence_gce info (debug purpose)
230+
ansible.builtin.command: "{{ item }}"
231+
with_items:
232+
- which fence_gce
233+
- /usr/sbin/fence_gce --version
234+
- "/usr/sbin/fence_gce -vvv -n {{ primary_hostname }} --zone {{ primary_zone }} -o status"
235+
- "/usr/sbin/fence_gce -vvv -n {{ primary_hostname }} --zone {{ primary_zone }} -o list"
236+
- "/usr/sbin/fence_gce -vvv -n {{ primary_hostname }} --zone {{ primary_zone }} -o list-status"
237+
- "/usr/sbin/fence_gce -vvv -n {{ primary_hostname }} --zone {{ primary_zone }} -o metadata"
238+
- "/usr/sbin/fence_gce -vvv -n {{ primary_hostname }} --zone {{ primary_zone }} -o manpage"
239+
- "/usr/sbin/fence_gce -vvv -n {{ primary_hostname }} --zone {{ primary_zone }} -o monitor"
240+
when:
241+
- not (use_sbd | default(false) | bool)
242+
changed_when: false
243+
failed_when: false
244+
227245
# The following STONITH commands for GCP have been adapted from
228246
# https://cloud.google.com/solutions/sap/docs/sap-hana-ha-config-sles#create_the_fencing_device_resources
229247
- name: Configure GCP Native Fencing STONITH for Primary
@@ -236,6 +254,11 @@
236254
op monitor interval="300s" timeout="120s"
237255
op start interval="0" timeout="60s"
238256
meta target-role=Started
257+
register: reg_fence_gce_prim
258+
changed_when: reg_fence_gce_prim.rc == 0
259+
until: reg_fence_gce_prim is successful
260+
retries: 3
261+
delay: 30
239262
when:
240263
- is_primary
241264
- not (use_sbd | default(false) | bool)
@@ -252,6 +275,11 @@
252275
op monitor interval="300s" timeout="120s"
253276
op start interval="0" timeout="60s"
254277
meta target-role=Started
278+
register: reg_fence_gce_sec
279+
changed_when: reg_fence_gce_sec.rc == 0
280+
until: reg_fence_gce_sec is successful
281+
retries: 3
282+
delay: 30
255283
when:
256284
- is_primary
257285
- not (use_sbd | default(false) | bool)
@@ -260,6 +288,8 @@
260288
ansible.builtin.command: >
261289
crm configure location LOC_STONITH_{{ primary_hostname }} \
262290
rsc_gce_stonith_primary -inf: "{{ primary_hostname }}"
291+
register: reg_loc_prim
292+
changed_when: reg_loc_prim.rc == 0
263293
when:
264294
- is_primary
265295
- not (use_sbd | default(false) | bool)
@@ -268,6 +298,8 @@
268298
ansible.builtin.command: >
269299
crm configure location LOC_STONITH_{{ secondary_hostname }} \
270300
rsc_gce_stonith_secondary -inf: "{{ secondary_hostname }}"
301+
register: reg_loc_sec
302+
changed_when: reg_loc_sec.rc == 0
271303
when:
272304
- is_primary
273305
- not (use_sbd | default(false) | bool)
@@ -278,6 +310,8 @@
278310
crm configure property
279311
$id="cib-bootstrap-options"
280312
stonith-timeout=300s
313+
register: reg_st
314+
changed_when: reg_st.rc == 0
281315
when:
282316
- stonith_timeout != '300s'
283317
- is_primary
@@ -289,6 +323,8 @@
289323
crm configure property
290324
$id="cib-bootstrap-options"
291325
stonith-enabled=true
326+
register: reg_se
327+
changed_when: reg_se.rc == 0
292328
when:
293329
- stonith_enabled | string | lower != 'true'
294330
- is_primary
@@ -313,13 +349,17 @@
313349
crm configure rsc_defaults
314350
$id="rsc-options"
315351
migration-threshold=5000
352+
register: reg_mig_t
353+
changed_when: reg_mig_t.rc == 0
316354
when:
317355
- migration_threshold != '5000'
318356
- is_primary
319357

320358
- name: Set op_defaults timeout
321359
ansible.builtin.command:
322360
cmd: crm configure op_defaults timeout=600
361+
register: reg_op_tim
362+
changed_when: reg_op_tim.rc == 0
323363
when:
324364
- op_default_timeout != '600'
325365
- is_primary
@@ -334,6 +374,8 @@
334374
cidr_netmask=32
335375
nic=eth0
336376
op monitor interval=3600s timeout=60s
377+
register: reg_ip
378+
changed_when: reg_ip.rc == 0
337379
when:
338380
- rsc_ip | length == 0
339381
- is_primary
@@ -383,6 +425,7 @@
383425
cmd: >-
384426
crm resource clear
385427
rsc_ip_{{ sap_hana_install_sid }}_HDB{{ sap_hana_install_instance_number }}
428+
changed_when: true
386429
when:
387430
- is_primary
388431
- reg_vip_location.stdout | trim | split(' ') | last != primary_hostname
@@ -397,6 +440,8 @@
397440
binfile="/usr/bin/socat"
398441
cmdline_options="-U TCP-LISTEN:625{{ sap_hana_install_instance_number }},backlog=10,fork,reuseaddr /dev/null"
399442
op monitor timeout=20s interval=10s op_params depth=0
443+
register: reg_hc_prim
444+
changed_when: reg_hc_prim.rc == 0
400445
when:
401446
- is_primary
402447
- rsc_healthcheck_primary | length == 0
@@ -408,6 +453,8 @@
408453
grp_ip_hc
409454
rsc_ip_{{ sap_hana_install_sid }}_HDB{{ sap_hana_install_instance_number }}
410455
rsc_healthcheck_primary
456+
register: reg_grp_ip_hc
457+
changed_when: reg_grp_ip_hc.rc == 0
411458
when:
412459
- is_primary
413460
- grp_ip_hc | length == 0

0 commit comments

Comments
 (0)