diff --git a/parts/linux/cloud-init/artifacts/localdns.sh b/parts/linux/cloud-init/artifacts/localdns.sh index 8d54c4f1430..97e7598a998 100644 --- a/parts/linux/cloud-init/artifacts/localdns.sh +++ b/parts/linux/cloud-init/artifacts/localdns.sh @@ -61,11 +61,39 @@ CURL_COMMAND="curl -s http://${LOCALDNS_NODE_LISTENER_IP}:8181/ready" NETWORKCTL_RELOAD_CMD="networkctl reload" START_LOCALDNS_TIMEOUT=10 +LOCALDNS_PID_POLL_INTERVAL_SECONDS=0.1 +LOCALDNS_READY_POLL_INTERVAL_SECONDS=0.1 +LOCALDNS_READY_TIMEOUT_SECONDS=60 # DNS health check timeout. DNS_HEALTH_CHECK_TIMEOUT=2 DNS_HEALTH_CHECK_TRIES=2 +# Convert a wall-clock timeout budget into a poll count for the configured interval. +calculate_max_poll_attempts() { + local timeout_duration=$1 + local poll_interval_seconds=$2 + + awk -v timeout="${timeout_duration}" -v interval="${poll_interval_seconds}" ' + BEGIN { + if (timeout !~ /^[0-9]+$/ || interval !~ /^[0-9]+([.][0-9]+)?$/) { + exit 1 + } + + if (interval <= 0) { + exit 1 + } + + if (timeout == 0) { + print 0 + exit 0 + } + + printf "%d\n", int((timeout / interval) + 0.999999) + } + ' +} + # Function definitions used in this file. # functions defined until "${__SOURCED__:+return}" are sourced and tested in - # spec/parts/linux/cloud-init/artifacts/localdns_spec.sh. @@ -419,14 +447,21 @@ start_localdns() { ${COREDNS_COMMAND} & # Wait until the PID file is created. - local elapsed=0 + local attempts=0 + local max_attempts + max_attempts=$(calculate_max_poll_attempts "${START_LOCALDNS_TIMEOUT}" "${LOCALDNS_PID_POLL_INTERVAL_SECONDS}") || { + echo "Failed to calculate localdns PID poll attempts for timeout ${START_LOCALDNS_TIMEOUT} and interval ${LOCALDNS_PID_POLL_INTERVAL_SECONDS}." + return 1 + } + while [ ! -f "${LOCALDNS_PID_FILE}" ]; do - sleep 1 - elapsed=$((elapsed + 1)) - if [ "$elapsed" -ge "$START_LOCALDNS_TIMEOUT" ]; then + if [ "$attempts" -ge "$max_attempts" ]; then echo "Timed out waiting for CoreDNS to create PID file at ${LOCALDNS_PID_FILE}." return 1 fi + + sleep "${LOCALDNS_PID_POLL_INTERVAL_SECONDS}" + attempts=$((attempts + 1)) done COREDNS_PID="$(cat ${LOCALDNS_PID_FILE})" @@ -436,26 +471,38 @@ start_localdns() { # Wait for localdns to be ready to serve traffic. wait_for_localdns_ready() { - local maxattempts=$1 - local timeout_duration=$2 - declare -i attempts=0 - local starttime=$(date +%s) + local timeout_duration=$1 + local starttime + local currenttime + local elapsedtime + local attempts=0 + local max_attempts + + max_attempts=$(calculate_max_poll_attempts "${timeout_duration}" "${LOCALDNS_READY_POLL_INTERVAL_SECONDS}") || { + echo "Failed to calculate localdns readiness poll attempts for timeout ${timeout_duration} and interval ${LOCALDNS_READY_POLL_INTERVAL_SECONDS}." + return 1 + } + + starttime=$(date +%s) echo "Waiting for localdns to start and be able to serve traffic." until [ "$($CURL_COMMAND)" = "OK" ]; do - if [ $attempts -ge $maxattempts ]; then - echo "Localdns failed to come online after $maxattempts attempts." - return 1 - fi - # Check for timeout based on elapsed time. + # Keep both guards: elapsed time is the real wall-clock timeout, while max_attempts + # guarantees termination if date +%s stalls or does not advance as expected. currenttime=$(date +%s) elapsedtime=$((currenttime - starttime)) - if [ $elapsedtime -ge $timeout_duration ]; then + if [ "$elapsedtime" -ge "$timeout_duration" ]; then echo "Localdns failed to come online after $timeout_duration seconds (timeout)." return 1 fi - sleep 1 - ((attempts++)) + + if [ "$attempts" -ge "$max_attempts" ]; then + echo "Localdns failed to come online after ${max_attempts} attempts (safety limit for ${timeout_duration} seconds timeout)." + return 1 + fi + + sleep "${LOCALDNS_READY_POLL_INTERVAL_SECONDS}" + attempts=$((attempts + 1)) done echo "Localdns is online and ready to serve traffic." return 0 @@ -1059,7 +1106,7 @@ fi start_localdns || exit $ERR_LOCALDNS_FAIL # Wait to direct traffic to localdns until it's ready. -wait_for_localdns_ready 60 60 || exit $ERR_LOCALDNS_FAIL +wait_for_localdns_ready "${LOCALDNS_READY_TIMEOUT_SECONDS}" || exit $ERR_LOCALDNS_FAIL # Disable DNS from DHCP and point the system at localdns. # -------------------------------------------------------------------------------------------------------------------- diff --git a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh index d61cf348413..2d539be9ddb 100644 --- a/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/localdns_spec.sh @@ -662,6 +662,59 @@ EOF End +# This section tests - calculate_max_poll_attempts +# This function is defined in parts/linux/cloud-init/artifacts/localdns.sh file. +#------------------------------------------------------------------------------------------------------------------------------------ + Describe 'calculate_max_poll_attempts' + setup() { + Include "./parts/linux/cloud-init/artifacts/localdns.sh" + } + BeforeEach 'setup' + It 'should return the exact attempt count when timeout divides evenly by the interval' + When call calculate_max_poll_attempts 2 0.5 + The status should be success + The output should eq "4" + End + + It 'should round up when timeout does not divide evenly by the interval' + When call calculate_max_poll_attempts 1 0.3 + The status should be success + The output should eq "4" + End + + It 'should return zero attempts for a zero timeout' + When call calculate_max_poll_attempts 0 0.1 + The status should be success + The output should eq "0" + End + + It 'should fail for a negative timeout' + When call calculate_max_poll_attempts -1 0.1 + The status should be failure + End + + It 'should fail for a zero interval' + When call calculate_max_poll_attempts 1 0 + The status should be failure + End + + It 'should fail for a non-numeric timeout' + When call calculate_max_poll_attempts abc 0.1 + The status should be failure + End + + It 'should fail for a fractional timeout' + When call calculate_max_poll_attempts 0.5 0.1 + The status should be failure + End + + It 'should fail for a non-numeric interval' + When call calculate_max_poll_attempts 1 abc + The status should be failure + End + End + + # This section tests - start_localdns # This function is defined in parts/linux/cloud-init/artifacts/localdns.sh file. #------------------------------------------------------------------------------------------------------------------------------------ @@ -669,10 +722,12 @@ EOF setup() { Include "./parts/linux/cloud-init/artifacts/localdns.sh" LOCALDNS_PID_FILE="/tmp/localdns.pid" + SLEEP_LOG_FILE="/tmp/localdns-start-sleep-log-$$" } cleanup() { rm -f "${LOCALDNS_PID_FILE}" rm -f ./mock-coredns.sh + rm -f "${SLEEP_LOG_FILE}" } BeforeEach 'setup' AfterEach 'cleanup' @@ -707,6 +762,37 @@ EOF The status should be failure The output should include "Timed out waiting for CoreDNS to create PID file" End + + It 'should poll for the PID file every 0.1 seconds' + mock_coredns() { + return 0 + } + COREDNS_COMMAND="mock_coredns" + START_LOCALDNS_TIMEOUT=1 + sleep() { + echo "sleep called with: $1" + } + When call start_localdns + The status should be failure + The output should include "sleep called with: 0.1" + End + + It 'should succeed after polling for the PID file every 0.1 seconds' + mock_coredns() { + return 0 + } + COREDNS_COMMAND="mock_coredns" + EXPECTED_SLEEP_LOG=$(printf '0.1\n') + sleep() { + echo "$1" >> "$SLEEP_LOG_FILE" + echo "12345" > "${LOCALDNS_PID_FILE}" + } + When call start_localdns + The status should be success + The output should include "Localdns PID is 12345." + The file "${LOCALDNS_PID_FILE}" should be exist + The contents of file "$SLEEP_LOG_FILE" should eq "$EXPECTED_SLEEP_LOG" + End End @@ -716,35 +802,152 @@ EOF Describe 'wait_for_localdns_ready' setup() { Include "./parts/linux/cloud-init/artifacts/localdns.sh" + DATE_SEQUENCE_FILE="/tmp/localdns-date-sequence-$$" + SLEEP_LOG_FILE="/tmp/localdns-sleep-log-$$" + } + cleanup() { + rm -f "$DATE_SEQUENCE_FILE" "${DATE_SEQUENCE_FILE}.next" "$SLEEP_LOG_FILE" } BeforeEach 'setup' + AfterEach 'cleanup' #------------------------- wait_for_localdns_ready ----------------------------------------------------------- It 'should return success if localdns is ready' CURL_COMMAND="echo OK" - MAX_ATTEMPTS=100 TIMEOUT=5 - When call wait_for_localdns_ready $MAX_ATTEMPTS $TIMEOUT + When call wait_for_localdns_ready $TIMEOUT The status should be success The output should include "Waiting for localdns to start and be able to serve traffic." The output should include "Localdns is online and ready to serve traffic." End - It 'should return failure if localdns is not ready, after timeout' + It 'should return failure if localdns is not ready after the wall-clock timeout' + CURL_COMMAND="echo NOTOK" + TIMEOUT=2 + EXPECTED_SLEEP_LOG=$(printf '0.1\n0.1\n') + # Expected date consumption order: + # 1. starttime initialization -> 100 + # 2. first loop timeout check -> 100 + # 3. second loop timeout check -> 101 + # 4. third loop timeout check -> 102 (triggers timeout after two sleeps) + cat > "$DATE_SEQUENCE_FILE" <&2 + return 1 + fi + + current_time=$(head -n 1 "$DATE_SEQUENCE_FILE") + tail -n +2 "$DATE_SEQUENCE_FILE" > "${DATE_SEQUENCE_FILE}.next" + mv "${DATE_SEQUENCE_FILE}.next" "$DATE_SEQUENCE_FILE" + + echo "$current_time" + } + sleep() { + echo "$1" >> "$SLEEP_LOG_FILE" + } + When call wait_for_localdns_ready $TIMEOUT + The status should be failure + The output should include "Localdns failed to come online after ${TIMEOUT} seconds (timeout)." + The contents of file "$SLEEP_LOG_FILE" should eq "$EXPECTED_SLEEP_LOG" + End + + It 'should prefer the wall-clock timeout message when timeout and attempt cap are both reached' CURL_COMMAND="echo NOTOK" - MAX_ATTEMPTS=1000 TIMEOUT=2 - When call wait_for_localdns_ready $MAX_ATTEMPTS $TIMEOUT + LOCALDNS_READY_POLL_INTERVAL_SECONDS=0.5 + EXPECTED_SLEEP_LOG=$(printf '0.5\n0.5\n0.5\n0.5\n') + # Expected date consumption order: + # 1. starttime initialization -> 100 + # 2. first loop timeout check -> 100 + # 3. second loop timeout check -> 100 + # 4. third loop timeout check -> 101 + # 5. fourth loop timeout check -> 101 + # 6. fifth loop timeout check -> 102 (wall-clock timeout and attempt cap both true) + cat > "$DATE_SEQUENCE_FILE" <&2 + return 1 + fi + + current_time=$(head -n 1 "$DATE_SEQUENCE_FILE") + tail -n +2 "$DATE_SEQUENCE_FILE" > "${DATE_SEQUENCE_FILE}.next" + mv "${DATE_SEQUENCE_FILE}.next" "$DATE_SEQUENCE_FILE" + + echo "$current_time" + } + sleep() { + echo "$1" >> "$SLEEP_LOG_FILE" + } + When call wait_for_localdns_ready $TIMEOUT The status should be failure The output should include "Localdns failed to come online after ${TIMEOUT} seconds (timeout)." + The output should not include "safety limit" + The contents of file "$SLEEP_LOG_FILE" should eq "$EXPECTED_SLEEP_LOG" End - It 'should return failure if localdns is not ready, after max attempts' + It 'should fail if readiness polling attempts cannot be calculated' CURL_COMMAND="echo NOTOK" - MAX_ATTEMPTS=2 - TIMEOUT=50 - When call wait_for_localdns_ready $MAX_ATTEMPTS $TIMEOUT + TIMEOUT=abc + When call wait_for_localdns_ready $TIMEOUT + The status should be failure + The output should include "Failed to calculate localdns readiness poll attempts for timeout ${TIMEOUT} and interval ${LOCALDNS_READY_POLL_INTERVAL_SECONDS}." + End + + It 'should return failure after derived max attempts when the clock does not advance' + CURL_COMMAND="echo NOTOK" + TIMEOUT=2 + LOCALDNS_READY_POLL_INTERVAL_SECONDS=0.5 + EXPECTED_SLEEP_LOG=$(printf '0.5\n0.5\n0.5\n0.5\n') + cat > "$DATE_SEQUENCE_FILE" <&2 + return 1 + fi + + current_time=$(head -n 1 "$DATE_SEQUENCE_FILE") + tail -n +2 "$DATE_SEQUENCE_FILE" > "${DATE_SEQUENCE_FILE}.next" + mv "${DATE_SEQUENCE_FILE}.next" "$DATE_SEQUENCE_FILE" + + echo "$current_time" + } + sleep() { + echo "$1" >> "$SLEEP_LOG_FILE" + } + When call wait_for_localdns_ready $TIMEOUT The status should be failure - The output should include "Localdns failed to come online after ${MAX_ATTEMPTS} attempts." + The output should include "Localdns failed to come online after 4 attempts (safety limit for ${TIMEOUT} seconds timeout)." + The contents of file "$SLEEP_LOG_FILE" should eq "$EXPECTED_SLEEP_LOG" End End