Skip to content

Commit c8548f6

Browse files
authored
Skip failure reports for already failed nodes (valkey-io#2434)
This change avoids additional failure report creation if the node is already marked as failed. The failure report count has never been used after a node has been marked as failed. So, there is no value addition in maintaining it further. This reduces operation of both add and delete failure report. Hence, the performance benefit. We can observe an avg. of 10% reduction in p99 CPU utilization (in a 2000 nodes cluster (1000 primary/ 1000 replica) with 330 nodes in failed state with this change. --------- Signed-off-by: Seungmin Lee <sungming@amazon.com>
1 parent eae23ba commit c8548f6

5 files changed

Lines changed: 92 additions & 19 deletions

File tree

src/cluster_legacy.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1784,15 +1784,21 @@ static void decodeFailureReportKey(unsigned char *buf, mstime_t *report_time, cl
17841784
* 'failing' is the node that is in failure state according to the
17851785
* 'sender' node.
17861786
*
1787-
* The function returns 0 if it early‐exits (same sender & time bucket)
1788-
* or updates a timestamp of an existing failure report from the same sender.
1789-
* 1 is returned if a new failure report is created. */
1787+
* Returns 0:
1788+
* - The node is already in FAIL state
1789+
* - The same sender has already reported within the same time bucket
1790+
* - An existing report from 'sender' was refreshed (timestamp updated)
1791+
* Returns 1 if a brand new failure report entry is created. */
17901792
int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
17911793
unsigned char buf[FAILURE_REPORT_KEYLEN];
17921794
mstime_t now = mstime();
17931795
const mstime_t bucketed_time = (now / SEC_IN_MS) * SEC_IN_MS + SEC_IN_MS;
17941796
int is_new = 1;
17951797

1798+
/* This avoids unnecessary iteration and memory ops, improving performance
1799+
* when handling repeated reports for already failed nodes. */
1800+
if (nodeFailed(failing)) return 0;
1801+
17961802
/* Look for any existing entry from this sender and remove it */
17971803
raxIterator ri;
17981804
raxStart(&ri, failing->fail_reports);

src/commands.def

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,9 @@ const char *CLUSTER_BUMPEPOCH_Tips[] = {
415415

416416
#ifndef SKIP_CMD_HISTORY_TABLE
417417
/* CLUSTER COUNT_FAILURE_REPORTS history */
418-
#define CLUSTER_COUNT_FAILURE_REPORTS_History NULL
418+
commandHistory CLUSTER_COUNT_FAILURE_REPORTS_History[] = {
419+
{"9.0.0","Ignore additional failure reports for a node which has been marked as failed."},
420+
};
419421
#endif
420422

421423
#ifndef SKIP_CMD_TIPS_TABLE
@@ -1138,7 +1140,7 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = {
11381140
{MAKE_CMD("addslotsrange","Assigns new hash slot ranges to a node.","O(N) where N is the total number of the slots between the start slot and end slot arguments.","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_ADDSLOTSRANGE_History,0,CLUSTER_ADDSLOTSRANGE_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_ADDSLOTSRANGE_Keyspecs,0,NULL,1),.args=CLUSTER_ADDSLOTSRANGE_Args},
11391141
{MAKE_CMD("bumpepoch","Advances the cluster config epoch.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_BUMPEPOCH_History,0,CLUSTER_BUMPEPOCH_Tips,1,clusterCommand,2,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_BUMPEPOCH_Keyspecs,0,NULL,0)},
11401142
{MAKE_CMD("cancelslotmigrations","Cancel all current ongoing slot migration operations.","O(N), where N is the number of slot migration operations being cancelled.","9.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_CANCELSLOTMIGRATIONS_History,0,CLUSTER_CANCELSLOTMIGRATIONS_Tips,0,clusterCommand,2,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_CANCELSLOTMIGRATIONS_Keyspecs,0,NULL,0)},
1141-
{MAKE_CMD("count-failure-reports","Returns the number of active failure reports active for a node.","O(N) where N is the number of failure reports","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_COUNT_FAILURE_REPORTS_History,0,CLUSTER_COUNT_FAILURE_REPORTS_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_COUNT_FAILURE_REPORTS_Keyspecs,0,NULL,1),.args=CLUSTER_COUNT_FAILURE_REPORTS_Args},
1143+
{MAKE_CMD("count-failure-reports","Returns the number of active failure reports for a node. No new reports are created once the node is marked as failed.","O(N) where N is the number of failure reports","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_COUNT_FAILURE_REPORTS_History,1,CLUSTER_COUNT_FAILURE_REPORTS_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_COUNT_FAILURE_REPORTS_Keyspecs,0,NULL,1),.args=CLUSTER_COUNT_FAILURE_REPORTS_Args},
11421144
{MAKE_CMD("countkeysinslot","Returns the number of keys in a hash slot.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_COUNTKEYSINSLOT_History,0,CLUSTER_COUNTKEYSINSLOT_Tips,0,clusterCommand,3,CMD_STALE,0,CLUSTER_COUNTKEYSINSLOT_Keyspecs,0,NULL,1),.args=CLUSTER_COUNTKEYSINSLOT_Args},
11431145
{MAKE_CMD("delslots","Sets hash slots as unbound for a node.","O(N) where N is the total number of hash slot arguments","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_DELSLOTS_History,0,CLUSTER_DELSLOTS_Tips,0,clusterCommand,-3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_DELSLOTS_Keyspecs,0,NULL,1),.args=CLUSTER_DELSLOTS_Args},
11441146
{MAKE_CMD("delslotsrange","Sets hash slot ranges as unbound for a node.","O(N) where N is the total number of the slots between the start slot and end slot arguments.","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_DELSLOTSRANGE_History,0,CLUSTER_DELSLOTSRANGE_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_DELSLOTSRANGE_Keyspecs,0,NULL,1),.args=CLUSTER_DELSLOTSRANGE_Args},

src/commands/cluster-count-failure-reports.json

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
{
22
"COUNT-FAILURE-REPORTS": {
3-
"summary": "Returns the number of active failure reports active for a node.",
3+
"summary": "Returns the number of active failure reports for a node. No new reports are created once the node is marked as failed.",
44
"complexity": "O(N) where N is the number of failure reports",
55
"group": "cluster",
66
"since": "3.0.0",
77
"arity": 3,
88
"container": "CLUSTER",
99
"function": "clusterCommand",
10+
"history": [
11+
[
12+
"9.0.0",
13+
"Ignore additional failure reports for a node which has been marked as failed."
14+
]
15+
],
1016
"command_flags": [
1117
"ADMIN",
1218
"STALE"

tests/unit/cluster/failure-marking.tcl

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -100,16 +100,6 @@ tags {external:skip tls:skip cluster singledb} {
100100
wait_node_marked_fail 3 $replica_id
101101
wait_node_marked_fail 4 $replica_id
102102

103-
# Check if we got the right failure reports.
104-
wait_for_condition 1000 50 {
105-
[R 0 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 0 &&
106-
[R 2 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
107-
[R 3 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
108-
[R 4 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1
109-
} else {
110-
fail "Cluster COUNT-FAILURE-REPORTS is not right."
111-
}
112-
113103
resume_process $replica_pid
114104

115105
# Check there are no failure reports left.
@@ -124,3 +114,67 @@ tags {external:skip tls:skip cluster singledb} {
124114
}
125115
}
126116
}
117+
118+
# Test that no new failure-report is added once the node is already marked as FAIL
119+
start_cluster 3 1 {tags {external:skip cluster}} {
120+
test "Primaries do not add failure-report after replica is already marked FAIL" {
121+
# Primary nodes
122+
set primary0 [srv 0 client];
123+
set primary0_pid [srv 0 pid]
124+
set primary1 [srv -1 client];
125+
set primary1_pid [srv -1 pid]
126+
set primary2 [srv -2 client];
127+
set primary2_pid [srv -2 pid]
128+
129+
# Replica node
130+
set replica0 [srv -3 client];
131+
set replica0_pid [srv -3 pid]
132+
set replica0_id [dict get [cluster_get_myself 3] id]
133+
134+
assert_equal [lindex [$primary0 role] 0] {master}
135+
assert_equal [lindex [$primary1 role] 0] {master}
136+
assert_equal [lindex [$primary2 role] 0] {master}
137+
assert_equal [lindex [$replica0 role] 0] {slave}
138+
139+
# Ensure replica is synced before simulating failure
140+
wait_for_sync $replica0
141+
142+
# This prevents a quorum of alive masters from reaching FAIL immediately,
143+
# so we can observe the PFAIL gossip and ensure failure reports get added.
144+
pause_process $replica0_pid
145+
pause_process $primary0_pid
146+
pause_process $primary1_pid
147+
148+
# The active primary (primary2) should mark the replica PFAIL
149+
wait_node_marked_pfail 2 $replica0_id
150+
151+
# Resume one paused primary (primary0) to reach quorum of 2 masters
152+
resume_process $primary0_pid
153+
154+
# Now the replica should transition to FAIL on those three primaries
155+
wait_node_marked_fail 0 $replica0_id
156+
wait_node_marked_fail 2 $replica0_id
157+
158+
# Resume the final paused primary (primary1)
159+
# Other nodes should not add a new failure report from this primary1
160+
resume_process $primary1_pid
161+
162+
# Ensure that primary0 and primary2 do not have more than one report
163+
wait_for_condition 1000 50 {
164+
[R 0 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] < 2 &&
165+
[R 2 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] < 2
166+
} else {
167+
fail "Ensure primary0 and primary2 do not exceed one failure report"
168+
}
169+
170+
# Bring the replica back online and verify cleanup
171+
resume_process $replica0_pid
172+
wait_for_condition 1000 50 {
173+
[R 0 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] == 0 &&
174+
[R 1 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] == 0 &&
175+
[R 2 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] == 0
176+
} else {
177+
fail "Failure-report lists were not cleared after replica recovery"
178+
}
179+
}
180+
}

tests/unit/cluster/human-announced-nodename.tcl

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Check if cluster's view of human announced nodename is reported in logs
2-
start_cluster 3 0 {tags {external:skip cluster}} {
2+
start_cluster 4 0 {tags {external:skip cluster}} {
33
test "Set cluster human announced nodename and let it propagate" {
44
for {set j 0} {$j < [llength $::servers]} {incr j} {
55
R $j config set cluster-announce-hostname "host-$j.com"
@@ -18,12 +18,17 @@ start_cluster 3 0 {tags {external:skip cluster}} {
1818
test "Human nodenames are visible in log messages" {
1919
# Pause instance 0, so everyone thinks it is dead
2020
pause_process [srv 0 pid]
21+
pause_process [srv -1 pid]
2122

2223
# We're going to use a message we will know will be sent, node unreachable,
2324
# since it includes the other node gossiping.
24-
wait_for_log_messages -1 {"*Node * (nodename-2) reported node * (nodename-0) as not reachable*"} 0 20 500
25-
wait_for_log_messages -2 {"*Node * (nodename-1) reported node * (nodename-0) as not reachable*"} 0 20 500
25+
wait_for_log_messages -2 {"*Node * (nodename-3) reported node * (nodename-0) as not reachable*"} 0 20 500
26+
wait_for_log_messages -3 {"*Node * (nodename-2) reported node * (nodename-0) as not reachable*"} 0 20 500
27+
28+
wait_for_log_messages -2 {"*Node * (nodename-3) reported node * (nodename-1) as not reachable*"} 0 20 500
29+
wait_for_log_messages -3 {"*Node * (nodename-2) reported node * (nodename-1) as not reachable*"} 0 20 500
2630

2731
resume_process [srv 0 pid]
32+
resume_process [srv -1 pid]
2833
}
2934
}

0 commit comments

Comments
 (0)