Skip failure reports for already failed nodes (valkey-io#2434)

sungming2 · web-flow · commit c8548f65d237 · 2025-08-19T15:24:26.000-07:00
This change avoids additional failure report creation if the node is
already marked as failed. The failure report count has never been used
after a node has been marked as failed. So, there is no value addition
in maintaining it further. This reduces operation of both add and delete
failure report. Hence, the performance benefit.

We can observe an avg. of 10% reduction in p99 CPU utilization (in a 2000
nodes cluster (1000 primary/ 1000 replica) with 330 nodes in failed
state with this change.

---------

Signed-off-by: Seungmin Lee &lt;sungming@amazon.com&gt;
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -1784,15 +1784,21 @@ static void decodeFailureReportKey(unsigned char *buf, mstime_t *report_time, cl
  * 'failing' is the node that is in failure state according to the
  * 'sender' node.
  *
- * The function returns 0 if it early‐exits (same sender & time bucket)
- * or updates a timestamp of an existing failure report from the same sender.
- * 1 is returned if a new failure report is created. */
+ * Returns 0:
+ *   - The node is already in FAIL state
+ *   - The same sender has already reported within the same time bucket
+ *   - An existing report from 'sender' was refreshed (timestamp updated)
+ * Returns 1 if a brand new failure report entry is created. */
 int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
     unsigned char buf[FAILURE_REPORT_KEYLEN];
     mstime_t now = mstime();
     const mstime_t bucketed_time = (now / SEC_IN_MS) * SEC_IN_MS + SEC_IN_MS;
     int is_new = 1;
 
+    /* This avoids unnecessary iteration and memory ops, improving performance
+     * when handling repeated reports for already failed nodes. */
+    if (nodeFailed(failing)) return 0;
+
     /* Look for any existing entry from this sender and remove it */
     raxIterator ri;
     raxStart(&ri, failing->fail_reports);
diff --git a/src/commands.def b/src/commands.def
@@ -415,7 +415,9 @@ const char *CLUSTER_BUMPEPOCH_Tips[] = {
 
 #ifndef SKIP_CMD_HISTORY_TABLE
 /* CLUSTER COUNT_FAILURE_REPORTS history */
-#define CLUSTER_COUNT_FAILURE_REPORTS_History NULL
+commandHistory CLUSTER_COUNT_FAILURE_REPORTS_History[] = {
+{"9.0.0","Ignore additional failure reports for a node which has been marked as failed."},
+};
 #endif
 
 #ifndef SKIP_CMD_TIPS_TABLE
@@ -1138,7 +1140,7 @@ struct COMMAND_STRUCT CLUSTER_Subcommands[] = {
 {MAKE_CMD("addslotsrange","Assigns new hash slot ranges to a node.","O(N) where N is the total number of the slots between the start slot and end slot arguments.","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_ADDSLOTSRANGE_History,0,CLUSTER_ADDSLOTSRANGE_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_ADDSLOTSRANGE_Keyspecs,0,NULL,1),.args=CLUSTER_ADDSLOTSRANGE_Args},
 {MAKE_CMD("bumpepoch","Advances the cluster config epoch.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_BUMPEPOCH_History,0,CLUSTER_BUMPEPOCH_Tips,1,clusterCommand,2,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_BUMPEPOCH_Keyspecs,0,NULL,0)},
 {MAKE_CMD("cancelslotmigrations","Cancel all current ongoing slot migration operations.","O(N), where N is the number of slot migration operations being cancelled.","9.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_CANCELSLOTMIGRATIONS_History,0,CLUSTER_CANCELSLOTMIGRATIONS_Tips,0,clusterCommand,2,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_CANCELSLOTMIGRATIONS_Keyspecs,0,NULL,0)},
-{MAKE_CMD("count-failure-reports","Returns the number of active failure reports active for a node.","O(N) where N is the number of failure reports","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_COUNT_FAILURE_REPORTS_History,0,CLUSTER_COUNT_FAILURE_REPORTS_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_COUNT_FAILURE_REPORTS_Keyspecs,0,NULL,1),.args=CLUSTER_COUNT_FAILURE_REPORTS_Args},
+{MAKE_CMD("count-failure-reports","Returns the number of active failure reports for a node. No new reports are created once the node is marked as failed.","O(N) where N is the number of failure reports","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_COUNT_FAILURE_REPORTS_History,1,CLUSTER_COUNT_FAILURE_REPORTS_Tips,1,clusterCommand,3,CMD_ADMIN|CMD_STALE,0,CLUSTER_COUNT_FAILURE_REPORTS_Keyspecs,0,NULL,1),.args=CLUSTER_COUNT_FAILURE_REPORTS_Args},
 {MAKE_CMD("countkeysinslot","Returns the number of keys in a hash slot.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_COUNTKEYSINSLOT_History,0,CLUSTER_COUNTKEYSINSLOT_Tips,0,clusterCommand,3,CMD_STALE,0,CLUSTER_COUNTKEYSINSLOT_Keyspecs,0,NULL,1),.args=CLUSTER_COUNTKEYSINSLOT_Args},
 {MAKE_CMD("delslots","Sets hash slots as unbound for a node.","O(N) where N is the total number of hash slot arguments","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_DELSLOTS_History,0,CLUSTER_DELSLOTS_Tips,0,clusterCommand,-3,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_DELSLOTS_Keyspecs,0,NULL,1),.args=CLUSTER_DELSLOTS_Args},
 {MAKE_CMD("delslotsrange","Sets hash slot ranges as unbound for a node.","O(N) where N is the total number of the slots between the start slot and end slot arguments.","7.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,CLUSTER_DELSLOTSRANGE_History,0,CLUSTER_DELSLOTSRANGE_Tips,0,clusterCommand,-4,CMD_NO_ASYNC_LOADING|CMD_ADMIN|CMD_STALE,0,CLUSTER_DELSLOTSRANGE_Keyspecs,0,NULL,1),.args=CLUSTER_DELSLOTSRANGE_Args},
diff --git a/src/commands/cluster-count-failure-reports.json b/src/commands/cluster-count-failure-reports.json
@@ -1,12 +1,18 @@
 {
     "COUNT-FAILURE-REPORTS": {
-        "summary": "Returns the number of active failure reports active for a node.",
+        "summary": "Returns the number of active failure reports for a node. No new reports are created once the node is marked as failed.",
         "complexity": "O(N) where N is the number of failure reports",
         "group": "cluster",
         "since": "3.0.0",
         "arity": 3,
         "container": "CLUSTER",
         "function": "clusterCommand",
+        "history": [
+            [
+                "9.0.0",
+                "Ignore additional failure reports for a node which has been marked as failed."
+            ]
+        ],
         "command_flags": [
             "ADMIN",
             "STALE"
diff --git a/tests/unit/cluster/failure-marking.tcl b/tests/unit/cluster/failure-marking.tcl
@@ -100,16 +100,6 @@ tags {external:skip tls:skip cluster singledb} {
             wait_node_marked_fail 3 $replica_id
             wait_node_marked_fail 4 $replica_id
 
-            # Check if we got the right failure reports.
-            wait_for_condition 1000 50 {
-                [R 0 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 0 &&
-                [R 2 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
-                [R 3 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 &&
-                [R 4 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1
-            } else {
-                fail "Cluster COUNT-FAILURE-REPORTS is not right."
-            }
-
             resume_process $replica_pid
 
             # Check there are no failure reports left.
@@ -124,3 +114,67 @@ tags {external:skip tls:skip cluster singledb} {
         }
     }
 }
+
+# Test that no new failure-report is added once the node is already marked as FAIL
+start_cluster 3 1 {tags {external:skip cluster}} {
+    test "Primaries do not add failure-report after replica is already marked FAIL" {
+        # Primary nodes
+        set primary0 [srv 0 client];
+        set primary0_pid [srv 0 pid]
+        set primary1 [srv -1 client];
+        set primary1_pid [srv -1 pid]
+        set primary2 [srv -2 client];
+        set primary2_pid [srv -2 pid]
+
+        # Replica node
+        set replica0 [srv -3 client];
+        set replica0_pid [srv -3 pid]
+        set replica0_id [dict get [cluster_get_myself 3] id]
+
+        assert_equal [lindex [$primary0 role] 0] {master}
+        assert_equal [lindex [$primary1 role] 0] {master}
+        assert_equal [lindex [$primary2 role] 0] {master}
+        assert_equal [lindex [$replica0 role] 0] {slave}
+
+        # Ensure replica is synced before simulating failure
+        wait_for_sync $replica0
+
+        # This prevents a quorum of alive masters from reaching FAIL immediately,
+        # so we can observe the PFAIL gossip and ensure failure reports get added.
+        pause_process $replica0_pid
+        pause_process $primary0_pid
+        pause_process $primary1_pid
+
+        # The active primary (primary2) should mark the replica PFAIL
+        wait_node_marked_pfail 2 $replica0_id
+
+        # Resume one paused primary (primary0) to reach quorum of 2 masters
+        resume_process $primary0_pid
+
+        # Now the replica should transition to FAIL on those three primaries
+        wait_node_marked_fail 0 $replica0_id
+        wait_node_marked_fail 2 $replica0_id
+
+        # Resume the final paused primary (primary1)
+        # Other nodes should not add a new failure report from this primary1
+        resume_process $primary1_pid
+
+        # Ensure that primary0 and primary2 do not have more than one report
+        wait_for_condition 1000 50 {
+            [R 0 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] < 2 &&
+            [R 2 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] < 2
+        } else {
+            fail "Ensure primary0 and primary2 do not exceed one failure report"
+        }
+
+        # Bring the replica back online and verify cleanup
+        resume_process $replica0_pid
+        wait_for_condition 1000 50 {
+            [R 0 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] == 0 &&
+            [R 1 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] == 0 &&
+            [R 2 CLUSTER COUNT-FAILURE-REPORTS $replica0_id] == 0
+        } else {
+            fail "Failure-report lists were not cleared after replica recovery"
+        }
+    }
+}
diff --git a/tests/unit/cluster/human-announced-nodename.tcl b/tests/unit/cluster/human-announced-nodename.tcl
@@ -1,5 +1,5 @@
 # Check if cluster's view of human announced nodename is reported in logs
-start_cluster 3 0 {tags {external:skip cluster}} {
+start_cluster 4 0 {tags {external:skip cluster}} {
     test "Set cluster human announced nodename and let it propagate" {
         for {set j 0} {$j < [llength $::servers]} {incr j} {
             R $j config set cluster-announce-hostname "host-$j.com"
@@ -18,12 +18,17 @@ start_cluster 3 0 {tags {external:skip cluster}} {
     test "Human nodenames are visible in log messages" {
         # Pause instance 0, so everyone thinks it is dead
         pause_process [srv 0 pid]
+        pause_process [srv -1 pid]
 
         # We're going to use a message we will know will be sent, node unreachable,
         # since it includes the other node gossiping.
-        wait_for_log_messages -1 {"*Node * (nodename-2) reported node * (nodename-0) as not reachable*"} 0 20 500
-        wait_for_log_messages -2 {"*Node * (nodename-1) reported node * (nodename-0) as not reachable*"} 0 20 500
+        wait_for_log_messages -2 {"*Node * (nodename-3) reported node * (nodename-0) as not reachable*"} 0 20 500
+        wait_for_log_messages -3 {"*Node * (nodename-2) reported node * (nodename-0) as not reachable*"} 0 20 500
+
+        wait_for_log_messages -2 {"*Node * (nodename-3) reported node * (nodename-1) as not reachable*"} 0 20 500
+        wait_for_log_messages -3 {"*Node * (nodename-2) reported node * (nodename-1) as not reachable*"} 0 20 500
         
         resume_process [srv 0 pid]
+        resume_process [srv -1 pid]
     }
 }