From 13100a01d51a5bb45efd9806abded618e2065bdf Mon Sep 17 00:00:00 2001
From: Siddharth Suresh <siddharth@stellar.org>
Date: Fri, 15 May 2026 21:17:06 +0000
Subject: [PATCH] PCv2: skip log collection in signal-triggered cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PCv2 cleanup function was collecting logs from every worker pod before
calling `helm uninstall`. With up to 1024 workers and serial per-pod log
collection, the cleanup would routinely take minutes — far exceeding the ~5s
Jenkins gives between SIGTERM and SIGKILL. Result: aborted PCv2 runs were
leaving entire worker fleets running after the Jenkins job died, because
`helm uninstall` never got a chance to run.

Add a `signalTriggered` parameter to `cleanup`. Signal handlers
(ProcessExit, CancelKeyPress) now pass `true`, which skips log collection
and goes straight to `helm uninstall`. Normal / legitimate-failure callers
pass `false`, preserving the existing logs-first behavior so debugging
output isn't lost when pods would still be alive to read from anyway.
---
 .../MissionHistoryPubnetParallelCatchupV2.fs  | 66 +++++++++++++------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
index 9526d268..601e5e57 100644
--- a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
+++ b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
@@ -278,38 +278,64 @@ let collectLogsFromPods (context: MissionContext) =
         with ex ->
             LogWarn "Could not collect logs from pod %s (this is expected if pod doesn't exist): %s" podName ex.Message
 
-// Cleanup on exit
-let cleanup (context: MissionContext) =
+// Cleanup on exit. `signalTriggered` indicates we're running under a hard
+// deadline (Jenkins' SoftKillWaitSeconds, ~5s by default, before SIGKILL).
+// In that case we have to prioritize getting `helm uninstall` issued ahead
+// of the much-slower log collection — otherwise we get SIGKILLed mid-
+// collection and leak every worker pod, which is what we saw in practice
+// with a 1024-worker run aborted from Jenkins.
+let cleanup (signalTriggered: bool) (context: MissionContext) =
     if toPerformCleanup then
         toPerformCleanup <- false
-        LogInfo "Cleaning up resources for release: %s" helmReleaseName
 
-        // Try to collect logs from all worker pods before cleanup
-        try
-            LogInfo "Attempting to collect worker logs before cleanup..."
-            let stopwatch = Stopwatch.StartNew()
-            collectLogsFromPods context
-            stopwatch.Stop()
-            LogInfo "Log collection completed in %.2f seconds" stopwatch.Elapsed.TotalSeconds
-        with ex -> LogWarn "Failed to collect some or all worker logs: %s" ex.Message
-
-        RunShellCommand [| "helm"
-                           "uninstall"
-                           helmReleaseName |]
-        |> ignore
+        if signalTriggered then
+            // Abort path: resources first, logs are nice-to-have.
+            // Skip log collection entirely — even parallelized it can't beat
+            // Jenkins' ~5s grace before SIGKILL, and it can't beat the per-pod
+            // terminationGracePeriodSeconds (default 30s) when scaled to 1024
+            // workers. Whatever logs were captured inline by the failure
+            // handler in the main loop are still on disk.
+            LogInfo "Signal-triggered cleanup: uninstalling release %s" helmReleaseName
+
+            RunShellCommand [| "helm"
+                               "uninstall"
+                               helmReleaseName |]
+            |> ignore
+        else
+            // Normal / legitimate-failure path: pods are still alive through
+            // this entire window, so we can collect all logs before deleting.
+            LogInfo "Cleaning up resources for release: %s" helmReleaseName
+
+            try
+                LogInfo "Attempting to collect worker logs before cleanup..."
+                let stopwatch = Stopwatch.StartNew()
+                collectLogsFromPods context
+                stopwatch.Stop()
+                LogInfo "Log collection completed in %.2f seconds" stopwatch.Elapsed.TotalSeconds
+            with ex -> LogWarn "Failed to collect some or all worker logs: %s" ex.Message
+
+            RunShellCommand [| "helm"
+                               "uninstall"
+                               helmReleaseName |]
+            |> ignore
 
 let mutable cleanupContext : MissionContext option = None
 
+// NOTE: AppDomain.ProcessExit handlers have a soft ~2-second runtime budget
+// before .NET force-exits the process. If we ever observe that this budget is insufficient, switch to
+// `PosixSignalRegistration.Create(PosixSignal.SIGTERM, ...)`
+// which has no such budget and lets the handler run to completion within
+// Jenkins' full SoftKillWaitSeconds window (~5s default).
 System.AppDomain.CurrentDomain.ProcessExit.Add
     (fun _ ->
         match cleanupContext with
-        | Some ctx -> cleanup ctx
+        | Some ctx -> cleanup true ctx
         | None -> ())
 
 Console.CancelKeyPress.Add
     (fun _ ->
         match cleanupContext with
-        | Some ctx -> cleanup ctx
+        | Some ctx -> cleanup true ctx
         | None -> ()
 
         Environment.Exit(0))
@@ -412,7 +438,7 @@ let historyPubnetParallelCatchupV2 (context: MissionContext) =
                 timeoutLeft <- timeoutLeft - jobMonitorStatusCheckIntervalSecs
                 if timeoutLeft <= 0 then failwith "job monitor not reachable"
         with ex ->
-            cleanup context
+            cleanup false context
             raise ex
 
-    cleanup context
+    cleanup false context