From a744f959ca8c6a0b8284d9415edc45033241ca73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 9 May 2026 11:52:32 +0000
Subject: [PATCH] fix(ci): unwedge approve-test-queue by cancelling orphaned
 runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Real failure mode:
A CICD run on a force-pushed PR head was left in 'waiting' status with a
non-empty /pending_deployments list, but POST to approve it 422'd ('No
pending deployment requests'). Every cron tick fetched the same orphan
first, crashed (KeyError on a non-existent 'id' key), and exit(1)'d.
With MAX_CONCURRENCY=1 the queue stayed wedged for hours.

Fix:
1. After a 422 POST, GET pending_deployments again to distinguish the
   two cases:
     - empty  -> race: another approver won, skip silently.
     - non-empty -> orphan/wedged run: cancel it via
       POST /actions/runs/{id}/cancel so the queue can move on.
2. cancel_run() is a new helper: POST + raise_for_status, returns bool.
3. If a wedged run can't even be cancelled, collect a flag and exit(1)
   at end-of-loop so the cron is loud about it (and other queue items
   still get processed first).
4. KeyError on the error path is gone — the only place a deployment
   id was printed referenced a key that doesn't exist on
   /pending_deployments items.

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .github/workflows/cicd-approve-test-queue.yml | 44 +++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml
index 608005072b5d..f61d129fea2b 100644
--- a/.github/workflows/cicd-approve-test-queue.yml
+++ b/.github/workflows/cicd-approve-test-queue.yml
@@ -81,6 +81,20 @@ jobs:
                   return None
 
 
+          def cancel_run(run_id):
+              """Cancel a workflow run. Returns True on success."""
+              url = f"{API_BASE}/actions/runs/{run_id}/cancel"
+              try:
+                  response = requests.post(url, headers=headers)
+                  response.raise_for_status()
+                  return True
+              except requests.exceptions.RequestException as e:
+                  print(f"Error cancelling run {run_id}: {str(e)}")
+                  if hasattr(e.response, 'text'):
+                      print(f"Response: {e.response.text}")
+                  return False
+
+
           def get_workflow_runs(status):
               """Get all workflow runs for a given status."""
               all_results = []
@@ -129,6 +143,7 @@ jobs:
 
           # Process each deployment
           print("Processing ...")
+          had_unrecoverable = False
           for workflow in pending_workflows:
               if total_workflows >= MAX_CONCURRENCY:
                   print("Maximum concurrency reached, stopping approvals")
@@ -139,7 +154,11 @@ jobs:
               print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")
 
               deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
-              deployment = make_request(deployment_url)[0]
+              deployments = make_request(deployment_url) or []
+              if not deployments:
+                  print(f"No pending deployments for run {workflow_id} (race: approved between list and GET), skipping")
+                  continue
+              deployment = deployments[0]
               environment_id = deployment["environment"]["id"]
 
               # Approve the deployment
@@ -152,9 +171,28 @@ jobs:
 
               if result:
                   total_workflows += 1
+                  continue
+
+              # POST failed. Distinguish race (someone else approved between our GET
+              # and POST) from a wedged run (deployment listed as pending but GitHub
+              # refuses to approve it — typically when the parent ref was force-pushed
+              # and the run was orphaned). The former is benign; the latter blocks
+              # every future cron tick because the run sits at the head of the
+              # waiting queue forever, so we cancel it.
+              deployments_after = make_request(deployment_url) or []
+              if not deployments_after:
+                  print(f"Run {workflow_id} approved by another path (race), skipping")
+                  continue
+
+              print(f"Run {workflow_id} is wedged: POST refused but pending_deployments still non-empty. Cancelling.")
+              if cancel_run(workflow_id):
+                  print(f"Cancelled wedged run {workflow_id}")
               else:
-                  print(f"Failed to approve deployment {deployment['id']}")
-                  exit(1)
+                  print(f"Could not cancel wedged run {workflow_id}; manual intervention required")
+                  had_unrecoverable = True
+
+          if had_unrecoverable:
+              exit(1)
 
           EOF
   notify: