From a744f959ca8c6a0b8284d9415edc45033241ca73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 9 May 2026 11:52:32 +0000 Subject: [PATCH] fix(ci): unwedge approve-test-queue by cancelling orphaned runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real failure mode: A CICD run on a force-pushed PR head was left in 'waiting' status with a non-empty /pending_deployments list, but POST to approve it 422'd ('No pending deployment requests'). Every cron tick fetched the same orphan first, crashed (KeyError on a non-existent 'id' key), and exit(1)'d. With MAX_CONCURRENCY=1 the queue stayed wedged for hours. Fix: 1. After a 422 POST, GET pending_deployments again to distinguish the two cases: - empty -> race: another approver won, skip silently. - non-empty -> orphan/wedged run: cancel it via POST /actions/runs/{id}/cancel so the queue can move on. 2. cancel_run() is a new helper: POST + raise_for_status, returns bool. 3. If a wedged run can't even be cancelled, collect a flag and exit(1) at end-of-loop so the cron is loud about it (and other queue items still get processed first). 4. KeyError on the error path is gone — the only place a deployment id was printed referenced a key that doesn't exist on /pending_deployments items. Signed-off-by: oliver könig --- .github/workflows/cicd-approve-test-queue.yml | 44 +++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml index 608005072b5d..f61d129fea2b 100644 --- a/.github/workflows/cicd-approve-test-queue.yml +++ b/.github/workflows/cicd-approve-test-queue.yml @@ -81,6 +81,20 @@ jobs: return None + def cancel_run(run_id): + """Cancel a workflow run. Returns True on success.""" + url = f"{API_BASE}/actions/runs/{run_id}/cancel" + try: + response = requests.post(url, headers=headers) + response.raise_for_status() + return True + except requests.exceptions.RequestException as e: + print(f"Error cancelling run {run_id}: {str(e)}") + if hasattr(e.response, 'text'): + print(f"Response: {e.response.text}") + return False + + def get_workflow_runs(status): """Get all workflow runs for a given status.""" all_results = [] @@ -129,6 +143,7 @@ jobs: # Process each deployment print("Processing ...") + had_unrecoverable = False for workflow in pending_workflows: if total_workflows >= MAX_CONCURRENCY: print("Maximum concurrency reached, stopping approvals") @@ -139,7 +154,11 @@ jobs: print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}") deployment_url = f"actions/runs/{workflow_id}/pending_deployments" - deployment = make_request(deployment_url)[0] + deployments = make_request(deployment_url) or [] + if not deployments: + print(f"No pending deployments for run {workflow_id} (race: approved between list and GET), skipping") + continue + deployment = deployments[0] environment_id = deployment["environment"]["id"] # Approve the deployment @@ -152,9 +171,28 @@ jobs: if result: total_workflows += 1 + continue + + # POST failed. Distinguish race (someone else approved between our GET + # and POST) from a wedged run (deployment listed as pending but GitHub + # refuses to approve it — typically when the parent ref was force-pushed + # and the run was orphaned). The former is benign; the latter blocks + # every future cron tick because the run sits at the head of the + # waiting queue forever, so we cancel it. + deployments_after = make_request(deployment_url) or [] + if not deployments_after: + print(f"Run {workflow_id} approved by another path (race), skipping") + continue + + print(f"Run {workflow_id} is wedged: POST refused but pending_deployments still non-empty. Cancelling.") + if cancel_run(workflow_id): + print(f"Cancelled wedged run {workflow_id}") else: - print(f"Failed to approve deployment {deployment['id']}") - exit(1) + print(f"Could not cancel wedged run {workflow_id}; manual intervention required") + had_unrecoverable = True + + if had_unrecoverable: + exit(1) EOF notify: