From 7017b385d201d0a624e32b599e13453a3ec15d12 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 27 Apr 2026 12:42:54 -0700 Subject: [PATCH 01/18] secure run_simulators workflow with OIDC and SSM --- .github/workflows/run-simulators.yml | 656 ++++++++++++++++++--------- 1 file changed, 445 insertions(+), 211 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 99f9fd43a..13c891929 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -1,222 +1,456 @@ name: run_simulators + on: # IMPORTANT: this workflow should only be triggered manually via the Actions # portal of the repo!!! Do not modify this workflow's trigger! workflow_dispatch: +# Deny GitHub token permissions by default; grant only what individual jobs need. permissions: {} +# Only one simulator run should use the shared EC2 instance/volume at a time. +concurrency: + group: sim + cancel-in-progress: false + +# Shared AWS config used by the EC2 and SSM commands. +env: + INSTANCE_ID: ${{ vars.AWS_EC2_INSTANCE_ID }} + AWS_DEFAULT_REGION: ${{ vars.AWS_REGION }} + jobs: - start_ec2_instance: - name: start_ec2_instance - runs-on: ubuntu-latest - concurrency: - group: sim - outputs: - volume_id: ${{ steps.create_volume_step.outputs.volume_id }} - env: - INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - steps: - - name: Create Volume from Latest Snapshot and Attach to Instance - id: create_volume_step - run: | - # Retrieve the latest snapshot ID - LATEST_SNAPSHOT_ID=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots | sort_by(@, &StartTime) | [-1].SnapshotId' --output text) - echo "Checking availability for snapshot: $LATEST_SNAPSHOT_ID" - - # Wait for the snapshot to complete - aws ec2 wait snapshot-completed --snapshot-ids $LATEST_SNAPSHOT_ID - echo "Snapshot is ready." - - # Create a new volume from the latest snapshot - volume_id=$(aws ec2 create-volume --snapshot-id $LATEST_SNAPSHOT_ID --availability-zone us-west-1b --volume-type gp3 --size 400 --throughput 250 --query "VolumeId" --output text) - echo "Created volume with ID: $volume_id" - - # Set volume_id as output - echo "volume_id=$volume_id" >> $GITHUB_OUTPUT - cat $GITHUB_OUTPUT - - # Wait until the volume is available - aws ec2 wait volume-available --volume-ids $volume_id - echo "Volume is now available" - - # Attach the volume to the instance - aws ec2 attach-volume --volume-id $volume_id --instance-id $INSTANCE_ID --device /dev/sda1 - echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sda1" - - - name: Start EC2 Instance - run: | - # Get the instance state - instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - - # If the machine is stopping wait for it to fully stop - while [ "$instance_state" == "stopping" ]; do - echo "Instance is stopping, waiting for it to fully stop..." - sleep 10 - instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - done - - # Check if instance state is "stopped" - if [[ "$instance_state" == "stopped" ]]; then - echo "Instance is stopped, starting it..." - aws ec2 start-instances --instance-ids $INSTANCE_ID - elif [[ "$instance_state" == "pending" ]]; then - echo "Instance startup is pending, continuing..." - elif [[ "$instance_state" == "running" ]]; then - echo "Instance is already running..." - exit 0 - else - echo "Unknown instance state: $instance_state" + start_ec2_instance: + name: start_ec2_instance + runs-on: ubuntu-latest + permissions: + id-token: write # Required for GitHub OIDC -> AWS role assumption. + outputs: + volume_id: ${{ steps.create_volume_step.outputs.volume_id }} + steps: + # Use OIDC to get short-lived AWS credentials instead of storing long-lived AWS keys. + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 + with: + role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ vars.AWS_REGION }} + allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + + - name: Create Volume from Latest Snapshot and Attach to Instance + id: create_volume_step + run: | + set -euo pipefail + + # Retrieve the latest snapshot ID owned by this AWS account. + LATEST_SNAPSHOT_ID=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots | sort_by(@, &StartTime) | [-1].SnapshotId' --output text) + echo "Checking availability for snapshot: $LATEST_SNAPSHOT_ID" + + # Wait for the snapshot to complete before creating the temporary test volume. + aws ec2 wait snapshot-completed --snapshot-ids "$LATEST_SNAPSHOT_ID" + echo "Snapshot is ready." + + # Create a temporary volume from the latest simulator snapshot. + volume_id=$(aws ec2 create-volume --snapshot-id "$LATEST_SNAPSHOT_ID" --availability-zone us-west-1b --volume-type gp3 --size 400 --throughput 250 --query "VolumeId" --output text) + echo "Created volume with ID: $volume_id" + + # Save the volume ID so the cleanup job can detach/delete it later. + echo "volume_id=$volume_id" >> "$GITHUB_OUTPUT" + + aws ec2 wait volume-available --volume-ids "$volume_id" + echo "Volume is now available" + + aws ec2 attach-volume --volume-id "$volume_id" --instance-id "$INSTANCE_ID" --device /dev/sda1 + echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sda1" + + - name: Start EC2 Instance + run: | + set -euo pipefail + + # Start the simulator instance only if it is currently stopped. + instance_state=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" | jq -r '.Reservations[].Instances[].State.Name') + + while [ "$instance_state" = "stopping" ]; do + echo "Instance is stopping, waiting for it to fully stop..." + sleep 10 + instance_state=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" | jq -r '.Reservations[].Instances[].State.Name') + done + + if [ "$instance_state" = "stopped" ]; then + echo "Instance is stopped, starting it..." + aws ec2 start-instances --instance-ids "$INSTANCE_ID" + elif [ "$instance_state" = "pending" ]; then + echo "Instance startup is pending, continuing..." + elif [ "$instance_state" = "running" ]; then + echo "Instance is already running..." + exit 0 + else + echo "Unknown instance state: $instance_state" + exit 1 + fi + + echo "Waiting for instance status checks to pass..." + aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID" + echo "Instance is now ready for use." + + check_simulator_version_updates: + name: check_simulator_version_updates + runs-on: ubuntu-latest + needs: start_ec2_instance + permissions: + id-token: write # Required because this job sends commands through AWS SSM. + steps: + # Use OIDC to get short-lived AWS credentials for SSM. + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 + with: + role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ vars.AWS_REGION }} + allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + + - name: Check for Simulator Version Updates + env: + GH_SHA: ${{ github.sha }} + run: | + set -euo pipefail + + # Build the remote shell script as a JSON array for AWS-RunShellScript. + commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' </dev/null || true) + + case "$status" in + Success) + break + ;; + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true exit 1 - fi - - # Wait for instance status checks to pass - echo "Waiting for instance status checks to pass..." - aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID - echo "Instance is now ready for use." - - - check_simulator_version_updates: - name: check_simulator_version_updates - runs-on: ubuntu-latest - needs: start_ec2_instance - steps: - - name: Check for Simulator Version Updates - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST }} - USER_NAME: ${{ secrets.SSH_USERNAME }} - GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} - GH_REF: ${{ github.ref }} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -o SendEnv=GH_REF -i private_key ${USER_NAME}@${HOSTNAME} ' - cd /home/ubuntu/actions/ && - rm -rf Scenic && - git clone --branch $(basename "$GH_REF") --single-branch https://$GH_ACCESS_TOKEN@github.com/BerkeleyLearnVerify/Scenic.git && - cd Scenic && - python3 -m venv venv && - source venv/bin/activate && - python3 -m pip install -e .[test-full] && - python3 .github/check_latest_simulators.py - ' - - check_nvidia_smi: - name: check_nvidia_smi - runs-on: ubuntu-latest - needs: start_ec2_instance - continue-on-error: true - steps: - - name: Check NVIDIA SMI - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{ secrets.SSH_HOST}} - USER_NAME: ${{ secrets.SSH_USERNAME}} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - output=$(nvidia-smi) - echo "$output" - if [ -z "$output" ]; then - echo "NVIDIA Driver is not set" + ;; + Pending|InProgress|Delayed|"") + sleep 5 + ;; + *) + echo "Unexpected SSM status: $status" + sleep 5 + ;; + esac + done + + aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text + + check_nvidia_smi: + name: check_nvidia_smi + runs-on: ubuntu-latest + needs: start_ec2_instance + continue-on-error: true + permissions: + id-token: write # Required because this job sends commands through AWS SSM. + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 + with: + role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ vars.AWS_REGION }} + allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + + - name: Check NVIDIA SMI + run: | + set -euo pipefail + + # Run the GPU driver check remotely through SSM. + commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' <<'EOF' + set -euo pipefail + output=$(nvidia-smi) + echo "$output" + + if [ -z "$output" ]; then + echo "NVIDIA Driver is not set" + exit 1 + fi + EOF + ) + + command_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name AWS-RunShellScript \ + --parameters "commands=$commands" \ + --query 'Command.CommandId' \ + --output text) + + while true; do + status=$(aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'Status' \ + --output text 2>/dev/null || true) + + case "$status" in + Success) + break + ;; + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true exit 1 - fi - ' - - name: NVIDIA Driver is not set - if: ${{ failure() }} - run: | - echo "NVIDIA SMI is not working, please run the steps here on the instance:" - echo "https://scenic-lang.atlassian.net/wiki/spaces/KAN/pages/2785287/Setting+Up+AWS+VM?parentProduct=JSW&initialAllowedFeatures=byline-contributors.byline-extensions.page-comments.delete.page-reactions.inline-comments.non-licensed-share&themeState=dark%253Adark%2520light%253Alight%2520spacing%253Aspacing%2520colorMode%253Alight&locale=en-US#Install-NVIDIA-Drivers" - - run_carla_simulators: - name: run_carla_simulators - runs-on: ubuntu-latest - needs: [check_simulator_version_updates, check_nvidia_smi] - steps: - - name: Run CARLA Tests - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{secrets.SSH_HOST}} - USER_NAME: ${{secrets.SSH_USERNAME}} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -o ServerAliveInterval=60 -o ServerAliveCountMax=3 -i private_key ${USER_NAME}@${HOSTNAME} ' - cd /home/ubuntu/actions/Scenic && - source venv/bin/activate && - carla_versions=($(find /software -maxdepth 1 -type d -name 'carla*')) && - for version in "${carla_versions[@]}"; do - echo "============================= CARLA $version =============================" - export CARLA_ROOT="$version" - pytest tests/simulators/carla - done - ' - - run_webots_simulators: - name: run_webots_simulators - runs-on: ubuntu-latest - needs: [check_simulator_version_updates, check_nvidia_smi] - steps: - - name: Run Webots Tests - env: - PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} - HOSTNAME: ${{secrets.SSH_HOST}} - USER_NAME: ${{secrets.SSH_USERNAME}} - run: | - echo "$PRIVATE_KEY" > private_key && chmod 600 private_key - ssh -o StrictHostKeyChecking=no -i private_key ${USER_NAME}@${HOSTNAME} ' - Xvfb :99 -screen 0 1024x768x16 & - cd /home/ubuntu/actions/Scenic && - source venv/bin/activate && - webots_versions=($(find /software -maxdepth 1 -type d -name 'webots*')) && - export DISPLAY=:99 && - for version in "${webots_versions[@]}"; do - echo "============================= Webots $version =============================" - export WEBOTS_ROOT="$version" - pytest tests/simulators/webots - done - kill %1 - ' - - stop_ec2_instance: - name: stop_ec2_instance - runs-on: ubuntu-latest - needs: [start_ec2_instance, check_simulator_version_updates, check_nvidia_smi, run_carla_simulators, run_webots_simulators] - if: always() - env: - VOLUME_ID: ${{ needs.start_ec2_instance.outputs.volume_id }} - INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} - steps: - - name: Stop EC2 Instance - run: | - # Get the instance state and stop it if running - instance_state=$(aws ec2 describe-instances --instance-ids $INSTANCE_ID | jq -r '.Reservations[].Instances[].State.Name') - if [[ "$instance_state" == "running" ]]; then - echo "Instance is running, stopping it..." - aws ec2 stop-instances --instance-ids $INSTANCE_ID - aws ec2 wait instance-stopped --instance-ids $INSTANCE_ID - echo "Instance has stopped." - elif [[ "$instance_state" == "stopped" ]]; then - echo "Instance is already stopped." - else - echo "Unexpected instance state: $instance_state" - exit 1 - fi - - - name: Detach Volume - run: | - # Detach the volume - aws ec2 detach-volume --volume-id $VOLUME_ID - aws ec2 wait volume-available --volume-ids $VOLUME_ID - echo "Volume $VOLUME_ID detached." - - - name: Delete Volume - run: | - # Delete the volume after snapshot is complete - aws ec2 delete-volume --volume-id $VOLUME_ID - echo "Volume $VOLUME_ID deleted." + ;; + Pending|InProgress|Delayed|"") + sleep 5 + ;; + *) + echo "Unexpected SSM status: $status" + sleep 5 + ;; + esac + done + + aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text + + - name: NVIDIA Driver is not set + if: ${{ failure() }} + run: | + echo "NVIDIA SMI is not working, please run the steps here on the instance:" + echo "https://scenic-lang.atlassian.net/wiki/spaces/KAN/pages/2785287/Setting+Up+AWS+VM?parentProduct=JSW&initialAllowedFeatures=byline-contributors.byline-extensions.page-comments.delete.page-reactions.inline-comments.non-licensed-share&themeState=dark%253Adark%2520light%253Alight%2520spacing%253Aspacing%2520colorMode%253Alight&locale=en-US#Install-NVIDIA-Drivers" + + run_carla_simulators: + name: run_carla_simulators + runs-on: ubuntu-latest + needs: [check_simulator_version_updates, check_nvidia_smi] + permissions: + id-token: write # Required because this job sends commands through AWS SSM. + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 + with: + role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ vars.AWS_REGION }} + allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + + - name: Run CARLA Tests + run: | + set -euo pipefail + + # Run CARLA tests once for each installed CARLA version on the instance. + commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' <<'EOF' + set -euo pipefail + cd /home/ubuntu/actions/Scenic + source venv/bin/activate + + carla_versions=($(find /software -maxdepth 1 -type d -name 'carla*')) + for version in "${carla_versions[@]}"; do + echo "============================= CARLA $version =============================" + export CARLA_ROOT="$version" + pytest tests/simulators/carla + done + EOF + ) + + command_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name AWS-RunShellScript \ + --parameters "commands=$commands" \ + --query 'Command.CommandId' \ + --output text) + + while true; do + status=$(aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'Status' \ + --output text 2>/dev/null || true) + + case "$status" in + Success) + break + ;; + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true + exit 1 + ;; + Pending|InProgress|Delayed|"") + sleep 10 + ;; + *) + echo "Unexpected SSM status: $status" + sleep 10 + ;; + esac + done + + aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text + + run_webots_simulators: + name: run_webots_simulators + runs-on: ubuntu-latest + needs: [check_simulator_version_updates, check_nvidia_smi] + permissions: + id-token: write # Required because this job sends commands through AWS SSM. + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 + with: + role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ vars.AWS_REGION }} + allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + + - name: Run Webots Tests + run: | + set -euo pipefail + + # Run Webots tests on a virtual display because Webots needs graphical support. + commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' <<'EOF' + set -euo pipefail + + Xvfb :99 -screen 0 1024x768x16 & + xvfb_pid=$! + trap "kill $xvfb_pid" EXIT + + cd /home/ubuntu/actions/Scenic + source venv/bin/activate + + webots_versions=($(find /software -maxdepth 1 -type d -name 'webots*')) + export DISPLAY=:99 + + for version in "${webots_versions[@]}"; do + echo "============================= Webots $version =============================" + export WEBOTS_ROOT="$version" + pytest tests/simulators/webots + done + EOF + ) + + command_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name AWS-RunShellScript \ + --parameters "commands=$commands" \ + --query 'Command.CommandId' \ + --output text) + + while true; do + status=$(aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'Status' \ + --output text 2>/dev/null || true) + + case "$status" in + Success) + break + ;; + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true + exit 1 + ;; + Pending|InProgress|Delayed|"") + sleep 10 + ;; + *) + echo "Unexpected SSM status: $status" + sleep 10 + ;; + esac + done + + aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text + + stop_ec2_instance: + name: stop_ec2_instance + runs-on: ubuntu-latest + permissions: + id-token: write # Required for GitHub OIDC -> AWS role assumption. + needs: + [ + start_ec2_instance, + check_simulator_version_updates, + check_nvidia_smi, + run_carla_simulators, + run_webots_simulators, + ] + if: always() # Run cleanup even if earlier jobs failed. + env: + VOLUME_ID: ${{ needs.start_ec2_instance.outputs.volume_id }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 + with: + role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ vars.AWS_REGION }} + allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + + - name: Stop EC2 Instance + run: | + set -euo pipefail + + instance_state=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" | jq -r '.Reservations[].Instances[].State.Name') + if [ "$instance_state" = "running" ]; then + echo "Instance is running, stopping it..." + aws ec2 stop-instances --instance-ids "$INSTANCE_ID" + aws ec2 wait instance-stopped --instance-ids "$INSTANCE_ID" + echo "Instance has stopped." + elif [ "$instance_state" = "stopped" ]; then + echo "Instance is already stopped." + else + echo "Unexpected instance state: $instance_state" + exit 1 + fi + + - name: Detach Volume + run: | + set -euo pipefail + + # Detach the temporary volume created from the latest snapshot. + aws ec2 detach-volume --volume-id "$VOLUME_ID" + aws ec2 wait volume-available --volume-ids "$VOLUME_ID" + echo "Volume $VOLUME_ID detached." + + - name: Delete Volume + run: | + set -euo pipefail + + # Delete the temporary volume so repeated workflow runs do not leave extra storage behind. + aws ec2 delete-volume --volume-id "$VOLUME_ID" + echo "Volume $VOLUME_ID deleted." + From 2a94e89ca161bf6aeb47a195fd5c0e2756bd3d98 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 27 Apr 2026 13:46:39 -0700 Subject: [PATCH 02/18] run SSM commands with Bash --- .github/workflows/run-simulators.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 13c891929..f07220591 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -113,7 +113,9 @@ jobs: set -euo pipefail # Build the remote shell script as a JSON array for AWS-RunShellScript. + # AWS documents this pattern for running Bash scripts through Run Command. commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' < Date: Mon, 27 Apr 2026 14:24:21 -0700 Subject: [PATCH 03/18] set CARLA SSM runtime environment --- .github/workflows/run-simulators.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index f07220591..3f55bebec 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -270,6 +270,20 @@ jobs: commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' <<'EOF' #!/bin/bash set -euo pipefail + + export HOME=/home/ubuntu + export USER=ubuntu + export LOGNAME=ubuntu + export XDG_RUNTIME_DIR=/tmp/runtime-ubuntu + mkdir -p "$XDG_RUNTIME_DIR" + chmod 700 "$XDG_RUNTIME_DIR" + + whoami + echo "HOME=$HOME" + echo "USER=$USER" + echo "LOGNAME=$LOGNAME" + echo "XDG_RUNTIME_DIR=$XDG_RUNTIME_DIR" + cd /home/ubuntu/actions/Scenic source venv/bin/activate From ca6d79238d614dd18ed3ad58970ac8b37fbb9bda Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 27 Apr 2026 15:13:00 -0700 Subject: [PATCH 04/18] run CARLA tests as ubuntu under SSM --- .github/workflows/run-simulators.yml | 40 +++++++++++----------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 3f55bebec..41f90c698 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -113,7 +113,7 @@ jobs: set -euo pipefail # Build the remote shell script as a JSON array for AWS-RunShellScript. - # AWS documents this pattern for running Bash scripts through Run Command. + # The first remote command is #!/bin/bash so SSM runs the script with Bash. commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' < Date: Mon, 27 Apr 2026 15:46:24 -0700 Subject: [PATCH 05/18] fix repo ownership for SSM simulator tests --- .github/workflows/run-simulators.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 41f90c698..47f50747a 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -126,6 +126,9 @@ jobs: source venv/bin/activate python3 -m pip install -e .[test-full] python3 .github/check_latest_simulators.py + + # Later simulator tests run as ubuntu, so make sure the checked-out repo is writable by ubuntu. + chown -R ubuntu:ubuntu /home/ubuntu/actions/Scenic EOF ) From 324e759f792594d566944bf324bcf641b99ff6d6 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 27 Apr 2026 16:09:12 -0700 Subject: [PATCH 06/18] fix SSM checkout ownership before tests --- .github/workflows/run-simulators.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 47f50747a..b9d1a398b 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -122,13 +122,16 @@ jobs: git clone --no-checkout https://github.com/BerkeleyLearnVerify/Scenic.git Scenic cd Scenic git checkout --detach "$GH_SHA" + + # Later simulator tests run as ubuntu, so make sure the checked-out repo is writable by ubuntu + # before any Scenic-generated files are created. + chown -R ubuntu:ubuntu /home/ubuntu/actions/Scenic + chmod -R u+rwX /home/ubuntu/actions/Scenic + python3 -m venv venv source venv/bin/activate python3 -m pip install -e .[test-full] python3 .github/check_latest_simulators.py - - # Later simulator tests run as ubuntu, so make sure the checked-out repo is writable by ubuntu. - chown -R ubuntu:ubuntu /home/ubuntu/actions/Scenic EOF ) From b034417fa2a93231b9081ab62779b8afed6b97f9 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 28 Apr 2026 09:29:08 -0700 Subject: [PATCH 07/18] run simulator setup as ubuntu under SSM --- .github/workflows/run-simulators.yml | 36 ++++++++++++++++------------ 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index b9d1a398b..1858e3f39 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -117,21 +117,27 @@ jobs: commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' < Date: Tue, 28 Apr 2026 16:02:38 -0700 Subject: [PATCH 08/18] cleanup --- .github/workflows/run-simulators.yml | 128 +++++++++++---------------- 1 file changed, 50 insertions(+), 78 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 1858e3f39..2dccad178 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -38,42 +38,42 @@ jobs: - name: Create Volume from Latest Snapshot and Attach to Instance id: create_volume_step run: | - set -euo pipefail - - # Retrieve the latest snapshot ID owned by this AWS account. + # Retrieve the latest snapshot ID LATEST_SNAPSHOT_ID=$(aws ec2 describe-snapshots --owner-ids self --query 'Snapshots | sort_by(@, &StartTime) | [-1].SnapshotId' --output text) echo "Checking availability for snapshot: $LATEST_SNAPSHOT_ID" - # Wait for the snapshot to complete before creating the temporary test volume. + # Wait for the snapshot to complete aws ec2 wait snapshot-completed --snapshot-ids "$LATEST_SNAPSHOT_ID" echo "Snapshot is ready." - # Create a temporary volume from the latest simulator snapshot. + # Create a new volume from the latest snapshot volume_id=$(aws ec2 create-volume --snapshot-id "$LATEST_SNAPSHOT_ID" --availability-zone us-west-1b --volume-type gp3 --size 400 --throughput 250 --query "VolumeId" --output text) echo "Created volume with ID: $volume_id" - # Save the volume ID so the cleanup job can detach/delete it later. + # Set volume_id as output echo "volume_id=$volume_id" >> "$GITHUB_OUTPUT" + # Wait until the volume is available aws ec2 wait volume-available --volume-ids "$volume_id" echo "Volume is now available" + # Attach the volume to the instance aws ec2 attach-volume --volume-id "$volume_id" --instance-id "$INSTANCE_ID" --device /dev/sda1 echo "Volume $volume_id attached to instance $INSTANCE_ID as /dev/sda1" - name: Start EC2 Instance run: | - set -euo pipefail - - # Start the simulator instance only if it is currently stopped. + # Get the instance state instance_state=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" | jq -r '.Reservations[].Instances[].State.Name') + # If the machine is stopping wait for it to fully stop while [ "$instance_state" = "stopping" ]; do echo "Instance is stopping, waiting for it to fully stop..." sleep 10 instance_state=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" | jq -r '.Reservations[].Instances[].State.Name') done + # Check if instance state is "stopped" if [ "$instance_state" = "stopped" ]; then echo "Instance is stopped, starting it..." aws ec2 start-instances --instance-ids "$INSTANCE_ID" @@ -87,6 +87,7 @@ jobs: exit 1 fi + # Wait for instance status checks to pass echo "Waiting for instance status checks to pass..." aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID" echo "Instance is now ready for use." @@ -110,32 +111,24 @@ jobs: env: GH_SHA: ${{ github.sha }} run: | - set -euo pipefail - # Build the remote shell script as a JSON array for AWS-RunShellScript. # The first remote command is #!/bin/bash so SSM runs the script with Bash. commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' < Date: Tue, 28 Apr 2026 16:06:21 -0700 Subject: [PATCH 09/18] cleanup --- .github/workflows/run-simulators.yml | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 2dccad178..16f91089c 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -147,19 +147,19 @@ jobs: status=$(aws ssm get-command-invocation \ --command-id "$command_id" \ --instance-id "$INSTANCE_ID" \ - --query 'Status' \ + --query 'StatusDetails' \ --output text 2>/dev/null || true) case "$status" in Success) break ;; - Failed|Cancelled|TimedOut|Cancelling|DeliveryTimedOut|ExecutionTimedOut|Undeliverable|Terminated) + Failed|Cancelled|TimedOut|Cancelling|"Delivery Timed Out"|"Execution Timed Out"|Undeliverable|Terminated|Incomplete|"Rate Exceeded") aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true exit 1 ;; - Pending|InProgress|Delayed|"") + Pending|"In Progress"|InProgress|Delayed|"") sleep 5 ;; *) @@ -215,19 +215,19 @@ jobs: status=$(aws ssm get-command-invocation \ --command-id "$command_id" \ --instance-id "$INSTANCE_ID" \ - --query 'Status' \ + --query 'StatusDetails' \ --output text 2>/dev/null || true) case "$status" in Success) break ;; - Failed|Cancelled|TimedOut|Cancelling|DeliveryTimedOut|ExecutionTimedOut|Undeliverable|Terminated) + Failed|Cancelled|TimedOut|Cancelling|"Delivery Timed Out"|"Execution Timed Out"|Undeliverable|Terminated|Incomplete|"Rate Exceeded") aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true exit 1 ;; - Pending|InProgress|Delayed|"") + Pending|"In Progress"|InProgress|Delayed|"") sleep 5 ;; *) @@ -293,19 +293,19 @@ jobs: status=$(aws ssm get-command-invocation \ --command-id "$command_id" \ --instance-id "$INSTANCE_ID" \ - --query 'Status' \ + --query 'StatusDetails' \ --output text 2>/dev/null || true) case "$status" in Success) break ;; - Failed|Cancelled|TimedOut|Cancelling|DeliveryTimedOut|ExecutionTimedOut|Undeliverable|Terminated) + Failed|Cancelled|TimedOut|Cancelling|"Delivery Timed Out"|"Execution Timed Out"|Undeliverable|Terminated|Incomplete|"Rate Exceeded") aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true exit 1 ;; - Pending|InProgress|Delayed|"") + Pending|"In Progress"|InProgress|Delayed|"") sleep 10 ;; *) @@ -368,19 +368,19 @@ jobs: status=$(aws ssm get-command-invocation \ --command-id "$command_id" \ --instance-id "$INSTANCE_ID" \ - --query 'Status' \ + --query 'StatusDetails' \ --output text 2>/dev/null || true) case "$status" in Success) break ;; - Failed|Cancelled|TimedOut|Cancelling|DeliveryTimedOut|ExecutionTimedOut|Undeliverable|Terminated) + Failed|Cancelled|TimedOut|Cancelling|"Delivery Timed Out"|"Execution Timed Out"|Undeliverable|Terminated|Incomplete|"Rate Exceeded") aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true exit 1 ;; - Pending|InProgress|Delayed|"") + Pending|"In Progress"|InProgress|Delayed|"") sleep 10 ;; *) @@ -437,6 +437,7 @@ jobs: fi - name: Detach Volume + if: ${{ needs.start_ec2_instance.outputs.volume_id != '' }} run: | # Detach the volume aws ec2 detach-volume --volume-id "$VOLUME_ID" @@ -444,6 +445,7 @@ jobs: echo "Volume $VOLUME_ID detached." - name: Delete Volume + if: ${{ needs.start_ec2_instance.outputs.volume_id != '' }} run: | # Delete the volume after snapshot is complete aws ec2 delete-volume --volume-id "$VOLUME_ID" From 7675431632507a40d016735eb7a520610166ac99 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 28 Apr 2026 16:53:20 -0700 Subject: [PATCH 10/18] cleanup --- .github/workflows/run-simulators.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 16f91089c..91815390f 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -97,7 +97,7 @@ jobs: runs-on: ubuntu-latest needs: start_ec2_instance permissions: - id-token: write # Required because this job sends commands through AWS SSM. + id-token: write steps: # Use OIDC to get short-lived AWS credentials for SSM. - name: Configure AWS Credentials @@ -181,7 +181,7 @@ jobs: needs: start_ec2_instance continue-on-error: true permissions: - id-token: write # Required because this job sends commands through AWS SSM. + id-token: write steps: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 @@ -254,7 +254,7 @@ jobs: runs-on: ubuntu-latest needs: [check_simulator_version_updates, check_nvidia_smi] permissions: - id-token: write # Required because this job sends commands through AWS SSM. + id-token: write steps: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 @@ -326,7 +326,7 @@ jobs: runs-on: ubuntu-latest needs: [check_simulator_version_updates, check_nvidia_smi] permissions: - id-token: write # Required because this job sends commands through AWS SSM. + id-token: write steps: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 @@ -400,7 +400,7 @@ jobs: name: stop_ec2_instance runs-on: ubuntu-latest permissions: - id-token: write # Required for GitHub OIDC -> AWS role assumption. + id-token: write needs: [ start_ec2_instance, From 0225fe9fe7773a34c8994ba1731d8a91a3a80fd8 Mon Sep 17 00:00:00 2001 From: lola Date: Wed, 29 Apr 2026 07:28:23 -0700 Subject: [PATCH 11/18] cleanup --- .github/workflows/run-simulators.yml | 49 +++++++++++++--------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 91815390f..15321c42d 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -5,25 +5,22 @@ on: # portal of the repo!!! Do not modify this workflow's trigger! workflow_dispatch: -# Deny GitHub token permissions by default; grant only what individual jobs need. permissions: {} -# Only one simulator run should use the shared EC2 instance/volume at a time. +# Ensure only one simulator workflow runs at a time concurrency: group: sim cancel-in-progress: false -# Shared AWS config used by the EC2 and SSM commands. env: - INSTANCE_ID: ${{ vars.AWS_EC2_INSTANCE_ID }} - AWS_DEFAULT_REGION: ${{ vars.AWS_REGION }} + INSTANCE_ID: ${{ secrets.AWS_EC2_INSTANCE_ID }} jobs: start_ec2_instance: name: start_ec2_instance runs-on: ubuntu-latest permissions: - id-token: write # Required for GitHub OIDC -> AWS role assumption. + id-token: write # This is required for OIDC to request the JWT outputs: volume_id: ${{ steps.create_volume_step.outputs.volume_id }} steps: @@ -31,9 +28,9 @@ jobs: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 with: - role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} - aws-region: ${{ vars.AWS_REGION }} - allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ secrets.AWS_REGION }} + allowed-account-ids: ${{ secrets.AWS_ACCOUNT_ID }} - name: Create Volume from Latest Snapshot and Attach to Instance id: create_volume_step @@ -99,13 +96,13 @@ jobs: permissions: id-token: write steps: - # Use OIDC to get short-lived AWS credentials for SSM. + # Use OIDC to get AWS credentials for SSM. - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 with: - role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} - aws-region: ${{ vars.AWS_REGION }} - allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ secrets.AWS_REGION }} + allowed-account-ids: ${{ secrets.AWS_ACCOUNT_ID }} - name: Check for Simulator Version Updates env: @@ -186,9 +183,9 @@ jobs: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 with: - role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} - aws-region: ${{ vars.AWS_REGION }} - allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ secrets.AWS_REGION }} + allowed-account-ids: ${{ secrets.AWS_ACCOUNT_ID }} - name: Check NVIDIA SMI run: | @@ -259,9 +256,9 @@ jobs: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 with: - role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} - aws-region: ${{ vars.AWS_REGION }} - allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ secrets.AWS_REGION }} + allowed-account-ids: ${{ secrets.AWS_ACCOUNT_ID }} - name: Run CARLA Tests run: | @@ -331,9 +328,9 @@ jobs: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 with: - role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} - aws-region: ${{ vars.AWS_REGION }} - allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ secrets.AWS_REGION }} + allowed-account-ids: ${{ secrets.AWS_ACCOUNT_ID }} - name: Run Webots Tests run: | @@ -400,7 +397,7 @@ jobs: name: stop_ec2_instance runs-on: ubuntu-latest permissions: - id-token: write + id-token: write needs: [ start_ec2_instance, @@ -416,9 +413,9 @@ jobs: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 with: - role-to-assume: ${{ vars.AWS_ROLE_TO_ASSUME }} - aws-region: ${{ vars.AWS_REGION }} - allowed-account-ids: ${{ vars.AWS_ACCOUNT_ID }} + role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} + aws-region: ${{ secrets.AWS_REGION }} + allowed-account-ids: ${{ secrets.AWS_ACCOUNT_ID }} - name: Stop EC2 Instance run: | From 3560233e25ce31a7468d3b389df05b2b06bbc3b9 Mon Sep 17 00:00:00 2001 From: lola Date: Mon, 4 May 2026 10:27:36 -0700 Subject: [PATCH 12/18] clean up simulator workflow --- .github/workflows/run-simulators.yml | 66 ++++++++++------------------ 1 file changed, 24 insertions(+), 42 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 15321c42d..9920b9a74 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -115,8 +115,6 @@ jobs: # SSM runs as root, but CARLA cannot. Use ubuntu for checkout/install too # so generated files like parser.py are writable during simulator tests. - mkdir -p /home/ubuntu/actions && - chown -R ubuntu:ubuntu /home/ubuntu/actions && sudo -u ubuntu -H bash -lc ' cd /home/ubuntu/actions/ && rm -rf Scenic && @@ -144,23 +142,19 @@ jobs: status=$(aws ssm get-command-invocation \ --command-id "$command_id" \ --instance-id "$INSTANCE_ID" \ - --query 'StatusDetails' \ - --output text 2>/dev/null || true) + --query 'Status' \ + --output text) case "$status" in Success) break ;; - Failed|Cancelled|TimedOut|Cancelling|"Delivery Timed Out"|"Execution Timed Out"|Undeliverable|Terminated|Incomplete|"Rate Exceeded") - aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true - aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text exit 1 ;; - Pending|"In Progress"|InProgress|Delayed|"") - sleep 5 - ;; - *) - echo "Unexpected SSM status: $status" + Pending|InProgress|Delayed) sleep 5 ;; esac @@ -212,23 +206,19 @@ jobs: status=$(aws ssm get-command-invocation \ --command-id "$command_id" \ --instance-id "$INSTANCE_ID" \ - --query 'StatusDetails' \ - --output text 2>/dev/null || true) + --query 'Status' \ + --output text) case "$status" in Success) break ;; - Failed|Cancelled|TimedOut|Cancelling|"Delivery Timed Out"|"Execution Timed Out"|Undeliverable|Terminated|Incomplete|"Rate Exceeded") - aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true - aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text exit 1 ;; - Pending|"In Progress"|InProgress|Delayed|"") - sleep 5 - ;; - *) - echo "Unexpected SSM status: $status" + Pending|InProgress|Delayed) sleep 5 ;; esac @@ -290,23 +280,19 @@ jobs: status=$(aws ssm get-command-invocation \ --command-id "$command_id" \ --instance-id "$INSTANCE_ID" \ - --query 'StatusDetails' \ - --output text 2>/dev/null || true) + --query 'Status' \ + --output text) case "$status" in Success) break ;; - Failed|Cancelled|TimedOut|Cancelling|"Delivery Timed Out"|"Execution Timed Out"|Undeliverable|Terminated|Incomplete|"Rate Exceeded") - aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true - aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text exit 1 ;; - Pending|"In Progress"|InProgress|Delayed|"") - sleep 10 - ;; - *) - echo "Unexpected SSM status: $status" + Pending|InProgress|Delayed) sleep 10 ;; esac @@ -365,23 +351,19 @@ jobs: status=$(aws ssm get-command-invocation \ --command-id "$command_id" \ --instance-id "$INSTANCE_ID" \ - --query 'StatusDetails' \ - --output text 2>/dev/null || true) + --query 'Status' \ + --output text) case "$status" in Success) break ;; - Failed|Cancelled|TimedOut|Cancelling|"Delivery Timed Out"|"Execution Timed Out"|Undeliverable|Terminated|Incomplete|"Rate Exceeded") - aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text || true - aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text || true + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text exit 1 ;; - Pending|"In Progress"|InProgress|Delayed|"") - sleep 10 - ;; - *) - echo "Unexpected SSM status: $status" + Pending|InProgress|Delayed) sleep 10 ;; esac From e5b60ec97ab263cd3aa0b473abad0c4f3bf901fa Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 5 May 2026 09:40:35 -0700 Subject: [PATCH 13/18] extend SSM execution timeout for CARLA tests --- .github/workflows/run-simulators.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 9920b9a74..5324931f0 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -269,10 +269,12 @@ jobs: EOF ) + # AWS-RunShellScript's executionTimeout defaults to 3600 seconds (1 hour). + # CARLA tests can take longer, so allow this SSM command to run for 3 hours. command_id=$(aws ssm send-command \ --instance-ids "$INSTANCE_ID" \ --document-name AWS-RunShellScript \ - --parameters "commands=$commands" \ + --parameters "commands=$commands,executionTimeout=10800" \ --query 'Command.CommandId' \ --output text) From 97d09d4b2dc7d5d135c0d017551ac80a8543ae72 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 12 May 2026 12:25:31 -0700 Subject: [PATCH 14/18] Install matching CARLA Python API during simulator tests --- .github/workflows/run-simulators.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 5324931f0..9defd7e65 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -257,12 +257,17 @@ jobs: commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' <<'EOF' #!/bin/bash sudo -u ubuntu -H bash -lc ' + set -e cd /home/ubuntu/actions/Scenic && source venv/bin/activate && - carla_versions=($(find /software -maxdepth 1 -type d -name "carla*")) && - for version in "${carla_versions[@]}"; do - echo "============================= CARLA $version =============================" - export CARLA_ROOT="$version" + carla_roots=($(find /software -maxdepth 1 -type d -name "carla*" | sort -V)) && + for carla_root in "${carla_roots[@]}"; do + carla_version="${carla_root#/software/carla}" + echo "============================= CARLA $carla_version =============================" + export CARLA_ROOT="$carla_root" + echo "Using CARLA_ROOT=$CARLA_ROOT" + echo "Installing Python CARLA API carla==$carla_version" + python3 -m pip install --force-reinstall "carla==$carla_version" pytest tests/simulators/carla done ' From 831fd474b607e38aa32aefd7e63dc9f981a08446 Mon Sep 17 00:00:00 2001 From: lola Date: Tue, 26 May 2026 16:48:03 -0700 Subject: [PATCH 15/18] Install CARLA APIs from local wheels in CI --- .github/workflows/run-simulators.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 9defd7e65..48b0c6f3f 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -265,9 +265,7 @@ jobs: carla_version="${carla_root#/software/carla}" echo "============================= CARLA $carla_version =============================" export CARLA_ROOT="$carla_root" - echo "Using CARLA_ROOT=$CARLA_ROOT" - echo "Installing Python CARLA API carla==$carla_version" - python3 -m pip install --force-reinstall "carla==$carla_version" + python3 -m pip install --force-reinstall "$CARLA_ROOT"/PythonAPI/carla/dist/carla-"$carla_version"-cp310-cp310-*.whl pytest tests/simulators/carla done ' From 7269c187af330fffe3b112277ba0dae1e3c93a17 Mon Sep 17 00:00:00 2001 From: lola Date: Wed, 27 May 2026 12:11:32 -0700 Subject: [PATCH 16/18] increase timeout for carla CI tests --- .github/workflows/run-simulators.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 48b0c6f3f..cc38cedd2 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -249,6 +249,9 @@ jobs: role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }} aws-region: ${{ secrets.AWS_REGION }} allowed-account-ids: ${{ secrets.AWS_ACCOUNT_ID }} + # CARLA tests can exceed the default 1-hour role session. + # Keep this longer than the SSM command timeout so we can still fetch logs if SSM times out. + role-duration-seconds: 9000 # 2.5 hours - name: Run CARLA Tests run: | @@ -273,11 +276,11 @@ jobs: ) # AWS-RunShellScript's executionTimeout defaults to 3600 seconds (1 hour). - # CARLA tests can take longer, so allow this SSM command to run for 3 hours. + # Limit the CARLA SSM command to 2 hours so it ends before the AWS role session expires. command_id=$(aws ssm send-command \ --instance-ids "$INSTANCE_ID" \ --document-name AWS-RunShellScript \ - --parameters "commands=$commands,executionTimeout=10800" \ + --parameters "commands=$commands,executionTimeout=7200" \ --query 'Command.CommandId' \ --output text) From 86fb7548220921461945a2a1caeda200c1b6c46f Mon Sep 17 00:00:00 2001 From: lola Date: Fri, 29 May 2026 09:23:50 -0700 Subject: [PATCH 17/18] use helper script for ssm commands --- .github/scripts/run-ssm-command.sh | 44 +++++++ .github/workflows/run-simulators.yml | 174 +++------------------------ 2 files changed, 64 insertions(+), 154 deletions(-) create mode 100755 .github/scripts/run-ssm-command.sh diff --git a/.github/scripts/run-ssm-command.sh b/.github/scripts/run-ssm-command.sh new file mode 100755 index 000000000..b9a72723e --- /dev/null +++ b/.github/scripts/run-ssm-command.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +INSTANCE_ID="${1:?Missing instance ID}" +EXECUTION_TIMEOUT="${2:-3600}" + +# Build the remote shell script as a JSON array for AWS-RunShellScript. +commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end') + +# Run the script on the EC2 instance through SSM. +command_id=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name AWS-RunShellScript \ + --parameters "commands=$commands,executionTimeout=$EXECUTION_TIMEOUT" \ + --query 'Command.CommandId' \ + --output text) + +# Poll SSM until the remote command finishes, then print its output. +while true; do + status=$(aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'Status' \ + --output text) + + case "$status" in + Success) + break + ;; + Failed|Cancelled|TimedOut|Cancelling) + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardOutputContent' --output text + aws ssm get-command-invocation --command-id "$command_id" --instance-id "$INSTANCE_ID" --query 'StandardErrorContent' --output text + exit 1 + ;; + Pending|InProgress|Delayed) + sleep 10 + ;; + esac +done + +aws ssm get-command-invocation \ + --command-id "$command_id" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index cc38cedd2..37f9122d0 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -95,7 +95,11 @@ jobs: needs: start_ec2_instance permissions: id-token: write + contents: read steps: + - name: Checkout repository + uses: actions/checkout@v4 + # Use OIDC to get AWS credentials for SSM. - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 @@ -108,9 +112,7 @@ jobs: env: GH_SHA: ${{ github.sha }} run: | - # Build the remote shell script as a JSON array for AWS-RunShellScript. - # The first remote command is #!/bin/bash so SSM runs the script with Bash. - commands=$(jq -Rs -c 'split("\n") | if .[-1] == "" then .[:-1] else . end' < Date: Fri, 29 May 2026 10:19:21 -0700 Subject: [PATCH 18/18] pin checkout action --- .github/workflows/run-simulators.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run-simulators.yml b/.github/workflows/run-simulators.yml index 37f9122d0..9b952aec3 100644 --- a/.github/workflows/run-simulators.yml +++ b/.github/workflows/run-simulators.yml @@ -98,7 +98,9 @@ jobs: contents: read steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 + with: + persist-credentials: false # Use OIDC to get AWS credentials for SSM. - name: Configure AWS Credentials @@ -140,7 +142,9 @@ jobs: contents: read steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 + with: + persist-credentials: false - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 @@ -177,7 +181,9 @@ jobs: contents: read steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 + with: + persist-credentials: false - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37 @@ -219,7 +225,9 @@ jobs: contents: read steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 + with: + persist-credentials: false - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@ec61189d14ec14c8efccab744f656cffd0e33f37