Skip to content

feat: end-to-end CI tests for backup/restore flow (#22) #20

feat: end-to-end CI tests for backup/restore flow (#22)

feat: end-to-end CI tests for backup/restore flow (#22) #20

name: Deployment Verification
on:
push:
branches:
- main
pull_request:
branches:
- main
schedule:
# Weekly rebuild to catch upstream image drift (new Traefik, Keycloak,
# or Postgres patch releases that break deployment).
- cron: "0 6 * * 1"
workflow_dispatch:
concurrency:
group: deployment-verification-${{ github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
permissions:
contents: read
jobs:
lint:
name: Lint shell scripts and workflow YAML
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- name: ShellCheck
# Uses the official koalaman/shellcheck-alpine image directly rather
# than an intermediate GitHub Action, so there is one less supply-chain
# layer to pin and review. Covers both repo-root scripts and the
# tests/ directory so the backup-restore-e2e runner is linted too.
run: |
docker run --rm -v "$PWD:/mnt" -w /mnt \
koalaman/shellcheck-alpine:stable \
shellcheck ./*.sh tests/*.sh
- name: actionlint (GitHub Actions workflow linting)
# Uses the rhysd/actionlint image directly pinned to a specific
# version. Surfaces workflow typos, invalid references to jobs/
# outputs, and common GitHub Actions footguns the YAML parser
# doesn't catch. actionlint itself is a single Go binary.
run: |
docker run --rm -v "$PWD:/mnt" -w /mnt \
rhysd/actionlint:1.7.12 \
-color
scan-trivy:
name: Scan pinned upstream image with Trivy
runs-on: ubuntu-latest
timeout-minutes: 10
# Trivy findings don't block the pipeline — they surface in the Security
# tab where they can be triaged and fixed via Dependabot upstream-digest
# bumps. A hard block here would cause CI failures on every new CVE
# disclosure, which isn't actionable inside this PR.
continue-on-error: true
permissions:
contents: read
security-events: write
strategy:
# One job per upstream image — findings show up separately in the
# GitHub Security tab under distinct categories (trivy-postgres,
# trivy-traefik, trivy-keycloak).
fail-fast: false
matrix:
include:
- name: postgres
image: "postgres:16@sha256:71e27bf60b70bded003791b5573f8b808365613f341df20ffcf0c1ed7bc13ddf"
- name: traefik
image: "traefik:3.2@sha256:e561a37f8710d9cf41c78bdf421d822b2c0b48267ec0552e644565fb55466ea9"
- name: keycloak
image: "quay.io/keycloak/keycloak:26.2.5@sha256:4883630ef9db14031cde3e60700c9a9a8eaf1b5c24db1589d6a2d43de38ba2a9"
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- name: Trivy scan of ${{ matrix.name }}
uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # v0.35.0
with:
image-ref: ${{ matrix.image }}
format: sarif
output: trivy-${{ matrix.name }}.sarif
severity: CRITICAL,HIGH
ignore-unfixed: true
- name: Upload Trivy SARIF (${{ matrix.name }}) to GitHub Security
uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
with:
sarif_file: trivy-${{ matrix.name }}.sarif
category: trivy-${{ matrix.name }}
deploy-and-test:
name: docker compose up + HTTPS + Traefik dashboard smoke
runs-on: ubuntu-latest
# Wait for lint to pass so we don't burn the 15-minute compose-up slot
# on a workflow that has shellcheck/actionlint errors. scan-trivy runs
# in parallel (not a dependency) since findings don't block deployment.
needs: lint
timeout-minutes: 15
permissions:
contents: read
env:
NETWORK_ONE: keycloak-network
NETWORK_TWO: traefik-network
DOCKER_COMPOSE_FILE: keycloak-traefik-letsencrypt-docker-compose.yml
APP_HOSTNAME: keycloak.heyvaldemar.net
APP_TRAEFIK_HOSTNAME: traefik.keycloak.heyvaldemar.net
COMPOSE_PROJECT_NAME: keycloak
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- name: Create necessary Docker networks
run: |
docker network create "$NETWORK_ONE" || true
docker network create "$NETWORK_TWO" || true
- name: Generate test .env with ephemeral credentials
# The real .env is gitignored. CI generates throwaway credentials so
# `docker compose up` succeeds without committing secrets to the repo.
run: |
cat > .env <<EOF
TRAEFIK_IMAGE_TAG=traefik:3.2@sha256:e561a37f8710d9cf41c78bdf421d822b2c0b48267ec0552e644565fb55466ea9
TRAEFIK_LOG_LEVEL=WARN
TRAEFIK_ACME_EMAIL=ci@example.com
TRAEFIK_HOSTNAME=${APP_TRAEFIK_HOSTNAME}
TRAEFIK_BASIC_AUTH=traefikadmin:\$\$2y\$\$10\$\$sMzJfirKC75x/hVpiINeZOiSm.Jkity9cn4KwNkRvO7hSQVFc5FLO
KEYCLOAK_POSTGRES_IMAGE_TAG=postgres:16@sha256:71e27bf60b70bded003791b5573f8b808365613f341df20ffcf0c1ed7bc13ddf
KEYCLOAK_IMAGE_TAG=quay.io/keycloak/keycloak:26.2.5@sha256:4883630ef9db14031cde3e60700c9a9a8eaf1b5c24db1589d6a2d43de38ba2a9
KEYCLOAK_DB_NAME=keycloakdb
KEYCLOAK_DB_USER=keycloakdbuser
KEYCLOAK_DB_PASSWORD=$(openssl rand -base64 24 | tr -d '/+=' | head -c 32)
KEYCLOAK_ADMIN_USERNAME=keycloakadmin
KEYCLOAK_ADMIN_PASSWORD=$(openssl rand -base64 24 | tr -d '/+=' | head -c 32)
KEYCLOAK_HOSTNAME=${APP_HOSTNAME}
KEYCLOAK_BACKUP_INIT_SLEEP=30m
KEYCLOAK_BACKUP_INTERVAL=24h
KEYCLOAK_POSTGRES_BACKUP_PRUNE_DAYS=7
KEYCLOAK_POSTGRES_BACKUPS_PATH=/srv/keycloak-postgres/backups
KEYCLOAK_POSTGRES_BACKUP_NAME=keycloak-postgres-backup
EOF
echo "Generated ephemeral .env for CI run"
- name: Start up services using Docker Compose
run: docker compose -f "$DOCKER_COMPOSE_FILE" -p "$COMPOSE_PROJECT_NAME" up -d
- name: Modify /etc/hosts for internal routing
run: |
echo "127.0.0.1 $APP_HOSTNAME" | sudo tee -a /etc/hosts
echo "127.0.0.1 $APP_TRAEFIK_HOSTNAME" | sudo tee -a /etc/hosts
- name: Print Docker Compose services status
run: docker ps
- name: Wait for the application to be ready via Traefik
run: |
echo "Checking the routing and availability of the application via Traefik..."
# $APP_HOSTNAME is intentionally expanded by the inner bash -c
# (which inherits the job-level env:), not by the outer shell.
# shellcheck disable=SC2016
timeout 5m bash -c 'while ! curl -fsSLk "https://$APP_HOSTNAME"; do
echo "Waiting for the application to be ready..."
sleep 10
done'
- name: Wait for the Traefik dashboard to be ready
run: |
echo "Checking the routing and availability of the Traefik dashboard..."
# Same deferred-expansion pattern as above.
# shellcheck disable=SC2016
timeout 5m bash -c 'while ! curl -fsSLk --write-out "%{http_code}" --output /dev/null "https://$APP_TRAEFIK_HOSTNAME" | grep -E "200|401"; do
echo "Waiting for the application to be ready..."
sleep 10
done'
- name: Inspect Network Configuration
run: |
docker network inspect "$NETWORK_ONE"
docker network inspect "$NETWORK_TWO"
- name: Show container logs on failure
if: failure()
run: docker compose -f "$DOCKER_COMPOSE_FILE" -p "$COMPOSE_PROJECT_NAME" logs
- name: Shutdown Docker Compose services
if: always()
run: docker compose -f "$DOCKER_COMPOSE_FILE" -p "$COMPOSE_PROJECT_NAME" down
backup-restore-e2e:
name: Backup + restore end-to-end smoke
runs-on: ubuntu-latest
# Parallel to deploy-and-test — backup/restore is orthogonal to HTTPS
# routing, so one job failing doesn't mask the other. Both fan out from
# `lint` so we don't burn the compose-up slot on a workflow with
# shellcheck/actionlint errors.
needs: lint
timeout-minutes: 15
permissions:
contents: read
env:
NETWORK_ONE: keycloak-network
NETWORK_TWO: traefik-network
DOCKER_COMPOSE_FILE: keycloak-traefik-letsencrypt-docker-compose.yml
COMPOSE_PROJECT_NAME: keycloak
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- name: Create necessary Docker networks
run: |
docker network create "$NETWORK_ONE" || true
docker network create "$NETWORK_TWO" || true
- name: Generate test .env with short backup intervals
# CI tunes the backup loop timings from 30m/24h down to 10s/30s so
# the backup cycle tests can complete in <5 min wall-clock. The
# hostnames here are placeholder — this job never exercises Traefik
# or Keycloak HTTPS routing (that's deploy-and-test's responsibility).
run: |
cat > .env <<EOF
TRAEFIK_IMAGE_TAG=traefik:3.2@sha256:e561a37f8710d9cf41c78bdf421d822b2c0b48267ec0552e644565fb55466ea9
TRAEFIK_LOG_LEVEL=WARN
TRAEFIK_ACME_EMAIL=ci@example.com
TRAEFIK_HOSTNAME=traefik.keycloak.ci.example
TRAEFIK_BASIC_AUTH=traefikadmin:\$\$2y\$\$10\$\$sMzJfirKC75x/hVpiINeZOiSm.Jkity9cn4KwNkRvO7hSQVFc5FLO
KEYCLOAK_POSTGRES_IMAGE_TAG=postgres:16@sha256:71e27bf60b70bded003791b5573f8b808365613f341df20ffcf0c1ed7bc13ddf
KEYCLOAK_IMAGE_TAG=quay.io/keycloak/keycloak:26.2.5@sha256:4883630ef9db14031cde3e60700c9a9a8eaf1b5c24db1589d6a2d43de38ba2a9
KEYCLOAK_DB_NAME=keycloakdb
KEYCLOAK_DB_USER=keycloakdbuser
KEYCLOAK_DB_PASSWORD=$(openssl rand -base64 24 | tr -d '/+=' | head -c 32)
KEYCLOAK_ADMIN_USERNAME=keycloakadmin
KEYCLOAK_ADMIN_PASSWORD=$(openssl rand -base64 24 | tr -d '/+=' | head -c 32)
KEYCLOAK_HOSTNAME=keycloak.ci.example
KEYCLOAK_BACKUP_INIT_SLEEP=10s
KEYCLOAK_BACKUP_INTERVAL=30s
KEYCLOAK_POSTGRES_BACKUP_PRUNE_DAYS=7
KEYCLOAK_POSTGRES_BACKUPS_PATH=/srv/keycloak-postgres/backups
KEYCLOAK_POSTGRES_BACKUP_NAME=keycloak-postgres-backup
EOF
- name: Start up services using Docker Compose
run: docker compose -f "$DOCKER_COMPOSE_FILE" -p "$COMPOSE_PROJECT_NAME" up -d
- name: Wait for postgres to become healthy
# docker compose --wait would be nicer but isn't universally
# available on the runner's compose version. Polling pg_isready
# is equivalent and works everywhere.
run: |
for i in $(seq 1 60); do
if docker exec "$(docker ps -aqf "name=${COMPOSE_PROJECT_NAME}-postgres" | head -1)" \
pg_isready -q -U keycloakdbuser -d keycloakdb > /dev/null 2>&1; then
echo "postgres ready after ${i} attempts"
exit 0
fi
sleep 2
done
echo "postgres did not become ready within 120s" >&2
exit 1
- name: Print Docker Compose services status
run: docker ps
- name: Run backup/restore E2E tests
run: ./tests/e2e-backup-restore.sh
- name: Show container logs on failure
if: failure()
run: docker compose -f "$DOCKER_COMPOSE_FILE" -p "$COMPOSE_PROJECT_NAME" logs
- name: Shutdown Docker Compose services
if: always()
run: docker compose -f "$DOCKER_COMPOSE_FILE" -p "$COMPOSE_PROJECT_NAME" down