SemiAnalysisAI · Oseltamivir · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
@@ -0,0 +1,116 @@
+name: CollectiveX Experimental
+
+# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
+# Push to the feature branch runs a small GB200 NCCL smoke (no merge to main
+# needed); workflow_dispatch runs a chosen SKU + benchmark (the lane for B200,
+# DeepEP, and larger sweeps). Each job lands on the SKU's self-hosted runner and
+# invokes that SKU's launch script — the same launch_${RUNNER_NAME%%_*}.sh
+# convention the serving benchmarks use.
+
+on:
+  push:
+    branches:
+      - collectivex
+    paths:
+      - 'experimental/CollectiveX/**'
+      - '.github/workflows/collectivex-experimental.yml'
+  workflow_dispatch:
+    inputs:
+      sku:
+        # Only SKUs with a matching launchers/launch_<prefix>.sh are offered —
+        # runner.name's prefix selects the script, so an SKU without one fails.
+        description: Self-hosted runner pool (must have a CollectiveX launcher)
+        type: choice
+        default: gb200
+        options: [gb200, b200-dgxc, b200-multinode]
+      benchmark:
+        description: Which benchmark to run
+        type: choice
+        default: nccl
+        options: [nccl, deepep, all]
+      ops:
+        description: NCCL ops (space-separated); blank = default set
+        type: string
+        default: ''
+      min_bytes:
+        description: nccl-tests min message size
+        type: string
+        default: '8'
+      max_bytes:
+        description: nccl-tests max message size
+        type: string
+        default: '8G'
+      ngpus:
+        description: GPUs per node (blank = SKU default)
+        type: string
+        default: ''
+
+concurrency:
+  group: collectivex-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  # Push -> short GB200 NCCL smoke (idle capacity; never auto-contends with the
+  # B200 serving sweep). GB200 runner workspace is staged to compute-visible
+  # Lustre via CX_STAGE_DIR.
+  smoke:
+    if: github.event_name == 'push'
+    runs-on: gb200
+    timeout-minutes: 60
+    env:
+      CX_BENCH: nccl
+      CX_NGPUS: '4'
+      CX_MAX_BYTES: 1G
+      CX_TIME: '20'
+      CX_STAGE_DIR: /mnt/lustre01/users-public/sa-shared/cx-stage
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - name: Launch GB200 NCCL smoke
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_smoke_gb200_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
+
+  # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
+  dispatch:
+    if: github.event_name == 'workflow_dispatch'
+    runs-on: ${{ inputs.sku }}
+    timeout-minutes: 120
+    env:
+      CX_BENCH: ${{ inputs.benchmark }}
+      CX_OPS: ${{ inputs.ops }}
+      CX_MIN_BYTES: ${{ inputs.min_bytes }}
+      CX_MAX_BYTES: ${{ inputs.max_bytes }}
+      CX_NGPUS: ${{ inputs.ngpus }}
+      # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
+      CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }}
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
@@ -0,0 +1,12 @@
+# in-container nccl-tests build cache
+.nccl-tests/
+# python
+__pycache__/
+*.pyc
+# generated run artifacts: captured env embeds hostnames / GPU UUIDs / NIC GUIDs,
+# so keep results out of git (CI uploads them as workflow artifacts instead).
+# Sanitized headline numbers live in CONTAINERS.md.
+results/*.json
+results/plots/
+results/raw_*.txt
+results/raw_*.txt.stderr
diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
@@ -0,0 +1,58 @@
+# CollectiveX — container & library versions
+
+One **multi-arch, digest-pinned** container is used for all NVIDIA SKUs, so B200
+(x86_64) and GB200 (aarch64) share a single reference and the cross-vendor
+comparison is truly same-image. Set in `launchers/common.sh` (`cx_default_image`).
+
+## Default container (all NVIDIA SKUs)
+
+- **Image:** import by tag **`lmsysorg/sglang:v0.5.11-cu130`** (multi-arch OCI index). Expected index digest, recorded for provenance/verification: `sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975`.
+- **Multi-arch manifest list:** linux/amd64 + linux/arm64; `enroot import` on each host pulls the matching arch.
+- **Import by TAG, not digest.** enroot builds its anonymous Docker Hub token scope from the *tag* and succeeds (no creds needed — same as the serving launchers). A bare `repo@sha256:` ref makes enroot prompt for a password and **hang** in non-interactive CI; a combined `tag@sha256:` ref 400s. `cx_ensure_squash` therefore imports by tag with `</dev/null` (a missing token fails fast instead of hanging). First import is multi-GB (~minutes); subsequent runs reuse the staged squash.
+- **Why v0.5.11-cu130 (chosen):** it's the newest cu130 release **pre-staged on BOTH clusters** — B200 `/home/sa-shared/containers/` (amd64 squash) and GB200 `/mnt/lustre01/users-public/sa-shared/` (arm64 squash), same filename — so neither side imports at all. (Shared cu130 multi-arch squashes across both clusters: v0.5.8.post1, v0.5.9, v0.5.11 — v0.5.11 is newest.) `v0.5.12-cu130` is staged on B200 but **not** GB200: its 62 layers overflow enroot's overlay-based squash creation on the GB200 kernel (`enroot-mksquashovlfs: failed to mount overlay … Invalid argument`), so it can't be the shared default.
+- **DeepEP: NOT bundled** here → `run_in_container.sh` builds it via `rebuild-deepep` at job setup (CX_BENCH=deepep). The NCCL path needs no DeepEP.
+- **nccl-tests build:** in-container (login nodes have no `nvcc`), `CX_NCCL_HOME=/usr` (system `nccl.h` in `/usr/include`), `CX_CUDA_HOME=/usr/local/cuda`. cu130 lineage ⇒ CUDA 13; confirm exact NCCL/torch on first run and append below.
+
+## Audited reference (cu130 lineage)
+
+Live audit of the sibling DeepSeek-V4 image `lmsysorg/sglang:deepseek-v4-grace-blackwell` (aarch64) on GB200, 2026-06-23 — the multi-arch `v0.5.11-cu130` should match closely (same cu130 base); reconfirm on first run:
+
+| Component | Version |
+|---|---|
+| OS / arch | Ubuntu 24.04.3, aarch64 |
+| CUDA (`nvcc`) | 13.0 (V13.0.88) |
+| NCCL (system `/usr/include/nccl.h`) | 2.28.3; torch-bundled 2.27.7 |
+| PyTorch | 2.9.1+cu130 |
+| DeepEP | bundled in *that* image; **not** in the multi-arch default |
+| NVSHMEM | `libnvshmem_host.so.3` present |
+| OpenMPI / gcc / make | 4.1.6 / 13.3.0 / 4.3 |
+| GPU / driver | GB200, 580.126.20 |
+
+**Version caveat:** the nccl-tests binary links **system NCCL** (2.28.x), while torch/DeepEP use the **bundled** NCCL (2.27.x). Record both in provenance (env_capture does); don't compare an nccl-tests curve against a DeepEP run as if NCCL were identical.
+
+## Bundled-DeepEP reference images (not the default)
+
+If a bundled DeepEP is needed before `rebuild-deepep` is wired on the multi-arch image, these arch-specific images bundle it (pin by digest):
+
+- B200 (amd64): `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b` (pre-staged on B200)
+- GB200 (arm64): `lmsysorg/sglang:deepseek-v4-grace-blackwell@sha256:4f583347d7ff08aef7e16dbb4985b2a7c147ff49a0c261d5e27b8f5f41719368` (staged on GB200 Lustre)
+
+Select via `CX_IMAGE=…@sha256:…` on the launch script.
+
+## Cluster access / QOS
+
+- **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account.
+- **GB200** (`watchtower`): account `benchmark`, qos `normal`, partition `batch` (`AllowQos=ALL`); idle capacity available. Runner workspace is **not** compute-visible → set `CX_STAGE_DIR` to a Lustre path (the launcher rsyncs there).
+
+## First real results (Milestone-0 spike, on the DeepSeek-V4 images)
+
+nccl-tests (system NCCL 2.28.3), all correctness-passed, peak bus-bw:
+
+| op | B200 8× (NVLink island, x86_64) | GB200 4× (NVL72 MNNVL, aarch64) |
+|---|---|---|
+| all_reduce | 835 GB/s | 689 GB/s |
+| all_gather | 653 | 658 |
+| reduce_scatter | 667 | 661 |
+| alltoall | 638 | 666 |
+
+(B200 vs GB200 carry distinct `comparison_key`s by topology-class, so they are labelled-distinct, not silently merged. Re-run on the multi-arch default to refresh under one image.)
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
@@ -0,0 +1,107 @@
+# CollectiveX
+
+Cross-vendor collective / EP-library benchmark (see `plan.md`). Per-SKU **launch
+adapters** (InferenceX-style `launch_<sku>.sh`) run **any benchmark** — selected
+by `CX_BENCH` — through a shared in-container runner, and a GitHub Actions
+workflow triggers runs on `push` (no merge to main needed). Milestone-0 headline
+already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL).
+
+> Experimental: WIP, not an official InferenceMAX result. All logic stays under
+> `experimental/CollectiveX/`; the only file outside is the orchestration-only
+> workflow.
+
+## Files
+
+| File | Role |
+|---|---|
+| `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) |
+| `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) |
+| `run_deepep.py` | DeepEP dispatch+combine, normal mode, correctness-gated (torch + DeepEP) |
+| `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) |
+| `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build |
+| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/all) |
+| `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL) |
+| `CONTAINERS.md` | the pinned multi-arch container + audited library versions |
+| `results/` | flat JSON artifacts (+ `plots/`, raw captures) |
+| `tests/fixtures/` | captured nccl-tests output for offline parser checks |
+
+## Run
+
+### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`)
+
+- **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle
+  capacity; never auto-contends with the B200 serving sweep).
+- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode),
+  `benchmark` (nccl / deepep / all), ops, sizes, ngpus. Lands on that SKU's
+  self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`.
+
+Each job renders a results table to the **GitHub Actions job summary** (via
+`summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs
+as an artifact. (The workflow only fires once the branch is pushed to GitHub.)
+
+### Directly on a cluster login node
+
+```bash
+# benchmark is selected by CX_BENCH (default nccl)
+bash experimental/CollectiveX/launchers/launch_gb200-nv.sh                 # GB200, NCCL primitives
+CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild)
+bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh               # B200 8× NVLink
+bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh         # B200 2-node, cross-IB
+```
+
+Knobs: `CX_BENCH` (nccl|deepep|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`,
+`CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible
+staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate
+nothing). Results land in `experimental/CollectiveX/results/`.
+
+### Offline (no GPU) — verify the parser/JSON pipeline
+
+```bash
+python3 run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \
+  --world-size 8 --nodes 1 --runner b200-dgxc --topology-class b200-nvlink-island --out /tmp/parsed.json
+python3 env_capture.py            # prints a (degraded, off-GPU) env record
+python3 plot.py --results-dir results --out-dir results/plots   # needs matplotlib
+```
+
+## Container
+
+One **multi-arch** image for all NVIDIA SKUs, imported by tag
+`lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…`
+recorded for provenance). Imported by tag, not digest — enroot's anonymous
+Docker Hub auth needs a tag, and a bare digest ref hangs in CI. See
+`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the bundled-DeepEP
+DeepSeek-V4 fallback images.
+
+## How it runs (confirmed against the live clusters)
+
+- Adapters mirror `runners/launch_*.sh`: `salloc` → enroot squash (import only if
+  missing) → `srun --container-image=… --container-mounts=<repo>:/ix` → in-container
+  `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account
+  `benchmark`.
+- Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in
+  `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node
+  adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`).
+- The sglang image installs editable under `/workspace`, so the repo is mounted at
+  **`/ix`**. GB200 compute nodes don't see the runner workspace → `CX_STAGE_DIR`
+  rsyncs the tree to Lustre first.
+- Every result embeds an `env_capture` record and a `comparison_key`; topology
+  class is part of the key, so B200(IB/NVLink) and GB200(MNNVL) stay labelled
+  distinct, never silently overlaid.
+
+## Status & known risks
+
+- **Spike done on real hardware** (both SKUs, 4 NCCL primitives, correctness-passed)
+  — on the DeepSeek-V4 images. Now standardizing on the **multi-arch** default;
+  validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9).
+- **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds
+  it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive;
+  `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against
+  the built commit. B200 (x86_64) first; GB200 (aarch64) follows.
+- **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a
+  compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container
+  or srt-slurm. CX_BENCH=nccl only for now.
+- **B200 QOS:** account `benchmark` has only `gpu-2_qos` (the serving-sweep
+  partition); idle `gpu-1` needs a QOS grant. GB200 `batch` is open.
+
+Once the multi-arch image is validated end-to-end, freeze the schema from the
+artifacts (plan: "Freeze the contract").