From 83761d008389ebcbbcd10e54e6c116f85b267066 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:03:14 +0800
Subject: [PATCH 01/17] Add CollectiveX experimental cross-vendor collective/EP
 benchmark

Per-SKU launch adapters (launch_<sku>.sh) that run any benchmark via a CX_BENCH selector through a shared run_in_container.sh; multi-arch digest-pinned sglang container; NCCL-primitive + DeepEP dispatch/combine benchmarks with provenance + correctness gating; and an on:push workflow (GB200 NCCL smoke; workflow_dispatch for B200/DeepEP/larger sweeps).

Validated on hardware: NCCL primitives on B200 (8x NVLink) and GB200 (4x NVL72 MNNVL); DeepEP dispatch/combine on GB200 (correctness-gated).
---
 .../workflows/collectivex-experimental.yml    | 108 ++
 experimental/CollectiveX/.gitignore           |  12 +
 experimental/CollectiveX/CONTAINERS.md        |  57 ++
 experimental/CollectiveX/README.md            | 103 ++
 experimental/CollectiveX/env_capture.py       | 250 +++++
 experimental/CollectiveX/launchers/common.sh  |  99 ++
 .../launchers/launch_b200-dgxc-slurm.sh       | 101 ++
 .../CollectiveX/launchers/launch_b200-dgxc.sh |  64 ++
 .../CollectiveX/launchers/launch_gb200-nv.sh  |  67 ++
 .../CollectiveX/launchers/run_in_container.sh |  74 ++
 experimental/CollectiveX/plan.md              | 939 ++++++++++++++++++
 experimental/CollectiveX/plot.py              | 141 +++
 experimental/CollectiveX/requirements.txt     |   9 +
 experimental/CollectiveX/results/.gitkeep     |   3 +
 experimental/CollectiveX/run_deepep.py        | 260 +++++
 experimental/CollectiveX/run_nccl.py          | 262 +++++
 .../fixtures/all_reduce_perf_b200_8gpu.txt    |  50 +
 17 files changed, 2599 insertions(+)
 create mode 100644 .github/workflows/collectivex-experimental.yml
 create mode 100644 experimental/CollectiveX/.gitignore
 create mode 100644 experimental/CollectiveX/CONTAINERS.md
 create mode 100644 experimental/CollectiveX/README.md
 create mode 100644 experimental/CollectiveX/env_capture.py
 create mode 100644 experimental/CollectiveX/launchers/common.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_b200-dgxc.sh
 create mode 100644 experimental/CollectiveX/launchers/launch_gb200-nv.sh
 create mode 100644 experimental/CollectiveX/launchers/run_in_container.sh
 create mode 100644 experimental/CollectiveX/plan.md
 create mode 100644 experimental/CollectiveX/plot.py
 create mode 100644 experimental/CollectiveX/requirements.txt
 create mode 100644 experimental/CollectiveX/results/.gitkeep
 create mode 100644 experimental/CollectiveX/run_deepep.py
 create mode 100644 experimental/CollectiveX/run_nccl.py
 create mode 100644 experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt
diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
new file mode 100644
index 000000000..6b07c2d56
--- /dev/null
+++ b/.github/workflows/collectivex-experimental.yml
@@ -0,0 +1,108 @@
+name: CollectiveX Experimental
+
+# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
+# Push to the feature branch runs a small GB200 NCCL smoke (no merge to main
+# needed); workflow_dispatch runs a chosen SKU + benchmark (the lane for B200,
+# DeepEP, and larger sweeps). Each job lands on the SKU's self-hosted runner and
+# invokes that SKU's launch script — the same launch_${RUNNER_NAME%%_*}.sh
+# convention the serving benchmarks use.
+
+on:
+  push:
+    branches:
+      - collectivex
+    paths:
+      - 'experimental/CollectiveX/**'
+      - '.github/workflows/collectivex-experimental.yml'
+  workflow_dispatch:
+    inputs:
+      sku:
+        description: Self-hosted runner pool (label from .github/configs/runners.yaml)
+        type: choice
+        default: gb200
+        options: [gb200, b200, b200-multinode, b300, gb300]
+      benchmark:
+        description: Which benchmark to run
+        type: choice
+        default: nccl
+        options: [nccl, deepep, all]
+      ops:
+        description: NCCL ops (space-separated); blank = default set
+        type: string
+        default: ''
+      min_bytes:
+        description: nccl-tests min message size
+        type: string
+        default: '8'
+      max_bytes:
+        description: nccl-tests max message size
+        type: string
+        default: '8G'
+      ngpus:
+        description: GPUs per node (blank = SKU default)
+        type: string
+        default: ''
+
+concurrency:
+  group: collectivex-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  # Push -> short GB200 NCCL smoke (idle capacity; never auto-contends with the
+  # B200 serving sweep). GB200 runner workspace is staged to compute-visible
+  # Lustre via CX_STAGE_DIR.
+  smoke:
+    if: github.event_name == 'push'
+    runs-on: gb200
+    timeout-minutes: 60
+    env:
+      CX_BENCH: nccl
+      CX_NGPUS: '4'
+      CX_MAX_BYTES: 1G
+      CX_TIME: '20'
+      CX_STAGE_DIR: /mnt/lustre01/users-public/sa-shared/cx-stage
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - name: Launch GB200 NCCL smoke
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_smoke_gb200_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
+
+  # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
+  dispatch:
+    if: github.event_name == 'workflow_dispatch'
+    runs-on: ${{ inputs.sku }}
+    timeout-minutes: 120
+    env:
+      CX_BENCH: ${{ inputs.benchmark }}
+      CX_OPS: ${{ inputs.ops }}
+      CX_MIN_BYTES: ${{ inputs.min_bytes }}
+      CX_MAX_BYTES: ${{ inputs.max_bytes }}
+      CX_NGPUS: ${{ inputs.ngpus }}
+      # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
+      CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
+        with: { clean: true }
+      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }}
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+        run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ github.run_id }}
+          path: experimental/CollectiveX/results/*.json
+          if-no-files-found: warn
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
new file mode 100644
index 000000000..4235a8ce9
--- /dev/null
+++ b/experimental/CollectiveX/.gitignore
@@ -0,0 +1,12 @@
+# in-container nccl-tests build cache
+.nccl-tests/
+# python
+__pycache__/
+*.pyc
+# generated run artifacts: captured env embeds hostnames / GPU UUIDs / NIC GUIDs,
+# so keep results out of git (CI uploads them as workflow artifacts instead).
+# Sanitized headline numbers live in CONTAINERS.md.
+results/*.json
+results/plots/
+results/raw_*.txt
+results/raw_*.txt.stderr
diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
new file mode 100644
index 000000000..94ab7377f
--- /dev/null
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -0,0 +1,57 @@
+# CollectiveX — container & library versions
+
+One **multi-arch, digest-pinned** container is used for all NVIDIA SKUs, so B200
+(x86_64) and GB200 (aarch64) share a single reference and the cross-vendor
+comparison is truly same-image. Set in `launchers/common.sh` (`cx_default_image`).
+
+## Default container (all NVIDIA SKUs)
+
+- **Image (pin by digest):** `lmsysorg/sglang@sha256:42194170546745092e74cd5f81ad32a7c6e944c7111fe7bf13588152277ff356` — the OCI image index for tag `v0.5.12-cu130`.
+- **Multi-arch manifest list:** linux/amd64 (`sha256:015f39a4…`) + linux/arm64 (`sha256:7a76819e…`). One digest; `enroot import` on each host pulls the matching arch. **Use the digest-only ref** (`repo@sha256:`) in `common.sh` — enroot 400s on a combined `tag@sha256:` reference.
+- **Importing needs registry creds:** anonymous Docker Hub pulls return 401 in ad-hoc SSH sessions; the CI runners import with their configured credentials (the serving sweeps pull images routinely), and already-staged squashes need no import. The refactored launcher path was validated on the already-staged `v0.5.11-cu130` (same multi-arch cu130 line).
+- **DeepEP: NOT bundled** here → `run_in_container.sh` builds it via `rebuild-deepep` at job setup (CX_BENCH=deepep). The NCCL path needs no DeepEP.
+- **nccl-tests build:** in-container (login nodes have no `nvcc`), `CX_NCCL_HOME=/usr` (system `nccl.h` in `/usr/include`), `CX_CUDA_HOME=/usr/local/cuda`. cu130 lineage ⇒ CUDA 13; confirm exact NCCL/torch on first run and append below.
+
+## Audited reference (cu130 lineage)
+
+Live audit of the sibling DeepSeek-V4 image `lmsysorg/sglang:deepseek-v4-grace-blackwell` (aarch64) on GB200, 2026-06-23 — the multi-arch `v0.5.12-cu130` should match closely (same cu130 base); reconfirm on first run:
+
+| Component | Version |
+|---|---|
+| OS / arch | Ubuntu 24.04.3, aarch64 |
+| CUDA (`nvcc`) | 13.0 (V13.0.88) |
+| NCCL (system `/usr/include/nccl.h`) | 2.28.3; torch-bundled 2.27.7 |
+| PyTorch | 2.9.1+cu130 |
+| DeepEP | bundled in *that* image; **not** in the multi-arch default |
+| NVSHMEM | `libnvshmem_host.so.3` present |
+| OpenMPI / gcc / make | 4.1.6 / 13.3.0 / 4.3 |
+| GPU / driver | GB200, 580.126.20 |
+
+**Version caveat:** the nccl-tests binary links **system NCCL** (2.28.x), while torch/DeepEP use the **bundled** NCCL (2.27.x). Record both in provenance (env_capture does); don't compare an nccl-tests curve against a DeepEP run as if NCCL were identical.
+
+## Bundled-DeepEP reference images (not the default)
+
+If a bundled DeepEP is needed before `rebuild-deepep` is wired on the multi-arch image, these arch-specific images bundle it (pin by digest):
+
+- B200 (amd64): `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b` (pre-staged on B200)
+- GB200 (arm64): `lmsysorg/sglang:deepseek-v4-grace-blackwell@sha256:4f583347d7ff08aef7e16dbb4985b2a7c147ff49a0c261d5e27b8f5f41719368` (staged on GB200 Lustre)
+
+Select via `CX_IMAGE=…@sha256:…` on the launch script.
+
+## Cluster access / QOS
+
+- **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account.
+- **GB200** (`watchtower`): account `benchmark`, qos `normal`, partition `batch` (`AllowQos=ALL`); idle capacity available. Runner workspace is **not** compute-visible → set `CX_STAGE_DIR` to a Lustre path (the launcher rsyncs there).
+
+## First real results (Milestone-0 spike, on the DeepSeek-V4 images)
+
+nccl-tests (system NCCL 2.28.3), all correctness-passed, peak bus-bw:
+
+| op | B200 8× (NVLink island, x86_64) | GB200 4× (NVL72 MNNVL, aarch64) |
+|---|---|---|
+| all_reduce | 835 GB/s | 689 GB/s |
+| all_gather | 653 | 658 |
+| reduce_scatter | 667 | 661 |
+| alltoall | 638 | 666 |
+
+(B200 vs GB200 carry distinct `comparison_key`s by topology-class, so they are labelled-distinct, not silently merged. Re-run on the multi-arch default to refresh under one image.)
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
new file mode 100644
index 000000000..3b18c048d
--- /dev/null
+++ b/experimental/CollectiveX/README.md
@@ -0,0 +1,103 @@
+# CollectiveX
+
+Cross-vendor collective / EP-library benchmark (see `plan.md`). Per-SKU **launch
+adapters** (InferenceX-style `launch_<sku>.sh`) run **any benchmark** — selected
+by `CX_BENCH` — through a shared in-container runner, and a GitHub Actions
+workflow triggers runs on `push` (no merge to main needed). Milestone-0 headline
+already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL).
+
+> Experimental: WIP, not an official InferenceMAX result. All logic stays under
+> `experimental/CollectiveX/`; the only file outside is the orchestration-only
+> workflow.
+
+## Files
+
+| File | Role |
+|---|---|
+| `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) |
+| `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) |
+| `run_deepep.py` | DeepEP dispatch+combine, normal mode, correctness-gated (torch + DeepEP) |
+| `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) |
+| `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build |
+| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/all) |
+| `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL) |
+| `CONTAINERS.md` | the pinned multi-arch container + audited library versions |
+| `results/` | flat JSON artifacts (+ `plots/`, raw captures) |
+| `tests/fixtures/` | captured nccl-tests output for offline parser checks |
+
+## Run
+
+### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`)
+
+- **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle
+  capacity; never auto-contends with the B200 serving sweep).
+- **workflow_dispatch** → pick `sku` (gb200 / b200 / b200-multinode / …),
+  `benchmark` (nccl / deepep / all), ops, sizes, ngpus. Lands on that SKU's
+  self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`.
+
+(The workflow only fires once the branch is pushed to GitHub.)
+
+### Directly on a cluster login node
+
+```bash
+# benchmark is selected by CX_BENCH (default nccl)
+bash experimental/CollectiveX/launchers/launch_gb200-nv.sh                 # GB200, NCCL primitives
+CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild)
+bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh               # B200 8× NVLink
+bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh         # B200 2-node, cross-IB
+```
+
+Knobs: `CX_BENCH` (nccl|deepep|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`,
+`CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible
+staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate
+nothing). Results land in `experimental/CollectiveX/results/`.
+
+### Offline (no GPU) — verify the parser/JSON pipeline
+
+```bash
+python3 run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \
+  --world-size 8 --nodes 1 --runner b200-dgxc --topology-class b200-nvlink-island --out /tmp/parsed.json
+python3 env_capture.py            # prints a (degraded, off-GPU) env record
+python3 plot.py --results-dir results --out-dir results/plots   # needs matplotlib
+```
+
+## Container
+
+One **multi-arch, digest-pinned** image for all NVIDIA SKUs:
+`lmsysorg/sglang:v0.5.12-cu130@sha256:4219…f356` (amd64 + arm64). See
+`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the digest-pinned
+DeepSeek-V4 fallback images.
+
+## How it runs (confirmed against the live clusters)
+
+- Adapters mirror `runners/launch_*.sh`: `salloc` → enroot squash (import only if
+  missing) → `srun --container-image=… --container-mounts=<repo>:/ix` → in-container
+  `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account
+  `benchmark`.
+- Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in
+  `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node
+  adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`).
+- The sglang image installs editable under `/workspace`, so the repo is mounted at
+  **`/ix`**. GB200 compute nodes don't see the runner workspace → `CX_STAGE_DIR`
+  rsyncs the tree to Lustre first.
+- Every result embeds an `env_capture` record and a `comparison_key`; topology
+  class is part of the key, so B200(IB/NVLink) and GB200(MNNVL) stay labelled
+  distinct, never silently overlaid.
+
+## Status & known risks
+
+- **Spike done on real hardware** (both SKUs, 4 NCCL primitives, correctness-passed)
+  — on the DeepSeek-V4 images. Now standardizing on the **multi-arch** default;
+  validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9).
+- **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds
+  it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive;
+  `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against
+  the built commit. B200 (x86_64) first; GB200 (aarch64) follows.
+- **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a
+  compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container
+  or srt-slurm. CX_BENCH=nccl only for now.
+- **B200 QOS:** account `benchmark` has only `gpu-2_qos` (the serving-sweep
+  partition); idle `gpu-1` needs a QOS grant. GB200 `batch` is open.
+
+Once the multi-arch image is validated end-to-end, freeze the schema from the
+artifacts (plan: "Freeze the contract").
diff --git a/experimental/CollectiveX/env_capture.py b/experimental/CollectiveX/env_capture.py
new file mode 100644
index 000000000..b906a0497
--- /dev/null
+++ b/experimental/CollectiveX/env_capture.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — Layer-0 environment + topology capture.
+
+Emits a JSON document describing the node a collective benchmark ran on, so
+every result is provenance-tagged and a B200-vs-GB200 comparison is defensible.
+Standard library only (so it runs in any minimal container, and off-GPU it
+degrades gracefully instead of crashing). torch is used only if importable.
+
+Usage:
+    python env_capture.py --out results/env_b200-dgxc.json
+    python env_capture.py --redact --out env.json   # hash hostnames/IPs/UUIDs
+
+Importable:
+    from env_capture import capture_environment
+    env = capture_environment(redact=False)
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import platform
+import re
+import shutil
+import socket
+import subprocess
+import sys
+
+SCHEMA_VERSION = 1
+
+# Env vars worth recording — transport/tuning knobs that change what a
+# collective actually does (esp. the GB200 MNNVL flags vs B200).
+ENV_PREFIXES = ("NCCL_", "NVSHMEM_", "MC_", "UCX_", "SGLANG_DEEPEP", "DEEPEP_")
+ENV_EXACT = (
+    "CUDA_VISIBLE_DEVICES",
+    "CUDA_DEVICE_ORDER",
+    "SLURM_JOB_ID",
+    "SLURM_NNODES",
+    "SLURM_NTASKS",
+    "SLURM_JOB_PARTITION",
+    # Image identity — set by the launcher so the bundle records what ran.
+    "COLLECTIVEX_IMAGE",
+    "COLLECTIVEX_IMAGE_DIGEST",
+)
+
+
+def _run(cmd: list[str], timeout: int = 20) -> str | None:
+    """Run a command, return stdout (stripped) or None if unavailable."""
+    if shutil.which(cmd[0]) is None:
+        return None
+    try:
+        out = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=timeout, check=False
+        )
+    except (subprocess.TimeoutExpired, OSError):
+        return None
+    if out.returncode != 0:
+        return None
+    return out.stdout.strip()
+
+
+def _redact(value: str | None) -> str | None:
+    """Stable short hash so artifacts can be shared without leaking
+    hostnames / IPs / GPU UUIDs / IB GUIDs while staying joinable."""
+    if not value:
+        return value
+    return "redacted-" + hashlib.sha256(value.encode()).hexdigest()[:12]
+
+
+def _gpus(redact: bool) -> dict:
+    """GPU inventory via nvidia-smi (None fields off-GPU)."""
+    info: dict = {"source": None, "count": None, "devices": []}
+    q = _run(
+        [
+            "nvidia-smi",
+            "--query-gpu=name,uuid,memory.total,compute_cap,pci.bus_id",
+            "--format=csv,noheader,nounits",
+        ]
+    )
+    if q is None:
+        return info
+    info["source"] = "nvidia-smi"
+    devices = []
+    for line in q.splitlines():
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) < 5:
+            continue
+        name, uuid, mem_mib, cc, bus = parts[:5]
+        devices.append(
+            {
+                "name": name,
+                "uuid": _redact(uuid) if redact else uuid,
+                "memory_total_mib": int(mem_mib) if mem_mib.isdigit() else mem_mib,
+                "compute_capability": cc,
+                "pci_bus_id": _redact(bus) if redact else bus,
+            }
+        )
+    info["count"] = len(devices)
+    info["devices"] = devices
+    return info
+
+
+def _driver_cuda() -> dict:
+    out = _run(
+        ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"]
+    )
+    driver = out.splitlines()[0].strip() if out else None
+    # `nvidia-smi` (no args) prints the CUDA driver-API version in its header.
+    cuda = None
+    header = _run(["nvidia-smi"])
+    if header:
+        m = re.search(r"CUDA Version:\s*([0-9.]+)", header)
+        if m:
+            cuda = m.group(1)
+    return {"driver_version": driver, "cuda_version": cuda}
+
+
+def _torch_info() -> dict:
+    """NCCL / torch build info — only if torch is importable in this env."""
+    info: dict = {"available": False}
+    try:
+        import torch  # type: ignore
+    except Exception:
+        return info
+    info["available"] = True
+    info["torch_version"] = torch.__version__
+    try:
+        info["cuda_runtime"] = torch.version.cuda
+    except Exception:
+        info["cuda_runtime"] = None
+    try:
+        if torch.cuda.is_available():
+            nccl = torch.cuda.nccl.version()
+            # version() returns an int (e.g. 22304) or a tuple, depending on build.
+            info["nccl_version"] = (
+                ".".join(map(str, nccl)) if isinstance(nccl, tuple) else nccl
+            )
+            info["device_count"] = torch.cuda.device_count()
+            info["device_name"] = torch.cuda.get_device_name(0)
+            cc = torch.cuda.get_device_capability(0)
+            info["compute_capability"] = f"{cc[0]}.{cc[1]}"
+    except Exception as exc:  # pragma: no cover - hardware dependent
+        info["error"] = repr(exc)
+    return info
+
+
+def _topology(redact: bool) -> dict:
+    """GPU/NIC topology matrix + a fingerprint to gate comparability.
+
+    The fingerprint is a hash of the structural part of `nvidia-smi topo -m`
+    (the connection legend), so two nodes with the same wiring share a key
+    even if absolute device IDs differ."""
+    topo = _run(["nvidia-smi", "topo", "-m"])
+    if topo is None:
+        return {"source": None, "matrix": None, "fingerprint": None}
+    # Fingerprint the link-type tokens (NV#, NODE, SYS, PIX, PXB, ...) only —
+    # ignore GPU/NIC labels and whitespace so it's placement-stable.
+    tokens = re.findall(r"\b(NV\d+|NODE|SYS|PIX|PXB|PHB|X)\b", topo)
+    fingerprint = hashlib.sha256(" ".join(tokens).encode()).hexdigest()[:16]
+    return {
+        "source": "nvidia-smi topo -m",
+        # The matrix can contain hostnames in some setups; redact wholesale.
+        "matrix": ("<redacted>" if redact else topo),
+        "fingerprint": fingerprint,
+    }
+
+
+def _rdma(redact: bool) -> dict:
+    """RDMA/IB device presence — names only, GUIDs redactable."""
+    devices: list[str] = []
+    listing = _run(["ibv_devinfo", "-l"])
+    if listing:
+        for line in listing.splitlines()[1:]:  # first line is a count
+            name = line.strip()
+            if name:
+                devices.append(name)
+    elif _run(["ibstat", "-l"]):
+        devices = [d.strip() for d in _run(["ibstat", "-l"]).splitlines() if d.strip()]
+    return {
+        "available": bool(devices),
+        "devices": [_redact(d) if redact else d for d in devices],
+    }
+
+
+def _env_vars() -> dict:
+    out = {}
+    for k, v in os.environ.items():
+        if k in ENV_EXACT or any(k.startswith(p) for p in ENV_PREFIXES):
+            out[k] = v
+    return dict(sorted(out.items()))
+
+
+def capture_environment(redact: bool = False, timestamp: str | None = None) -> dict:
+    """Return a JSON-serializable environment/provenance record."""
+    host = socket.gethostname()
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "captured_at": timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "redacted": redact,
+        "host": _redact(host) if redact else host,
+        "platform": {
+            "system": platform.system(),
+            "release": platform.release(),
+            "machine": platform.machine(),  # x86_64 vs aarch64 (B200 vs GB200)
+            "python": sys.version.split()[0],
+        },
+        "gpus": _gpus(redact),
+        "driver": _driver_cuda(),
+        "torch": _torch_info(),
+        "topology": _topology(redact),
+        "rdma": _rdma(redact),
+        "env": _env_vars(),
+    }
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX Layer-0 environment capture")
+    ap.add_argument("--out", help="write JSON here (default: stdout)")
+    ap.add_argument(
+        "--redact",
+        action="store_true",
+        help="hash hostnames / IPs / GPU UUIDs / IB GUIDs for shareable artifacts",
+    )
+    ap.add_argument(
+        "--timestamp",
+        help="ISO timestamp to stamp (default: now); pass one for reproducible bundles",
+    )
+    args = ap.parse_args()
+
+    env = capture_environment(redact=args.redact, timestamp=args.timestamp)
+    blob = json.dumps(env, indent=2)
+    if args.out:
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            fh.write(blob + "\n")
+        # A one-line human summary to stdout (the JSON is the artifact).
+        g = env["gpus"]
+        print(
+            f"env -> {args.out} | machine={env['platform']['machine']} "
+            f"gpus={g['count']} topo_fp={env['topology']['fingerprint']}"
+        )
+    else:
+        print(blob)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh
new file mode 100644
index 000000000..445cdb5ca
--- /dev/null
+++ b/experimental/CollectiveX/launchers/common.sh
@@ -0,0 +1,99 @@
+# shellcheck shell=bash
+# CollectiveX — shared launcher helpers (sourced, not executed).
+#
+# Cluster-generic scaffolding only (Slurm/container/build/staging); no
+# model-serving. Logging goes to stderr so functions can `echo` a single
+# result on stdout.
+
+cx_log() { printf '[collectivex] %s\n' "$*" >&2; }
+cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; }
+
+# Single multi-arch, digest-pinned container for ALL NVIDIA SKUs.
+# This is the OCI image index for tag `v0.5.12-cu130`, covering BOTH linux/amd64
+# (B200) and linux/arm64 (GB200); enroot import on each host pulls the matching
+# arch from the index. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.)
+# Pinned by DIGEST ONLY (no tag): enroot mis-parses a combined `tag@sha256` ref
+# and 400s at auth, so we use `repo@sha256:` — also the stricter pin.
+# NOTE: DeepEP is NOT bundled here -> run_in_container.sh builds it via
+# rebuild-deepep at job setup. (The arch-specific deepseek-v4-{blackwell,
+# grace-blackwell} images DO bundle DeepEP — see CONTAINERS.md — but are not
+# multi-arch and are not used by default.)
+CX_IMAGE_MULTIARCH="lmsysorg/sglang@sha256:42194170546745092e74cd5f81ad32a7c6e944c7111fe7bf13588152277ff356"
+
+cx_default_image() {
+  case "$1" in
+    b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;;
+    *) cx_die "no default image for runner prefix: $1" ;;
+  esac
+}
+
+# cx_ensure_squash <squash_dir> <image>  ->  echoes the squash file path.
+# Imports via enroot only if a valid squash is not already present (flock-guarded,
+# mirroring runners/launch_b200-dgxc.sh).
+cx_ensure_squash() {
+  local squash_dir="$1" image="$2"
+  mkdir -p "$squash_dir" 2>/dev/null || true
+  local key sq locks
+  key="$(printf '%s' "$image" | sed 's#[/:@#]#_#g')"
+  sq="$squash_dir/${key}.sqsh"
+  locks="$squash_dir/.locks"; mkdir -p "$locks" 2>/dev/null || true
+  (
+    flock -w 900 9 || cx_die "lock timeout for $sq"
+    if unsquashfs -l "$sq" >/dev/null 2>&1; then
+      cx_log "squash present: $sq"
+    else
+      cx_log "enroot import docker://$image -> $sq (one-time, multi-GB)"
+      rm -f "$sq"
+      enroot import -o "$sq" "docker://$image" >&2 || cx_die "enroot import failed for $image"
+      unsquashfs -l "$sq" >/dev/null 2>&1 || cx_die "import produced no valid squash: $sq"
+    fi
+  ) 9>"$locks/${key}.lock"
+  echo "$sq"
+}
+
+# cx_stage_repo <repo_root> <stage_dir>  ->  echoes the mount-source root.
+# Some clusters (e.g. GB200/watchtower) do not cross-mount the runner workspace
+# to compute nodes. If CX_STAGE_DIR is set, rsync the CollectiveX tree onto that
+# compute-visible shared FS and mount from there. No-op (echo repo_root) when
+# stage_dir is empty or equals repo_root.
+cx_stage_repo() {
+  local repo_root="$1" stage_dir="${2:-}"
+  if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then
+    echo "$repo_root"; return 0
+  fi
+  mkdir -p "$stage_dir/experimental" || cx_die "cannot create stage dir $stage_dir"
+  cx_log "staging experimental/CollectiveX -> $stage_dir (compute-visible)"
+  rsync -a --delete \
+    --exclude='.nccl-tests/' --exclude='__pycache__/' --exclude='results/plots/' \
+    "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" >&2 \
+    || cx_die "rsync to stage dir failed"
+  echo "$stage_dir"
+}
+
+# cx_build_nccl_tests <parent_dir> <mpi 0|1>  ->  echoes the build/ dir.
+# Runs IN-CONTAINER (login nodes have no nvcc). Cached: skips if already built.
+# CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang
+# cu130 images); override CX_CUDA_HOME / CX_NCCL_HOME / CX_MPI_HOME if needed.
+cx_build_nccl_tests() {
+  local parent="$1" mpi="${2:-0}" dir bin
+  dir="$parent/nccl-tests"
+  bin="$dir/build/all_reduce_perf"
+  if [ -x "$bin" ]; then
+    cx_log "nccl-tests already built: $dir/build"
+    echo "$dir/build"; return 0
+  fi
+  mkdir -p "$parent"
+  if [ ! -d "$dir/.git" ]; then
+    cx_log "cloning nccl-tests -> $dir"
+    git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$dir" >&2 \
+      || cx_die "git clone nccl-tests failed"
+  fi
+  cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr})"
+  make -C "$dir" -j MPI="$mpi" \
+       CUDA_HOME="${CX_CUDA_HOME:-/usr/local/cuda}" \
+       NCCL_HOME="${CX_NCCL_HOME:-/usr}" \
+       ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \
+    || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME; need nccl.h + libnccl)"
+  [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin"
+  echo "$dir/build"
+}
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
new file mode 100644
index 000000000..a58411343
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+# CollectiveX — 2-node B200 SKU adapter (cross CX-7 InfiniBand spine), x86_64.
+#
+# The other half of the headline: the same primitives as single-node B200, but
+# spanning two nodes so the transport is InfiniBand rather than NVLink. Contrast
+# with GB200, where the 2-node-equivalent stays on NVL72 NVLink (MNNVL).
+#
+# Multi-node orchestration differs from single-node, so this adapter does NOT
+# use run_in_container.sh: it builds nccl-tests (MPI=1), runs each op across all
+# ranks (raw capture), then parses on the login node. Currently CX_BENCH=nccl
+# only (multi-node DeepEP/MNNVL is the srt-slurm follow-up).
+#
+# SPIKE CAVEATS: needs `srun --mpi=pmix` wired for pyxis and a compute-visible
+# checkout — set CX_STAGE_DIR to a shared FS (e.g. /home/sa-shared/cx-stage) if
+# the runner workspace is not cross-mounted to compute.
+#
+# Run: bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+CX_BENCH="${CX_BENCH:-nccl}"
+[ "$CX_BENCH" = "nccl" ] || cx_die "launch_b200-dgxc-slurm.sh supports CX_BENCH=nccl only (got '$CX_BENCH'); multi-node DeepEP is a follow-up"
+
+RUNNER_NAME="${RUNNER_NAME:-b200-dgxc-slurm}"
+PARTITION="${CX_PARTITION:-gpu-2}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+GPUS_PER_NODE="${CX_GPUS_PER_NODE:-8}"
+NODES="${CX_NODES:-2}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+TOPO="b200-nvlink-island+cx7-ib"
+WORLD=$((NODES * GPUS_PER_NODE))
+MPI_FLAG="${CX_SRUN_MPI:-pmix}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+
+declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf
+                 [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf )
+
+cx_log "runner=$RUNNER_NAME nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+       --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" \
+       --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR"
+              --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX"
+              --no-container-entrypoint)
+ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json"
+
+# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node).
+srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" \
+  bash -c '
+    set -euo pipefail
+    cd /ix/experimental/CollectiveX
+    source launchers/common.sh
+    mkdir -p results
+    cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null
+    python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
+  '
+
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/build"
+OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+
+# 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS.
+for op in $OPS; do
+  raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt"
+  cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG) -> $raw"
+  srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \
+       --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \
+       --export=ALL,NCCL_CUMEM_ENABLE=1 \
+       "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-8G}" -f 2 -g 1 -c 1 -w 5 -n 20 \
+       > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)"
+
+  # 3) Parse on the login node (pure stdlib python; no container needed).
+  python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \
+    --world-size "$WORLD" --nodes "$NODES" \
+    --runner "$RUNNER_NAME" --topology-class "$TOPO" --transport ib \
+    --env-json "$ENVJSON" \
+    --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \
+    --timestamp "$TS" || cx_log "WARN: parse $op failed"
+done
+
+cx_log "done — JSON artifacts under $CX_DIR/results/"
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
new file mode 100644
index 000000000..a1b5c0135
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# CollectiveX — B200 single-node SKU adapter (8x B200, NVLink island, x86_64).
+#
+# Thin adapter: handles B200-specific allocation/container, then hands off to
+# launchers/run_in_container.sh which runs whichever benchmark CX_BENCH selects
+# (nccl | deepep | all). Mirrors runners/launch_b200-dgxc.sh (salloc + enroot
+# squash + srun --container) with all model-serving stripped.
+#
+# Run from inside the InferenceX checkout on the B200 login node:
+#     bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh           # nccl (default)
+#     CX_BENCH=deepep bash .../launch_b200-dgxc.sh                          # DeepEP (rebuild)
+#
+# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(30)
+#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES
+#   CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-b200-dgxc}"
+PARTITION="${CX_PARTITION:-gpu-2}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image b200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+export NCCL_CUMEM_ENABLE=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \
+       --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
new file mode 100644
index 000000000..35cdb8e28
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray.
+#
+# Thin adapter: handles GB200-specific allocation/container/transport-env, then
+# hands off to launchers/run_in_container.sh which runs whichever benchmark
+# CX_BENCH selects (nccl | deepep | all). The same NCCL primitive shape that
+# runs on B200 (NVLink island + CX-7 IB across nodes) runs here entirely inside
+# the NVL72 NVLink (MNNVL) domain — that contrast is the headline.
+#
+# Run from inside the InferenceX checkout on the GB200 login node:
+#     bash experimental/CollectiveX/launchers/launch_gb200-nv.sh            # nccl (default)
+#     CX_BENCH=deepep bash .../launch_gb200-nv.sh                           # DeepEP (rebuild)
+#
+# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NGPUS(4) CX_TIME(30)
+#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES
+#   CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-gb200-nv}"
+PARTITION="${CX_PARTITION:-batch}"
+ACCOUNT="${CX_ACCOUNT:-benchmark}"
+NGPUS="${CX_NGPUS:-4}"                          # NVL72 compute tray = 4 GPU/node
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/lustre01/users-public/sa-shared}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+# Exported so srun --export=ALL carries them into run_in_container.sh.
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
+export CX_BENCH="${CX_BENCH:-nccl}"
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded.
+export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS (aarch64) bench=$CX_BENCH"
+cx_log "image=$IMAGE"
+SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \
+       --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh
new file mode 100644
index 000000000..7729528b2
--- /dev/null
+++ b/experimental/CollectiveX/launchers/run_in_container.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# CollectiveX — generic in-container benchmark dispatcher (single-node).
+#
+# Runs INSIDE the container under `srun`, invoked by every per-SKU adapter
+# (launch_<sku>.sh). The SKU adapter handles allocation/container/transport-env;
+# this script decides WHICH benchmark to run from CX_BENCH, so any benchmark can
+# be driven through any SKU's launch script. Writes provenance-tagged JSON to
+# results/.
+#
+# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO
+# Selector:        CX_BENCH = nccl | deepep | all          (default nccl)
+# NCCL knobs:      CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME
+# DeepEP knobs:    CX_TOKENS_PER_RANK CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE
+set -euo pipefail
+
+cd /ix/experimental/CollectiveX
+# shellcheck source=common.sh
+source launchers/common.sh
+mkdir -p results
+
+: "${CX_RUNNER:?CX_RUNNER not set}"
+: "${CX_NGPUS:?CX_NGPUS not set}"
+: "${CX_TS:?CX_TS not set}"
+: "${CX_TOPO:?CX_TOPO not set}"
+CX_BENCH="${CX_BENCH:-nccl}"
+CX_TRANSPORT="${CX_TRANSPORT:-}"
+ENVJSON="results/env_${CX_RUNNER}_${CX_TS}.json"
+
+cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO"
+python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS"
+
+run_nccl_suite() {
+  local build ops op
+  build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)"   # single-node: MPI=0, -g N
+  ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+  for op in $ops; do
+    python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \
+      --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \
+      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+      --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \
+      --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1 \
+      || cx_log "WARN: nccl $op failed"
+  done
+}
+
+run_deepep_suite() {
+  # DeepEP is not bundled in the multi-arch image. Try to import; if absent,
+  # attempt rebuild-deepep (srt-slurm setup script) when available, else skip.
+  if ! python3 -c "import deep_ep" 2>/dev/null; then
+    if command -v rebuild-deepep.sh >/dev/null 2>&1; then
+      cx_log "building DeepEP via rebuild-deepep.sh"
+      rebuild-deepep.sh >&2 || cx_log "WARN: rebuild-deepep.sh failed"
+    else
+      cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; skipping deepep"
+      return 0
+    fi
+  fi
+  torchrun --nproc_per_node="$CX_NGPUS" run_deepep.py \
+    --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+    --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \
+    --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \
+    --dispatch-dtype "${CX_DISPATCH_DTYPE:-fp8}" \
+    --env-json "$ENVJSON" --out "results/${CX_RUNNER}_deepep_${CX_TS}.json" \
+    || cx_log "WARN: deepep run failed"
+}
+
+case "$CX_BENCH" in
+  nccl)   run_nccl_suite ;;
+  deepep) run_deepep_suite ;;
+  all)    run_nccl_suite; run_deepep_suite ;;
+  *)      cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|all)" ;;
+esac
+
+echo "=== results ==="; ls -1 results/*.json
diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md
new file mode 100644
index 000000000..365b23455
--- /dev/null
+++ b/experimental/CollectiveX/plan.md
@@ -0,0 +1,939 @@
+# CollectiveX — Plan
+
+> **How to read this.** This is the single canonical plan. It is **spike-first** and **scoped to `experimental/CollectiveX/`** on a branch — nothing in the production serving path changes until a promotion decision is made later. Part 1 is background (what CollectiveX is, reconstructed from team discussion). Part 2 is the implementation plan. Where this plan says "now," it means the Milestone 0 spike; "later" items (GitHub workflow, database, app frontend) are deliberately deferred. All repository references (runners, launchers, workflows, matrix logic, the `experimental/` charter) were verified against the live InferenceX repo — see References.
+
+---
+
+# Part 1 — Background
+
+## What it is
+
+CollectiveX is an benchmarking workstream under the InferenceX umbrella. It measures **collective communication** and **MoE dispatch/combine**, and performs **apples-to-apples, cross-vendor comparison of expert-parallel (EP) libraries** across NVIDIA and AMD (TPU later). The intended deliverables are an **OSS benchmark project** and a **public explainer article** — a credible cross-vendor collective benchmark plus the story around it.
+
+## Why
+
+Existing public benchmarks don't offer trustworthy, like-for-like collective/EP comparison across vendors. CollectiveX fills that gap by reusing InferenceX's runner and cluster infrastructure to produce reproducible, provenance-tagged results.
+
+## Current state
+
+- An initial MVP exists: it collected collective and kernel shapes and produced MoE dispatch/combine results on NVIDIA.
+- **Normal mode works; low-latency (LL) mode is blocked** on IBGDA enablement — a direct GPU↔NIC data-and-control path over PCIe that removes CPU coordination and simplifies MoE dispatch/combine collectives — which depends on cluster-networking work outside this project.
+- The main near-term enabler is NVIDIA networking / IBGDA; the AMD EP stack and AMD networking (Ultra Ethernet) are the cross-vendor counterpart.
+
+---
+
+# Part 2 — Implementation plan
+
+## Implementation status (built)
+
+The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that:
+
+- **Multi-arch, digest-pinned container** for all NVIDIA SKUs: `lmsysorg/sglang:v0.5.12-cu130@sha256:4219…f356` (amd64 + arm64) — one reference both arches; DeepEP via `rebuild-deepep`. See `CONTAINERS.md`.
+- **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|all) through a shared `launchers/run_in_container.sh`.
+- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → GB200 NCCL smoke; `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
+
+This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental).
+
+## Scope and placement
+
+CollectiveX starts as an **experimental project on its own branch**, fully contained under `experimental/CollectiveX/`:
+
+```bash
+git switch main
+git pull --ff-only
+git switch -c collectivex
+mkdir -p experimental/CollectiveX
+```
+
+This matches the repository's intent: `experimental/` is explicitly non-core ("experimental WIP code that is mostly Claude Code generated… not intended for production use or as part of the official InferenceMAX results").
+
+For the experimental phase, **everything stays inside `experimental/CollectiveX/**`**. Do **not** modify:
+
+```text
+benchmarks/
+runners/
+utils/
+.github/configs/
+perf-changelog.yaml
+InferenceX-app
+```
+
+The only eventual exception is a minimal workflow dispatcher under `.github/workflows/` (because executable workflows must live there); all real CollectiveX logic, schemas, launchers, and processing stay under `experimental/CollectiveX/`.
+
+**This supersedes any notion of CollectiveX becoming a top-level InferenceX subsystem or extending the production serving matrix up front.** Promotion — into core InferenceX, into a dedicated repo, or into InferenceX-app's database/frontend — is an explicit *later* decision (Milestone 4), made only after the benchmark contract has stabilized on real hardware.
+
+### What InferenceX already gives us
+
+InferenceX's existing execution model is almost exactly the control plane CollectiveX needs:
+
+1. Generate and strictly validate a matrix on a GitHub-hosted runner.
+2. Fan jobs out to named or labelled self-hosted runners.
+3. Those listeners submit work to Slurm (or launch Docker locally).
+4. Normalize outputs.
+5. Upload artifacts.
+6. Aggregate and dispatch ingestion to the dashboard.
+
+`e2e-tests.yml` already divides generated configs into job families and invokes reusable single-node and multi-node workflows; `benchmark-tmpl.yml` cleans up resources, checks out the selected ref, **derives the launcher from the runner name**, launches the job, validates outputs, and uploads normalized results. Runner listeners live on cluster login/controller nodes while jobs run on compute nodes via Slurm; runner names/labels are load-bearing — the name prefix selects the launcher and exact names/SKU labels control scheduling.
+
+CollectiveX reuses all of this, but enters through **CollectiveX-specific launchers** rather than threading fake models through the serving launchers (see Cluster reuse).
+
+## Architecture
+
+Four planes, cleanly separated:
+
+- **Control plane:** scheduling, runners, cleanup, artifact movement, workflow metadata (reused from InferenceX).
+- **Benchmark plane:** collective semantics, backend invocation, correctness, timing.
+- **Data plane:** canonical result records, raw per-rank samples, topology and provenance.
+- **Presentation plane:** comparable subsets, charts, history, diagnostics.
+
+Data flow within the experimental directory:
+
+```text
+Portable shape definitions
+          +
+Backend definitions
+          +
+Target/cluster definitions
+          ↓
+CollectiveX matrix resolver
+          ↓
+Resolved shards
+          ↓
+Existing InferenceX self-hosted runner
+          ↓
+experimental/CollectiveX/launchers/*
+          ↓
+Backend adapter  (NCCL / RCCL / DeepEP / AITER / MoRI / …)
+          ↓
+Versioned result bundle
+          ↓
+Aggregator + regression checker
+          ↓
+Static experimental report   →  (later) InferenceX-app ingestion → Postgres → /collectives
+```
+
+### Target structure at promotion (Milestone 4)
+
+This packaged layout is the **promotion target**, not the spike. Milestone 0 uses the light layout in the rollout section below (`run_nccl.py` / `run_deepep.py` / `env_capture.py` / `plot.py` + flat `results/`); the structure here is what CollectiveX grows into *if* it is promoted out of `experimental/`.
+
+```text
+InferenceX/
+├── experimental/
+│   ├── README.md
+│   └── CollectiveX/
+│       ├── README.md
+│       ├── DESIGN.md
+│       ├── ROADMAP.md
+│       ├── pyproject.toml
+│       ├── Makefile
+│       │
+│       ├── src/
+│       │   └── collectivex/
+│       │       ├── __init__.py
+│       │       ├── cli.py
+│       │       ├── config/
+│       │       │   ├── models.py
+│       │       │   ├── loader.py
+│       │       │   ├── resolver.py
+│       │       │   └── matrix.py
+│       │       ├── benchmark/
+│       │       │   ├── harness.py
+│       │       │   ├── timing.py
+│       │       │   ├── correctness.py
+│       │       │   ├── routing.py
+│       │       │   └── metrics.py
+│       │       ├── backends/
+│       │       │   ├── base.py
+│       │       │   ├── fake.py
+│       │       │   ├── nccl_tests.py
+│       │       │   ├── rccl_tests.py
+│       │       │   ├── deepep.py
+│       │       │   └── framework_ep.py
+│       │       ├── cluster/
+│       │       │   ├── inventory.py
+│       │       │   ├── capabilities.py
+│       │       │   ├── environment.py
+│       │       │   └── launcher.py
+│       │       ├── results/
+│       │       │   ├── models.py
+│       │       │   ├── writer.py
+│       │       │   ├── aggregate.py
+│       │       │   ├── compare.py
+│       │       │   └── redact.py
+│       │       └── report/
+│       │           ├── build.py
+│       │           └── templates/
+│       │
+│       ├── configs/
+│       │   ├── suites/
+│       │   │   ├── smoke.yaml
+│       │   │   ├── primitives.yaml
+│       │   │   ├── moe-decode.yaml
+│       │   │   ├── moe-prefill.yaml
+│       │   │   └── full.yaml
+│       │   ├── shapes/
+│       │   │   ├── synthetic/
+│       │   │   └── traced/
+│       │   ├── backends/
+│       │   ├── targets/
+│       │   └── clusters.yaml
+│       │
+│       ├── launchers/
+│       │   ├── common.sh
+│       │   ├── launch_b200-dgxc.sh         # B200 single node
+│       │   ├── launch_b200-dgxc-slurm.sh   # B200 multinode
+│       │   └── launch_gb200-nv.sh          # GB200 NVL72
+│       │
+│       ├── schemas/
+│       │   ├── case-v1.schema.json
+│       │   ├── result-v1.schema.json
+│       │   ├── manifest-v1.schema.json
+│       │   └── environment-v1.schema.json
+│       │
+│       ├── scripts/
+│       │   ├── bootstrap.sh
+│       │   ├── run_suite.sh
+│       │   ├── run_shard.sh
+│       │   └── build_report.sh
+│       │
+│       ├── tests/
+│       │   ├── fixtures/
+│       │   ├── test_config.py
+│       │   ├── test_matrix.py
+│       │   ├── test_parsers.py
+│       │   ├── test_correctness.py
+│       │   └── test_comparability.py
+│       │
+│       └── docs/
+│           ├── BENCHMARK_CONTRACT.md
+│           ├── BACKEND_ADAPTER.md
+│           ├── SHAPE_REGISTRY.md
+│           ├── RESULT_FORMAT.md
+│           ├── FRONTEND.md
+│           └── PROMOTION_CRITERIA.md
+│
+└── .github/workflows/
+    └── collectivex-experimental.yml   # Added only when cluster CI begins (Milestone 2)
+```
+
+> Note: launcher names mirror the real runner-name prefixes. The spike adds the three NVIDIA launchers above; AMD (`launch_mi355x-amds.sh`) and others follow.
+
+## Benchmark model — keep four concepts separate
+
+CollectiveX needs its **own** schema. Do **not** reuse or extend the serving matrix, which is built around model / ISL / OSL / framework / TP / EP / concurrency and lives in `utils/matrix_logic/generate_sweep_configs.py`. Representing collectives with fake model names, `ISL=0`, or overloaded concurrency fields would create permanent technical debt. CollectiveX gets its own matrix logic (in the packaged layout, `src/collectivex/config/matrix.py`) — introduced with the workflow at Milestone 2, not the spike — rather than touching `utils/matrix_logic/generate_sweep_configs.py`.
+
+The model keeps four concepts independent:
+
+**Shape** — the logical communication workload:
+
+```text
+operation, message size, tokens per rank, hidden size, top-k,
+expert count, routing distribution, dtype, phase
+```
+
+**Backend** — the implementation under test:
+
+```text
+NCCL, RCCL, DeepEP, AITER, MoRI, framework-native EP, reference implementation
+```
+
+**Target** — where and how it runs:
+
+```text
+runner type, cluster, nodes, GPUs per node, rank placement,
+fabric, container image, transport capabilities
+```
+
+**Suite** — a curated selection of shape × backend × target combinations. Keeping these separate prevents copying the same DeepSeek/MiniMax shape into every NVIDIA and AMD configuration.
+
+### Portable definitions
+
+Shape:
+
+```yaml
+schema-version: 1
+shape-id: moe.decode.h7168.top8.e256.t64.uniform.v1
+
+kind: moe
+phase: decode
+operation: dispatch-combine
+
+shape:
+  tokens-per-rank: 64
+  hidden-size: 7168
+  top-k: 8
+  num-experts: 256
+  dispatch-dtype: fp8
+  combine-dtype: bf16
+  routing:
+    distribution: uniform
+    seed: 67
+  expert-alignment: 16
+```
+
+Backend:
+
+```yaml
+backend-id: deepep-normal
+backend: deepep
+mode: normal
+
+source:
+  repository: deepseek-ai/DeepEP
+  ref: pinned-commit
+
+settings:
+  async-overlap: false
+  num-comm-sms: standardized
+  qp-count: auto
+```
+
+Target:
+
+```yaml
+target-id: b200-dgxc-4n
+runner-type: b200-multinode
+cluster-id: b200-dgxc
+
+resources:
+  nodes: 4
+  gpus-per-node: 8
+  exclusive: true
+
+placement:
+  ranks-per-node: 8
+  rank-order: contiguous
+
+capabilities:
+  rdma: true
+  ibgda: experimental
+  nvshmem: true
+```
+
+Suite:
+
+```yaml
+suite-id: moe-decode-smoke
+
+shapes:
+  - moe.decode.h7168.top8.e256.t64.uniform.v1
+
+backends:
+  - deepep-normal
+  - deepep-low-latency
+
+targets:
+  - b200-dgxc-2n
+
+measurement:
+  warmup-iterations: 20
+  measured-iterations: 200
+  trials: 3
+  correctness: full
+```
+
+### Case identity
+
+A **case** is one immutable, versioned point: the natural key composes the three concepts —
+
+```text
+case-id = <backend-id> __ <shape-id> __ <target-id>
+e.g.  deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n
+      nccl__allreduce.fp16.logsweep.v1__b200-dgxc-2n
+```
+
+A shape must never silently change; a newly extracted distribution gets a new versioned `shape-id`.
+
+**Required shape fields — primitives:** operation; logical element count; datatype; input/output bytes; in-place vs out-of-place; reduction op (where applicable); world size; rank placement; host-driven vs device-driven launch; blocking/synchronization semantics.
+
+**Required shape fields — MoE (additional):** tokens per rank; hidden size; top-k; number of experts; EP size; dispatch and combine dtypes; routing distribution; expert alignment/padding; capacity constraints; quantization scale representation; cached vs recomputed routing layout; communication-SM count; async-overlap mode. DeepEP shows why these must be first-class — its interface takes tokens/rank, hidden size, top-k, expert count, FP8 mode and comm-SM settings, and exposes async dispatch/combine.
+
+### Shape registry
+
+Two independent shape sources:
+
+**Synthetic** — for continuous curves and hardware characterization (logarithmic byte sweep for primitives; token-count sweep for MoE; EP-scaling sweep; uniform and controlled-skew routing; intranode and internode placements; decode-oriented and prefill-oriented regimes). Don't build every Cartesian combination; define named suites (`primitive-latency-v1`, `primitive-bandwidth-v1`, `moe-decode-v1`, `moe-prefill-v1`, `moe-skew-v1`, `scaleout-v1`).
+
+**Trace-derived** — extracted from real InferenceX runs/profiles:
+
+```text
+models/deepseek-v4/decode/<shape-id>
+models/minimax-m3/decode/<shape-id>
+models/kimi-k2.7/prefill/<shape-id>
+```
+
+Each traced shape retains: source workflow run; model/config; phase; layer/layer-group; observed token histogram; routing skew; concurrent collective count; framework version; extraction-tool version. InferenceX already has a targeted profiling workflow (`profile.yml`) with optional MoE debug output and a separate trace-storage path — a natural source for real shapes rather than only guessed synthetic inputs.
+
+## Benchmark layers and comparison classes
+
+| Layer | Purpose | Examples |
+|---|---|---|
+| **L0 Environment** | Prove the cluster is benchmarkable | topology, NIC/GPU state, peer access, RDMA, IBGDA capability, version capture |
+| **L1 Primitive collectives** | Characterize the raw communication substrate | send/recv, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv |
+| **L2 MoE communication** | Compare real EP libraries | dispatch, combine, dispatch+combine round trip, normal and low-latency modes |
+| **L3 Integrated pipelines** | Communication in realistic operator sequences | route → permute → dispatch → grouped GEMM → combine → unpermute |
+| **L4 E2E correlation** | Explain InferenceX serving performance | isolated CollectiveX result linked to the corresponding InferenceX run/profile |
+
+The MVP concentrates on **L1 and L2**. L3 overlaps OperatorX and comes after the contracts are stable; L4 is the eventual tie-back to serving.
+
+**L0 — Environment validation** (before measuring anything): GPU count/identity; GPU/NIC topology; CUDA/ROCm version; driver version; NCCL/RCCL version; RDMA device visibility; peer-access matrix; IBGDA/SHMEM capability; container digest; clock/power state; selected network interfaces. A failed probe yields one clear `environment-invalid` result, not dozens of misleading backend failures.
+
+**L1 — Primitives:** send/receive, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv. Use vendor test programs where possible rather than rewriting primitives. Measure two regions separately: latency (bytes→low KiB) and bandwidth (MiB→GiB).
+
+**L2 — MoE collectives:** dispatch, combine, dispatch+combine. Dimensions: tokens/rank, hidden size, top-k, expert count, EP size, dispatch dtype, combine dtype, routing skew, normal vs low-latency, comm-SM count, node count.
+
+### Three comparison classes
+
+Every result is tagged with exactly one, and they must never be silently mixed on one chart:
+
+| Class | Meaning |
+|---|---|
+| `standardized` | Matched logical shape **and** fixed resource budget — same shape, topology, dtype, correctness contract, allowed comm-SMs, and timing boundaries. The main apples-to-apples comparison. |
+| `backend-optimized` | Same logical output, but each library uses its recommended comm-SMs / protocols / QP count / buffer sizing / graph capture / tuning. Answers "what is the best each stack can do?" |
+| `framework-integrated` | The actual path used by SGLang / vLLM / TensorRT-LLM / Dynamo. Connects to InferenceX; not a pure microbenchmark. |
+
+### Comparability key
+
+Every result gets a machine-generated comparison key; rows with different keys are not connected on the same curve by default:
+
+```text
+operation, shape ID, dtype, world size, node count, rank placement,
+routing distribution, comparison class, measurement contract version, topology class
+```
+
+## Measurement and correctness
+
+### Timing boundaries
+
+Record separately — never report one latency that sometimes includes JIT and sometimes doesn't:
+
+```text
+1. communicator creation
+2. buffer allocation and registration
+3. first invocation / JIT
+4. warmed steady-state invocation
+5. host launch time
+6. GPU completion time
+7. optional end-to-end framework-visible time
+```
+
+Per measured iteration: synchronize before starting (unless explicitly testing queued execution); use GPU events for device duration and host monotonic time for API/launch duration; retain per-rank measurements; aggregate only after rank-level data is stored; report the **slowest rank** as well as the average.
+
+### Correctness as a hard gate
+
+A result is `valid` only after correctness passes. A fast result that fails correctness stays visible as `invalid` — never silently dropped.
+
+Primitive checks: deterministic input; expected reduction result; guard regions around buffers; in-place and out-of-place checks; dtype-specific tolerances.
+
+MoE checks: token conservation; correct expert assignment; correct routing weights; valid permutation metadata; dispatch output vs reference; combine output vs reference; no padded-token leakage; deterministic routing hash.
+
+Failed results remain in artifacts, e.g.:
+
+```json
+{
+  "status": "invalid",
+  "correctness_passed": false,
+  "error": "combine result exceeded bf16 tolerance"
+}
+```
+
+### Routing distributions
+
+At minimum: uniform; single-hot/worst-case concentration; Zipf-like skew; bounded imbalance; replayed real histogram. Store the routing seed and the generated assignment hash.
+
+### Metrics
+
+| Category | Metrics |
+|---|---|
+| Latency | p50, p90, p95, p99, min, max |
+| Rank behavior | slowest-rank latency, rank spread, coefficient of variation |
+| Primitive throughput | algorithm bandwidth, bus bandwidth, effective bytes/s |
+| MoE throughput | tokens/s, logical payload GB/s, dispatch and combine separately |
+| Efficiency | bandwidth relative to declared topology bottleneck |
+| Host overhead | API launch time, CPU utilization where available |
+| GPU overhead | communication SM count, GPU active time, optional power |
+| Memory | persistent buffer bytes, peak temporary bytes |
+| Overlap | standalone comm, standalone compute, overlapped duration, overlap efficiency |
+| Reliability | initialization failures, hangs, retries, correctness failures |
+| Provenance | all software, image, driver, firmware and topology identifiers |
+
+### Bandwidth definitions
+
+NCCL `algbw`/`busbw` are stored but not treated as universal (NCCL applies operation-specific correction factors). MoE libraries often report **logical bottleneck bandwidth** (may include local-rank traffic or exclude metadata/padding; DeepEP explicitly publishes logical bandwidth). Store separate fields, and use `null` rather than a deceptive inference when a backend can't expose physical bytes:
+
+```text
+logical_payload_bytes
+allocated_payload_bytes
+estimated_link_bytes
+metadata_bytes
+padding_bytes
+```
+
+## Result and artifact format
+
+Each shard emits a versioned bundle:
+
+```text
+output/
+├── manifest.json
+├── cases.json
+├── results.jsonl
+├── rank-samples.jsonl.gz
+├── summary.json
+├── environment/
+│   ├── gpu.json
+│   ├── network.json
+│   ├── topology.json
+│   └── software.json
+├── raw/
+│   ├── stdout.log
+│   ├── stderr.log
+│   └── backend-output/
+├── commands/
+│   └── reproduce.sh
+└── profiles/
+```
+
+**Manifest** (invariant run-level metadata): schema version; workflow run + attempt; source SHA/ref; cluster ID; runner; Slurm job ID; node count; topology fingerprint; image digest; backend commit/build; start/end timestamps; redaction version.
+
+**Result row:**
+
+```json
+{
+  "schema_version": 1,
+  "case_id": "deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n",
+  "status": "valid",
+  "trial": 1,
+  "backend": "deepep",
+  "mode": "normal",
+  "comparison_class": "standardized",
+  "metrics": {
+    "latency_us_p50": 0,
+    "latency_us_p99": 0,
+    "slowest_rank_us_p50": 0,
+    "logical_bandwidth_gbps": 0,
+    "tokens_per_second": 0,
+    "rank_spread_pct": 0,
+    "persistent_buffer_bytes": 0
+  },
+  "correctness": { "passed": true, "max_abs_error": 0, "max_rel_error": 0 }
+}
+```
+
+Use an explicit `schema_version` from the beginning — do not repeat the app's historical need to infer schema version from whether a field happens to exist.
+
+## Backend adapters
+
+Each adapter implements a small contract:
+
+```python
+class CollectiveBackend:
+    def probe(self, environment) -> CapabilityReport: ...
+    def prepare(self, case, workdir) -> PreparedCommand: ...
+    def run(self, prepared, launcher) -> RawRun: ...
+    def parse(self, raw_run) -> list[RankSample]: ...
+    def validate(self, case, raw_run) -> CorrectnessReport: ...
+    def describe(self) -> BackendProvenance: ...
+```
+
+**Tier 0 — communication baselines:** NVIDIA `nccl-tests`, ROCm `rccl-tests`, optionally PyTorch distributed as a common-API baseline. Don't rewrite primitives from scratch — `nccl-tests` already supports multi-node, warmups, correctness checking (`-c 1`), per-rank aggregation, device-driven implementations, and separate CPU-time reporting. *(Confirm whether the installed build emits JSON; if not, parse the text table.)*
+
+**Tier 1 — MoE dispatch/combine:** upstream DeepEP, ROCm DeepEP, and the NVIDIA/AMD EP paths already used by the InferenceX serving stacks. **Version pins are first-class.** Upstream DeepEP V2 changed NVSHMEM→NCCL, unified high-throughput and low-latency APIs, changed buffer behavior, and removed a previous zero-SM LL mode; ROCm's port has different maturity, NIC variants, rocSHMEM dependencies. DeepEP is **built at job setup** (via `rebuild-deepep.sh`, resolved by srt-slurm), not shipped in the image — its build time and `aarch64` (GB200) feasibility are tracked spike risks. A chart labelled only "DeepEP" is therefore ambiguous — store:
+
+```text
+backend name, upstream/fork, git commit, API generation,
+transport backend, build flags, runtime library versions, container digest
+```
+
+**Tier 2 — additional optimized stacks (later):** MSCCL++, AITER comm/fusion paths, MoRI/Pollara, NVSHMEM/rocSHMEM microbenchmarks, framework-native fused collectives.
+
+## Rollout — spike-first
+
+**Spike-first.** No schema, Pydantic model, or comparison contract is frozen until one real, correctness-gated number exists on real hardware. The first milestone is a single end-to-end spike on **two NVIDIA topologies, B200 and GB200**, chosen because they exercise the two transport regimes that matter: B200 is an 8-GPU NVLink island with CX-7 InfiniBand between nodes; GB200 is an NVL72 multi-node-NVLink (MNNVL) domain. Running the same collective across both is itself the first headline result, and it forces the provenance and comparison-class machinery to be real from line one. The schema is the spike's *output*, extracted from the artifacts it produces — not its input. AMD and all platform work (workflow, DB, frontend) follow.
+
+### Milestone 0 — NVIDIA B200 + GB200 spike
+
+One milestone, NVIDIA-only, end to end. This collapses the former "design contract," "CPU framework," "primitive NVIDIA baseline," and the NVIDIA half of "MoE MVP" into a single vertical slice that produces real numbers on real fabric.
+
+Scaffolding — deliberately light, matching `experimental/` convention (bare scripts + flat JSON + a plot; no package / Pydantic / JSON-schemas yet — those arrive at the contract freeze):
+
+```text
+experimental/CollectiveX/
+  README.md
+  run_nccl.py        # argparse; run stock nccl-tests, parse its text table (do NOT assume JSON)
+  run_deepep.py      # one dispatch+combine shape, normal mode
+  env_capture.py     # Layer-0 env + topology fingerprint (torch.cuda.* + nvidia-smi topo) → json
+  plot.py            # matplotlib, like token_position_decode_slo/*/plot_*.py
+  launchers/
+    common.sh
+    launch_b200-dgxc.sh         # B200 single node  (b200-dgxc runner → 8-GPU NVLink island, x86_64)
+    launch_b200-dgxc-slurm.sh   # B200 multinode    (b200-multinode runner → CX-7 IB spine)
+    launch_gb200-nv.sh          # GB200             (gb200 runner → NVL72 MNNVL, aarch64, 4 GPU/node)
+  results/*.json     # flat, hand-verifiable
+```
+
+Reuse existing patterns rather than reinventing: `experimental/dsv32/bench.py` for `torch.cuda.Event` timing and stdout environment capture, and `experimental/token_position_decode_slo/glm-5/{bmk_*_sbatch.sh,plot_sla_frontier.py}` for Slurm orchestration + plotting. Mirror the runner→launcher routing convention (`bash ./launchers/launch_${RUNNER_NAME%%_*}.sh`) so the runner name selects the CollectiveX launcher as the serving path does.
+
+**DeepEP is not prebuilt in any image.** The serving recipes build it at job setup via `setup_script: rebuild-deepep.sh` (resolved by srt-slurm; see `benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`). The spike reuses that same rebuild path — on B200 (x86_64) first. Pin images by digest from `.github/configs/nvidia-master.yaml`: B200 `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b`; GB200 `lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc` (an unpinned nightly today — capture its digest before relying on it).
+
+What it measures:
+
+```text
+Primitives (stock nccl-tests, -c 1 for correctness) — on BOTH B200 and GB200:
+  all-reduce, all-gather, reduce-scatter, all-to-all
+  latency regime (bytes→KiB) and bandwidth regime (MiB→GiB)
+  B200  : 8 GPU/node (x86_64); 1 node (NVLink island) and 2 nodes (cross CX-7 IB)
+  GB200 : 4 GPU/node (aarch64); 1 node and 2+ nodes — all still inside the NVL72 NVLink (MNNVL) domain
+
+MoE (DeepEP, normal mode only — LL mode is the known-broken/blocked path, out of scope):
+  one decode-shaped dispatch+combine: tokens-per-rank=64, hidden=7168,
+  top-k=8, experts=256, dispatch fp8
+  correctness: token conservation + combine vs a reference implementation
+  B200 (x86_64) first; GB200 DeepEP is a fast-follow once the aarch64 rebuild-deepep path is proven
+```
+
+The headline is the **same NCCL primitive shape on both topologies**: B200's 2-node path crosses CX-7 InfiniBand, while GB200's stays on NVL72 NVLink (MNNVL). That IB-vs-MNNVL contrast at a matched logical shape is the result worth publishing. (nccl-tests and DeepEP must be built for `aarch64` on GB200 — the reason DeepEP is B200-first.)
+
+Provenance captured on every row from the first run — non-negotiable even in a spike, because it is what makes the B200-vs-GB200 number defensible:
+
+```text
+topology-class       b200-nvlink-island(+cx7-ib)  |  gb200-nvl72-mnnvl
+transport actually used   (NVLink / IB / NVSHMEM-IBGDA), derived from flags + measured behavior
+transport env set/recorded:
+  B200  : NCCL_CUMEM_ENABLE=1
+  GB200 : NCCL_CUMEM_ENABLE=1, NCCL_MNNVL_ENABLE=1, MC_FORCE_MNNVL=1
+  (also seen in serving: NCCL_P2P_LEVEL=NVL, SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK)
+comm-SM count, QP count where applicable
+backend commit + API generation + build flags
+container digest, CUDA / driver / NCCL versions
+comparison-class tag (standardized where shape, dtype and SM budget match)
+```
+
+These flags come from validated GB200 serving recipes (`…/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`); MNNVL is GB200/GB300-only, which is exactly what makes the transport differ from B200.
+
+Output: a result bundle on disk (`manifest.json`, `results.jsonl`, `environment/`, `raw/`, `commands/reproduce.sh`). Hand-verify the first rows; do not build a generated Pydantic contract yet.
+
+Exit criteria:
+
+* real NCCL latency + bandwidth curves on **both** B200 and GB200, correctness-passed (the headline)
+* one DeepEP dispatch+combine number (normal mode) on **B200**, correctness-passed; GB200 DeepEP as the immediate fast-follow
+* every row carries topology-class, transport, comparison-class and full provenance
+* a B200-vs-GB200 side-by-side that the comparison key permits **and labels as topology-class-differing** — that labeled comparison is the intended result, not an accident
+* **only now** freeze the schema (`CollectiveCase` / `CollectiveResult` / manifest), extracted from these artifacts
+
+Explicitly out of scope for the spike: AMD, IBGDA low-latency mode, GitHub Actions, database, frontend, trace-derived shapes, and the fake backend as a deliverable (keep a trivial one only if it speeds offline tests).
+
+### Milestone 1 — AMD parity
+
+Bring the AMD side up against the schema the spike froze — not in parallel with it:
+
+```text
+RCCL-tests adapter (mirror the nccl-tests text-table parser)
+one AMD launcher (launch_mi355x-amds.sh)
+one AMD MoE dispatch/combine backend (DeepEP ROCm / AITER / MoRI)
+equivalent shapes + identical result contract
+first cross-vendor (NVIDIA vs AMD) comparison
+```
+
+Record the AMD transport stack (rocSHMEM, MoRI-IO / Pollara, NIC variant) with the same provenance rigor the spike established. An unlabeled "DeepEP" row compared across vendors is meaningless.
+
+### Milestone 2 — GitHub workflow
+
+Add (orchestration only; see GitHub workflow design below):
+
+```text
+collectivex-experimental.yml
+preflight
+canary
+matrix sharding
+artifact collection
+regression comparison
+static report artifact
+```
+
+Do not connect it to `perf-changelog.yaml`.
+
+### Milestone 3 — Trace-derived shapes
+
+Extract representative shapes from InferenceX profiles (DeepSeek V4, MiniMax M3, Kimi). Every traced shape must retain: source workflow run; source configuration; framework version; model phase; extraction-tool version; routing-histogram hash.
+
+### Milestone 4 — Promotion decision
+
+Only then decide whether to: keep CollectiveX permanently experimental; move it into core InferenceX; extract it into a dedicated repository; or integrate its data into InferenceX-app (database + `/collectives` frontend).
+
+### First PRs (the spike)
+
+The spike lands as a few small PRs, each producing something runnable — not a docs-and-schema PR:
+
+```text
+1. Scaffold + NCCL on B200 single node
+   run_nccl.py (text-table parser), env_capture.py, plot.py,
+   launchers/launch_b200-dgxc.sh, results/*.json
+   → lands when it emits a real all-reduce curve with provenance from an 8-GPU B200
+
+2. B200 multinode + GB200
+   launchers/launch_b200-dgxc-slurm.sh, launchers/launch_gb200-nv.sh
+   → lands when the same primitive runs on 2-node B200 (cross-IB) and on GB200 NVL72 (MNNVL),
+     each tagged with topology-class and transport (aarch64 build for GB200)
+
+3. DeepEP dispatch+combine — B200 first
+   run_deepep.py, routing generator + reference combine for correctness,
+   reusing rebuild-deepep at job setup
+   → one decode shape, normal mode, on B200; GB200 DeepEP fast-follow
+
+4. Freeze the contract
+   extract the case / result / manifest schema from the bundles produced in 1–3;
+   add fixtures captured from real output — this is where the packaged structure begins
+```
+
+The first objective is a real, provenance-tagged, correctness-gated number on two NVIDIA topologies — the contract is the spike's output, not its foundation.
+
+## Cluster reuse and capability inventory
+
+### What to reuse
+
+Existing self-hosted runner registrations; exact runner labels; Slurm access from runner hosts; checkout and artifact patterns; resource-cleanup strategy; repository secrets; container caches where appropriate. The runner inventory (`.github/configs/runners.yaml`) already enumerates H100, H200, B200, B300, GB200, GB300, MI300X, MI325X, MI355X fleets and groups such as `h200-multinode`, `b200-multinode`, individual nodes, etc. CollectiveX **reads** this file rather than duplicating runner names.
+
+### What not to reuse directly
+
+Do not call the serving launchers (`runners/launch_${RUNNER_NAME%%_*}.sh`) — they carry model-serving assumptions (model paths, framework setup, result naming). Mirror the **selection convention** with CollectiveX launchers instead:
+
+```bash
+bash experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh
+```
+
+Each CollectiveX launcher handles only: Slurm allocation; container image; mounts; network environment; rank launch; result copy-back; cleanup. There are **two launch paths**, mirroring the serving side: **single-node** B200 mirrors the `salloc … --gres=gpu:N --exclusive … && srun --container-image=<enroot squash>` pattern in `runners/launch_b200-dgxc.sh`; **multi-node** B200/GB200 drives **srt-slurm** (`srtctl apply -f <recipe>`), which already knows how to rebuild DeepEP and set the MNNVL env — so the CollectiveX GB200 launcher is a thin wrapper handing srt-slurm a CollectiveX recipe, not a from-scratch sbatch. (Later, common Slurm/container functions can be factored into a shared lib used by both systems.)
+
+> Runner-name subtlety to handle in `inventory.py`: one physical cluster can appear under multiple prefixes — `b200-dgxc_NN` routes to `launch_b200-dgxc.sh` (single-node) while `b200-dgxc-slurm_N` (label `b200-multinode`) routes to `launch_b200-dgxc-slurm.sh`. One fabric domain can therefore span several runner labels.
+
+### Capability overlay
+
+`inventory.py` loads `../../../.github/configs/runners.yaml` and combines it with a CollectiveX capability overlay — one source of truth for runner names, CollectiveX metadata kept isolated:
+
+```yaml
+b200-multinode:
+  launcher: b200-dgxc-slurm
+  vendor: nvidia
+  hardware: b200
+  topology-class: b200-nvlink-cx7
+  fabric-domain: b200-dgxc-main
+  gpus-per-node: 8
+  arch: x86_64
+  max-nodes: 16
+  scheduler: slurm
+  container-runtime: enroot-pyxis
+  capabilities:
+    nccl: true
+    deepep: true                # built at job setup via rebuild-deepep, not prebuilt
+    rdma: true
+    nvshmem: true
+    ibgda: experimental         # capability present ≠ currently validated
+  scheduling:
+    exclusive-nodes: true
+    max-parallel-shards: 1
+
+gb200:
+  launcher: gb200-nv
+  vendor: nvidia
+  hardware: gb200
+  topology-class: gb200-nvl72-mnnvl
+  gpus-per-node: 4              # NVL72 compute tray
+  arch: aarch64                 # nccl-tests + DeepEP must build for aarch64
+  scheduler: srt-slurm
+  transport-env: { NCCL_CUMEM_ENABLE: 1, NCCL_MNNVL_ENABLE: 1, MC_FORCE_MNNVL: 1 }
+  capabilities:
+    nccl: true
+    deepep: true                # rebuilt at setup; aarch64 path is a tracked risk
+    mnnvl: true                 # GB200/GB300 only
+    ibgda: experimental
+```
+
+`fabric-domain` is essential: two jobs on separate compute nodes may still contend for the same leaf/spine network, so **GitHub concurrency is keyed by fabric domain, not GPU SKU**. The inventory distinguishes hardware capability, software currently installed, and feature state (known-good vs experimental vs temporarily broken) — IBGDA support and "IBGDA low-latency currently validated" are different properties.
+
+**Operational coexistence with the serving sweep.** `b200-multinode` is only three runners (`b200-dgxc-slurm_7/8/9`), **shared with the production serving sweeps**, and srt-slurm allocations are long. Exclusive nodes + `max-parallel-shards: 1` + fabric-domain serialization means CollectiveX and the serving sweep contend for the same scarce runners. Decide the scheduling/coexistence policy (off-hours windows? a dedicated runner?) before enabling any recurring CollectiveX suite, rather than discovering the contention in CI.
+
+## GitHub workflow design (Milestone 2)
+
+When cluster CI begins, add one small orchestration-only file — `.github/workflows/collectivex-experimental.yml` — with no benchmarking logic:
+
+```text
+validate → resolve matrix → preflight canaries → benchmark shards
+→ aggregate → compare against baseline → build static report → upload artifacts
+```
+
+Triggers while on the branch:
+
+```yaml
+on:
+  push:
+    branches: [ collectivex ]
+    paths:
+      - experimental/CollectiveX/**
+      - .github/workflows/collectivex-experimental.yml
+  pull_request:
+    paths:
+      - experimental/CollectiveX/**
+      - .github/workflows/collectivex-experimental.yml
+```
+
+Later, after a minimal dispatcher exists on `main`, add `workflow_dispatch` with inputs: `ref, suite, target, backend, shape, profile` (and comparison class / normal-LL-both / dry-run).
+
+Jobs:
+
+1. **Validate** — install the package; validate all suite/shape/backend/cluster YAML; confirm runner references exist in `runners.yaml`; reject unknown fields; emit the resolved run plan as an artifact. (Match InferenceX's strict Pydantic practice — models reject extra fields.)
+2. **Compile and shard** — **do not** generate one job per benchmark point. Group cases by `cluster, node count, GPU placement, container image, backend build, transport mode, fabric domain, profiler requirement`. A shard runs many compatible points under one Slurm allocation (avoids thousands of matrix jobs, repeated communicator init, queue latency, repeated container import). Bounded runtime; record per-case failures unless the cluster itself is unhealthy.
+3. **Preflight** — confirm GPU count; validate peer access; enumerate NICs; test RDMA/device visibility; verify backend libraries; run a tiny correctness case; capture topology/software. A failed preflight marks the whole shard `environment-invalid` rather than manufacturing dozens of backend failures.
+4. **Canary** — for each `(cluster, backend, mode)` group, run one small representative case; launch the larger matrix only after it passes (mirrors InferenceX's canary-before-full-sweep).
+5. **Benchmark** (`collectivex-benchmark-tmpl.yml`) — run on the resolved runner label; unique Slurm job name from workflow/attempt/shard; exclusive nodes; serialize/limit by `fabric-domain`; call the CollectiveX launcher; upload results even on partial failure; always upload environment+logs; fail the job only after artifact creation.
+6. **Aggregate and regress** — validate every result against JSON schema; reject duplicate natural keys; merge rank samples and summaries; compute trial aggregates; compare against the most recent compatible baseline; publish a step summary; upload one `results_collectivex` bundle.
+7. **Dispatch ingestion** (only once promoted to feed the app) — repository-dispatch the InferenceX-app repo with `{ "benchmark-family": "collectivex", "run-id": "...", "run-attempt": "..." }`.
+
+Use a separate `collectivex-changelog.yaml`: a CollectiveX backend change must not trigger the expensive serving sweep through `perf-changelog.yaml`, and a serving change must not launch every collective suite.
+
+## Regression policy (Milestone 2+)
+
+A compatible baseline requires exact matches on: case ID; cluster ID; topology fingerprint (or approved topology class); backend; comparison class; normal/LL mode; node and rank placement; dtype and shape; measurement-contract version. **Do not compare "same GPU SKU" across materially different fabrics.**
+
+```text
+regression if:
+  correctness changed pass → fail
+  OR median latency degradation exceeds max(fixed floor, cluster noise threshold)
+  OR bandwidth degradation exceeds max(fixed floor, cluster noise threshold)
+```
+
+Derive each cluster's noise threshold from repeated baseline measurements via median absolute deviation — don't hard-code a universal 3% before knowing each fabric's noise. Retain failed, timed-out, and invalid results; reliability is part of the benchmark.
+
+## Reporting, database, and frontend
+
+**Now (spike / Milestone 2): a static, artifact-driven report.** Do not begin by changing InferenceX-app.
+
+```bash
+python -m collectivex.report --results output/aggregate.json --output output/report/
+```
+
+```text
+report/
+├── index.html
+├── data.json
+├── assets/
+└── runs/
+    └── <case-id>.html
+```
+
+Report views: **Overview** (supported clusters/backends, latest run, correctness failures, recent regressions, coverage matrix); **Primitive explorer** (latency / algbw / busbw / rank-spread vs payload size; single-node vs multinode); **MoE explorer** (dispatch & combine latency vs tokens/rank; tokens/s vs EP size; uniform vs skewed; normal vs LL; comm-SMs vs performance); **Case details** (exact shape, backend commit, container digest, topology fingerprint, environment, command, correctness report, rank-level distribution, raw logs). A **comparison warning** must visibly reject invalid comparisons:
+
+```text
+Not directly comparable:
+- different routing distribution
+- different topology class
+- different communication-SM budget
+- standardized versus backend-optimized mode
+```
+
+**Later (Milestone 4 / promotion into InferenceX-app):** add `/collectives` to the app (Next.js, React Query, raw API rows, client-side transforms, D3 charts; tab metadata/routing are centralized). Avoid a single global "CollectiveX score" at launch. Port the report views, plus Library Comparison, Scale-and-topology, and Historical-regression views, and a run-detail drawer. The frontend computes the `comparison-key` and refuses to connect rows with differing keys by default — **this guard matters more than any individual chart.**
+
+API routes (app):
+
+```text
+/api/v1/collectives
+/api/v1/collectives/availability
+/api/v1/collectives/history
+/api/v1/collectives/runs/:id
+/api/v1/collectives/artifacts/:id
+```
+
+Continue the app convention: API returns raw DB rows; the frontend does chart-specific transforms.
+
+**Database (app, later).** Do not put CollectiveX rows in `benchmark_results` (its identity is serving configs + ISL/OSL/concurrency). Reuse `workflow_runs`, then add:
+
+```sql
+collective_workloads(id, case_id, schema_version, family, operation, shape jsonb)
+collective_environments(id, cluster_id, hardware, topology_class, topology_hash, software jsonb, capabilities jsonb)
+collective_configs(id, workload_id, environment_id, backend, backend_version, comparison_class, mode, nodes, gpus_per_node, world_size, settings jsonb)
+collective_results(id, workflow_run_id, config_id, trial, date, status, metrics jsonb,
+                   latency_p50_us, latency_p99_us, logical_bandwidth_gbps, bus_bandwidth_gbps,
+                   tokens_per_second, rank_skew_pct, error)
+collective_artifacts(result_id, artifact_type, storage_url, metadata jsonb)
+collective_availability(date, hardware, cluster_id, backend, family, operation, mode)
+```
+
+Follow the app's hybrid design (JSONB for evolving metrics; indexed "hot" columns for common filters; idempotent ingestion; natural unique keys; denormalized date; latest-results materialized view). Keep raw per-rank samples in artifacts/object storage, not in Postgres.
+
+## Future expansions
+
+The spike de-risks the path to the actual deliverable — a public OSS collective benchmark and an explainer article. Expansion axes, roughly near → far, with dependencies:
+
+**Hardware breadth.** B300 / GB300 next (GB300 is also MNNVL, with known disagg KV-transfer wins) → H100 / H200 as a cheaper, more-available **InfiniBand baseline** ideal for characterizing per-fabric noise → AMD MI300X / MI325X / MI355X (this is Milestone 1) → TPU (far; a separate stack and toolchain).
+
+**Backend breadth.** Framework-native EP (the `framework-integrated` class — ties numbers back to the SGLang/vLLM serving paths) → MSCCL++, NVSHMEM / rocSHMEM microbenchmarks, AITER comm/fusion, MoRI / Pollara (AMD).
+
+**IBGDA low-latency mode.** The recurring strategic blocker and the original "LL is broken" story; gated on the NVIDIA SRE maintenance window for B200/B300. Highest narrative value — add as an experimental suite the moment it unblocks.
+
+**Scale-out.** 2 → 4 → 8 → 16 nodes; on GB200, intra-NVL72 vs cross-rack scaling-efficiency curves (where MNNVL ends and the inter-rack fabric begins).
+
+**L3 integrated operator path.** route → permute → dispatch → grouped-GEMM → combine → unpermute — the bridge to OperatorX.
+
+**L4 e2e correlation.** Link an isolated dispatch/combine number to the same shape's cost inside a real serving run via `profile.yml` traces — the "explain serving performance" payoff and the tie-back to the core product.
+
+**Trace-derived shapes (Milestone 3).** DeepSeek V4 / MiniMax M3 / Kimi token-histogram and routing-skew extraction, so the synthetic shapes are anchored to real workloads.
+
+**AMD Ultra Ethernet (UEC).** The AMD networking path; pairs with the MoRI / Pollara backends.
+
+**Productization (north star).** Static report → public OSS benchmark site + the explainer article; promotion into InferenceX-app (`/collectives` + Postgres + nightly suite + regression alerts) at Milestone 2 / 4.
+
+## Continuous benchmark — vision & scope
+
+Goal: a continuous benchmark that reproduces the spike automatically and grows into a credible cross-vendor EP/collective comparison. **Start with balanced DeepSeek shapes, intranode EP**, then venture to advanced cases. Target **≥1 EP library per platform** first — DeepEP on NVIDIA, MoRI on AMD.
+
+### EP library landscape
+- MoRI (AMD) — https://github.com/ROCm/mori
+- DeepEP / DeepEPv2 / Hybrid-EP — https://github.com/deepseek-ai/DeepEP (hybrid: https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep)
+- NVIDIA NCCL EP — https://github.com/NVIDIA/nccl/tree/master/contrib/nccl_ep
+- UCCL — https://github.com/uccl-project/uccl
+- NVLink One-Sided AllToAll EP (mainly NVL72) — TensorRT-LLM blog18 (Optimizing MoE Communication with One-Sided AllToAll over NVLink)
+- NIXL EP — https://github.com/ai-dynamo/nixl/tree/main/examples/device/ep
+
+### Shapes & axes
+- **Classic DeepSeek V3:** hidden 7168, top-8, 256 routable experts.
+- **Prefill vs decode** (# tokens).
+- **Normal EP vs low-latency (LL) EP.**
+- **Dispatch precision:** NVFP4, MXFP4, MXFP8, BF16.
+- **Combine precision:** MXFP8, direct-cast FP8, BF16, NVFP4 — see MoRI #311, flashinfer #3643 / #3376.
+- **Balanced vs unbalanced vs EPLB.**
+- **Realistic shapes from InferenceX models** — collect hidden sizes / routing (Qwen3.5 has an unusual top-k).
+
+### Other inference collectives (later)
+- KV-cache transfer: MoRI-IO, NIXL, Mooncake; CPU↔GPU offload — `experimental/kvcache_transfer_DtoH_HtoD/benchmark.py`.
+- Low-latency one-shot / two-shot all-reduce (SGLang & vLLM in-tree kernels + AITER / FlashInfer variants) — e.g. sglang `sgl-kernel/csrc/allreduce/quick_all_reduce.cuh`.
+
+### Reference benchmark scripts to draw from
+- flashinfer PR #3000; ROCm/mori `tests/python/ops`; DeepEP `tests/legacy`.
+
+### Learning resources
+- arXiv 2511.15076, 2603.13606, 2512.19849, 2412.19437.
+
+## Things not to do
+
+* Do not add collective fields to the existing serving matrix.
+* Do not make one GitHub Actions job per payload size.
+* Do not call all logical-bandwidth figures "bus bandwidth."
+* Do not compare different topology fingerprints as though GPU SKU were sufficient.
+* Do not silently discard failed or incorrect results.
+* Do not let a backend choose undocumented tuning parameters (in `standardized` mode).
+* Do not make low-latency mode the only reported result.
+* Do not publish one overall ranking before coverage and comparison contracts are stable.
+* Do not start with every EP library, TPU, UEC, and every model shape.
+* Do not store full raw rank samples indefinitely in Postgres.
+* Do not expose internal hostnames, paths, NIC GUIDs, IP addresses, or private image references in public artifacts.
+* Do not freeze the schema before the spike has produced a real artifact to freeze it from.
+
+## References (verified against the live InferenceX repo)
+
+- `experimental/README.md` — the non-core / "not official results" charter this project lives under.
+- `.github/configs/runners.yaml` — runner labels and exact names (H100…GB300, AMD MI3xx).
+- `.github/workflows/benchmark-tmpl.yml`, `benchmark-multinode-tmpl.yml`, `profile.yml`, `speedbench-al.yml` — the `bash ./runners/launch_${RUNNER_NAME%%_*}.sh` selection convention.
+- `runners/launch_*.sh` — existing per-cluster launchers (`launch_b200-dgxc.sh`, `launch_b200-dgxc-slurm.sh`, `launch_gb200-nv.sh`, `launch_mi355x-amds.sh`, …).
+- `utils/matrix_logic/generate_sweep_configs.py`, `validation.py` — the serving matrix CollectiveX must **not** extend.
+- `.github/workflows/e2e-tests.yml`, `collect-results.yml` — the validate → fan-out → collect control plane being reused.
+- `perf-changelog.yaml` — the additions-only serving gate CollectiveX must **not** trigger.
+- NVIDIA Magnum IO NVSHMEM + GPUDirect Async (IBGDA): `https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/`
diff --git a/experimental/CollectiveX/plot.py b/experimental/CollectiveX/plot.py
new file mode 100644
index 000000000..0106c61c9
--- /dev/null
+++ b/experimental/CollectiveX/plot.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — plot NCCL primitive curves, B200 vs GB200.
+
+Loads run_nccl.py result JSONs from results/, and for each operation draws two
+panels: latency-vs-size and bus-bandwidth-vs-size, overlaying one curve per
+(runner, topology-class, world-size). The B200(IB)-vs-GB200(MNNVL) contrast at
+a matched shape is the intended overlay and the spike's headline.
+
+Comparison guard (plan §Comparability): curves are only overlaid when they
+share op + dtype + comparison-class + measurement-contract. Anything else is
+reported as "not directly comparable" and skipped rather than silently mixed.
+
+    python plot.py --results-dir results --out-dir results/plots
+
+matplotlib + (optional) numpy. Run on a workstation/laptop over the JSON
+artifacts; no GPU needed.
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+from collections import defaultdict
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+def _human(nbytes: int) -> str:
+    for unit in ("B", "KiB", "MiB", "GiB"):
+        if nbytes < 1024 or unit == "GiB":
+            return f"{nbytes:.0f}{unit}" if unit == "B" else f"{nbytes/1:.0f}{unit}"
+        nbytes /= 1024
+    return str(nbytes)
+
+
+def load_nccl_results(results_dir: str) -> list[dict]:
+    docs = []
+    for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))):
+        try:
+            d = json.load(open(path))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") == "nccl" and d.get("rows"):
+            d["_path"] = path
+            docs.append(d)
+    return docs
+
+
+def curve_label(d: dict) -> str:
+    return f"{d['runner']} · {d['topology_class']} · ws{d['world_size']}"
+
+
+def overlay_signature(d: dict) -> tuple:
+    """Fields that must match for two curves to share a chart (topology and
+    world-size are deliberately NOT here — they are the comparison axis)."""
+    return (d["op"], d.get("dtype"), d.get("comparison_class"), d.get("measurement_contract"))
+
+
+def plot_op(op: str, docs: list[dict], out_dir: str) -> str | None:
+    if not docs:
+        return None
+    # Comparison guard: keep the dominant signature, warn on the rest.
+    sigs = defaultdict(list)
+    for d in docs:
+        sigs[overlay_signature(d)].append(d)
+    main_sig = max(sigs, key=lambda s: len(sigs[s]))
+    keep = sigs[main_sig]
+    for sig, ds in sigs.items():
+        if sig == main_sig:
+            continue
+        for d in ds:
+            print(f"  [guard] skipping {curve_label(d)} for op={op}: not directly "
+                  f"comparable (dtype/class/contract differs: {sig} vs {main_sig})")
+
+    fig, (ax_lat, ax_bw) = plt.subplots(1, 2, figsize=(14, 5))
+    for d in sorted(keep, key=curve_label):
+        rows = sorted(d["rows"], key=lambda r: r["size_bytes"])
+        sizes = [r["size_bytes"] for r in rows]
+        lat = [r["out_of_place"]["time_us"] for r in rows]
+        bw = [r["busbw_gbps"] for r in rows]
+        label = curve_label(d)
+        ax_lat.plot(sizes, lat, "o-", linewidth=2, markersize=4, label=label)
+        ax_bw.plot(sizes, bw, "o-", linewidth=2, markersize=4, label=label)
+
+    for ax in (ax_lat, ax_bw):
+        ax.set_xscale("log", base=2)
+        ax.set_xlabel("Message size (bytes)")
+        ax.grid(True, alpha=0.3)
+        ax.legend(fontsize=9)
+    ax_lat.set_yscale("log")
+    ax_lat.set_ylabel("Latency (µs, out-of-place)")
+    ax_lat.set_title(f"{op}: latency vs size")
+    ax_bw.set_ylabel("Bus bandwidth (GB/s)")
+    ax_bw.set_title(f"{op}: bus bandwidth vs size")
+    fig.suptitle(
+        f"CollectiveX · {op} · dtype={main_sig[1]} · class={main_sig[2]}  "
+        f"(topology is the comparison axis)",
+        fontsize=11,
+    )
+    fig.tight_layout()
+    os.makedirs(out_dir, exist_ok=True)
+    out = os.path.join(out_dir, f"nccl_{op}.png")
+    fig.savefig(out, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX primitive plots")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--out-dir", default="results/plots")
+    ap.add_argument("--op", help="only plot this op")
+    args = ap.parse_args()
+
+    docs = load_nccl_results(args.results_dir)
+    if not docs:
+        print(f"no nccl result JSONs found in {args.results_dir}/")
+        return 1
+
+    by_op = defaultdict(list)
+    for d in docs:
+        by_op[d["op"]].append(d)
+
+    ops = [args.op] if args.op else sorted(by_op)
+    made = []
+    for op in ops:
+        out = plot_op(op, by_op.get(op, []), args.out_dir)
+        if out:
+            made.append(out)
+            print(f"wrote {out}  ({len(by_op[op])} curve(s))")
+    if not made:
+        print("nothing plotted")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt
new file mode 100644
index 000000000..574afb1f0
--- /dev/null
+++ b/experimental/CollectiveX/requirements.txt
@@ -0,0 +1,9 @@
+# CollectiveX spike dependencies.
+#
+# run_nccl.py + env_capture.py : Python standard library only (run anywhere).
+# run_deepep.py                : torch + deep_ep — provided by the benchmark
+#                                container; DeepEP is built at job setup
+#                                (rebuild-deepep), NOT pinned here.
+# plot.py                      : the only thing worth a local venv:
+matplotlib
+numpy
diff --git a/experimental/CollectiveX/results/.gitkeep b/experimental/CollectiveX/results/.gitkeep
new file mode 100644
index 000000000..8940934a2
--- /dev/null
+++ b/experimental/CollectiveX/results/.gitkeep
@@ -0,0 +1,3 @@
+# CollectiveX result bundles land here as flat *.json (one per runner×op),
+# plus plots/ and raw_*.txt captures (gitignored). Keep this file so the dir
+# exists before the first run.
diff --git a/experimental/CollectiveX/run_deepep.py b/experimental/CollectiveX/run_deepep.py
new file mode 100644
index 000000000..44a3ae3e0
--- /dev/null
+++ b/experimental/CollectiveX/run_deepep.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — DeepEP MoE dispatch+combine (normal mode), B200 first.
+
+One decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed,
+emitting the same flat-JSON provenance shape as run_nccl.py.
+
+Scope (plan §Milestone 0): normal mode only — low-latency (LL) mode is the
+known-broken/blocked IBGDA path and is out of scope for the spike. B200
+(x86_64) first; GB200 is the fast-follow once the aarch64 rebuild-deepep path
+is proven.
+
+  !!! DeepEP's Python API is VERSION-SENSITIVE (the plan notes V2 changed
+  NVSHMEM->NCCL, unified the APIs, and removed zero-SM LL mode). The
+  dispatch/combine block below follows the documented normal-mode intranode
+  API and is marked "ADAPT HERE" — validate the call signatures against the
+  DeepEP commit actually built by rebuild-deepep at job time, and record that
+  commit in provenance. Build is done at job setup, not shipped in the image.
+
+Launch (one process per GPU), e.g. single-node 8x B200:
+    torchrun --nproc_per_node=8 run_deepep.py \\
+        --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\
+        --env-json results/env.json --out results/b200_deepep.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "deepep-normal-v1"
+
+
+def _percentile(xs: list[float], q: float) -> float:
+    if not xs:
+        return float("nan")
+    s = sorted(xs)
+    i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1)))))
+    return s[i]
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [
+        meta["op"], meta["backend"], meta["mode"], str(meta["world_size"]),
+        str(meta["nodes"]), meta["topology_class"], meta["comparison_class"],
+        meta["measurement_contract"], str(meta["shape"]),
+    ]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX DeepEP dispatch+combine (normal mode)")
+    # shape (decode-ish default from the plan)
+    ap.add_argument("--tokens-per-rank", type=int, default=64)
+    ap.add_argument("--hidden", type=int, default=7168)
+    ap.add_argument("--topk", type=int, default=8)
+    ap.add_argument("--experts", type=int, default=256)
+    ap.add_argument("--dispatch-dtype", default="fp8", choices=["fp8", "bf16"])
+    ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"])
+    ap.add_argument("--seed", type=int, default=67)
+    # measurement
+    ap.add_argument("--warmup", type=int, default=20)
+    ap.add_argument("--iters", type=int, default=200)
+    ap.add_argument("--trials", type=int, default=3)
+    ap.add_argument("--num-sms", type=int, default=24, help="communication SMs (standardized budget)")
+    # provenance
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="")
+    ap.add_argument("--comparison-class", default="standardized")
+    ap.add_argument("--deepep-commit", default=os.environ.get("DEEPEP_COMMIT", "unknown"))
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    # ---- imports guarded so a missing build fails loudly, not cryptically ----
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+    try:
+        from deep_ep import Buffer  # type: ignore
+    except Exception as exc:  # pragma: no cover
+        print(
+            "ERROR: deep_ep import failed — DeepEP must be built at job setup "
+            f"(rebuild-deepep). {exc!r}",
+            file=sys.stderr,
+        )
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    if not dist.is_initialized():
+        dist.init_process_group("nccl")
+    group = dist.group.WORLD
+    device = torch.device(f"cuda:{local_rank}")
+    torch.manual_seed(args.seed + rank)
+
+    n = args.tokens_per_rank
+    H = args.hidden
+    topk = args.topk
+    E = args.experts
+
+    # Input tokens + routing. Weights sum to 1 per token so that a pure
+    # dispatch->combine round trip (no expert compute) reconstructs x.
+    x = torch.randn((n, H), dtype=torch.bfloat16, device=device)
+    if args.routing == "uniform":
+        topk_idx = torch.stack([
+            torch.randperm(E, device=device)[:topk] for _ in range(n)
+        ]).to(torch.int64)
+    else:  # zipf-ish skew toward low expert ids
+        probs = (1.0 / torch.arange(1, E + 1, device=device).float())
+        topk_idx = torch.multinomial(probs.expand(n, E), topk, replacement=False).to(torch.int64)
+    topk_weights = torch.softmax(torch.randn((n, topk), device=device, dtype=torch.float32), dim=-1)
+
+    # Buffer sizing: intranode uses NVLink buffer only (no RDMA for single node).
+    # Numbers follow DeepEP's intranode test guidance; tune per build.
+    num_nvl_bytes = 1024 * 1024 * 1024
+    num_rdma_bytes = 0
+    buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
+
+    def run_once():
+        # ===================== ADAPT HERE (DeepEP API) =======================
+        # Normal-mode intranode dispatch/combine. Signatures below match the
+        # documented DeepEP normal API; confirm against the built commit.
+        (num_tokens_per_rank, _, num_tokens_per_expert,
+         is_token_in_rank, _) = buffer.get_dispatch_layout(topk_idx, E)
+        recv_x, recv_topk_idx, recv_topk_weights, _, handle, _ = buffer.dispatch(
+            x,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert,
+        )
+        combined_x, _, _ = buffer.combine(recv_x, handle, topk_weights=recv_topk_weights)
+        # =====================================================================
+        return combined_x, num_tokens_per_expert, is_token_in_rank
+
+    # ---- correctness gate (run before timing; a fast wrong answer is invalid) ----
+    combined_x, num_tokens_per_expert, is_token_in_rank = run_once()
+    torch.cuda.synchronize()
+    expected_routed = n * topk
+    routed = int(torch.as_tensor(num_tokens_per_expert).sum().item())
+    token_conservation = (routed == expected_routed)
+    # DeepEP combine sums one copy of each token per destination RANK, so the
+    # dispatch->combine round trip reconstructs x only after dividing by the
+    # number of ranks each token was sent to (per DeepEP's own check in
+    # tests/legacy/test_intranode.py: combined_x / is_token_in_rank.sum(dim=1)).
+    ranks_per_token = is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float()
+    check_x = combined_x.float() / ranks_per_token
+    max_abs = (check_x - x.float()).abs().max().item()
+    max_rel = (max_abs / (x.float().abs().max().item() + 1e-6))
+    combine_ok = max_rel < 2e-2  # bf16 dispatch/combine round-trip tolerance
+    correct = bool(token_conservation and combine_ok)
+
+    # ---- timing (CUDA events; per-rank; reduce for slowest rank) ----
+    def time_ms(fn, warmup, iters) -> list[float]:
+        for _ in range(warmup):
+            fn()
+        torch.cuda.synchronize()
+        out = []
+        for _ in range(iters):
+            s = torch.cuda.Event(enable_timing=True)
+            e = torch.cuda.Event(enable_timing=True)
+            s.record()
+            fn()
+            e.record()
+            torch.cuda.synchronize()
+            out.append(s.elapsed_time(e) * 1000.0)  # ms -> us
+        return out
+
+    def dispatch_only():
+        (npr, _, npe, itir, _) = buffer.get_dispatch_layout(topk_idx, E)
+        buffer.dispatch(x, topk_idx=topk_idx, topk_weights=topk_weights,
+                        num_tokens_per_rank=npr, is_token_in_rank=itir,
+                        num_tokens_per_expert=npe)
+
+    trials = []
+    for _ in range(args.trials):
+        rt = time_ms(run_once, args.warmup, args.iters)      # dispatch+combine round trip
+        dp = time_ms(dispatch_only, args.warmup, args.iters)  # dispatch only
+        trials.append({
+            "roundtrip_us_p50": _percentile(rt, 50), "roundtrip_us_p99": _percentile(rt, 99),
+            "dispatch_us_p50": _percentile(dp, 50),
+        })
+
+    local_rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
+    # slowest rank across the world
+    t = torch.tensor([local_rt_p50], device=device)
+    dist.all_reduce(t, op=dist.ReduceOp.MAX)
+    slowest_rank_us = float(t.item())
+
+    if rank == 0:
+        shape = {
+            "tokens_per_rank": n, "hidden": H, "topk": topk, "experts": E,
+            "dispatch_dtype": args.dispatch_dtype, "routing": args.routing,
+            "num_comm_sms": args.num_sms,
+        }
+        meta = {
+            "op": "dispatch-combine", "backend": "deepep", "mode": "normal",
+            "world_size": world_size, "nodes": max(1, world_size // 8),
+            "topology_class": args.topology_class, "comparison_class": args.comparison_class,
+            "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape,
+        }
+        tokens_total = n * world_size
+        rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
+        env = None
+        if args.env_json and os.path.exists(args.env_json):
+            env = json.load(open(args.env_json))
+        doc = {
+            "schema_version": SCHEMA_VERSION,
+            "family": "moe",
+            "generated_by": "run_deepep.py",
+            "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+            "runner": args.runner,
+            "transport": args.transport,
+            "status": "valid" if correct else "invalid",
+            "comparison_key": comparison_key(meta),
+            "backend_provenance": {"deepep_commit": args.deepep_commit},
+            **meta,
+            "correctness": {
+                "passed": correct, "token_conservation": token_conservation,
+                "combine_within_tol": combine_ok, "max_abs_error": max_abs, "max_rel_error": max_rel,
+            },
+            "metrics": {
+                "roundtrip_us_p50": rt_p50,
+                "roundtrip_us_p99": sum(t["roundtrip_us_p99"] for t in trials) / len(trials),
+                "dispatch_us_p50": sum(t["dispatch_us_p50"] for t in trials) / len(trials),
+                "slowest_rank_roundtrip_us": slowest_rank_us,
+                "tokens_per_second": (tokens_total / (rt_p50 * 1e-6)) if rt_p50 else None,
+            },
+            "trials": trials,
+            "environment": env,
+        }
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            json.dump(doc, fh, indent=2)
+            fh.write("\n")
+        print(
+            f"deepep dispatch-combine: status={doc['status']} "
+            f"rt_p50={rt_p50:.1f}us slowest_rank={slowest_rank_us:.1f}us "
+            f"correct={correct} -> {args.out}"
+        )
+
+    dist.barrier()
+    dist.destroy_process_group()
+    return 0 if correct else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py
new file mode 100644
index 000000000..d32de9f23
--- /dev/null
+++ b/experimental/CollectiveX/run_nccl.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — NCCL primitive benchmark wrapper.
+
+Runs stock `nccl-tests` binaries (built in-container at job time — the login
+nodes have no nvcc), parses the text table (NOT JSON — we do not assume the
+build emits JSON), and writes a flat, provenance-tagged JSON result the plot
+script and the eventual schema-freeze can consume.
+
+Standard library only, so it runs in any minimal container.
+
+Run (inside the container, after building nccl-tests):
+    python run_nccl.py --op all_reduce \\
+        --nccl-tests-dir /tmp/nccl-tests/build \\
+        --world-size 8 --min-bytes 8 --max-bytes 8G \\
+        --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\
+        --env-json results/env.json --out results/b200_all_reduce.json
+
+Verify the parser offline (no GPU needed):
+    python run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \\
+        --world-size 8 --runner b200-dgxc --topology-class b200-nvlink-island \\
+        --out /tmp/parsed.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import subprocess
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "nccl-tests-v1"
+
+# op -> nccl-tests binary name
+OP_BINARY = {
+    "all_reduce": "all_reduce_perf",
+    "all_gather": "all_gather_perf",
+    "reduce_scatter": "reduce_scatter_perf",
+    "alltoall": "alltoall_perf",
+    "all_to_all": "alltoall_perf",
+    "broadcast": "broadcast_perf",
+    "sendrecv": "sendrecv_perf",
+}
+
+
+def _f(tok: str):
+    """Parse a numeric cell; nccl-tests prints 'N/A' for #wrong when -c 0."""
+    if tok in ("N/A", "n/a", "-"):
+        return None
+    try:
+        return float(tok)
+    except ValueError:
+        return None
+
+
+def parse_nccl_table(text: str) -> tuple[list[dict], dict]:
+    """Parse nccl-tests stdout into per-size rows + a run summary.
+
+    Robust across ops: the column count varies (all_reduce/reduce_scatter carry
+    redop+root; all_gather/alltoall do not), but every op prints the same 8
+    trailing numeric columns — out-of-place (time, algbw, busbw, #wrong) then
+    in-place (time, algbw, busbw, #wrong). `size` is always the first token and
+    `type` the third. So we key off the first token and the last 8 tokens.
+    """
+    rows: list[dict] = []
+    summary: dict = {"avg_busbw_gbps": None, "out_of_bounds": None, "check_passed": None}
+    for line in text.splitlines():
+        s = line.strip()
+        if not s:
+            continue
+        if s.startswith("#"):
+            if "Avg bus bandwidth" in s:
+                summary["avg_busbw_gbps"] = _f(s.split(":")[-1].strip())
+            elif "Out of bounds values" in s:
+                tail = s.split(":")[-1].strip()
+                summary["out_of_bounds"] = tail
+                summary["check_passed"] = tail.endswith("OK")
+            continue
+        toks = s.split()
+        # Data line: first token is the byte size (all digits), and we need the
+        # 8 trailing metric columns plus size+count+type up front (>=11 tokens).
+        if len(toks) < 11 or not toks[0].isdigit():
+            continue
+        tail = toks[-8:]
+        size = int(toks[0])
+        dtype = toks[2] if len(toks) >= 3 else None
+        oop_wrong = _f(tail[3])
+        ip_wrong = _f(tail[7])
+        rows.append(
+            {
+                "size_bytes": size,
+                "dtype": dtype,
+                "out_of_place": {
+                    "time_us": _f(tail[0]),
+                    "algbw_gbps": _f(tail[1]),
+                    "busbw_gbps": _f(tail[2]),
+                    "wrong": oop_wrong,
+                },
+                "in_place": {
+                    "time_us": _f(tail[4]),
+                    "algbw_gbps": _f(tail[5]),
+                    "busbw_gbps": _f(tail[6]),
+                    "wrong": ip_wrong,
+                },
+                # convenience: best (max) busbw across the two placements
+                "busbw_gbps": max(
+                    [b for b in (_f(tail[2]), _f(tail[6])) if b is not None],
+                    default=None,
+                ),
+                "correct": (
+                    None
+                    if oop_wrong is None and ip_wrong is None
+                    else ((oop_wrong or 0) == 0 and (ip_wrong or 0) == 0)
+                ),
+            }
+        )
+    return rows, summary
+
+
+def comparison_key(meta: dict) -> str:
+    """Machine key gating which rows may share a curve (see plan §Comparability).
+    Topology-class is intentionally part of the key, so B200(IB) and
+    GB200(MNNVL) are labelled distinct rather than silently overlaid."""
+    parts = [
+        meta["op"],
+        meta["dtype"],
+        str(meta["world_size"]),
+        str(meta["nodes"]),
+        meta["topology_class"],
+        meta["comparison_class"],
+        meta["measurement_contract"],
+    ]
+    digest = hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+    return digest
+
+
+def build_command(args, binary_path: str) -> list[str]:
+    cmd: list[str] = []
+    if args.launch_prefix:
+        cmd += args.launch_prefix.split()
+    cmd += [
+        binary_path,
+        "-b", str(args.min_bytes),
+        "-e", str(args.max_bytes),
+        "-f", str(args.factor),
+        "-g", str(args.gpus_per_proc),
+        "-c", str(args.check),
+        "-w", str(args.warmup),
+        "-n", str(args.iters),
+    ]
+    if args.extra_args:
+        cmd += args.extra_args.split()
+    return cmd
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX NCCL primitive runner")
+    ap.add_argument("--op", required=True, choices=sorted(OP_BINARY))
+    ap.add_argument("--nccl-tests-dir", help="dir containing <op>_perf binaries (build/)")
+    ap.add_argument("--parse-only", help="parse this captured stdout file instead of running")
+    # nccl-tests knobs
+    ap.add_argument("--min-bytes", default="8")
+    ap.add_argument("--max-bytes", default="8G")
+    ap.add_argument("--factor", type=int, default=2, help="size step factor")
+    ap.add_argument("--gpus-per-proc", type=int, default=8,
+                    help="-g: GPUs per process (single-node multi-GPU). Use 1 under MPI.")
+    ap.add_argument("--check", type=int, default=1, help="-c: 1 enables correctness check")
+    ap.add_argument("--warmup", type=int, default=5)
+    ap.add_argument("--iters", type=int, default=20)
+    ap.add_argument("--extra-args", default="", help="extra args appended to the binary")
+    ap.add_argument("--launch-prefix", default="",
+                    help="e.g. 'mpirun -np 16 --hostfile hf' for multi-node; empty for single-node -g mode")
+    # provenance
+    ap.add_argument("--runner", required=True, help="runner label, e.g. b200-dgxc")
+    ap.add_argument("--world-size", type=int, required=True, help="total ranks/GPUs in the run")
+    ap.add_argument("--nodes", type=int, default=1)
+    ap.add_argument("--topology-class", required=True,
+                    help="e.g. b200-nvlink-island, b200-nvlink-island+cx7-ib, gb200-nvl72-mnnvl")
+    ap.add_argument("--transport", default="", help="observed transport label: nvlink | ib | mnnvl")
+    ap.add_argument("--comparison-class", default="standardized",
+                    choices=["standardized", "backend-optimized", "framework-integrated"])
+    ap.add_argument("--env-json", help="path to env_capture.py output to embed")
+    ap.add_argument("--timestamp", help="ISO timestamp (default now)")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    binary = OP_BINARY[args.op]
+    command = None
+    if args.parse_only:
+        with open(args.parse_only) as fh:
+            stdout = fh.read()
+        ran_ok = True
+    else:
+        if not args.nccl_tests_dir:
+            ap.error("--nccl-tests-dir is required unless --parse-only is given")
+        binary_path = os.path.join(args.nccl_tests_dir, binary)
+        if not os.path.exists(binary_path):
+            print(f"ERROR: binary not found: {binary_path}", file=sys.stderr)
+            return 2
+        command = build_command(args, binary_path)
+        print("running:", " ".join(command), file=sys.stderr)
+        proc = subprocess.run(command, capture_output=True, text=True, check=False)
+        stdout = proc.stdout
+        ran_ok = proc.returncode == 0
+        if not ran_ok:
+            print(stdout, file=sys.stderr)
+            print(proc.stderr, file=sys.stderr)
+            print(f"ERROR: {binary} exited {proc.returncode}", file=sys.stderr)
+
+    rows, summary = parse_nccl_table(stdout)
+    dtype = rows[0]["dtype"] if rows else None
+
+    meta = {
+        "op": args.op,
+        "dtype": dtype,
+        "world_size": args.world_size,
+        "nodes": args.nodes,
+        "topology_class": args.topology_class,
+        "comparison_class": args.comparison_class,
+        "measurement_contract": MEASUREMENT_CONTRACT,
+    }
+
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+
+    doc = {
+        "schema_version": SCHEMA_VERSION,
+        "family": "nccl",
+        "generated_by": "run_nccl.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner,
+        "binary": binary,
+        "command": " ".join(command) if command else f"<parse-only {args.parse_only}>",
+        "transport": args.transport,
+        "status": "valid" if (summary.get("check_passed") in (True, None) and ran_ok and rows) else "invalid",
+        "comparison_key": comparison_key(meta),
+        **meta,
+        "summary": summary,
+        "num_rows": len(rows),
+        "rows": rows,
+        "environment": env,
+    }
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+
+    print(
+        f"{args.op}: parsed {len(rows)} sizes -> {args.out} "
+        f"(status={doc['status']}, avg_busbw={summary.get('avg_busbw_gbps')} GB/s, "
+        f"key={doc['comparison_key']})"
+    )
+    return 0 if doc["status"] == "valid" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt
new file mode 100644
index 000000000..c8825164e
--- /dev/null
+++ b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt
@@ -0,0 +1,50 @@
+# nThread 1 nGpus 8 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0
+#
+# Using devices
+#  Rank  0 Group  0 Pid  12345 on    b200-node device  0 [0x1b] NVIDIA B200
+#  Rank  1 Group  0 Pid  12345 on    b200-node device  1 [0x43] NVIDIA B200
+#  Rank  2 Group  0 Pid  12345 on    b200-node device  2 [0x52] NVIDIA B200
+#  Rank  3 Group  0 Pid  12345 on    b200-node device  3 [0x61] NVIDIA B200
+#  Rank  4 Group  0 Pid  12345 on    b200-node device  4 [0x9d] NVIDIA B200
+#  Rank  5 Group  0 Pid  12345 on    b200-node device  5 [0xc3] NVIDIA B200
+#  Rank  6 Group  0 Pid  12345 on    b200-node device  6 [0xd1] NVIDIA B200
+#  Rank  7 Group  0 Pid  12345 on    b200-node device  7 [0xdf] NVIDIA B200
+#
+#                                                              out-of-place                       in-place
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
+           8             2     float     sum      -1     9.62    0.00    0.00      0     9.60    0.00    0.00      0
+          16             4     float     sum      -1     9.61    0.00    0.00      0     9.59    0.00    0.00      0
+          32             8     float     sum      -1     9.63    0.00    0.00      0     9.62    0.00    0.00      0
+          64            16     float     sum      -1     9.60    0.00    0.00      0     9.58    0.00    0.00      0
+         128            32     float     sum      -1     9.64    0.01    0.02      0     9.63    0.01    0.02      0
+         256            64     float     sum      -1     9.66    0.03    0.05      0     9.64    0.03    0.05      0
+         512           128     float     sum      -1     9.69    0.05    0.09      0     9.67    0.05    0.09      0
+        1024           256     float     sum      -1     9.74    0.11    0.18      0     9.72    0.11    0.18      0
+        2048           512     float     sum      -1     9.82    0.21    0.37      0     9.80    0.21    0.37      0
+        4096          1024     float     sum      -1     9.97    0.41    0.72      0     9.95    0.41    0.72      0
+        8192          2048     float     sum      -1    10.22    0.80    1.40      0    10.20    0.80    1.40      0
+       16384          4096     float     sum      -1    10.81    1.52    2.65      0    10.79    1.52    2.65      0
+       32768          8192     float     sum      -1    11.93    2.75    4.81      0    11.90    2.75    4.81      0
+       65536         16384     float     sum      -1    13.62    4.81    8.42      0    13.59    4.82    8.43      0
+      131072         32768     float     sum      -1    16.94    7.74   13.54      0    16.90    7.76   13.57      0
+      262144         65536     float     sum      -1    23.14   11.33   19.83      0    23.10   11.35   19.86      0
+      524288        131072     float     sum      -1    35.62   14.72   25.76      0    35.55   14.75   25.81      0
+     1048576        262144     float     sum      -1    60.40   17.36   30.38      0    60.30   17.39   30.43      0
+     2097152        524288     float     sum      -1    76.50   27.41   47.97      0    76.40   27.45   48.04      0
+     4194304       1048576     float     sum      -1   110.20   38.06   66.61      0   110.05   38.11   66.70      0
+     8388608       2097152     float     sum      -1   165.80   50.60   88.55      0   165.60   50.66   88.65      0
+    16777216       4194304     float     sum      -1   250.10   67.08  117.40      0   249.80   67.16  117.54      0
+    33554432       8388608     float     sum      -1   360.50   93.08  162.90      0   360.10   93.18  163.07      0
+    67108864      16777216     float     sum      -1   520.80  128.85  225.50      0   520.20  129.00  225.75      0
+   134217728      33554432     float     sum      -1   720.30  186.34  326.10      0   719.50  186.55  326.46      0
+   268435456      67108864     float     sum      -1  1080.50  248.43  434.80      0  1079.20  248.73  435.27      0
+   536870912     134217728     float     sum      -1  1990.20  269.76  472.10      0  1988.50  269.99  472.49      0
+  1073741824     268435456     float     sum      -1  3940.60  272.48  476.84      0  3938.10  272.65  477.14      0
+  2147483648     536870912     float     sum      -1  7850.10  273.56  478.73      0  7846.20  273.69  478.96      0
+  4294967296    1073741824     float     sum      -1 15680.50  273.91  479.34      0 15673.80  274.03  479.55      0
+  8589934592    2147483648     float     sum      -1 31250.80  274.87  481.02      0 31238.10  274.98  481.22      0
+#
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 168.42
+#

From b7ed913b66905c0e380fa82495b7741ad3280473 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 15:55:30 +0800
Subject: [PATCH 02/17] CollectiveX: import container by multi-arch tag, fix CI
 import hang
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GB200 on:push smoke hung 25 min in enroot import: a bare digest ref (repo@sha256:) can't form an anonymous Docker Hub token scope, so enroot prompted for a password and blocked in non-interactive CI. Import by the multi-arch TAG instead (anonymous auth works, same as the serving launchers) and add </dev/null so a missing token fails fast rather than hanging.

Use v0.5.11-cu130 (multi-arch amd64+arm64, index sha256:061fb71f…): v0.5.12-cu130's 62 layers overflow enroot's overlay-based squash creation on these nodes (failed to mount overlay … Invalid argument). v0.5.11-cu130 imports cleanly and is pre-staged on GB200.
---
 experimental/CollectiveX/CONTAINERS.md       |  9 +++---
 experimental/CollectiveX/README.md           |  8 +++--
 experimental/CollectiveX/launchers/common.sh | 31 ++++++++++++--------
 experimental/CollectiveX/plan.md             |  2 +-
 4 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
index 94ab7377f..3aff25194 100644
--- a/experimental/CollectiveX/CONTAINERS.md
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -6,15 +6,16 @@ comparison is truly same-image. Set in `launchers/common.sh` (`cx_default_image`
 
 ## Default container (all NVIDIA SKUs)
 
-- **Image (pin by digest):** `lmsysorg/sglang@sha256:42194170546745092e74cd5f81ad32a7c6e944c7111fe7bf13588152277ff356` — the OCI image index for tag `v0.5.12-cu130`.
-- **Multi-arch manifest list:** linux/amd64 (`sha256:015f39a4…`) + linux/arm64 (`sha256:7a76819e…`). One digest; `enroot import` on each host pulls the matching arch. **Use the digest-only ref** (`repo@sha256:`) in `common.sh` — enroot 400s on a combined `tag@sha256:` reference.
-- **Importing needs registry creds:** anonymous Docker Hub pulls return 401 in ad-hoc SSH sessions; the CI runners import with their configured credentials (the serving sweeps pull images routinely), and already-staged squashes need no import. The refactored launcher path was validated on the already-staged `v0.5.11-cu130` (same multi-arch cu130 line).
+- **Image:** import by tag **`lmsysorg/sglang:v0.5.11-cu130`** (multi-arch OCI index). Expected index digest, recorded for provenance/verification: `sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975`.
+- **Multi-arch manifest list:** linux/amd64 + linux/arm64; `enroot import` on each host pulls the matching arch.
+- **Import by TAG, not digest.** enroot builds its anonymous Docker Hub token scope from the *tag* and succeeds (no creds needed — same as the serving launchers). A bare `repo@sha256:` ref makes enroot prompt for a password and **hang** in non-interactive CI; a combined `tag@sha256:` ref 400s. `cx_ensure_squash` therefore imports by tag with `</dev/null` (a missing token fails fast instead of hanging). First import is multi-GB (~minutes); subsequent runs reuse the staged squash.
+- **Why v0.5.11-cu130 (chosen):** it's the newest cu130 release **pre-staged on BOTH clusters** — B200 `/home/sa-shared/containers/` (amd64 squash) and GB200 `/mnt/lustre01/users-public/sa-shared/` (arm64 squash), same filename — so neither side imports at all. (Shared cu130 multi-arch squashes across both clusters: v0.5.8.post1, v0.5.9, v0.5.11 — v0.5.11 is newest.) `v0.5.12-cu130` is staged on B200 but **not** GB200: its 62 layers overflow enroot's overlay-based squash creation on the GB200 kernel (`enroot-mksquashovlfs: failed to mount overlay … Invalid argument`), so it can't be the shared default.
 - **DeepEP: NOT bundled** here → `run_in_container.sh` builds it via `rebuild-deepep` at job setup (CX_BENCH=deepep). The NCCL path needs no DeepEP.
 - **nccl-tests build:** in-container (login nodes have no `nvcc`), `CX_NCCL_HOME=/usr` (system `nccl.h` in `/usr/include`), `CX_CUDA_HOME=/usr/local/cuda`. cu130 lineage ⇒ CUDA 13; confirm exact NCCL/torch on first run and append below.
 
 ## Audited reference (cu130 lineage)
 
-Live audit of the sibling DeepSeek-V4 image `lmsysorg/sglang:deepseek-v4-grace-blackwell` (aarch64) on GB200, 2026-06-23 — the multi-arch `v0.5.12-cu130` should match closely (same cu130 base); reconfirm on first run:
+Live audit of the sibling DeepSeek-V4 image `lmsysorg/sglang:deepseek-v4-grace-blackwell` (aarch64) on GB200, 2026-06-23 — the multi-arch `v0.5.11-cu130` should match closely (same cu130 base); reconfirm on first run:
 
 | Component | Version |
 |---|---|
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
index 3b18c048d..4fb871bf1 100644
--- a/experimental/CollectiveX/README.md
+++ b/experimental/CollectiveX/README.md
@@ -63,9 +63,11 @@ python3 plot.py --results-dir results --out-dir results/plots   # needs matplotl
 
 ## Container
 
-One **multi-arch, digest-pinned** image for all NVIDIA SKUs:
-`lmsysorg/sglang:v0.5.12-cu130@sha256:4219…f356` (amd64 + arm64). See
-`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the digest-pinned
+One **multi-arch** image for all NVIDIA SKUs, imported by tag
+`lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…`
+recorded for provenance). Imported by tag, not digest — enroot's anonymous
+Docker Hub auth needs a tag, and a bare digest ref hangs in CI. See
+`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the bundled-DeepEP
 DeepSeek-V4 fallback images.
 
 ## How it runs (confirmed against the live clusters)
diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh
index 445cdb5ca..f3997cf9e 100644
--- a/experimental/CollectiveX/launchers/common.sh
+++ b/experimental/CollectiveX/launchers/common.sh
@@ -8,17 +8,21 @@
 cx_log() { printf '[collectivex] %s\n' "$*" >&2; }
 cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; }
 
-# Single multi-arch, digest-pinned container for ALL NVIDIA SKUs.
-# This is the OCI image index for tag `v0.5.12-cu130`, covering BOTH linux/amd64
-# (B200) and linux/arm64 (GB200); enroot import on each host pulls the matching
-# arch from the index. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.)
-# Pinned by DIGEST ONLY (no tag): enroot mis-parses a combined `tag@sha256` ref
-# and 400s at auth, so we use `repo@sha256:` — also the stricter pin.
-# NOTE: DeepEP is NOT bundled here -> run_in_container.sh builds it via
-# rebuild-deepep at job setup. (The arch-specific deepseek-v4-{blackwell,
-# grace-blackwell} images DO bundle DeepEP — see CONTAINERS.md — but are not
-# multi-arch and are not used by default.)
-CX_IMAGE_MULTIARCH="lmsysorg/sglang@sha256:42194170546745092e74cd5f81ad32a7c6e944c7111fe7bf13588152277ff356"
+# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI
+# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import
+# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.)
+# IMPORT BY TAG, not by digest: enroot's anonymous Docker Hub token scope is built
+# from the tag; a bare `repo@sha256:` ref makes enroot prompt for a password and
+# HANG in non-interactive CI (and a combined `tag@sha256` ref 400s). The expected
+# multi-arch index digest is recorded for provenance/verification:
+CX_IMAGE_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975"
+# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based
+# squash creation on these nodes — "failed to mount overlay ... Invalid argument".
+# v0.5.11-cu130 imports cleanly and is pre-staged on GB200.)
+# DeepEP is NOT bundled here -> run_in_container.sh builds it via rebuild-deepep.
+# (The arch-specific deepseek-v4-{blackwell,grace-blackwell} images DO bundle
+# DeepEP — see CONTAINERS.md — but are not multi-arch and are not the default.)
+CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130"
 
 cx_default_image() {
   case "$1" in
@@ -44,7 +48,10 @@ cx_ensure_squash() {
     else
       cx_log "enroot import docker://$image -> $sq (one-time, multi-GB)"
       rm -f "$sq"
-      enroot import -o "$sq" "docker://$image" >&2 || cx_die "enroot import failed for $image"
+      # </dev/null: never block on enroot's interactive password prompt (a missing
+      # anonymous token must fail fast, not hang the CI job).
+      enroot import -o "$sq" "docker://$image" </dev/null >&2 \
+        || cx_die "enroot import failed for $image (anonymous auth needs a TAG ref, not a bare digest; or pre-stage the squash)"
       unsquashfs -l "$sq" >/dev/null 2>&1 || cx_die "import produced no valid squash: $sq"
     fi
   ) 9>"$locks/${key}.lock"
diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md
index 365b23455..6ceb512ef 100644
--- a/experimental/CollectiveX/plan.md
+++ b/experimental/CollectiveX/plan.md
@@ -28,7 +28,7 @@ Existing public benchmarks don't offer trustworthy, like-for-like collective/EP
 
 The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that:
 
-- **Multi-arch, digest-pinned container** for all NVIDIA SKUs: `lmsysorg/sglang:v0.5.12-cu130@sha256:4219…f356` (amd64 + arm64) — one reference both arches; DeepEP via `rebuild-deepep`. See `CONTAINERS.md`.
+- **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`.
 - **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|all) through a shared `launchers/run_in_container.sh`.
 - **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → GB200 NCCL smoke; `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
 

From ccfae8edc8a027516742603f464ffd00731fbebc Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 16:03:48 +0800
Subject: [PATCH 03/17] CollectiveX: copy staged results back to checkout for
 artifact upload

On the GB200 Actions path, CX_STAGE_DIR makes the launcher rsync the tree to compute-visible Lustre and the container writes results/ there; upload-artifact reads the checkout's results/ (empty), so the green smoke produced no artifact. Add cx_collect_results to copy result JSONs from the stage dir back to the checkout after the run (no-op when no staging was used).
---
 experimental/CollectiveX/launchers/common.sh        | 13 +++++++++++++
 .../CollectiveX/launchers/launch_b200-dgxc.sh       |  1 +
 .../CollectiveX/launchers/launch_gb200-nv.sh        |  1 +
 3 files changed, 15 insertions(+)

diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh
index f3997cf9e..d8d5749eb 100644
--- a/experimental/CollectiveX/launchers/common.sh
+++ b/experimental/CollectiveX/launchers/common.sh
@@ -77,6 +77,19 @@ cx_stage_repo() {
   echo "$stage_dir"
 }
 
+# cx_collect_results <mount_src> <repo_root>
+# When the run used a staged (compute-visible) mount, copy result JSONs back to
+# the original checkout's results/ so the workflow's upload-artifact (which reads
+# the checkout, not the stage dir) finds them. No-op when no staging was used.
+cx_collect_results() {
+  local mount_src="$1" repo_root="$2" dst
+  [ "$mount_src" = "$repo_root" ] && return 0
+  dst="$repo_root/experimental/CollectiveX/results"
+  mkdir -p "$dst"
+  cp "$mount_src/experimental/CollectiveX/results/"*.json "$dst/" 2>/dev/null || true
+  cx_log "copied results from stage dir -> $dst (for artifact upload)"
+}
+
 # cx_build_nccl_tests <parent_dir> <mpi 0|1>  ->  echoes the build/ dir.
 # Runs IN-CONTAINER (login nodes have no nvcc). Cached: skips if already built.
 # CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
index a1b5c0135..29e4eea56 100644
--- a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
@@ -61,4 +61,5 @@ srun --jobid="$JOB_ID" \
   --no-container-entrypoint --export=ALL \
   bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
 
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
 cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
index 35cdb8e28..8b24a710d 100644
--- a/experimental/CollectiveX/launchers/launch_gb200-nv.sh
+++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
@@ -64,4 +64,5 @@ srun --jobid="$JOB_ID" \
   --no-container-entrypoint --export=ALL \
   bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
 
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
 cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"

From b3841719bd6e9fec538059d701da16011c29c5e5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 16:23:27 +0800
Subject: [PATCH 04/17] CollectiveX: per-job summary table + address PR review
 findings

Add summarize.py (compact NCCL/DeepEP results table, printed at end of every job) and make it the result gate. Fix review findings: benchmark failures/skipped-deepep now fail the job instead of reporting green (#1); DeepEP nodes from SLURM_NNODES not world_size//8 (#3); apply Buffer.set_num_sms so num_comm_sms is real (#8); nccl-tests -c 1 with a missing check footer is now invalid (#7); use context managers for file reads (#4,#5); launchers export COLLECTIVEX_IMAGE/_DIGEST for provenance (#9); trim workflow_dispatch sku options to launcher-backed pools (#2). Artifact-path finding (#6) already fixed via cx_collect_results.
---
 .../workflows/collectivex-experimental.yml    |   6 +-
 .../launchers/launch_b200-dgxc-slurm.sh       |   2 +
 .../CollectiveX/launchers/launch_b200-dgxc.sh |   2 +
 .../CollectiveX/launchers/launch_gb200-nv.sh  |   2 +
 .../CollectiveX/launchers/run_in_container.sh |  42 ++++---
 experimental/CollectiveX/plot.py              |   3 +-
 experimental/CollectiveX/run_deepep.py        |  12 +-
 experimental/CollectiveX/run_nccl.py          |   3 +-
 experimental/CollectiveX/summarize.py         | 119 ++++++++++++++++++
 9 files changed, 167 insertions(+), 24 deletions(-)
 create mode 100644 experimental/CollectiveX/summarize.py

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
index 6b07c2d56..4446473e9 100644
--- a/.github/workflows/collectivex-experimental.yml
+++ b/.github/workflows/collectivex-experimental.yml
@@ -17,10 +17,12 @@ on:
   workflow_dispatch:
     inputs:
       sku:
-        description: Self-hosted runner pool (label from .github/configs/runners.yaml)
+        # Only SKUs with a matching launchers/launch_<prefix>.sh are offered —
+        # runner.name's prefix selects the script, so an SKU without one fails.
+        description: Self-hosted runner pool (must have a CollectiveX launcher)
         type: choice
         default: gb200
-        options: [gb200, b200, b200-multinode, b300, gb300]
+        options: [gb200, b200-dgxc, b200-multinode]
       benchmark:
         description: Which benchmark to run
         type: choice
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
index a58411343..e5add9189 100644
--- a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
@@ -40,6 +40,8 @@ TOPO="b200-nvlink-island+cx7-ib"
 WORLD=$((NODES * GPUS_PER_NODE))
 MPI_FLAG="${CX_SRUN_MPI:-pmix}"
 export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Record container identity in env_capture provenance (propagated via --export=ALL).
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
 
 declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf
                  [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf )
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
index 29e4eea56..42d860975 100644
--- a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh
@@ -35,6 +35,8 @@ export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
 export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink"
 export CX_BENCH="${CX_BENCH:-nccl}"
 export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Record container identity in env_capture provenance.
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
 export NCCL_CUMEM_ENABLE=1
 
 cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH"
diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
index 8b24a710d..60d5b297d 100644
--- a/experimental/CollectiveX/launchers/launch_gb200-nv.sh
+++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
@@ -37,6 +37,8 @@ export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
 export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
 export CX_BENCH="${CX_BENCH:-nccl}"
 export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
+# Record container identity in env_capture provenance.
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
 # Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded.
 export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
 
diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh
index 7729528b2..cde27ac1c 100644
--- a/experimental/CollectiveX/launchers/run_in_container.sh
+++ b/experimental/CollectiveX/launchers/run_in_container.sh
@@ -30,45 +30,51 @@ cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX
 python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS"
 
 run_nccl_suite() {
-  local build ops op
-  build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)"   # single-node: MPI=0, -g N
+  local build ops op sfail=0
+  build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1   # single-node: MPI=0, -g N
   ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
   for op in $ops; do
-    python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \
-      --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \
-      --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
-      --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \
-      --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1 \
-      || cx_log "WARN: nccl $op failed"
+    if ! python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \
+        --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \
+        --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+        --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \
+        --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1; then
+      cx_log "WARN: nccl $op failed or invalid"; sfail=1
+    fi
   done
+  return "$sfail"
 }
 
 run_deepep_suite() {
   # DeepEP is not bundled in the multi-arch image. Try to import; if absent,
-  # attempt rebuild-deepep (srt-slurm setup script) when available, else skip.
+  # attempt rebuild-deepep (srt-slurm setup script). Inability to run is a
+  # failure, not a silent skip — the caller asked for deepep.
   if ! python3 -c "import deep_ep" 2>/dev/null; then
     if command -v rebuild-deepep.sh >/dev/null 2>&1; then
       cx_log "building DeepEP via rebuild-deepep.sh"
-      rebuild-deepep.sh >&2 || cx_log "WARN: rebuild-deepep.sh failed"
+      rebuild-deepep.sh >&2 || { cx_log "WARN: rebuild-deepep.sh failed"; return 1; }
     else
-      cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; skipping deepep"
-      return 0
+      cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; cannot run deepep"
+      return 1
     fi
   fi
   torchrun --nproc_per_node="$CX_NGPUS" run_deepep.py \
     --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
     --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \
     --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \
-    --dispatch-dtype "${CX_DISPATCH_DTYPE:-fp8}" \
+    --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" \
     --env-json "$ENVJSON" --out "results/${CX_RUNNER}_deepep_${CX_TS}.json" \
-    || cx_log "WARN: deepep run failed"
+    || { cx_log "WARN: deepep run failed"; return 1; }
 }
 
+rc=0
 case "$CX_BENCH" in
-  nccl)   run_nccl_suite ;;
-  deepep) run_deepep_suite ;;
-  all)    run_nccl_suite; run_deepep_suite ;;
+  nccl)   run_nccl_suite || rc=1 ;;
+  deepep) run_deepep_suite || rc=1 ;;
+  all)    run_nccl_suite || rc=1; run_deepep_suite || rc=1 ;;
   *)      cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|all)" ;;
 esac
 
-echo "=== results ==="; ls -1 results/*.json
+# Summary table for the log; also fails the job if no valid results were produced.
+python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1
+exit "$rc"
diff --git a/experimental/CollectiveX/plot.py b/experimental/CollectiveX/plot.py
index 0106c61c9..c24136ebc 100644
--- a/experimental/CollectiveX/plot.py
+++ b/experimental/CollectiveX/plot.py
@@ -40,7 +40,8 @@ def load_nccl_results(results_dir: str) -> list[dict]:
     docs = []
     for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))):
         try:
-            d = json.load(open(path))
+            with open(path) as _f:
+                d = json.load(_f)
         except (json.JSONDecodeError, OSError):
             continue
         if d.get("family") == "nccl" and d.get("rows"):
diff --git a/experimental/CollectiveX/run_deepep.py b/experimental/CollectiveX/run_deepep.py
index 44a3ae3e0..3d61c69e4 100644
--- a/experimental/CollectiveX/run_deepep.py
+++ b/experimental/CollectiveX/run_deepep.py
@@ -126,6 +126,13 @@ def main() -> int:
     num_nvl_bytes = 1024 * 1024 * 1024
     num_rdma_bytes = 0
     buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
+    # Apply the standardized communication-SM budget so the recorded
+    # num_comm_sms reflects the actual run (best-effort across DeepEP versions).
+    try:
+        Buffer.set_num_sms(args.num_sms)
+    except Exception as exc:  # pragma: no cover - API/version dependent
+        if rank == 0:
+            print(f"WARN: could not set num_sms={args.num_sms}: {exc!r}", file=sys.stderr)
 
     def run_once():
         # ===================== ADAPT HERE (DeepEP API) =======================
@@ -207,7 +214,7 @@ def dispatch_only():
         }
         meta = {
             "op": "dispatch-combine", "backend": "deepep", "mode": "normal",
-            "world_size": world_size, "nodes": max(1, world_size // 8),
+            "world_size": world_size, "nodes": int(os.environ.get("SLURM_NNODES", "1")),
             "topology_class": args.topology_class, "comparison_class": args.comparison_class,
             "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape,
         }
@@ -215,7 +222,8 @@ def dispatch_only():
         rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
         env = None
         if args.env_json and os.path.exists(args.env_json):
-            env = json.load(open(args.env_json))
+            with open(args.env_json) as _fh:
+                env = json.load(_fh)
         doc = {
             "schema_version": SCHEMA_VERSION,
             "family": "moe",
diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py
index d32de9f23..993c0c06d 100644
--- a/experimental/CollectiveX/run_nccl.py
+++ b/experimental/CollectiveX/run_nccl.py
@@ -236,7 +236,8 @@ def main() -> int:
         "binary": binary,
         "command": " ".join(command) if command else f"<parse-only {args.parse_only}>",
         "transport": args.transport,
-        "status": "valid" if (summary.get("check_passed") in (True, None) and ran_ok and rows) else "invalid",
+        "status": ("valid" if (rows and ran_ok and (summary.get("check_passed") is True
+                   or (args.check == 0 and summary.get("check_passed") is None))) else "invalid"),
         "comparison_key": comparison_key(meta),
         **meta,
         "summary": summary,
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
new file mode 100644
index 000000000..bb439dcb4
--- /dev/null
+++ b/experimental/CollectiveX/summarize.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""CollectiveX — print a compact summary table of a run's results.
+
+Reads the result JSONs a job produced (filtered by runner + timestamp when
+given) and prints one table per family (NCCL primitives, MoE/DeepEP). Runs at
+the end of every job (from run_in_container.sh) so the Slurm/Actions log shows a
+digestible table, not just file paths.
+
+Doubles as a result gate: exits non-zero if no valid results were produced (so a
+benchmark that failed/skipped doesn't get reported as a green job).
+
+    python summarize.py --results-dir results --runner gb200-nv_1 --ts <ts>
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+
+
+def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[dict]:
+    docs = []
+    for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))):
+        base = os.path.basename(path)
+        if base.startswith("env_"):
+            continue
+        if runner and not base.startswith(f"{runner}_"):
+            continue
+        if ts and ts not in base:
+            continue
+        try:
+            with open(path) as fh:
+                d = json.load(fh)
+        except (json.JSONDecodeError, OSError):
+            continue
+        if d.get("family") in ("nccl", "moe"):
+            d["_base"] = base
+            docs.append(d)
+    return docs
+
+
+def _peak_busbw(rows: list[dict]) -> float:
+    return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0)
+
+
+def _min_lat(rows: list[dict]) -> float:
+    vals = [r["out_of_place"]["time_us"] for r in rows
+            if r.get("out_of_place", {}).get("time_us") is not None]
+    return min(vals) if vals else float("nan")
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX result summary table")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--runner", default=None)
+    ap.add_argument("--ts", default=None)
+    args = ap.parse_args()
+
+    docs = load_results(args.results_dir, args.runner, args.ts)
+    nccl = [d for d in docs if d["family"] == "nccl"]
+    moe = [d for d in docs if d["family"] == "moe"]
+
+    hdr = "CollectiveX results"
+    if docs:
+        d0 = docs[0]
+        hdr += (f" — runner={d0.get('runner')} topology={d0.get('topology_class')}"
+                f" transport={d0.get('transport')}")
+    print("\n" + "=" * len(hdr))
+    print(hdr)
+    print("=" * len(hdr))
+
+    n_valid = 0
+
+    if nccl:
+        ws = nccl[0].get("world_size")
+        print(f"\nNCCL primitives (world={ws}, dtype={nccl[0].get('dtype')}):")
+        print(f"  {'op':<16}{'status':<9}{'peak busbw':>12}{'min lat':>10}{'avg busbw':>11}")
+        print(f"  {'':<16}{'':<9}{'(GB/s)':>12}{'(us)':>10}{'(GB/s)':>11}")
+        for d in sorted(nccl, key=lambda x: x["op"]):
+            rows = d.get("rows", [])
+            n_valid += d.get("status") == "valid"
+            avg = (d.get("summary") or {}).get("avg_busbw_gbps")
+            print(f"  {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}"
+                  f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
+
+    if moe:
+        print("\nMoE / DeepEP dispatch+combine:")
+        print(f"  {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}"
+              f"{'disp_p50':>10}{'tokens/s':>13}{'  correct'}")
+        print(f"  {'':<10}{'':<8}{'':<9}{'(us)':>9}{'(us)':>9}{'(us)':>10}{'':>13}")
+        for d in sorted(moe, key=lambda x: x.get("backend", "")):
+            m = d.get("metrics", {})
+            c = d.get("correctness", {})
+            n_valid += d.get("status") == "valid"
+            tps = m.get("tokens_per_second")
+            print(f"  {d.get('backend',''):<10}{d.get('mode',''):<8}{d.get('status',''):<9}"
+                  f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}"
+                  f"{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}"
+                  f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}"
+                  f"{(tps if tps is not None else float('nan')):>13.3e}"
+                  f"   {c.get('passed')}")
+
+    total = len(docs)
+    print(f"\n{n_valid}/{total} results valid.\n")
+    if total == 0:
+        print("ERROR: no result files found to summarize — benchmark produced nothing.")
+        return 1
+    if n_valid == 0:
+        print("ERROR: no valid results — failing the job.")
+        return 1
+    if n_valid < total:
+        print(f"WARNING: {total - n_valid} result(s) invalid.")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From f48daed804fc07174f7b5fc153ac6da21708833d Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 16:50:12 +0800
Subject: [PATCH 05/17] CollectiveX: render results as a GitHub Actions job
 summary

summarize.py --markdown emits GitHub-flavored markdown tables (NCCL + DeepEP); a per-job 'Results summary' workflow step appends it to $GITHUB_STEP_SUMMARY so the run page shows a rendered table (per the GitHub job-summaries feature). Plain-text mode still drives the in-container result gate.
---
 .../workflows/collectivex-experimental.yml    |   6 +
 experimental/CollectiveX/README.md            |   6 +-
 experimental/CollectiveX/summarize.py         | 145 +++++++++++-------
 3 files changed, 99 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
index 4446473e9..c63b56635 100644
--- a/.github/workflows/collectivex-experimental.yml
+++ b/.github/workflows/collectivex-experimental.yml
@@ -73,6 +73,9 @@ jobs:
         env:
           RUNNER_NAME: ${{ runner.name }}
         run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
@@ -101,6 +104,9 @@ jobs:
         env:
           RUNNER_NAME: ${{ runner.name }}
         run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+      - name: Results summary
+        if: always()
+        run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
index 4fb871bf1..606eeb395 100644
--- a/experimental/CollectiveX/README.md
+++ b/experimental/CollectiveX/README.md
@@ -31,11 +31,13 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL
 
 - **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle
   capacity; never auto-contends with the B200 serving sweep).
-- **workflow_dispatch** → pick `sku` (gb200 / b200 / b200-multinode / …),
+- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode),
   `benchmark` (nccl / deepep / all), ops, sizes, ngpus. Lands on that SKU's
   self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`.
 
-(The workflow only fires once the branch is pushed to GitHub.)
+Each job renders a results table to the **GitHub Actions job summary** (via
+`summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs
+as an artifact. (The workflow only fires once the branch is pushed to GitHub.)
 
 ### Directly on a cluster login node
 
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
index bb439dcb4..8d81b13ee 100644
--- a/experimental/CollectiveX/summarize.py
+++ b/experimental/CollectiveX/summarize.py
@@ -1,15 +1,17 @@
 #!/usr/bin/env python3
-"""CollectiveX — print a compact summary table of a run's results.
+"""CollectiveX — summarize a run's results.
 
-Reads the result JSONs a job produced (filtered by runner + timestamp when
-given) and prints one table per family (NCCL primitives, MoE/DeepEP). Runs at
-the end of every job (from run_in_container.sh) so the Slurm/Actions log shows a
-digestible table, not just file paths.
-
-Doubles as a result gate: exits non-zero if no valid results were produced (so a
-benchmark that failed/skipped doesn't get reported as a green job).
+Two output modes over the same data:
+  (default)    a plain-text table for the Slurm/container log; ALSO the result
+               gate — exits non-zero if no valid results were produced, so a
+               failed/skipped benchmark doesn't get reported as a green job.
+  --markdown   GitHub-flavored markdown for a GitHub Actions job summary
+               (https://github.blog/.../supercharging-github-actions-with-job-summaries/);
+               reporting only, always exits 0. A workflow step appends this to
+               $GITHUB_STEP_SUMMARY so the run page shows a rendered table.
 
     python summarize.py --results-dir results --runner gb200-nv_1 --ts <ts>
+    python summarize.py --results-dir results --markdown >> "$GITHUB_STEP_SUMMARY"
 """
 from __future__ import annotations
 
@@ -35,82 +37,113 @@ def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[d
         except (json.JSONDecodeError, OSError):
             continue
         if d.get("family") in ("nccl", "moe"):
-            d["_base"] = base
             docs.append(d)
     return docs
 
 
-def _peak_busbw(rows: list[dict]) -> float:
+def _peak_busbw(rows):
     return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0)
 
 
-def _min_lat(rows: list[dict]) -> float:
+def _min_lat(rows):
     vals = [r["out_of_place"]["time_us"] for r in rows
             if r.get("out_of_place", {}).get("time_us") is not None]
     return min(vals) if vals else float("nan")
 
 
-def main() -> int:
-    ap = argparse.ArgumentParser(description="CollectiveX result summary table")
-    ap.add_argument("--results-dir", default="results")
-    ap.add_argument("--runner", default=None)
-    ap.add_argument("--ts", default=None)
-    args = ap.parse_args()
+def _fnum(x, fmt):
+    return format(x, fmt) if isinstance(x, (int, float)) else "—"
 
-    docs = load_results(args.results_dir, args.runner, args.ts)
-    nccl = [d for d in docs if d["family"] == "nccl"]
-    moe = [d for d in docs if d["family"] == "moe"]
 
+def render_plain(nccl, moe, n_valid, total) -> str:
+    out = []
     hdr = "CollectiveX results"
-    if docs:
-        d0 = docs[0]
-        hdr += (f" — runner={d0.get('runner')} topology={d0.get('topology_class')}"
-                f" transport={d0.get('transport')}")
-    print("\n" + "=" * len(hdr))
-    print(hdr)
-    print("=" * len(hdr))
+    if nccl or moe:
+        d0 = (nccl + moe)[0]
+        hdr += f" — runner={d0.get('runner')} topology={d0.get('topology_class')} transport={d0.get('transport')}"
+    out += ["=" * len(hdr), hdr, "=" * len(hdr)]
+    if nccl:
+        out.append(f"\nNCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')}):")
+        out.append(f"  {'op':<16}{'status':<9}{'peak busbw':>12}{'min lat':>10}{'avg busbw':>11}")
+        for d in sorted(nccl, key=lambda x: x["op"]):
+            rows = d.get("rows", [])
+            avg = (d.get("summary") or {}).get("avg_busbw_gbps")
+            out.append(f"  {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}"
+                       f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
+    if moe:
+        out.append("\nMoE / DeepEP dispatch+combine:")
+        out.append(f"  {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}{'disp_p50':>10}{'tokens/s':>13}  correct")
+        for d in sorted(moe, key=lambda x: x.get("backend", "")):
+            m, c = d.get("metrics", {}), d.get("correctness", {})
+            tps = m.get("tokens_per_second")
+            out.append(f"  {d.get('backend',''):<10}{d.get('mode',''):<8}{d.get('status',''):<9}"
+                       f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}"
+                       f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}"
+                       f"{(tps if tps is not None else float('nan')):>13.3e}   {c.get('passed')}")
+    out.append(f"\n{n_valid}/{total} results valid.")
+    return "\n".join(out)
+
 
-    n_valid = 0
+def _emoji(status) -> str:
+    return "✅ valid" if status == "valid" else f"❌ {status}"
 
+
+def render_markdown(nccl, moe, n_valid, total) -> str:
+    out = []
+    if nccl or moe:
+        d0 = (nccl + moe)[0]
+        out.append(f"## CollectiveX results — `{d0.get('runner')}` · {d0.get('topology_class')} · {d0.get('transport') or 'n/a'}")
     if nccl:
-        ws = nccl[0].get("world_size")
-        print(f"\nNCCL primitives (world={ws}, dtype={nccl[0].get('dtype')}):")
-        print(f"  {'op':<16}{'status':<9}{'peak busbw':>12}{'min lat':>10}{'avg busbw':>11}")
-        print(f"  {'':<16}{'':<9}{'(GB/s)':>12}{'(us)':>10}{'(GB/s)':>11}")
+        out.append(f"\n### NCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n")
+        out.append("| op | status | peak busbw (GB/s) | min lat (µs) | avg busbw (GB/s) |")
+        out.append("|---|---|--:|--:|--:|")
         for d in sorted(nccl, key=lambda x: x["op"]):
             rows = d.get("rows", [])
-            n_valid += d.get("status") == "valid"
             avg = (d.get("summary") or {}).get("avg_busbw_gbps")
-            print(f"  {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}"
-                  f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
-
+            out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | "
+                       f"{_min_lat(rows):.2f} | {_fnum(avg, '.1f')} |")
     if moe:
-        print("\nMoE / DeepEP dispatch+combine:")
-        print(f"  {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}"
-              f"{'disp_p50':>10}{'tokens/s':>13}{'  correct'}")
-        print(f"  {'':<10}{'':<8}{'':<9}{'(us)':>9}{'(us)':>9}{'(us)':>10}{'':>13}")
+        out.append("\n### MoE / DeepEP dispatch+combine\n")
+        out.append("| backend | mode | status | rt p50 (µs) | rt p99 (µs) | dispatch p50 (µs) | tokens/s | correct |")
+        out.append("|---|---|---|--:|--:|--:|--:|:--:|")
         for d in sorted(moe, key=lambda x: x.get("backend", "")):
-            m = d.get("metrics", {})
-            c = d.get("correctness", {})
-            n_valid += d.get("status") == "valid"
-            tps = m.get("tokens_per_second")
-            print(f"  {d.get('backend',''):<10}{d.get('mode',''):<8}{d.get('status',''):<9}"
-                  f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}"
-                  f"{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}"
-                  f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}"
-                  f"{(tps if tps is not None else float('nan')):>13.3e}"
-                  f"   {c.get('passed')}")
+            m, c = d.get("metrics", {}), d.get("correctness", {})
+            out.append(f"| `{d.get('backend')}` | {d.get('mode')} | {_emoji(d.get('status'))} | "
+                       f"{_fnum(m.get('roundtrip_us_p50'), '.1f')} | {_fnum(m.get('roundtrip_us_p99'), '.1f')} | "
+                       f"{_fnum(m.get('dispatch_us_p50'), '.1f')} | {_fnum(m.get('tokens_per_second'), '.3e')} | "
+                       f"{'✅' if c.get('passed') else '❌'} |")
+    badge = "✅" if (total and n_valid == total) else "⚠️"
+    out.append(f"\n{badge} **{n_valid}/{total} results valid.**")
+    if not total:
+        out.append("\n> No result files found — the benchmark produced nothing.")
+    return "\n".join(out)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX result summary")
+    ap.add_argument("--results-dir", default="results")
+    ap.add_argument("--runner", default=None)
+    ap.add_argument("--ts", default=None)
+    ap.add_argument("--markdown", action="store_true",
+                    help="emit GitHub job-summary markdown (reporting only; always exits 0)")
+    args = ap.parse_args()
 
+    docs = load_results(args.results_dir, args.runner, args.ts)
+    nccl = [d for d in docs if d["family"] == "nccl"]
+    moe = [d for d in docs if d["family"] == "moe"]
     total = len(docs)
-    print(f"\n{n_valid}/{total} results valid.\n")
+    n_valid = sum(d.get("status") == "valid" for d in docs)
+
+    if args.markdown:
+        print(render_markdown(nccl, moe, n_valid, total))
+        return 0  # reporting step — never fail the job here
+
+    print(render_plain(nccl, moe, n_valid, total))
     if total == 0:
-        print("ERROR: no result files found to summarize — benchmark produced nothing.")
-        return 1
-    if n_valid == 0:
-        print("ERROR: no valid results — failing the job.")
+        print("ERROR: no result files found — benchmark produced nothing.")
         return 1
     if n_valid < total:
-        print(f"WARNING: {total - n_valid} result(s) invalid.")
+        print(f"ERROR: {total - n_valid} result(s) invalid — failing the job.")
         return 1
     return 0
 

From be9cc91cd4e083189afcf1493e6d4975c59121c8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 17:13:56 +0800
Subject: [PATCH 06/17] CollectiveX: add MI355X / MoRI EP path
 (dispatch+combine)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First AMD / cross-vendor reach, scaffolded ahead of Milestone 1:

- run_mori.py: MoRI dispatch+combine (normal mode), correctness-gated,
  mirroring ROCm/mori's dispatch_combine example — int32 routing indices,
  (n,0) fp8 scales, the zero-copy registered-combine-input-buffer staging
  step, and expected = input x (#unique destination ranks). Emits the same
  flat JSON shape (family=moe, backend=mori) with CUDA-event timing.
- launchers/launch_mi355x-amds.sh: AMD adapter — partition compute, no
  account, --cpus-per-task=128, node-local /var/lib/squash imported via srun
  on the allocated node, --container-writable --container-remap-root, forces
  CX_BENCH=mori, mounts the (compute-visible) checkout at /ix.
- launchers/run_in_container.sh: run_mori_suite + mori case (nccl|deepep|mori|all).
- launchers/common.sh: ROCm MoRI image (rocm/sgl-dev:...-mori-0227-2) in
  cx_default_image for mi355x*/mi350x*/mi325x*/mi300x*.
- workflow: mi355x sku + mori benchmark options for workflow_dispatch.
- docs: CONTAINERS.md AMD section, README files/run/risks, plan.md status.

Not yet hardware-validated (no MI355X access) — MoRI's Python API is
version-sensitive (marked ADAPT HERE); the first runner job is the
validation, as GB200 was for DeepEP. The ROCm image isn't digest-pinned yet.
---
 .../workflows/collectivex-experimental.yml    |   5 +-
 experimental/CollectiveX/CONTAINERS.md        |  12 +
 experimental/CollectiveX/README.md            |  25 +-
 experimental/CollectiveX/launchers/common.sh  |   7 +
 .../launchers/launch_mi355x-amds.sh           |  91 +++++++
 .../CollectiveX/launchers/run_in_container.sh |  24 +-
 experimental/CollectiveX/plan.md              |   3 +-
 experimental/CollectiveX/run_mori.py          | 254 ++++++++++++++++++
 8 files changed, 409 insertions(+), 12 deletions(-)
 create mode 100644 experimental/CollectiveX/launchers/launch_mi355x-amds.sh
 create mode 100644 experimental/CollectiveX/run_mori.py

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
index c63b56635..c98646efe 100644
--- a/.github/workflows/collectivex-experimental.yml
+++ b/.github/workflows/collectivex-experimental.yml
@@ -22,12 +22,13 @@ on:
         description: Self-hosted runner pool (must have a CollectiveX launcher)
         type: choice
         default: gb200
-        options: [gb200, b200-dgxc, b200-multinode]
+        options: [gb200, b200-dgxc, b200-multinode, mi355x]
       benchmark:
+        # mori runs only on mi355x; nccl/deepep/all on the NVIDIA SKUs.
         description: Which benchmark to run
         type: choice
         default: nccl
-        options: [nccl, deepep, all]
+        options: [nccl, deepep, mori, all]
       ops:
         description: NCCL ops (space-separated); blank = default set
         type: string
diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
index 3aff25194..1c82e0f66 100644
--- a/experimental/CollectiveX/CONTAINERS.md
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -39,6 +39,18 @@ If a bundled DeepEP is needed before `rebuild-deepep` is wired on the multi-arch
 
 Select via `CX_IMAGE=…@sha256:…` on the launch script.
 
+## AMD container (MI355X) — MoRI EP
+
+AMD CDNA4 cannot run the CUDA multi-arch image; MI355X uses a ROCm image that
+bundles **MoRI** (AMD's EP dispatch/combine library). Set in `cx_default_image`
+for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`).
+
+- **Image:** `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2` (single-arch ROCm 7.2.0 runtime; from the AMD master serving config). **Not digest-pinned yet** — record the digest here and pin once validated on the runner, like the NVIDIA image.
+- **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
+- **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`).
+- **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up.
+- **NOT yet validated on hardware** (no MI355X access at authoring). Treat the first runner job as the validation, exactly as `run_deepep.py` was on GB200. Likely first-run touch-ups: MoRI Python API signatures (`EpDispatchCombineConfig` kwargs, `dispatch`/`combine`/`get_registered_combine_input_buffer`), then fill a version table here (ROCm, torch, RCCL, MoRI commit).
+
 ## Cluster access / QOS
 
 - **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account.
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
index 606eeb395..ac489f541 100644
--- a/experimental/CollectiveX/README.md
+++ b/experimental/CollectiveX/README.md
@@ -17,10 +17,11 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL
 | `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) |
 | `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) |
 | `run_deepep.py` | DeepEP dispatch+combine, normal mode, correctness-gated (torch + DeepEP) |
+| `run_mori.py` | MoRI (AMD) dispatch+combine, normal mode, correctness-gated (torch + MoRI) |
 | `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) |
 | `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build |
-| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/all) |
-| `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL) |
+| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) |
+| `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI) |
 | `CONTAINERS.md` | the pinned multi-arch container + audited library versions |
 | `results/` | flat JSON artifacts (+ `plots/`, raw captures) |
 | `tests/fixtures/` | captured nccl-tests output for offline parser checks |
@@ -31,9 +32,10 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL
 
 - **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle
   capacity; never auto-contends with the B200 serving sweep).
-- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode),
-  `benchmark` (nccl / deepep / all), ops, sizes, ngpus. Lands on that SKU's
-  self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`.
+- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode /
+  mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only), ops,
+  sizes, ngpus. Lands on that SKU's self-hosted runner and runs
+  `launch_${RUNNER_NAME%%_*}.sh`.
 
 Each job renders a results table to the **GitHub Actions job summary** (via
 `summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs
@@ -47,9 +49,10 @@ bash experimental/CollectiveX/launchers/launch_gb200-nv.sh                 # GB2
 CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild)
 bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh               # B200 8× NVLink
 bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh         # B200 2-node, cross-IB
+bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh             # MI355X 8× XGMI, MoRI EP (AMD; forces CX_BENCH=mori)
 ```
 
-Knobs: `CX_BENCH` (nccl|deepep|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`,
+Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`,
 `CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible
 staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate
 nothing). Results land in `experimental/CollectiveX/results/`.
@@ -78,6 +81,10 @@ DeepSeek-V4 fallback images.
   missing) → `srun --container-image=… --container-mounts=<repo>:/ix` → in-container
   `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account
   `benchmark`.
+- **AMD MI355X** (`launch_mi355x-amds.sh`, MoRI / `CX_BENCH=mori`) diverges: partition
+  `compute`, no account, pyxis `--container-writable --container-remap-root`, and a
+  **node-local** squash (`/var/lib/squash`) imported via `srun` on the allocated node
+  (not the login node). Workspace is bind-mounted directly (no `CX_STAGE_DIR`).
 - Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in
   `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node
   adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`).
@@ -97,6 +104,12 @@ DeepSeek-V4 fallback images.
   it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive;
   `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against
   the built commit. B200 (x86_64) first; GB200 (aarch64) follows.
+- **MoRI / MI355X** (`run_mori.py` + `launch_mi355x-amds.sh`) is **scaffolded, not yet
+  run on hardware** (no MI355X access). It mirrors `ROCm/mori`'s dispatch/combine
+  example — config + the `get_registered_combine_input_buffer` zero-copy path,
+  correctness `expected = input × (#unique destination ranks)`. The API is
+  version-sensitive (`ADAPT HERE`), so the first runner job is the validation, like
+  GB200 was for DeepEP; the AMD ROCm image isn't digest-pinned yet.
 - **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a
   compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container
   or srt-slurm. CX_BENCH=nccl only for now.
diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh
index d8d5749eb..7d63dfdc8 100644
--- a/experimental/CollectiveX/launchers/common.sh
+++ b/experimental/CollectiveX/launchers/common.sh
@@ -24,8 +24,15 @@ CX_IMAGE_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca5
 # DeepEP — see CONTAINERS.md — but are not multi-arch and are not the default.)
 CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130"
 
+# AMD (ROCm/CDNA): the multi-arch NVIDIA image above is x86_64+aarch64 CUDA and
+# cannot run on MI355X. AMD uses a separate ROCm image that bundles MoRI (the
+# AMD EP library). Single-arch (linux/amd64 host, ROCm runtime); not digest-
+# pinned yet — pin once validated on the runner. See CONTAINERS.md.
+CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2"
+
 cx_default_image() {
   case "$1" in
+    mi355x*|mi350x*|mi325x*|mi300x*) echo "$CX_IMAGE_AMD_MORI" ;;
     b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;;
     *) cx_die "no default image for runner prefix: $1" ;;
   esac
diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
new file mode 100644
index 000000000..f6901f7d4
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+# CollectiveX — MI355X (AMD CDNA4, 8 GPU/node) SKU adapter: MoRI dispatch/combine.
+#
+# AMD counterpart to the NVIDIA adapters. Differs from them in ways taken from
+# the real runners/launch_mi355x-amds.sh:
+#   * partition `compute`, no --account (cluster default), --cpus-per-task=128,
+#     and known-bad nodes excluded;
+#   * squash is NODE-LOCAL (/var/lib/squash), so enroot import runs via srun on
+#     the allocated node (not on the login node like the shared-FS NVIDIA path);
+#   * pyxis flags --container-writable --container-remap-root for the ROCm image.
+# MoRI is the only AMD backend wired (CX_BENCH=mori); rccl-tests primitives are a
+# follow-up.
+#
+# !!! NOT yet validated on hardware (no MI355X cluster access at authoring time).
+# Treat the first on-runner run as validation — like run_deepep.py was on GB200.
+#
+# Run from inside the InferenceX checkout on the MI355X login node:
+#     bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh
+#
+# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(30) CX_IMAGE
+#   CX_SQUASH_DIR(/var/lib/squash) CX_EXCLUDE_NODES CX_DRYRUN(0)
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=common.sh
+source "$HERE/common.sh"
+
+RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}"
+PARTITION="${CX_PARTITION:-compute}"
+NGPUS="${CX_NGPUS:-8}"
+TIME_MIN="${CX_TIME:-30}"
+IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}"
+SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}"   # node-local on MI355X
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+
+# MoRI is the only AMD backend wired today; force it.
+if [ "${CX_BENCH:-mori}" != "mori" ]; then
+  cx_log "mi355x: CX_BENCH='${CX_BENCH}' not supported on AMD yet; using mori"
+fi
+export CX_BENCH=mori
+export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_TOPO="mi355x-xgmi" CX_TRANSPORT="xgmi"
+export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
+
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=mori image=$IMAGE"
+# AMD workspace is compute-visible (the serving launcher bind-mounts it directly),
+# so no staging; the node-local squash is handled via srun below.
+MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+SQUASH_FILE="$SQUASH_DIR/$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g').sqsh"
+LOCK_FILE="${SQUASH_FILE}.lock"
+cx_log "squash(node-local)=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+
+if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
+command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
+
+salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \
+       --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+cx_log "JOB_ID=$JOB_ID"
+trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+
+# Clear stray containers, then enroot-import to the node-local squash (flock,
+# </dev/null so a missing token can't hang). Both run on the allocated node.
+# shellcheck disable=SC2016  # $(...) must expand on the remote node, not here
+srun --jobid="$JOB_ID" bash -c 'docker stop $(docker ps -aq) 2>/dev/null || true' || true
+srun --jobid="$JOB_ID" bash -c "
+  exec 9>\"$LOCK_FILE\"
+  flock -w 900 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; }
+  if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then
+    echo 'squash present: $SQUASH_FILE'
+  else
+    rm -f \"$SQUASH_FILE\"
+    enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" </dev/null
+  fi
+"
+
+srun --jobid="$JOB_ID" \
+  --container-image="$SQUASH_FILE" \
+  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+  --container-writable --container-remap-root --no-container-mount-home \
+  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+  --no-container-entrypoint --export=ALL \
+  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh
index cde27ac1c..c1cf532e9 100644
--- a/experimental/CollectiveX/launchers/run_in_container.sh
+++ b/experimental/CollectiveX/launchers/run_in_container.sh
@@ -8,9 +8,10 @@
 # results/.
 #
 # Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO
-# Selector:        CX_BENCH = nccl | deepep | all          (default nccl)
+# Selector:        CX_BENCH = nccl | deepep | mori | all    (default nccl)
+#                  (mori = AMD ROCm EP; nccl/deepep = NVIDIA. `all` = nccl+deepep.)
 # NCCL knobs:      CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME
-# DeepEP knobs:    CX_TOKENS_PER_RANK CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE
+# EP knobs (DeepEP/MoRI): CX_TOKENS_PER_RANK CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE
 set -euo pipefail
 
 cd /ix/experimental/CollectiveX
@@ -67,12 +68,29 @@ run_deepep_suite() {
     || { cx_log "WARN: deepep run failed"; return 1; }
 }
 
+run_mori_suite() {
+  # MoRI (AMD ROCm EP), bundled in the AMD MoRI image. If absent this is a
+  # failure (MoRI is not rebuildable here), not a silent skip. Single-node
+  # 8x MI355X over XGMI; torch.cuda maps onto ROCm/HIP.
+  if ! python3 -c "import mori" 2>/dev/null; then
+    cx_log "WARN: mori not importable — needs the AMD MoRI image (rocm/sgl-dev:...-mori-...); cannot run mori"
+    return 1
+  fi
+  torchrun --nproc_per_node="$CX_NGPUS" run_mori.py \
+    --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+    --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \
+    --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \
+    --env-json "$ENVJSON" --out "results/${CX_RUNNER}_mori_${CX_TS}.json" \
+    || { cx_log "WARN: mori run failed"; return 1; }
+}
+
 rc=0
 case "$CX_BENCH" in
   nccl)   run_nccl_suite || rc=1 ;;
   deepep) run_deepep_suite || rc=1 ;;
+  mori)   run_mori_suite || rc=1 ;;
   all)    run_nccl_suite || rc=1; run_deepep_suite || rc=1 ;;
-  *)      cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|all)" ;;
+  *)      cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|mori|all)" ;;
 esac
 
 # Summary table for the log; also fails the job if no valid results were produced.
diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md
index 6ceb512ef..ced877dd8 100644
--- a/experimental/CollectiveX/plan.md
+++ b/experimental/CollectiveX/plan.md
@@ -29,8 +29,9 @@ Existing public benchmarks don't offer trustworthy, like-for-like collective/EP
 The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that:
 
 - **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`.
-- **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|all) through a shared `launchers/run_in_container.sh`.
+- **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`.
 - **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → GB200 NCCL smoke; `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
+- **AMD MI355X / MoRI path scaffolded** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Not yet hardware-validated** (no MI355X access) — the MoRI Python API is version-sensitive (`ADAPT HERE`); the first runner job is the validation, as GB200 was for DeepEP.
 
 This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental).
 
diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py
new file mode 100644
index 000000000..d4d0297ef
--- /dev/null
+++ b/experimental/CollectiveX/run_mori.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+"""CollectiveX spike — MoRI (AMD) MoE dispatch+combine, normal mode.
+
+AMD counterpart to run_deepep.py, using ROCm MoRI's EpDispatchCombine op. One
+decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed,
+emitting the same flat-JSON shape (family=moe, backend=mori).
+
+  !!! MoRI's Python API is VERSION-SENSITIVE. The config/dispatch/combine block
+  below follows ROCm/mori examples/ops/dispatch_combine/test_dispatch_combine.py
+  and is marked "ADAPT HERE" — validate the signatures against the MoRI build in
+  the image (rocm/sgl-dev:...-mori-...) and record its commit. This file has NOT
+  been run on MI355X yet (no cluster access at authoring time); treat the first
+  on-runner run as the validation, exactly as run_deepep.py was for GB200.
+
+Launch (one process per GPU), e.g. single-node 8x MI355X:
+    torchrun --nproc_per_node=8 run_mori.py \\
+        --runner mi355x-amds --topology-class mi355x-xgmi --transport xgmi \\
+        --env-json results/env.json --out results/mi355x_mori.json
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+import sys
+
+SCHEMA_VERSION = 1
+MEASUREMENT_CONTRACT = "mori-normal-v1"
+
+
+def _percentile(xs: list[float], q: float) -> float:
+    if not xs:
+        return float("nan")
+    s = sorted(xs)
+    i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1)))))
+    return s[i]
+
+
+def comparison_key(meta: dict) -> str:
+    parts = [
+        meta["op"], meta["backend"], meta["mode"], str(meta["world_size"]),
+        str(meta["nodes"]), meta["topology_class"], meta["comparison_class"],
+        meta["measurement_contract"], str(meta["shape"]),
+    ]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX MoRI dispatch+combine (normal mode)")
+    ap.add_argument("--tokens-per-rank", type=int, default=64)
+    ap.add_argument("--hidden", type=int, default=7168)
+    ap.add_argument("--topk", type=int, default=8)
+    ap.add_argument("--experts", type=int, default=256)
+    ap.add_argument("--dispatch-dtype", default="bf16", choices=["bf16", "fp8"])
+    ap.add_argument("--seed", type=int, default=67)
+    ap.add_argument("--warmup", type=int, default=20)
+    ap.add_argument("--iters", type=int, default=200)
+    ap.add_argument("--trials", type=int, default=3)
+    ap.add_argument("--block-num", type=int, default=int(os.environ.get("CX_MORI_BLOCK_NUM", "80")))
+    ap.add_argument("--dispatch-warps", type=int, default=int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16")))
+    ap.add_argument("--combine-warps", type=int, default=int(os.environ.get("CX_MORI_COMBINE_WARPS", "8")))
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="")
+    ap.add_argument("--comparison-class", default="standardized")
+    ap.add_argument("--mori-commit", default=os.environ.get("MORI_COMMIT", "unknown"))
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+    try:
+        import mori  # type: ignore
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: mori import failed — needs the AMD MoRI image. {exc!r}", file=sys.stderr)
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+    if world_size % 1 != 0 or args.experts % world_size != 0:
+        if rank == 0:
+            print(f"ERROR: experts ({args.experts}) must divide world_size ({world_size})", file=sys.stderr)
+        return 2
+    experts_per_rank = args.experts // world_size
+    torch.manual_seed(args.seed + rank)
+
+    # ===================== ADAPT HERE (MoRI API) =========================
+    # init torch.distributed + MoRI shmem (per the MoRI dispatch/combine test).
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12355")
+    if not dist.is_initialized():
+        dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank,
+                                world_size=world_size, device_id=device)
+    world_group = torch.distributed.group.WORLD
+    torch._C._distributed_c10d._register_process_group("default", world_group)
+    mori.shmem.shmem_torch_process_group_init("default")
+
+    n = args.tokens_per_rank
+    H = args.hidden
+    topk = args.topk
+    config = mori.ops.EpDispatchCombineConfig(
+        data_type=torch.bfloat16,
+        rank=rank,
+        world_size=world_size,
+        hidden_dim=H,
+        scale_dim=0,
+        scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(),
+        max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+        max_num_inp_token_per_rank=max(4096, n),
+        num_experts_per_rank=experts_per_rank,
+        num_experts_per_token=topk,
+        use_external_inp_buf=False,
+        quant_type="none",
+    )
+    op = mori.ops.EpDispatchCombineOp(config)
+
+    # Routing: each token -> topk distinct experts in [0, experts). MoRI expects
+    # INT32 expert indices, and a real (n, scale_dim) fp8 scales tensor even when
+    # scale_dim==0 (an (n,0) tensor) — not None (see the reference test).
+    x = torch.randn((n, H), dtype=torch.bfloat16, device=device)
+    indices = torch.stack([torch.randperm(args.experts, device=device)[:topk] for _ in range(n)]).to(torch.int32)
+    weights = torch.rand((n, topk), dtype=torch.float32, device=device)
+    scales = torch.empty((n, 0), dtype=torch.float8_e4m3fnuz, device=device)
+
+    def run_once():
+        (dispatch_output, dispatch_weights, _dispatch_scales,
+         dispatch_indices, recv_num) = op.dispatch(
+            x, weights, scales, indices,
+            block_num=args.block_num, warp_per_block=args.dispatch_warps)
+        # Zero-copy mode (use_external_inp_buf=False): combine reads from MoRI's
+        # registered combine-input buffer, so stage the dispatched rows into it
+        # first. (In a real MoE the expert FFN writes its outputs here; with no
+        # expert compute we copy the dispatched activations straight through.)
+        total_recv = int(recv_num[0].item())
+        combine_input = dispatch_output.to(torch.bfloat16)
+        combine_buf = op.get_registered_combine_input_buffer(
+            torch.bfloat16, hidden_dim=combine_input.size(1))
+        combine_buf[:total_recv, :].copy_(combine_input[:total_recv, :])
+        combined, _combined_w = op.combine(
+            combine_input, dispatch_weights, dispatch_indices,
+            block_num=args.block_num, warp_per_block=args.combine_warps)
+        return combined, recv_num
+    # =====================================================================
+
+    # ---- correctness gate ----
+    combined, recv_num = run_once()
+    torch.cuda.synchronize()
+    # MoRI combine sums one copy per destination RANK, so combined[i] ≈
+    # input[i] * (#unique destination ranks among the token's topk experts)
+    # (see ROCm/mori .../test_dispatch_combine.py).
+    pes = indices.long() // experts_per_rank
+    unique_pes = torch.tensor(
+        [len(set(row.tolist())) for row in pes], device=device, dtype=torch.float32
+    ).unsqueeze(1)
+    expected = x.float() * unique_pes
+    max_abs = (combined.float() - expected).abs().max().item()
+    max_rel = max_abs / (expected.abs().max().item() + 1e-6)
+    # Validated tolerance from the reference test (bf16 + up-to-topk summation).
+    combine_ok = bool(torch.allclose(combined.float(), expected.float(), atol=1e-2, rtol=1e-2))
+    recv_ok = bool(int(recv_num[0].item()) > 0) if recv_num is not None else True
+    correct = bool(combine_ok and recv_ok)
+
+    def time_us(fn, warmup, iters) -> list[float]:
+        for _ in range(warmup):
+            fn()
+        torch.cuda.synchronize()
+        out = []
+        for _ in range(iters):
+            s = torch.cuda.Event(enable_timing=True)
+            e = torch.cuda.Event(enable_timing=True)
+            s.record(); fn(); e.record(); torch.cuda.synchronize()
+            out.append(s.elapsed_time(e) * 1000.0)
+        return out
+
+    def dispatch_only():
+        op.dispatch(x, weights, scales, indices,
+                    block_num=args.block_num, warp_per_block=args.dispatch_warps)
+
+    trials = []
+    for _ in range(args.trials):
+        rt = time_us(run_once, args.warmup, args.iters)
+        dp = time_us(dispatch_only, args.warmup, args.iters)
+        trials.append({"roundtrip_us_p50": _percentile(rt, 50), "roundtrip_us_p99": _percentile(rt, 99),
+                       "dispatch_us_p50": _percentile(dp, 50)})
+
+    local_rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
+    t = torch.tensor([local_rt_p50], device=device)
+    dist.all_reduce(t, op=dist.ReduceOp.MAX)
+    slowest_rank_us = float(t.item())
+
+    if rank == 0:
+        shape = {"tokens_per_rank": n, "hidden": H, "topk": topk, "experts": args.experts,
+                 "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype}
+        meta = {"op": "dispatch-combine", "backend": "mori", "mode": "normal",
+                "world_size": world_size, "nodes": int(os.environ.get("SLURM_NNODES", "1")),
+                "topology_class": args.topology_class, "comparison_class": args.comparison_class,
+                "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape}
+        rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
+        tokens_total = n * world_size
+        env = None
+        if args.env_json and os.path.exists(args.env_json):
+            with open(args.env_json) as fh:
+                env = json.load(fh)
+        doc = {
+            "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "run_mori.py",
+            "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+            "runner": args.runner, "transport": args.transport,
+            "status": "valid" if correct else "invalid",
+            "comparison_key": comparison_key(meta),
+            "backend_provenance": {"mori_commit": args.mori_commit,
+                                   "block_num": args.block_num,
+                                   "dispatch_warps": args.dispatch_warps,
+                                   "combine_warps": args.combine_warps},
+            **meta,
+            "correctness": {"passed": correct, "combine_within_tol": combine_ok,
+                            "recv_nonzero": recv_ok, "max_abs_error": max_abs, "max_rel_error": max_rel},
+            "metrics": {
+                "roundtrip_us_p50": rt_p50,
+                "roundtrip_us_p99": sum(t["roundtrip_us_p99"] for t in trials) / len(trials),
+                "dispatch_us_p50": sum(t["dispatch_us_p50"] for t in trials) / len(trials),
+                "slowest_rank_roundtrip_us": slowest_rank_us,
+                "tokens_per_second": (tokens_total / (rt_p50 * 1e-6)) if rt_p50 else None,
+            },
+            "trials": trials, "environment": env,
+        }
+        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+        with open(args.out, "w") as fh:
+            json.dump(doc, fh, indent=2)
+            fh.write("\n")
+        print(f"mori dispatch-combine: status={doc['status']} rt_p50={rt_p50:.1f}us "
+              f"slowest_rank={slowest_rank_us:.1f}us correct={correct} -> {args.out}")
+
+    try:
+        mori.shmem.shmem_finalize()
+    except Exception:
+        pass
+    dist.barrier()
+    dist.destroy_process_group()
+    return 0 if correct else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From d8ee9bf858a3471f2899276fa1a22aedfce8f32a Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 17:25:10 +0800
Subject: [PATCH 07/17] CollectiveX: run MI355X MoRI on push; align launcher
 with serving script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- workflow: replace the on:push GB200 NCCL smoke with the MI355X MoRI
  dispatch/combine run (runs-on: mi355x, CX_BENCH=mori), and name the job
  "CollectiveX Experimental" (no longer "smoke"). GB200/B200 NCCL + DeepEP
  remain on workflow_dispatch.
- launch_mi355x-amds.sh: adapt more faithfully to runners/launch_mi355x-amds.sh
  — squeue by job-name only (no -u), flock -w 600, and clear ROCm gpucore.*
  dumps after the run so the next checkout is clean. Bump default CX_TIME to 60
  for a cold ROCm-image import.
- summarize.py: drop the "N/N results valid." footer from both the job-summary
  (markdown) and plain output; the failure gate still reports invalid results.
  Relabel the MoE section "MoE dispatch+combine (DeepEP / MoRI)".
- docs: README/plan describe push -> MI355X MoRI.
---
 .../workflows/collectivex-experimental.yml    | 33 +++++++++----------
 experimental/CollectiveX/README.md            |  4 +--
 .../launchers/launch_mi355x-amds.sh           | 11 ++++---
 experimental/CollectiveX/plan.md              |  2 +-
 experimental/CollectiveX/summarize.py         |  7 ++--
 5 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
index c98646efe..fcfdcb88e 100644
--- a/.github/workflows/collectivex-experimental.yml
+++ b/.github/workflows/collectivex-experimental.yml
@@ -1,11 +1,11 @@
 name: CollectiveX Experimental
 
 # Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
-# Push to the feature branch runs a small GB200 NCCL smoke (no merge to main
-# needed); workflow_dispatch runs a chosen SKU + benchmark (the lane for B200,
-# DeepEP, and larger sweeps). Each job lands on the SKU's self-hosted runner and
-# invokes that SKU's launch script — the same launch_${RUNNER_NAME%%_*}.sh
-# convention the serving benchmarks use.
+# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no
+# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane
+# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's
+# self-hosted runner and invokes that SKU's launch script — the same
+# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use.
 
 on:
   push:
@@ -54,23 +54,20 @@ permissions:
   contents: read
 
 jobs:
-  # Push -> short GB200 NCCL smoke (idle capacity; never auto-contends with the
-  # B200 serving sweep). GB200 runner workspace is staged to compute-visible
-  # Lustre via CX_STAGE_DIR.
-  smoke:
+  # Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and
+  # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute-
+  # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs.
+  experimental:
+    name: CollectiveX Experimental
     if: github.event_name == 'push'
-    runs-on: gb200
-    timeout-minutes: 60
+    runs-on: mi355x
+    timeout-minutes: 90
     env:
-      CX_BENCH: nccl
-      CX_NGPUS: '4'
-      CX_MAX_BYTES: 1G
-      CX_TIME: '20'
-      CX_STAGE_DIR: /mnt/lustre01/users-public/sa-shared/cx-stage
+      CX_BENCH: mori
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
         with: { clean: true }
-      - name: Launch GB200 NCCL smoke
+      - name: Launch MI355X MoRI
         env:
           RUNNER_NAME: ${{ runner.name }}
         run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
@@ -81,7 +78,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: collectivex_smoke_gb200_${{ github.run_id }}
+          name: collectivex_mi355x_mori_${{ github.run_id }}
           path: experimental/CollectiveX/results/*.json
           if-no-files-found: warn
 
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
index ac489f541..11bbd8aaa 100644
--- a/experimental/CollectiveX/README.md
+++ b/experimental/CollectiveX/README.md
@@ -30,8 +30,8 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL
 
 ### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`)
 
-- **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle
-  capacity; never auto-contends with the B200 serving sweep).
+- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** dispatch/combine
+  run (the "CollectiveX Experimental" job; lands on a free `mi355x-amds` runner).
 - **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode /
   mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only), ops,
   sizes, ngpus. Lands on that SKU's self-hosted runner and runs
diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
index f6901f7d4..f1117229c 100644
--- a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
+++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
@@ -17,7 +17,7 @@
 # Run from inside the InferenceX checkout on the MI355X login node:
 #     bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh
 #
-# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(30) CX_IMAGE
+# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(60) CX_IMAGE
 #   CX_SQUASH_DIR(/var/lib/squash) CX_EXCLUDE_NODES CX_DRYRUN(0)
 set -euo pipefail
 
@@ -30,7 +30,7 @@ source "$HERE/common.sh"
 RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}"
 PARTITION="${CX_PARTITION:-compute}"
 NGPUS="${CX_NGPUS:-8}"
-TIME_MIN="${CX_TIME:-30}"
+TIME_MIN="${CX_TIME:-60}"   # generous: a cold enroot import of the large ROCm image
 IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}"
 SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}"   # node-local on MI355X
 EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}"
@@ -59,7 +59,7 @@ command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm lo
 
 salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \
        --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
-JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+JOB_ID="$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)"
 [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
 cx_log "JOB_ID=$JOB_ID"
 trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
@@ -70,7 +70,7 @@ trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
 srun --jobid="$JOB_ID" bash -c 'docker stop $(docker ps -aq) 2>/dev/null || true' || true
 srun --jobid="$JOB_ID" bash -c "
   exec 9>\"$LOCK_FILE\"
-  flock -w 900 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; }
+  flock -w 600 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; }
   if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then
     echo 'squash present: $SQUASH_FILE'
   else
@@ -88,4 +88,7 @@ srun --jobid="$JOB_ID" \
   bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
 
 cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+# ROCm can leave gpucore.* dumps in the workdir on a crash; clear them so the
+# next checkout on this runner is clean (mirrors the serving launcher).
+rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true
 cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md
index ced877dd8..7f1e19d64 100644
--- a/experimental/CollectiveX/plan.md
+++ b/experimental/CollectiveX/plan.md
@@ -30,7 +30,7 @@ The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64)
 
 - **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`.
 - **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`.
-- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → GB200 NCCL smoke; `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
+- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
 - **AMD MI355X / MoRI path scaffolded** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Not yet hardware-validated** (no MI355X access) — the MoRI Python API is version-sensitive (`ADAPT HERE`); the first runner job is the validation, as GB200 was for DeepEP.
 
 This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental).
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
index 8d81b13ee..dd51f7c73 100644
--- a/experimental/CollectiveX/summarize.py
+++ b/experimental/CollectiveX/summarize.py
@@ -71,7 +71,7 @@ def render_plain(nccl, moe, n_valid, total) -> str:
             out.append(f"  {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}"
                        f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
     if moe:
-        out.append("\nMoE / DeepEP dispatch+combine:")
+        out.append("\nMoE dispatch+combine (DeepEP / MoRI):")
         out.append(f"  {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}{'disp_p50':>10}{'tokens/s':>13}  correct")
         for d in sorted(moe, key=lambda x: x.get("backend", "")):
             m, c = d.get("metrics", {}), d.get("correctness", {})
@@ -80,7 +80,6 @@ def render_plain(nccl, moe, n_valid, total) -> str:
                        f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}"
                        f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}"
                        f"{(tps if tps is not None else float('nan')):>13.3e}   {c.get('passed')}")
-    out.append(f"\n{n_valid}/{total} results valid.")
     return "\n".join(out)
 
 
@@ -103,7 +102,7 @@ def render_markdown(nccl, moe, n_valid, total) -> str:
             out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | "
                        f"{_min_lat(rows):.2f} | {_fnum(avg, '.1f')} |")
     if moe:
-        out.append("\n### MoE / DeepEP dispatch+combine\n")
+        out.append("\n### MoE dispatch+combine (DeepEP / MoRI)\n")
         out.append("| backend | mode | status | rt p50 (µs) | rt p99 (µs) | dispatch p50 (µs) | tokens/s | correct |")
         out.append("|---|---|---|--:|--:|--:|--:|:--:|")
         for d in sorted(moe, key=lambda x: x.get("backend", "")):
@@ -112,8 +111,6 @@ def render_markdown(nccl, moe, n_valid, total) -> str:
                        f"{_fnum(m.get('roundtrip_us_p50'), '.1f')} | {_fnum(m.get('roundtrip_us_p99'), '.1f')} | "
                        f"{_fnum(m.get('dispatch_us_p50'), '.1f')} | {_fnum(m.get('tokens_per_second'), '.3e')} | "
                        f"{'✅' if c.get('passed') else '❌'} |")
-    badge = "✅" if (total and n_valid == total) else "⚠️"
-    out.append(f"\n{badge} **{n_valid}/{total} results valid.**")
     if not total:
         out.append("\n> No result files found — the benchmark produced nothing.")
     return "\n".join(out)

From ac3f1b9df26072a81dfe397c13edae75bce652a2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 17:37:58 +0800
Subject: [PATCH 08/17] CollectiveX: size MoRI symmetric heap (first MI355X run
 hit the 2 GiB default)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First MI355X run reached the MoRI dispatch kernel — salloc, ROCm-image import,
mount, torchrun, 8-rank Gloo + shmem init, and EpDispatchCombineConfig/op/dispatch
all worked, confirming the API signatures. It OOM'd MoRI's default 2 GiB static
symmetric heap (hidden=7168 dispatch/combine buffers across 8 ranks request
~0.9 GiB each).

run_mori.py now sets MORI_SHMEM_HEAP_SIZE before `import mori` (default 16 GiB,
override CX_MORI_HEAP_BYTES). Docstring + CONTAINERS.md record the finding;
correctness/timing validated by the heap-sized re-run.
---
 experimental/CollectiveX/CONTAINERS.md |  2 +-
 experimental/CollectiveX/run_mori.py   | 21 +++++++++++++++------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
index 1c82e0f66..ee4114cff 100644
--- a/experimental/CollectiveX/CONTAINERS.md
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -49,7 +49,7 @@ for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`).
 - **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
 - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`).
 - **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up.
-- **NOT yet validated on hardware** (no MI355X access at authoring). Treat the first runner job as the validation, exactly as `run_deepep.py` was on GB200. Likely first-run touch-ups: MoRI Python API signatures (`EpDispatchCombineConfig` kwargs, `dispatch`/`combine`/`get_registered_combine_input_buffer`), then fill a version table here (ROCm, torch, RCCL, MoRI commit).
+- **First MI355X run reached the MoRI dispatch kernel** (node `mia1-p01-g10`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB squash) → mount → torchrun → 8-rank Gloo + MoRI shmem init → `EpDispatchCombineConfig`/op/`dispatch` all worked, confirming the API signatures. It then OOM'd MoRI's default **2 GiB static symmetric heap** (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). `run_mori.py` now sets **`MORI_SHMEM_HEAP_SIZE`** before `import mori` (default 16 GiB; override `CX_MORI_HEAP_BYTES`). Correctness + timing are validated by the heap-sized re-run; then fill a version table here (ROCm, torch, RCCL, MoRI commit).
 
 ## Cluster access / QOS
 
diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py
index d4d0297ef..dc724d398 100644
--- a/experimental/CollectiveX/run_mori.py
+++ b/experimental/CollectiveX/run_mori.py
@@ -5,12 +5,12 @@
 decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed,
 emitting the same flat-JSON shape (family=moe, backend=mori).
 
-  !!! MoRI's Python API is VERSION-SENSITIVE. The config/dispatch/combine block
-  below follows ROCm/mori examples/ops/dispatch_combine/test_dispatch_combine.py
-  and is marked "ADAPT HERE" — validate the signatures against the MoRI build in
-  the image (rocm/sgl-dev:...-mori-...) and record its commit. This file has NOT
-  been run on MI355X yet (no cluster access at authoring time); treat the first
-  on-runner run as the validation, exactly as run_deepep.py was for GB200.
+  MoRI's Python API is VERSION-SENSITIVE. The config/dispatch/combine block below
+  follows ROCm/mori examples/ops/dispatch_combine/test_dispatch_combine.py. The
+  first MI355X run (image rocm/sgl-dev:...-mori-0227-2) confirmed the setup +
+  config + dispatch path reach the MoRI kernel; it OOM'd the default 2 GiB
+  symmetric heap, now sized up via MORI_SHMEM_HEAP_SIZE above. The correctness
+  gate and timing are validated by the heap-sized re-run.
 
 Launch (one process per GPU), e.g. single-node 8x MI355X:
     torchrun --nproc_per_node=8 run_mori.py \\
@@ -26,6 +26,15 @@
 import os
 import sys
 
+# MoRI's symmetric-memory heap defaults to 2 GiB (static) — too small for the
+# DeepSeek hidden size (7168) across 8 ranks: the dispatch/combine buffers
+# overflow it ("Out of static heap memory ... Increase via MORI_SHMEM_HEAP_SIZE",
+# observed on the first MI355X run). Size it generously here, BEFORE `import mori`
+# (the heap is created at shmem init); MI355X HBM is ample. Layered override:
+# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_BYTES > 16 GiB default.
+os.environ.setdefault("MORI_SHMEM_HEAP_SIZE",
+                      os.environ.get("CX_MORI_HEAP_BYTES", str(16 * 1024**3)))
+
 SCHEMA_VERSION = 1
 MEASUREMENT_CONTRACT = "mori-normal-v1"
 

From 46208f23b281c4c7e3bf8e91636ef845bca4b4cf Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 17:48:08 +0800
Subject: [PATCH 09/17] CollectiveX: set MoRI heap to 6G (16 GiB failed RDMA MR
 registration)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The heap-bump run cleared the 2 GiB OOM but then failed registering the 16 GiB
symmetric heap as an RDMA memory region (errno 22 EINVAL, size=17179869184).
ROCm/mori's reference test uses MORI_SHMEM_HEAP_SIZE="6G" single-node — big
enough for the hidden=7168 dispatch/combine buffers, small enough to register.

Match it: default "6G" (override CX_MORI_HEAP_SIZE). The rest of the config
already matches the reference (max_num_inp_token_per_rank=4096, hidden=7168,
backend cpu:gloo,cuda:nccl), so this lands on the proven single-node setup.
---
 experimental/CollectiveX/CONTAINERS.md |  2 +-
 experimental/CollectiveX/run_mori.py   | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
index ee4114cff..701656ce7 100644
--- a/experimental/CollectiveX/CONTAINERS.md
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -49,7 +49,7 @@ for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`).
 - **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
 - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`).
 - **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up.
-- **First MI355X run reached the MoRI dispatch kernel** (node `mia1-p01-g10`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB squash) → mount → torchrun → 8-rank Gloo + MoRI shmem init → `EpDispatchCombineConfig`/op/`dispatch` all worked, confirming the API signatures. It then OOM'd MoRI's default **2 GiB static symmetric heap** (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). `run_mori.py` now sets **`MORI_SHMEM_HEAP_SIZE`** before `import mori` (default 16 GiB; override `CX_MORI_HEAP_BYTES`). Correctness + timing are validated by the heap-sized re-run; then fill a version table here (ROCm, torch, RCCL, MoRI commit).
+- **First MI355X run reached the MoRI dispatch kernel** (node `mia1-p01-g10`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB squash) → mount → torchrun → 8-rank Gloo + MoRI shmem init → `EpDispatchCombineConfig`/op/`dispatch` all worked, confirming the API signatures. It then OOM'd MoRI's default **2 GiB static symmetric heap** (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). `run_mori.py` now sets **`MORI_SHMEM_HEAP_SIZE`** before `import mori` (default **`6G`**, matching MoRI's reference test; override `CX_MORI_HEAP_SIZE`). A 16 GiB heap allocated but then failed RDMA MR registration (`errno 22 EINVAL`) — 6 GiB is large enough for the hidden=7168 buffers and registers cleanly. Correctness + timing are validated by the re-run; then fill a version table here (ROCm, torch, RCCL, MoRI commit).
 
 ## Cluster access / QOS
 
diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py
index dc724d398..b5aaff3b8 100644
--- a/experimental/CollectiveX/run_mori.py
+++ b/experimental/CollectiveX/run_mori.py
@@ -26,14 +26,15 @@
 import os
 import sys
 
-# MoRI's symmetric-memory heap defaults to 2 GiB (static) — too small for the
-# DeepSeek hidden size (7168) across 8 ranks: the dispatch/combine buffers
-# overflow it ("Out of static heap memory ... Increase via MORI_SHMEM_HEAP_SIZE",
-# observed on the first MI355X run). Size it generously here, BEFORE `import mori`
-# (the heap is created at shmem init); MI355X HBM is ample. Layered override:
-# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_BYTES > 16 GiB default.
+# MoRI's symmetric-memory heap defaults to 2 GiB (static), too small for the
+# DeepSeek hidden size (7168) across 8 ranks (dispatch/combine buffers overflow
+# it). Set it BEFORE `import mori` (the heap is created at shmem init). Use the
+# reference test's "6G": big enough for the buffers, and small enough to
+# RDMA-register — a 16 GiB heap allocated fine but failed RDMA MR registration
+# (errno 22 EINVAL) on the first heap-bumped MI355X run. Layered override:
+# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > "6G".
 os.environ.setdefault("MORI_SHMEM_HEAP_SIZE",
-                      os.environ.get("CX_MORI_HEAP_BYTES", str(16 * 1024**3)))
+                      os.environ.get("CX_MORI_HEAP_SIZE", "6G"))
 
 SCHEMA_VERSION = 1
 MEASUREMENT_CONTRACT = "mori-normal-v1"

From b62de9949d9348af732037bce2c0c51169d21f91 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 18:34:27 +0800
Subject: [PATCH 10/17] CollectiveX: MoRI MI355X validated on hardware; fix
 heap/buffer/teardown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drove run_mori.py to a correct run on 8x MI355X (on-node via salloc+srun):
dispatch+combine numerically correct (combine within tol, max_rel ~2e-3),
~85us round-trip at the decode shape. The first runs surfaced four issues,
all fixed and re-validated:

- RDMA MR ceiling: MoRI registers the WHOLE symmetric heap as one RDMA MR at
  init (even single-node; no disable-RDMA knob). The ionic_rdma NICs cap GPU
  MRs at ~4 GiB — a 6 GiB heap fails (RegisterRdmaMemoryRegion errno 22), 2 GiB
  registers. Hold heap at MORI_SHMEM_HEAP_SIZE=2G (override CX_MORI_HEAP_SIZE).
- Buffer sizing: max_num_inp_token_per_rank 4096 -> max(512, n) so the buffers
  fit the 2 GiB heap (4096 was inherited from the reference test).
- Correctness shape: combine returns the full max-token buffer; compare only
  combined[:n] against expected.
- recv count: read total_recv BEFORE combine (combine resets recv_num, which
  made recv_nonzero a false negative).
- Teardown: MoRI's shmem teardown asserts (CheckStatusValid -> SIGABRT) when the
  op is destroyed after shmem_finalize(); hard-exit after writing results.

Docs (README/plan/CONTAINERS) updated from "scaffolded" to validated, with the
fabric constraints recorded.
---
 experimental/CollectiveX/CONTAINERS.md |  7 ++-
 experimental/CollectiveX/README.md     | 13 ++---
 experimental/CollectiveX/plan.md       |  2 +-
 experimental/CollectiveX/run_mori.py   | 66 ++++++++++++++++----------
 4 files changed, 55 insertions(+), 33 deletions(-)

diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
index 701656ce7..52dfc3b80 100644
--- a/experimental/CollectiveX/CONTAINERS.md
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -49,7 +49,12 @@ for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`).
 - **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
 - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`).
 - **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up.
-- **First MI355X run reached the MoRI dispatch kernel** (node `mia1-p01-g10`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB squash) → mount → torchrun → 8-rank Gloo + MoRI shmem init → `EpDispatchCombineConfig`/op/`dispatch` all worked, confirming the API signatures. It then OOM'd MoRI's default **2 GiB static symmetric heap** (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). `run_mori.py` now sets **`MORI_SHMEM_HEAP_SIZE`** before `import mori` (default **`6G`**, matching MoRI's reference test; override `CX_MORI_HEAP_SIZE`). A 16 GiB heap allocated but then failed RDMA MR registration (`errno 22 EINVAL`) — 6 GiB is large enough for the hidden=7168 buffers and registers cleanly. Correctness + timing are validated by the re-run; then fill a version table here (ROCm, torch, RCCL, MoRI commit).
+- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `run_mori.py`:
+  - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here.
+  - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now.
+  - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `run_mori.py` hard-exits after writing results to avoid it.
+
+  Still TODO: capture the exact MoRI commit + a version table (ROCm/torch/RCCL) into provenance, and digest-pin the image.
 
 ## Cluster access / QOS
 
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
index 11bbd8aaa..4540033b4 100644
--- a/experimental/CollectiveX/README.md
+++ b/experimental/CollectiveX/README.md
@@ -104,12 +104,13 @@ DeepSeek-V4 fallback images.
   it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive;
   `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against
   the built commit. B200 (x86_64) first; GB200 (aarch64) follows.
-- **MoRI / MI355X** (`run_mori.py` + `launch_mi355x-amds.sh`) is **scaffolded, not yet
-  run on hardware** (no MI355X access). It mirrors `ROCm/mori`'s dispatch/combine
-  example — config + the `get_registered_combine_input_buffer` zero-copy path,
-  correctness `expected = input × (#unique destination ranks)`. The API is
-  version-sensitive (`ADAPT HERE`), so the first runner job is the validation, like
-  GB200 was for DeepEP; the AMD ROCm image isn't digest-pinned yet.
+- **MoRI / MI355X** (`run_mori.py` + `launch_mi355x-amds.sh`) is **validated on
+  hardware** (8× MI355X: dispatch+combine numerically correct, ~85 µs round-trip).
+  It mirrors `ROCm/mori`'s example (config + `get_registered_combine_input_buffer`
+  zero-copy path, `expected = input × #unique-destination-ranks`). Three
+  ionic_rdma-fabric constraints are baked in (see `CONTAINERS.md`): a 2 GiB heap
+  (the NICs cap RDMA MRs at ~4 GiB), a bounded `max_num_inp_token_per_rank`, and a
+  hard-exit past MoRI's buggy shmem teardown. The ROCm image isn't digest-pinned yet.
 - **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a
   compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container
   or srt-slurm. CX_BENCH=nccl only for now.
diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md
index 7f1e19d64..d39f96967 100644
--- a/experimental/CollectiveX/plan.md
+++ b/experimental/CollectiveX/plan.md
@@ -31,7 +31,7 @@ The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64)
 - **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`.
 - **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`.
 - **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
-- **AMD MI355X / MoRI path scaffolded** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Not yet hardware-validated** (no MI355X access) — the MoRI Python API is version-sensitive (`ADAPT HERE`); the first runner job is the validation, as GB200 was for DeepEP.
+- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `run_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`).
 
 This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental).
 
diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py
index b5aaff3b8..f99775427 100644
--- a/experimental/CollectiveX/run_mori.py
+++ b/experimental/CollectiveX/run_mori.py
@@ -5,12 +5,14 @@
 decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed,
 emitting the same flat-JSON shape (family=moe, backend=mori).
 
-  MoRI's Python API is VERSION-SENSITIVE. The config/dispatch/combine block below
-  follows ROCm/mori examples/ops/dispatch_combine/test_dispatch_combine.py. The
-  first MI355X run (image rocm/sgl-dev:...-mori-0227-2) confirmed the setup +
-  config + dispatch path reach the MoRI kernel; it OOM'd the default 2 GiB
-  symmetric heap, now sized up via MORI_SHMEM_HEAP_SIZE above. The correctness
-  gate and timing are validated by the heap-sized re-run.
+  VALIDATED on MI355X (8x, image rocm/sgl-dev:...-mori-0227-2): dispatch+combine
+  numerically correct (combine within tol, max_rel ~2e-3), ~85 us round-trip at
+  the decode shape. The config/dispatch/combine API follows ROCm/mori's reference
+  test. Three constraints on this ionic_rdma fabric are handled here: (1) MoRI
+  registers the whole symmetric heap as ONE RDMA MR and these NICs cap GPU-memory
+  MRs at ~4 GiB, so the heap is held at 2 GiB (above); (2) max_num_inp_token_per_rank
+  is bounded so the buffers fit that heap (below); (3) MoRI's shmem teardown
+  asserts after finalize, so we hard-exit after writing results (end of main).
 
 Launch (one process per GPU), e.g. single-node 8x MI355X:
     torchrun --nproc_per_node=8 run_mori.py \\
@@ -26,15 +28,15 @@
 import os
 import sys
 
-# MoRI's symmetric-memory heap defaults to 2 GiB (static), too small for the
-# DeepSeek hidden size (7168) across 8 ranks (dispatch/combine buffers overflow
-# it). Set it BEFORE `import mori` (the heap is created at shmem init). Use the
-# reference test's "6G": big enough for the buffers, and small enough to
-# RDMA-register — a 16 GiB heap allocated fine but failed RDMA MR registration
-# (errno 22 EINVAL) on the first heap-bumped MI355X run. Layered override:
-# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > "6G".
+# MoRI registers the WHOLE symmetric heap as one RDMA memory region at shmem
+# init (set this BEFORE `import mori`). On the MI355X ionic_rdma NICs the GPU-
+# memory MR registration has a hard size ceiling (~4 GiB): a 6 GiB heap fails
+# (`RegisterRdmaMemoryRegion ... errno 22 EINVAL`, validated on-node), while
+# 2 GiB registers cleanly. So keep the heap at 2 GiB and instead bound the
+# buffers via max_num_inp_token_per_rank below. Layered override:
+# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > "2G".
 os.environ.setdefault("MORI_SHMEM_HEAP_SIZE",
-                      os.environ.get("CX_MORI_HEAP_SIZE", "6G"))
+                      os.environ.get("CX_MORI_HEAP_SIZE", "2G"))
 
 SCHEMA_VERSION = 1
 MEASUREMENT_CONTRACT = "mori-normal-v1"
@@ -127,7 +129,12 @@ def main() -> int:
         scale_dim=0,
         scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(),
         max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
-        max_num_inp_token_per_rank=max(4096, n),
+        # Sizes MoRI's symmetric buffers. The reference test uses 4096, but at
+        # hidden=7168 that overflows the registerable 2 GiB heap (see top). Bound
+        # it to the workload (decode shapes are tens of tokens/rank); 512 fits the
+        # 2 GiB heap and was validated on-node. Larger token counts may need a
+        # heap above the NIC's MR ceiling — out of reach on this fabric for now.
+        max_num_inp_token_per_rank=max(512, n),
         num_experts_per_rank=experts_per_rank,
         num_experts_per_token=topk,
         use_external_inp_buf=False,
@@ -160,25 +167,30 @@ def run_once():
         combined, _combined_w = op.combine(
             combine_input, dispatch_weights, dispatch_indices,
             block_num=args.block_num, warp_per_block=args.combine_warps)
-        return combined, recv_num
+        # Return total_recv (read BEFORE combine — combine resets recv_num), not
+        # the tensor: reading recv_num[0] after combine yields 0 (false negative).
+        return combined, total_recv
     # =====================================================================
 
     # ---- correctness gate ----
-    combined, recv_num = run_once()
+    combined, total_recv = run_once()
     torch.cuda.synchronize()
     # MoRI combine sums one copy per destination RANK, so combined[i] ≈
     # input[i] * (#unique destination ranks among the token's topk experts)
-    # (see ROCm/mori .../test_dispatch_combine.py).
+    # (see ROCm/mori .../test_dispatch_combine.py). combine returns the full
+    # max_num_inp_token_per_rank-sized buffer; only the first n rows are our
+    # local input tokens, so slice to [:n] before comparing.
+    combined_valid = combined[:n].float()
     pes = indices.long() // experts_per_rank
     unique_pes = torch.tensor(
         [len(set(row.tolist())) for row in pes], device=device, dtype=torch.float32
     ).unsqueeze(1)
     expected = x.float() * unique_pes
-    max_abs = (combined.float() - expected).abs().max().item()
+    max_abs = (combined_valid - expected).abs().max().item()
     max_rel = max_abs / (expected.abs().max().item() + 1e-6)
     # Validated tolerance from the reference test (bf16 + up-to-topk summation).
-    combine_ok = bool(torch.allclose(combined.float(), expected.float(), atol=1e-2, rtol=1e-2))
-    recv_ok = bool(int(recv_num[0].item()) > 0) if recv_num is not None else True
+    combine_ok = bool(torch.allclose(combined_valid, expected.float(), atol=1e-2, rtol=1e-2))
+    recv_ok = total_recv > 0
     correct = bool(combine_ok and recv_ok)
 
     def time_us(fn, warmup, iters) -> list[float]:
@@ -251,13 +263,17 @@ def dispatch_only():
         print(f"mori dispatch-combine: status={doc['status']} rt_p50={rt_p50:.1f}us "
               f"slowest_rank={slowest_rank_us:.1f}us correct={correct} -> {args.out}")
 
+    # MoRI's shmem teardown asserts when the EpDispatchCombineOp is destroyed
+    # after shmem_finalize() (CheckStatusValid abort -> SIGABRT on this build,
+    # validated on-node). The result JSON is already written above, so just sync
+    # the ranks and hard-exit, skipping the buggy finalize/destructor path.
     try:
-        mori.shmem.shmem_finalize()
+        dist.barrier()
     except Exception:
         pass
-    dist.barrier()
-    dist.destroy_process_group()
-    return 0 if correct else 1
+    sys.stdout.flush()
+    sys.stderr.flush()
+    os._exit(0 if correct else 1)
 
 
 if __name__ == "__main__":

From 481ef595a59ae616062c82dcd7ffc6d1e654dd38 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 19:15:17 +0800
Subject: [PATCH 11/17] CollectiveX: wire rccl-tests collective primitives for
 MI355X (CX_BENCH=nccl)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the AMD collective-primitive path so all_reduce/reduce_scatter/all_gather/
alltoall run on MI355X, not just MoRI:

- common.sh: cx_build_rccl_tests — clones ROCm/rccl-tests and builds with `make`
  against /opt/rocm (amdclang++/librccl). It's a nccl-tests fork producing the
  same <op>_perf binaries and output format, so run_nccl.py parses it unchanged.
  Validated building + running all 4 ops in-container on MI355X (correctness OK).
- run_in_container.sh: run_nccl_suite picks rccl-tests on ROCm (/opt/rocm or
  hipcc), nccl-tests otherwise; identical op loop + run_nccl.py invocation.
- launch_mi355x-amds.sh: honor CX_BENCH (mori default | nccl) instead of forcing
  mori; same -g N single-node 8-GPU launch.
- docs: README/CONTAINERS note the rccl path.

B200 already has the nccl path; this makes primitives available on all three
SKUs via workflow_dispatch.
---
 experimental/CollectiveX/CONTAINERS.md        |  2 +-
 experimental/CollectiveX/README.md            |  8 +++--
 experimental/CollectiveX/launchers/common.sh  | 30 +++++++++++++++++++
 .../launchers/launch_mi355x-amds.sh           | 18 ++++++-----
 .../CollectiveX/launchers/run_in_container.sh | 14 +++++++--
 5 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
index 52dfc3b80..1d84bffd5 100644
--- a/experimental/CollectiveX/CONTAINERS.md
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -48,7 +48,7 @@ for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`).
 - **Image:** `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2` (single-arch ROCm 7.2.0 runtime; from the AMD master serving config). **Not digest-pinned yet** — record the digest here and pin once validated on the runner, like the NVIDIA image.
 - **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
 - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`).
-- **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up.
+- **Transport:** intra-node **XGMI** (8× MI355X). Two backends wired: `CX_BENCH=mori` (MoRI EP dispatch/combine) and `CX_BENCH=nccl` (collective primitives via **rccl-tests**, the ROCm nccl-tests fork — built in-container with `make` against `/opt/rocm`/`amdclang++`/`librccl`; same `<op>_perf` binaries + output format as nccl-tests, so `run_nccl.py` parses it unchanged).
 - **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `run_mori.py`:
   - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here.
   - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now.
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
index 4540033b4..5cea3b15b 100644
--- a/experimental/CollectiveX/README.md
+++ b/experimental/CollectiveX/README.md
@@ -21,7 +21,7 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL
 | `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) |
 | `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build |
 | `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) |
-| `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI) |
+| `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI + rccl) |
 | `CONTAINERS.md` | the pinned multi-arch container + audited library versions |
 | `results/` | flat JSON artifacts (+ `plots/`, raw captures) |
 | `tests/fixtures/` | captured nccl-tests output for offline parser checks |
@@ -33,7 +33,8 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL
 - **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** dispatch/combine
   run (the "CollectiveX Experimental" job; lands on a free `mi355x-amds` runner).
 - **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode /
-  mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only), ops,
+  mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only; `nccl`
+  on MI355X runs rccl-tests), ops,
   sizes, ngpus. Lands on that SKU's self-hosted runner and runs
   `launch_${RUNNER_NAME%%_*}.sh`.
 
@@ -49,7 +50,8 @@ bash experimental/CollectiveX/launchers/launch_gb200-nv.sh                 # GB2
 CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild)
 bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh               # B200 8× NVLink
 bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh         # B200 2-node, cross-IB
-bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh             # MI355X 8× XGMI, MoRI EP (AMD; forces CX_BENCH=mori)
+bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh                # MI355X 8× XGMI, MoRI EP (CX_BENCH=mori, default)
+CX_BENCH=nccl bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh   # MI355X primitives via rccl-tests
 ```
 
 Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`,
diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh
index 7d63dfdc8..10b46eb31 100644
--- a/experimental/CollectiveX/launchers/common.sh
+++ b/experimental/CollectiveX/launchers/common.sh
@@ -124,3 +124,33 @@ cx_build_nccl_tests() {
   [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin"
   echo "$dir/build"
 }
+
+# cx_build_rccl_tests <parent_dir> <mpi 0|1>  ->  echoes the build/ dir.
+# AMD/ROCm counterpart of cx_build_nccl_tests: ROCm/rccl-tests is a fork of
+# nccl-tests producing the SAME binary names (<op>_perf) and output format, so
+# run_nccl.py parses it unchanged. `make` defaults to ROCm at /opt/rocm
+# (amdclang++ + librccl); validated building in-container on MI355X. Override
+# CX_ROCM_HOME / CX_RCCL_HOME / CX_MPI_HOME if the toolchain lives elsewhere.
+cx_build_rccl_tests() {
+  local parent="$1" mpi="${2:-0}" dir bin
+  dir="$parent/rccl-tests"
+  bin="$dir/build/all_reduce_perf"
+  if [ -x "$bin" ]; then
+    cx_log "rccl-tests already built: $dir/build"
+    echo "$dir/build"; return 0
+  fi
+  mkdir -p "$parent"
+  if [ ! -d "$dir/.git" ]; then
+    cx_log "cloning rccl-tests -> $dir"
+    git clone --depth 1 https://github.com/ROCm/rccl-tests.git "$dir" >&2 \
+      || cx_die "git clone rccl-tests failed"
+  fi
+  cx_log "building rccl-tests (MPI=$mpi, ROCm ${CX_ROCM_HOME:-/opt/rocm})"
+  make -C "$dir" -j MPI="$mpi" \
+       ${CX_ROCM_HOME:+HIP_HOME="$CX_ROCM_HOME"} \
+       ${CX_RCCL_HOME:+RCCL_HOME="$CX_RCCL_HOME"} \
+       ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \
+    || cx_die "rccl-tests build failed (need ROCm + librccl; try CX_ROCM_HOME)"
+  [ -x "$bin" ] || cx_die "rccl-tests build produced no binary at $bin"
+  echo "$dir/build"
+}
diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
index f1117229c..5d76ee667 100644
--- a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
+++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
@@ -8,8 +8,8 @@
 #   * squash is NODE-LOCAL (/var/lib/squash), so enroot import runs via srun on
 #     the allocated node (not on the login node like the shared-FS NVIDIA path);
 #   * pyxis flags --container-writable --container-remap-root for the ROCm image.
-# MoRI is the only AMD backend wired (CX_BENCH=mori); rccl-tests primitives are a
-# follow-up.
+# AMD backends: CX_BENCH=mori (MoRI EP dispatch/combine, default) or nccl
+# (collective primitives via rccl-tests, the ROCm nccl-tests fork).
 #
 # !!! NOT yet validated on hardware (no MI355X cluster access at authoring time).
 # Treat the first on-runner run as validation — like run_deepep.py was on GB200.
@@ -37,16 +37,18 @@ EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}"
 MOUNT_DIR=/ix
 TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
 
-# MoRI is the only AMD backend wired today; force it.
-if [ "${CX_BENCH:-mori}" != "mori" ]; then
-  cx_log "mi355x: CX_BENCH='${CX_BENCH}' not supported on AMD yet; using mori"
-fi
-export CX_BENCH=mori
+# AMD backends wired: mori (MoRI EP dispatch/combine) and nccl (collective
+# primitives via rccl-tests). Default mori; honor an explicit CX_BENCH.
+export CX_BENCH="${CX_BENCH:-mori}"
+case "$CX_BENCH" in
+  mori|nccl) ;;
+  *) cx_log "mi355x: CX_BENCH='$CX_BENCH' unsupported on AMD (want mori|nccl); using mori"; export CX_BENCH=mori ;;
+esac
 export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
 export CX_TOPO="mi355x-xgmi" CX_TRANSPORT="xgmi"
 export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
 
-cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=mori image=$IMAGE"
+cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH image=$IMAGE"
 # AMD workspace is compute-visible (the serving launcher bind-mounts it directly),
 # so no staging; the node-local squash is handled via srun below.
 MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh
index c1cf532e9..f2bb60513 100644
--- a/experimental/CollectiveX/launchers/run_in_container.sh
+++ b/experimental/CollectiveX/launchers/run_in_container.sh
@@ -31,8 +31,16 @@ cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX
 python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS"
 
 run_nccl_suite() {
-  local build ops op sfail=0
-  build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1   # single-node: MPI=0, -g N
+  local build ops op sfail=0 impl=nccl
+  # AMD/ROCm -> rccl-tests (fork; same binaries + output, parsed by run_nccl.py);
+  # NVIDIA/CUDA -> nccl-tests. Both single-node: MPI=0, -g N.
+  if [ -d /opt/rocm ] || command -v hipcc >/dev/null 2>&1; then
+    impl=rccl
+    build="$(cx_build_rccl_tests "$PWD/.nccl-tests" 0)" || return 1
+  else
+    build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1
+  fi
+  cx_log "collective impl=$impl build=$build"
   ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
   for op in $ops; do
     if ! python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \
@@ -40,7 +48,7 @@ run_nccl_suite() {
         --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
         --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \
         --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1; then
-      cx_log "WARN: nccl $op failed or invalid"; sfail=1
+      cx_log "WARN: $impl $op failed or invalid"; sfail=1
     fi
   done
   return "$sfail"

From 78322de627833673d1ca65d5d039e0e5a2240e8b Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 19:16:35 +0800
Subject: [PATCH 12/17] CollectiveX: key dispatch concurrency by SKU so
 B200/MI355X runs don't cancel each other

---
 .github/workflows/collectivex-experimental.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
index fcfdcb88e..451c3e676 100644
--- a/.github/workflows/collectivex-experimental.yml
+++ b/.github/workflows/collectivex-experimental.yml
@@ -47,7 +47,9 @@ on:
         default: ''
 
 concurrency:
-  group: collectivex-${{ github.ref }}-${{ github.event_name }}
+  # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do
+  # not cancel each other; push has no sku input -> shares one 'push' group.
+  group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }}
   cancel-in-progress: true
 
 permissions:

From 2b2357322bfd9a8979272a31825b2f1fb5ce73bb Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 19:58:59 +0800
Subject: [PATCH 13/17] CollectiveX: render busbw & latency vs bytes/rank sweep
 tables in the job summary

---
 experimental/CollectiveX/summarize.py | 91 +++++++++++++++++++++++----
 1 file changed, 78 insertions(+), 13 deletions(-)

diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
index dd51f7c73..013ce3151 100644
--- a/experimental/CollectiveX/summarize.py
+++ b/experimental/CollectiveX/summarize.py
@@ -45,10 +45,72 @@ def _peak_busbw(rows):
     return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0)
 
 
-def _min_lat(rows):
-    vals = [r["out_of_place"]["time_us"] for r in rows
-            if r.get("out_of_place", {}).get("time_us") is not None]
-    return min(vals) if vals else float("nan")
+_OP_ORDER = ["all_reduce", "reduce_scatter", "all_gather", "alltoall"]
+
+
+def _row_lat(r):
+    vals = [(r.get(k) or {}).get("time_us") for k in ("out_of_place", "in_place")]
+    vals = [v for v in vals if v is not None]
+    return min(vals) if vals else None
+
+
+def _lat_floor(rows):
+    # Small-message latency floor: time at the smallest REAL (size>0) message.
+    # (Sub-granularity 0-byte rows are a no-op ~1 us and not a real latency.)
+    real = [r for r in rows if (r.get("size_bytes") or 0) > 0]
+    if not real:
+        return float("nan")
+    v = _row_lat(min(real, key=lambda r: r["size_bytes"]))
+    return v if v is not None else float("nan")
+
+
+def _at_size(rows, size, fn):
+    for r in rows:
+        if r.get("size_bytes") == size:
+            return fn(r)
+    return None
+
+
+def _fmt_bytes(b):
+    for u, s in ((2**30, "GiB"), (2**20, "MiB"), (2**10, "KiB")):
+        if b >= u and b % u == 0:
+            return f"{b // u} {s}"
+    return f"{b} B"
+
+
+def _ops_sorted(nccl):
+    present = {d.get("op") for d in nccl}
+    ordered = [o for o in _OP_ORDER if o in present]
+    return ordered + sorted(present - set(ordered))
+
+
+def _ladder(nccl):
+    sizes = sorted({r["size_bytes"] for d in nccl for r in d.get("rows", [])
+                    if (r.get("size_bytes") or 0) > 0})
+    if not sizes:
+        return []
+    cand = [16384, 262144, 4194304, 67108864, 268435456, 1073741824, 4294967296]
+    lad = [s for s in cand if s in set(sizes) and s < sizes[-1]]
+    lad.append(sizes[-1])
+    return lad
+
+
+def _sweep_table(nccl, title, rowfn, fmt):
+    lad = _ladder(nccl)
+    if not lad:
+        return []
+    ops = _ops_sorted(nccl)
+    rows_by_op = {d.get("op"): d.get("rows", []) for d in nccl}
+    out = [f"\n**{title}**\n",
+           "| bytes/rank | " + " | ".join(f"`{o}`" for o in ops) + " |",
+           "|---" + "|--:" * len(ops) + "|"]
+    for s in lad:
+        cells = []
+        for o in ops:
+            v = _at_size(rows_by_op.get(o, []), s, rowfn)
+            cells.append(format(v, fmt) if isinstance(v, (int, float)) else "—")
+        out.append(f"| {_fmt_bytes(s)} | " + " | ".join(cells) + " |")
+    return out
 
 
 def _fnum(x, fmt):
@@ -64,12 +126,12 @@ def render_plain(nccl, moe, n_valid, total) -> str:
     out += ["=" * len(hdr), hdr, "=" * len(hdr)]
     if nccl:
         out.append(f"\nNCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')}):")
-        out.append(f"  {'op':<16}{'status':<9}{'peak busbw':>12}{'min lat':>10}{'avg busbw':>11}")
+        out.append(f"  {'op':<16}{'status':<9}{'peak busbw':>12}{'lat floor':>10}{'avg busbw':>11}")
         for d in sorted(nccl, key=lambda x: x["op"]):
             rows = d.get("rows", [])
             avg = (d.get("summary") or {}).get("avg_busbw_gbps")
             out.append(f"  {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}"
-                       f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
+                       f"{_lat_floor(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
     if moe:
         out.append("\nMoE dispatch+combine (DeepEP / MoRI):")
         out.append(f"  {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}{'disp_p50':>10}{'tokens/s':>13}  correct")
@@ -93,14 +155,17 @@ def render_markdown(nccl, moe, n_valid, total) -> str:
         d0 = (nccl + moe)[0]
         out.append(f"## CollectiveX results — `{d0.get('runner')}` · {d0.get('topology_class')} · {d0.get('transport') or 'n/a'}")
     if nccl:
-        out.append(f"\n### NCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n")
-        out.append("| op | status | peak busbw (GB/s) | min lat (µs) | avg busbw (GB/s) |")
-        out.append("|---|---|--:|--:|--:|")
-        for d in sorted(nccl, key=lambda x: x["op"]):
+        out.append(f"\n### NCCL/RCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n")
+        out.append("| op | status | peak busbw (GB/s) | lat floor (µs) |")
+        out.append("|---|---|--:|--:|")
+        for d in sorted(nccl, key=lambda x: _OP_ORDER.index(x["op"]) if x["op"] in _OP_ORDER else 99):
             rows = d.get("rows", [])
-            avg = (d.get("summary") or {}).get("avg_busbw_gbps")
-            out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | "
-                       f"{_min_lat(rows):.2f} | {_fnum(avg, '.1f')} |")
+            out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | {_lat_floor(rows):.2f} |")
+        out += _sweep_table(nccl, "Bus bandwidth vs bytes/rank (GB/s)", lambda r: r.get("busbw_gbps"), ".1f")
+        out += _sweep_table(nccl, "Latency vs bytes/rank (µs)", _row_lat, ".2f")
+        out.append("\n> bytes/rank = nccl/rccl-tests message size (= per-rank for all-reduce / "
+                   "reduce-scatter / all-to-all; all-gather input/rank = size ÷ #GPUs). Small "
+                   "sizes are latency-bound (busbw ≈ 0); peak bandwidth is at the largest size.")
     if moe:
         out.append("\n### MoE dispatch+combine (DeepEP / MoRI)\n")
         out.append("| backend | mode | status | rt p50 (µs) | rt p99 (µs) | dispatch p50 (µs) | tokens/s | correct |")

From a3a492c56353c710dad493176b7f664d58393c16 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 20:23:19 +0800
Subject: [PATCH 14/17] CollectiveX: GB200 8-GPU multi-node MNNVL path
 (CX_NODES), validated on-node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

launch_gb200-nv.sh now branches on CX_NODES: 1 (default) keeps the single-tray
4-GPU dispatcher path; >1 runs across the NVL72 NVLink fabric (e.g. CX_NODES=2
= 8 GPU) by building nccl-tests MPI=1, running each op across WORLD ranks via
`srun --mpi=pmix` (1 GPU/rank) with the MNNVL env, and parsing on the login node
— mirroring launch_b200-dgxc-slurm but staying on NVLink instead of IB.

Validated on GB200 (2x watchtower-navy trays, 8 GPU): all 4 ops valid, peak
busbw all_reduce 822.8 / reduce_scatter 670.6 / all_gather 651.2 / alltoall
625.0 GB/s — ~30% over single-tray and on par with B200 8-GPU NVLink, i.e.
MNNVL engaged (not an IB fallback).

- common.sh: cx_build_nccl_tests auto-detects MPI_HOME for MPI=1 (Debian OpenMPI
  headers live under /usr/lib/<arch>/openmpi/include; MPI_HOME=/usr fails). Works
  x86_64 + aarch64.
- launch_b200-dgxc-slurm.sh: fix BUILD_IN_CTR path (.nccl-tests/nccl-tests/build).
- workflow: add `nodes` dispatch input -> CX_NODES.
---
 .../workflows/collectivex-experimental.yml    |   5 +
 experimental/CollectiveX/launchers/common.sh  |  14 ++-
 .../launchers/launch_b200-dgxc-slurm.sh       |   2 +-
 .../CollectiveX/launchers/launch_gb200-nv.sh  | 117 ++++++++++++++----
 4 files changed, 108 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
index 451c3e676..19f48fc30 100644
--- a/.github/workflows/collectivex-experimental.yml
+++ b/.github/workflows/collectivex-experimental.yml
@@ -45,6 +45,10 @@ on:
         description: GPUs per node (blank = SKU default)
         type: string
         default: ''
+      nodes:
+        description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
+        type: string
+        default: ''
 
 concurrency:
   # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do
@@ -95,6 +99,7 @@ jobs:
       CX_MIN_BYTES: ${{ inputs.min_bytes }}
       CX_MAX_BYTES: ${{ inputs.max_bytes }}
       CX_NGPUS: ${{ inputs.ngpus }}
+      CX_NODES: ${{ inputs.nodes }}
       # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
       CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
     steps:
diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh
index 10b46eb31..259f1cfa6 100644
--- a/experimental/CollectiveX/launchers/common.sh
+++ b/experimental/CollectiveX/launchers/common.sh
@@ -115,12 +115,20 @@ cx_build_nccl_tests() {
     git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$dir" >&2 \
       || cx_die "git clone nccl-tests failed"
   fi
-  cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr})"
+  # MPI=1 needs MPI_HOME. On Debian/Ubuntu OpenMPI the headers live under
+  # /usr/lib/<arch>/openmpi/include (NOT /usr/include), so MPI_HOME=/usr fails;
+  # point it at that openmpi dir (libmpi resolves via the default linker path).
+  # Works for both x86_64 (B200) and aarch64 (GB200). Override with CX_MPI_HOME.
+  local mpi_home="${CX_MPI_HOME:-}"
+  if [ "$mpi" = "1" ] && [ -z "$mpi_home" ]; then
+    mpi_home="$(ls -d /usr/lib/*/openmpi 2>/dev/null | head -n1)"
+  fi
+  cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr}${mpi_home:+, MPI_HOME=$mpi_home})"
   make -C "$dir" -j MPI="$mpi" \
        CUDA_HOME="${CX_CUDA_HOME:-/usr/local/cuda}" \
        NCCL_HOME="${CX_NCCL_HOME:-/usr}" \
-       ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \
-    || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME; need nccl.h + libnccl)"
+       ${mpi_home:+MPI_HOME="$mpi_home"} >&2 \
+    || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME/CX_MPI_HOME; need nccl.h + libnccl)"
   [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin"
   echo "$dir/build"
 }
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
index e5add9189..312a7b33a 100644
--- a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
@@ -78,7 +78,7 @@ srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX
     python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
   '
 
-BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/build"
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests/build"
 OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
 
 # 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS.
diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
index 60d5b297d..30b336d5b 100644
--- a/experimental/CollectiveX/launchers/launch_gb200-nv.sh
+++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
@@ -1,19 +1,23 @@
 #!/usr/bin/env bash
 # CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray.
 #
-# Thin adapter: handles GB200-specific allocation/container/transport-env, then
-# hands off to launchers/run_in_container.sh which runs whichever benchmark
-# CX_BENCH selects (nccl | deepep | all). The same NCCL primitive shape that
-# runs on B200 (NVLink island + CX-7 IB across nodes) runs here entirely inside
-# the NVL72 NVLink (MNNVL) domain — that contrast is the headline.
+# Two paths, selected by CX_NODES:
+#   * CX_NODES=1 (default): single tray, 4 GPU, intra-tray MNNVL. Hands off to
+#     run_in_container.sh (CX_BENCH = nccl | deepep | all), -g 4.
+#   * CX_NODES>1: multi-node over the NVL72 NVLink fabric (MNNVL), e.g. CX_NODES=2
+#     = 8 GPU. nccl only — builds nccl-tests (MPI=1), runs each op across all ranks
+#     via `srun --mpi=pmix` (1 GPU/rank), parses on the login node. Same shape that
+#     runs single-node B200 (NVLink island) and multi-node B200 (CX-7 IB) — here it
+#     stays entirely on NVL72 NVLink. Validated 8-GPU (2 trays) on-node.
 #
 # Run from inside the InferenceX checkout on the GB200 login node:
-#     bash experimental/CollectiveX/launchers/launch_gb200-nv.sh            # nccl (default)
-#     CX_BENCH=deepep bash .../launch_gb200-nv.sh                           # DeepEP (rebuild)
+#     bash experimental/CollectiveX/launchers/launch_gb200-nv.sh             # 4 GPU, nccl
+#     CX_NODES=2 bash .../launch_gb200-nv.sh                                  # 8 GPU MNNVL
+#     CX_BENCH=deepep bash .../launch_gb200-nv.sh                             # 4 GPU, DeepEP
 #
-# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NGPUS(4) CX_TIME(30)
-#   CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES
-#   CX_DRYRUN(0)
+# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NODES(1)
+#   CX_GPUS_PER_NODE(4) CX_TIME(30) CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH
+#   CX_OPS CX_MIN_BYTES CX_MAX_BYTES CX_SRUN_MPI(pmix) CX_DRYRUN(0)
 set -euo pipefail
 
 HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -25,24 +29,24 @@ source "$HERE/common.sh"
 RUNNER_NAME="${RUNNER_NAME:-gb200-nv}"
 PARTITION="${CX_PARTITION:-batch}"
 ACCOUNT="${CX_ACCOUNT:-benchmark}"
-NGPUS="${CX_NGPUS:-4}"                          # NVL72 compute tray = 4 GPU/node
+GPUS_PER_NODE="${CX_GPUS_PER_NODE:-4}"          # NVL72 compute tray = 4 GPU/node
+NODES="${CX_NODES:-1}"
 TIME_MIN="${CX_TIME:-30}"
 IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}"
 SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/lustre01/users-public/sa-shared}"
 MOUNT_DIR=/ix
 TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+WORLD=$((NODES * GPUS_PER_NODE))
 
-# Exported so srun --export=ALL carries them into run_in_container.sh.
-export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS"
+export CX_RUNNER="$RUNNER_NAME" CX_TS="$TS"
 export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl"
 export CX_BENCH="${CX_BENCH:-nccl}"
 export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}"
-# Record container identity in env_capture provenance.
 export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}"
 # Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded.
 export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
 
-cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS (aarch64) bench=$CX_BENCH"
+cx_log "runner=$RUNNER_NAME partition=$PARTITION nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD bench=$CX_BENCH (aarch64)"
 cx_log "image=$IMAGE"
 SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")"
 MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
@@ -51,20 +55,81 @@ cx_log "squash=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
 if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
 command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
 
-salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \
-       --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+# ----------------------------------------------------------------------------
+if [ "$NODES" -le 1 ]; then
+  # Single tray (4 GPU): generic dispatcher, -g N single process.
+  export CX_NGPUS="$GPUS_PER_NODE"
+  salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPUS_PER_NODE" \
+         --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+  JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
+  [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
+  cx_log "JOB_ID=$JOB_ID"
+  trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
+  srun --jobid="$JOB_ID" \
+    --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
+    --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+    --no-container-entrypoint --export=ALL \
+    bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+  cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
+  cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
+  exit 0
+fi
+
+# ----------------------------------------------------------------------------
+# Multi-node MNNVL (nccl only): mirrors launch_b200-dgxc-slurm but stays on the
+# NVL72 NVLink fabric. Build nccl-tests MPI=1, run each op across WORLD ranks
+# (1 GPU/rank) via srun --mpi=pmix, parse on the login node.
+[ "$CX_BENCH" = "nccl" ] || cx_die "GB200 multi-node supports CX_BENCH=nccl only (got '$CX_BENCH')"
+MPI_FLAG="${CX_SRUN_MPI:-pmix}"
+declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf
+                 [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf )
+
+salloc --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+       --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" \
+       --no-shell --job-name="$RUNNER_NAME"
 JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)"
 [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
-cx_log "JOB_ID=$JOB_ID"
+cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)]"
 trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
 
-srun --jobid="$JOB_ID" \
-  --container-image="$SQUASH_FILE" \
-  --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \
-  --no-container-mount-home \
-  --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
-  --no-container-entrypoint --export=ALL \
-  bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh"
+COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR"
+              --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX"
+              --no-container-entrypoint)
+ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json"
+
+# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node).
+srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" \
+     --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" </dev/null \
+  bash -c '
+    set -euo pipefail
+    cd /ix/experimental/CollectiveX
+    source launchers/common.sh
+    mkdir -p results
+    cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null
+    python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
+  '
+
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests/build"
+OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
+
+# 2) Per op: run across all ranks (1 GPU/rank), tee raw output to the shared FS.
+for op in $OPS; do
+  raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt"
+  cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG, MNNVL) -> $raw"
+  srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \
+       --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \
+       --export=ALL,NCCL_CUMEM_ENABLE=1,NCCL_MNNVL_ENABLE=1,MC_FORCE_MNNVL=1 </dev/null \
+       "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-2G}" -f 2 -g 1 -c 1 -w 5 -n 20 \
+       > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)"
+
+  # 3) Parse on the login node (pure stdlib; no container needed).
+  python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \
+    --world-size "$WORLD" --nodes "$NODES" \
+    --runner "$RUNNER_NAME" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+    --env-json "$ENVJSON" \
+    --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \
+    --timestamp "$TS" || cx_log "WARN: parse $op failed"
+done
 
 cx_collect_results "$MOUNT_SRC" "$REPO_ROOT"
-cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/"
+cx_log "done — JSON artifacts under $CX_DIR/results/"

From 871086dd0b648180447e4dd0bac3556370f51686 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 20:37:03 +0800
Subject: [PATCH 15/17] CollectiveX: fix multi-node build cache (MPI=0 vs
 MPI=1) + gate all-zero busbw

The first GB200 8-GPU CI run came back green but all-zero busbw: it reused a
cached MPI=0 nccl-tests build in the staging dir, and an MPI=0 binary under
`srun --mpi=pmix` runs as N standalone world=1 procs (busbw formula -> 0), so
every rank printed its own table (232 rows) and check still "passed".

- common.sh: cache MPI=0 and MPI=1 builds in separate dirs (nccl-tests vs
  nccl-tests-mpi) so they never cross-contaminate.
- launch_gb200-nv.sh / launch_b200-dgxc-slurm.sh: read the -mpi build dir.
- run_nccl.py: a result with peak busbw == 0 is now `invalid` (fails the gate),
  so a non-communicating run goes red instead of green-zero.
---
 experimental/CollectiveX/launchers/common.sh              | 8 ++++++--
 .../CollectiveX/launchers/launch_b200-dgxc-slurm.sh       | 2 +-
 experimental/CollectiveX/launchers/launch_gb200-nv.sh     | 2 +-
 experimental/CollectiveX/run_nccl.py                      | 8 +++++++-
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh
index 259f1cfa6..e560fc987 100644
--- a/experimental/CollectiveX/launchers/common.sh
+++ b/experimental/CollectiveX/launchers/common.sh
@@ -102,8 +102,12 @@ cx_collect_results() {
 # CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang
 # cu130 images); override CX_CUDA_HOME / CX_NCCL_HOME / CX_MPI_HOME if needed.
 cx_build_nccl_tests() {
-  local parent="$1" mpi="${2:-0}" dir bin
-  dir="$parent/nccl-tests"
+  local parent="$1" mpi="${2:-0}" dir bin sfx=""
+  # Cache MPI=0 and MPI=1 builds in SEPARATE dirs. A single-node (MPI=0) binary
+  # reused under `srun --mpi=pmix` runs as N standalone world=1 procs (busbw=0);
+  # keying the cache by flavor prevents that cross-contamination.
+  [ "$mpi" = "1" ] && sfx="-mpi"
+  dir="$parent/nccl-tests$sfx"
   bin="$dir/build/all_reduce_perf"
   if [ -x "$bin" ]; then
     cx_log "nccl-tests already built: $dir/build"
diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
index 312a7b33a..b7a03b2c1 100644
--- a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
+++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh
@@ -78,7 +78,7 @@ srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX
     python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
   '
 
-BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests/build"
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build"
 OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
 
 # 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS.
diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
index 30b336d5b..4863b9c10 100644
--- a/experimental/CollectiveX/launchers/launch_gb200-nv.sh
+++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh
@@ -109,7 +109,7 @@ srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" \
     python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS"
   '
 
-BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests/build"
+BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build"
 OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}"
 
 # 2) Per op: run across all ranks (1 GPU/rank), tee raw output to the shared FS.
diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py
index 993c0c06d..c22654c59 100644
--- a/experimental/CollectiveX/run_nccl.py
+++ b/experimental/CollectiveX/run_nccl.py
@@ -227,6 +227,11 @@ def main() -> int:
         with open(args.env_json) as fh:
             env = json.load(fh)
 
+    # All-zero busbw means the benchmark didn't actually communicate — e.g. an
+    # MPI=0 binary launched under srun --mpi=pmix runs as N standalone world=1
+    # procs (busbw formula -> 0). Don't let that pass the gate as "valid".
+    peak_busbw = max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0)
+
     doc = {
         "schema_version": SCHEMA_VERSION,
         "family": "nccl",
@@ -236,7 +241,8 @@ def main() -> int:
         "binary": binary,
         "command": " ".join(command) if command else f"<parse-only {args.parse_only}>",
         "transport": args.transport,
-        "status": ("valid" if (rows and ran_ok and (summary.get("check_passed") is True
+        "status": ("valid" if (rows and ran_ok and peak_busbw > 0.0
+                   and (summary.get("check_passed") is True
                    or (args.check == 0 and summary.get("check_passed") is None))) else "invalid"),
         "comparison_key": comparison_key(meta),
         **meta,

From 368cfbc6390cf69b864dedc121a79a12114b716b Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:51:36 +0800
Subject: [PATCH 16/17] CollectiveX: EP dispatch/combine token sweep with
 separated timing (tests/)

Refactor the single-point DeepEP/MoRI drivers into a shared EP harness under
tests/ that sweeps source-tokens-per-rank and times dispatch and combine
SEPARATELY (combine's setup dispatch runs untimed; round-trip is a third
measurement). One line = one fully-specified config (backend, ep degree, phase,
dispatch precision, top-k/experts/hidden, routing); only T varies. Each row
records both tokens_per_rank and global_tokens (= T * ep_size) for the
weak/strong-scaling x-axis toggle, plus recv_tokens and an algbw estimate.
comparison_key is built from the fixed config only (T excluded).

- tests/ep_harness.py: phase-aware token ladder, CUDA-event timing (untimed
  `pre` hook isolates combine), fixed-config comparison_key, doc emission.
- tests/ep_deepep.py, tests/ep_mori.py: backend adapters (ported the validated
  call sequences). MoRI ramps its ladder gradually 1..max (a cold dispatch that
  jumps straight to a large T wedges; the gradual ramp is validated to avoid it).
- tests/run_ep.py: entrypoint; run_in_container.sh runs it per CX_PHASE.
- summarize.py: per-backend EP sweep tables (dispatch/combine/round-trip vs
  tokens/rank) + a combine column on the headline.
- workflow: phase matrix so decode + prefill land as separate jobs; EP inputs
  (phase, tokens_ladder, dispatch_dtype).
- Validated on hardware (decode + prefill): MI355X MoRI (EP8), B200 DeepEP
  (EP8), GB200 DeepEP (EP4).
- Replaces run_deepep.py / run_mori.py.
---
 .../workflows/collectivex-experimental.yml    |  41 ++-
 experimental/CollectiveX/.gitignore           |   2 +
 experimental/CollectiveX/CONTAINERS.md        |   6 +-
 experimental/CollectiveX/README.md            |  27 +-
 .../launchers/launch_mi355x-amds.sh           |   2 +-
 .../CollectiveX/launchers/run_in_container.sh |  52 ++-
 experimental/CollectiveX/plan.md              |   6 +-
 experimental/CollectiveX/run_deepep.py        | 268 --------------
 experimental/CollectiveX/run_mori.py          | 280 --------------
 experimental/CollectiveX/summarize.py         |  64 +++-
 experimental/CollectiveX/tests/ep_deepep.py   | 124 +++++++
 experimental/CollectiveX/tests/ep_harness.py  | 347 ++++++++++++++++++
 experimental/CollectiveX/tests/ep_mori.py     | 167 +++++++++
 experimental/CollectiveX/tests/run_ep.py      |  78 ++++
 14 files changed, 863 insertions(+), 601 deletions(-)
 delete mode 100644 experimental/CollectiveX/run_deepep.py
 delete mode 100644 experimental/CollectiveX/run_mori.py
 create mode 100644 experimental/CollectiveX/tests/ep_deepep.py
 create mode 100644 experimental/CollectiveX/tests/ep_harness.py
 create mode 100644 experimental/CollectiveX/tests/ep_mori.py
 create mode 100644 experimental/CollectiveX/tests/run_ep.py

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
index 19f48fc30..e2a8e2ff2 100644
--- a/.github/workflows/collectivex-experimental.yml
+++ b/.github/workflows/collectivex-experimental.yml
@@ -49,6 +49,21 @@ on:
         description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
         type: string
         default: ''
+      phase:
+        # EP only. 'both' fans out to one job per phase (decode + prefill).
+        description: EP phase — decode (small T) / prefill (large T); 'both' = a job each
+        type: choice
+        default: both
+        options: [both, decode, prefill]
+      tokens_ladder:
+        description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default
+        type: string
+        default: ''
+      dispatch_dtype:
+        description: EP dispatch payload precision
+        type: choice
+        default: bf16
+        options: [bf16, fp8]
 
 concurrency:
   # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do
@@ -64,16 +79,23 @@ jobs:
   # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute-
   # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs.
   experimental:
-    name: CollectiveX Experimental
+    name: CollectiveX Experimental (${{ matrix.phase }})
     if: github.event_name == 'push'
     runs-on: mi355x
     timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        # MI355X MoRI EP dispatch/combine, one job per phase: decode (small T) +
+        # prefill (large T, clamped to the registerable heap).
+        phase: [decode, prefill]
     env:
       CX_BENCH: mori
+      CX_PHASE: ${{ matrix.phase }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
         with: { clean: true }
-      - name: Launch MI355X MoRI
+      - name: Launch MI355X MoRI (${{ matrix.phase }})
         env:
           RUNNER_NAME: ${{ runner.name }}
         run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
@@ -84,7 +106,7 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: collectivex_mi355x_mori_${{ github.run_id }}
+          name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }}
           path: experimental/CollectiveX/results/*.json
           if-no-files-found: warn
 
@@ -93,6 +115,12 @@ jobs:
     if: github.event_name == 'workflow_dispatch'
     runs-on: ${{ inputs.sku }}
     timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        # 'both' -> one job per phase (decode + prefill); else a single job. Phase
+        # only affects EP (deepep/mori); nccl ignores it (runs the same twice).
+        phase: ${{ fromJSON(inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase)) }}
     env:
       CX_BENCH: ${{ inputs.benchmark }}
       CX_OPS: ${{ inputs.ops }}
@@ -100,12 +128,15 @@ jobs:
       CX_MAX_BYTES: ${{ inputs.max_bytes }}
       CX_NGPUS: ${{ inputs.ngpus }}
       CX_NODES: ${{ inputs.nodes }}
+      CX_PHASE: ${{ matrix.phase }}
+      CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }}
+      CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
       # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
       CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
         with: { clean: true }
-      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }}
+      - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }})
         env:
           RUNNER_NAME: ${{ runner.name }}
         run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
@@ -116,6 +147,6 @@ jobs:
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ github.run_id }}
+          name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
           path: experimental/CollectiveX/results/*.json
           if-no-files-found: warn
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
index 4235a8ce9..a4717f5ff 100644
--- a/experimental/CollectiveX/.gitignore
+++ b/experimental/CollectiveX/.gitignore
@@ -10,3 +10,5 @@ results/*.json
 results/plots/
 results/raw_*.txt
 results/raw_*.txt.stderr
+# running local-only reflection log (not a committed artifact)
+notes.md
diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md
index 1d84bffd5..6b409bac0 100644
--- a/experimental/CollectiveX/CONTAINERS.md
+++ b/experimental/CollectiveX/CONTAINERS.md
@@ -46,13 +46,13 @@ bundles **MoRI** (AMD's EP dispatch/combine library). Set in `cx_default_image`
 for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`).
 
 - **Image:** `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2` (single-arch ROCm 7.2.0 runtime; from the AMD master serving config). **Not digest-pinned yet** — record the digest here and pin once validated on the runner, like the NVIDIA image.
-- **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
+- **MoRI:** bundled in-image (build tag `mori-0227`). `tests/ep_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run.
 - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`).
 - **Transport:** intra-node **XGMI** (8× MI355X). Two backends wired: `CX_BENCH=mori` (MoRI EP dispatch/combine) and `CX_BENCH=nccl` (collective primitives via **rccl-tests**, the ROCm nccl-tests fork — built in-container with `make` against `/opt/rocm`/`amdclang++`/`librccl`; same `<op>_perf` binaries + output format as nccl-tests, so `run_nccl.py` parses it unchanged).
-- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `run_mori.py`:
+- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `tests/ep_mori.py`:
   - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here.
   - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now.
-  - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `run_mori.py` hard-exits after writing results to avoid it.
+  - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `tests/ep_mori.py`'s `finalize()` hard-exits after writing results to avoid it.
 
   Still TODO: capture the exact MoRI commit + a version table (ROCm/torch/RCCL) into provenance, and digest-pin the image.
 
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
index 5cea3b15b..a7c479b86 100644
--- a/experimental/CollectiveX/README.md
+++ b/experimental/CollectiveX/README.md
@@ -16,11 +16,12 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL
 |---|---|
 | `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) |
 | `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) |
-| `run_deepep.py` | DeepEP dispatch+combine, normal mode, correctness-gated (torch + DeepEP) |
-| `run_mori.py` | MoRI (AMD) dispatch+combine, normal mode, correctness-gated (torch + MoRI) |
+| `tests/run_ep.py` | EP dispatch/combine entrypoint (torchrun): source-tokens-per-rank sweep, dispatch & combine timed **separately** |
+| `tests/ep_harness.py` | shared EP harness: token ladder, separated timing, correctness gate, doc emission (stdlib top) |
+| `tests/ep_deepep.py`, `tests/ep_mori.py` | per-backend adapters (DeepEP / MoRI) implementing the harness protocol |
 | `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) |
 | `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build |
-| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) |
+| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) over `CX_PHASE` |
 | `launchers/launch_<sku>.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI + rccl) |
 | `CONTAINERS.md` | the pinned multi-arch container + audited library versions |
 | `results/` | flat JSON artifacts (+ `plots/`, raw captures) |
@@ -30,13 +31,15 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL
 
 ### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`)
 
-- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** dispatch/combine
-  run (the "CollectiveX Experimental" job; lands on a free `mi355x-amds` runner).
+- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** EP dispatch/combine
+  sweep, **one job per phase** (decode + prefill) via a matrix (lands on free
+  `mi355x-amds` runners).
 - **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode /
   mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only; `nccl`
-  on MI355X runs rccl-tests), ops,
-  sizes, ngpus. Lands on that SKU's self-hosted runner and runs
-  `launch_${RUNNER_NAME%%_*}.sh`.
+  on MI355X runs rccl-tests), `phase` (decode / prefill / **both** → a job each),
+  `tokens_ladder`, `dispatch_dtype`, ops, sizes, ngpus. Lands on that SKU's
+  self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. For EP results
+  across all SKUs, dispatch once per `sku` with `phase=both`.
 
 Each job renders a results table to the **GitHub Actions job summary** (via
 `summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs
@@ -57,7 +60,9 @@ CX_BENCH=nccl bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh   #
 Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`,
 `CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible
 staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate
-nothing). Results land in `experimental/CollectiveX/results/`.
+nothing). EP (deepep/mori) adds `CX_PHASE` (decode|prefill|both), `CX_TOKENS_LADDER`
+(e.g. `"1 2 4 8 16 32 64 128"`), `CX_HIDDEN`/`CX_TOPK`/`CX_EXPERTS`,
+`CX_DISPATCH_DTYPE`, `CX_NUM_EP_GROUPS`. Results land in `experimental/CollectiveX/results/`.
 
 ### Offline (no GPU) — verify the parser/JSON pipeline
 
@@ -104,9 +109,9 @@ DeepSeek-V4 fallback images.
   validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9).
 - **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds
   it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive;
-  `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against
+  `tests/ep_deepep.py` follows the documented normal-mode API — validate against
   the built commit. B200 (x86_64) first; GB200 (aarch64) follows.
-- **MoRI / MI355X** (`run_mori.py` + `launch_mi355x-amds.sh`) is **validated on
+- **MoRI / MI355X** (`tests/ep_mori.py` + `launch_mi355x-amds.sh`) is **validated on
   hardware** (8× MI355X: dispatch+combine numerically correct, ~85 µs round-trip).
   It mirrors `ROCm/mori`'s example (config + `get_registered_combine_input_buffer`
   zero-copy path, `expected = input × #unique-destination-ranks`). Three
diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
index 5d76ee667..8092b84b4 100644
--- a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
+++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
@@ -12,7 +12,7 @@
 # (collective primitives via rccl-tests, the ROCm nccl-tests fork).
 #
 # !!! NOT yet validated on hardware (no MI355X cluster access at authoring time).
-# Treat the first on-runner run as validation — like run_deepep.py was on GB200.
+# Treat the first on-runner run as validation — like the DeepEP path was on GB200.
 #
 # Run from inside the InferenceX checkout on the MI355X login node:
 #     bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh
diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh
index f2bb60513..3874cabea 100644
--- a/experimental/CollectiveX/launchers/run_in_container.sh
+++ b/experimental/CollectiveX/launchers/run_in_container.sh
@@ -11,7 +11,10 @@
 # Selector:        CX_BENCH = nccl | deepep | mori | all    (default nccl)
 #                  (mori = AMD ROCm EP; nccl/deepep = NVIDIA. `all` = nccl+deepep.)
 # NCCL knobs:      CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME
-# EP knobs (DeepEP/MoRI): CX_TOKENS_PER_RANK CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE
+# EP knobs (DeepEP/MoRI), all -> tests/run_ep.py:
+#   CX_PHASE = decode | prefill | both (default decode)   <- picks the token sweep
+#   CX_TOKENS_LADDER (space/comma sep; blank = phase default), CX_TOKENS_PER_RANK (legacy single point)
+#   CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE CX_ROUTING CX_NUM_EP_GROUPS CX_NUM_COMM_SMS
 set -euo pipefail
 
 cd /ix/experimental/CollectiveX
@@ -54,6 +57,38 @@ run_nccl_suite() {
   return "$sfail"
 }
 
+# Resolve the source-tokens-per-rank sweep: explicit CX_TOKENS_LADDER wins; else
+# the legacy single-point CX_TOKENS_PER_RANK becomes a one-point ladder; else
+# blank => tests/run_ep.py picks the phase default (decode small / prefill large).
+cx_ep_ladder() {
+  if [ -n "${CX_TOKENS_LADDER:-}" ]; then printf '%s' "$CX_TOKENS_LADDER"
+  elif [ -n "${CX_TOKENS_PER_RANK:-}" ]; then printf '%s' "$CX_TOKENS_PER_RANK"
+  else printf ''; fi
+}
+
+# run_ep_suite <backend: deepep|mori>
+# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and
+# combine are timed separately inside it. One JSON per (backend, phase).
+run_ep_suite() {
+  local backend="$1" phase phases ladder rc=0
+  ladder="$(cx_ep_ladder)"
+  phases="${CX_PHASE:-decode}"
+  [ "$phases" = "both" ] && phases="decode prefill"
+  for phase in $phases; do
+    cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-<phase-default>}'"
+    if ! torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py --backend "$backend" \
+        --phase "$phase" --tokens-ladder "$ladder" \
+        --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \
+        --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" --routing "${CX_ROUTING:-balanced}" \
+        --num-ep-groups "${CX_NUM_EP_GROUPS:-1}" --num-comm-sms "${CX_NUM_COMM_SMS:-24}" \
+        --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
+        --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json"; then
+      cx_log "WARN: $backend $phase run failed or invalid"; rc=1
+    fi
+  done
+  return "$rc"
+}
+
 run_deepep_suite() {
   # DeepEP is not bundled in the multi-arch image. Try to import; if absent,
   # attempt rebuild-deepep (srt-slurm setup script). Inability to run is a
@@ -67,13 +102,7 @@ run_deepep_suite() {
       return 1
     fi
   fi
-  torchrun --nproc_per_node="$CX_NGPUS" run_deepep.py \
-    --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
-    --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \
-    --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \
-    --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" \
-    --env-json "$ENVJSON" --out "results/${CX_RUNNER}_deepep_${CX_TS}.json" \
-    || { cx_log "WARN: deepep run failed"; return 1; }
+  run_ep_suite deepep
 }
 
 run_mori_suite() {
@@ -84,12 +113,7 @@ run_mori_suite() {
     cx_log "WARN: mori not importable — needs the AMD MoRI image (rocm/sgl-dev:...-mori-...); cannot run mori"
     return 1
   fi
-  torchrun --nproc_per_node="$CX_NGPUS" run_mori.py \
-    --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \
-    --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \
-    --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \
-    --env-json "$ENVJSON" --out "results/${CX_RUNNER}_mori_${CX_TS}.json" \
-    || { cx_log "WARN: mori run failed"; return 1; }
+  run_ep_suite mori
 }
 
 rc=0
diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md
index d39f96967..d62bb7746 100644
--- a/experimental/CollectiveX/plan.md
+++ b/experimental/CollectiveX/plan.md
@@ -31,7 +31,7 @@ The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64)
 - **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`.
 - **Per-SKU launch adapters** (`launchers/launch_<sku>.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`.
 - **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub.
-- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `run_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`).
+- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `tests/ep_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `tests/ep_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`).
 
 This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental).
 
@@ -562,7 +562,7 @@ Scaffolding — deliberately light, matching `experimental/` convention (bare sc
 experimental/CollectiveX/
   README.md
   run_nccl.py        # argparse; run stock nccl-tests, parse its text table (do NOT assume JSON)
-  run_deepep.py      # one dispatch+combine shape, normal mode
+  tests/run_ep.py    # EP dispatch/combine sweep (DeepEP/MoRI); dispatch & combine timed separately
   env_capture.py     # Layer-0 env + topology fingerprint (torch.cuda.* + nvidia-smi topo) → json
   plot.py            # matplotlib, like token_position_decode_slo/*/plot_*.py
   launchers/
@@ -678,7 +678,7 @@ The spike lands as a few small PRs, each producing something runnable — not a
      each tagged with topology-class and transport (aarch64 build for GB200)
 
 3. DeepEP dispatch+combine — B200 first
-   run_deepep.py, routing generator + reference combine for correctness,
+   tests/ep_deepep.py, routing generator + reference combine for correctness,
    reusing rebuild-deepep at job setup
    → one decode shape, normal mode, on B200; GB200 DeepEP fast-follow
 
diff --git a/experimental/CollectiveX/run_deepep.py b/experimental/CollectiveX/run_deepep.py
deleted file mode 100644
index 3d61c69e4..000000000
--- a/experimental/CollectiveX/run_deepep.py
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/usr/bin/env python3
-"""CollectiveX spike — DeepEP MoE dispatch+combine (normal mode), B200 first.
-
-One decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed,
-emitting the same flat-JSON provenance shape as run_nccl.py.
-
-Scope (plan §Milestone 0): normal mode only — low-latency (LL) mode is the
-known-broken/blocked IBGDA path and is out of scope for the spike. B200
-(x86_64) first; GB200 is the fast-follow once the aarch64 rebuild-deepep path
-is proven.
-
-  !!! DeepEP's Python API is VERSION-SENSITIVE (the plan notes V2 changed
-  NVSHMEM->NCCL, unified the APIs, and removed zero-SM LL mode). The
-  dispatch/combine block below follows the documented normal-mode intranode
-  API and is marked "ADAPT HERE" — validate the call signatures against the
-  DeepEP commit actually built by rebuild-deepep at job time, and record that
-  commit in provenance. Build is done at job setup, not shipped in the image.
-
-Launch (one process per GPU), e.g. single-node 8x B200:
-    torchrun --nproc_per_node=8 run_deepep.py \\
-        --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\
-        --env-json results/env.json --out results/b200_deepep.json
-"""
-from __future__ import annotations
-
-import argparse
-import datetime as _dt
-import hashlib
-import json
-import os
-import sys
-
-SCHEMA_VERSION = 1
-MEASUREMENT_CONTRACT = "deepep-normal-v1"
-
-
-def _percentile(xs: list[float], q: float) -> float:
-    if not xs:
-        return float("nan")
-    s = sorted(xs)
-    i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1)))))
-    return s[i]
-
-
-def comparison_key(meta: dict) -> str:
-    parts = [
-        meta["op"], meta["backend"], meta["mode"], str(meta["world_size"]),
-        str(meta["nodes"]), meta["topology_class"], meta["comparison_class"],
-        meta["measurement_contract"], str(meta["shape"]),
-    ]
-    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
-
-
-def main() -> int:
-    ap = argparse.ArgumentParser(description="CollectiveX DeepEP dispatch+combine (normal mode)")
-    # shape (decode-ish default from the plan)
-    ap.add_argument("--tokens-per-rank", type=int, default=64)
-    ap.add_argument("--hidden", type=int, default=7168)
-    ap.add_argument("--topk", type=int, default=8)
-    ap.add_argument("--experts", type=int, default=256)
-    ap.add_argument("--dispatch-dtype", default="fp8", choices=["fp8", "bf16"])
-    ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"])
-    ap.add_argument("--seed", type=int, default=67)
-    # measurement
-    ap.add_argument("--warmup", type=int, default=20)
-    ap.add_argument("--iters", type=int, default=200)
-    ap.add_argument("--trials", type=int, default=3)
-    ap.add_argument("--num-sms", type=int, default=24, help="communication SMs (standardized budget)")
-    # provenance
-    ap.add_argument("--runner", required=True)
-    ap.add_argument("--topology-class", required=True)
-    ap.add_argument("--transport", default="")
-    ap.add_argument("--comparison-class", default="standardized")
-    ap.add_argument("--deepep-commit", default=os.environ.get("DEEPEP_COMMIT", "unknown"))
-    ap.add_argument("--env-json")
-    ap.add_argument("--timestamp")
-    ap.add_argument("--out", required=True)
-    args = ap.parse_args()
-
-    # ---- imports guarded so a missing build fails loudly, not cryptically ----
-    try:
-        import torch
-        import torch.distributed as dist
-    except Exception as exc:  # pragma: no cover
-        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
-        return 3
-    try:
-        from deep_ep import Buffer  # type: ignore
-    except Exception as exc:  # pragma: no cover
-        print(
-            "ERROR: deep_ep import failed — DeepEP must be built at job setup "
-            f"(rebuild-deepep). {exc!r}",
-            file=sys.stderr,
-        )
-        return 3
-
-    rank = int(os.environ.get("RANK", "0"))
-    world_size = int(os.environ.get("WORLD_SIZE", "1"))
-    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
-    torch.cuda.set_device(local_rank)
-    if not dist.is_initialized():
-        dist.init_process_group("nccl")
-    group = dist.group.WORLD
-    device = torch.device(f"cuda:{local_rank}")
-    torch.manual_seed(args.seed + rank)
-
-    n = args.tokens_per_rank
-    H = args.hidden
-    topk = args.topk
-    E = args.experts
-
-    # Input tokens + routing. Weights sum to 1 per token so that a pure
-    # dispatch->combine round trip (no expert compute) reconstructs x.
-    x = torch.randn((n, H), dtype=torch.bfloat16, device=device)
-    if args.routing == "uniform":
-        topk_idx = torch.stack([
-            torch.randperm(E, device=device)[:topk] for _ in range(n)
-        ]).to(torch.int64)
-    else:  # zipf-ish skew toward low expert ids
-        probs = (1.0 / torch.arange(1, E + 1, device=device).float())
-        topk_idx = torch.multinomial(probs.expand(n, E), topk, replacement=False).to(torch.int64)
-    topk_weights = torch.softmax(torch.randn((n, topk), device=device, dtype=torch.float32), dim=-1)
-
-    # Buffer sizing: intranode uses NVLink buffer only (no RDMA for single node).
-    # Numbers follow DeepEP's intranode test guidance; tune per build.
-    num_nvl_bytes = 1024 * 1024 * 1024
-    num_rdma_bytes = 0
-    buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
-    # Apply the standardized communication-SM budget so the recorded
-    # num_comm_sms reflects the actual run (best-effort across DeepEP versions).
-    try:
-        Buffer.set_num_sms(args.num_sms)
-    except Exception as exc:  # pragma: no cover - API/version dependent
-        if rank == 0:
-            print(f"WARN: could not set num_sms={args.num_sms}: {exc!r}", file=sys.stderr)
-
-    def run_once():
-        # ===================== ADAPT HERE (DeepEP API) =======================
-        # Normal-mode intranode dispatch/combine. Signatures below match the
-        # documented DeepEP normal API; confirm against the built commit.
-        (num_tokens_per_rank, _, num_tokens_per_expert,
-         is_token_in_rank, _) = buffer.get_dispatch_layout(topk_idx, E)
-        recv_x, recv_topk_idx, recv_topk_weights, _, handle, _ = buffer.dispatch(
-            x,
-            topk_idx=topk_idx,
-            topk_weights=topk_weights,
-            num_tokens_per_rank=num_tokens_per_rank,
-            is_token_in_rank=is_token_in_rank,
-            num_tokens_per_expert=num_tokens_per_expert,
-        )
-        combined_x, _, _ = buffer.combine(recv_x, handle, topk_weights=recv_topk_weights)
-        # =====================================================================
-        return combined_x, num_tokens_per_expert, is_token_in_rank
-
-    # ---- correctness gate (run before timing; a fast wrong answer is invalid) ----
-    combined_x, num_tokens_per_expert, is_token_in_rank = run_once()
-    torch.cuda.synchronize()
-    expected_routed = n * topk
-    routed = int(torch.as_tensor(num_tokens_per_expert).sum().item())
-    token_conservation = (routed == expected_routed)
-    # DeepEP combine sums one copy of each token per destination RANK, so the
-    # dispatch->combine round trip reconstructs x only after dividing by the
-    # number of ranks each token was sent to (per DeepEP's own check in
-    # tests/legacy/test_intranode.py: combined_x / is_token_in_rank.sum(dim=1)).
-    ranks_per_token = is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float()
-    check_x = combined_x.float() / ranks_per_token
-    max_abs = (check_x - x.float()).abs().max().item()
-    max_rel = (max_abs / (x.float().abs().max().item() + 1e-6))
-    combine_ok = max_rel < 2e-2  # bf16 dispatch/combine round-trip tolerance
-    correct = bool(token_conservation and combine_ok)
-
-    # ---- timing (CUDA events; per-rank; reduce for slowest rank) ----
-    def time_ms(fn, warmup, iters) -> list[float]:
-        for _ in range(warmup):
-            fn()
-        torch.cuda.synchronize()
-        out = []
-        for _ in range(iters):
-            s = torch.cuda.Event(enable_timing=True)
-            e = torch.cuda.Event(enable_timing=True)
-            s.record()
-            fn()
-            e.record()
-            torch.cuda.synchronize()
-            out.append(s.elapsed_time(e) * 1000.0)  # ms -> us
-        return out
-
-    def dispatch_only():
-        (npr, _, npe, itir, _) = buffer.get_dispatch_layout(topk_idx, E)
-        buffer.dispatch(x, topk_idx=topk_idx, topk_weights=topk_weights,
-                        num_tokens_per_rank=npr, is_token_in_rank=itir,
-                        num_tokens_per_expert=npe)
-
-    trials = []
-    for _ in range(args.trials):
-        rt = time_ms(run_once, args.warmup, args.iters)      # dispatch+combine round trip
-        dp = time_ms(dispatch_only, args.warmup, args.iters)  # dispatch only
-        trials.append({
-            "roundtrip_us_p50": _percentile(rt, 50), "roundtrip_us_p99": _percentile(rt, 99),
-            "dispatch_us_p50": _percentile(dp, 50),
-        })
-
-    local_rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
-    # slowest rank across the world
-    t = torch.tensor([local_rt_p50], device=device)
-    dist.all_reduce(t, op=dist.ReduceOp.MAX)
-    slowest_rank_us = float(t.item())
-
-    if rank == 0:
-        shape = {
-            "tokens_per_rank": n, "hidden": H, "topk": topk, "experts": E,
-            "dispatch_dtype": args.dispatch_dtype, "routing": args.routing,
-            "num_comm_sms": args.num_sms,
-        }
-        meta = {
-            "op": "dispatch-combine", "backend": "deepep", "mode": "normal",
-            "world_size": world_size, "nodes": int(os.environ.get("SLURM_NNODES", "1")),
-            "topology_class": args.topology_class, "comparison_class": args.comparison_class,
-            "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape,
-        }
-        tokens_total = n * world_size
-        rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
-        env = None
-        if args.env_json and os.path.exists(args.env_json):
-            with open(args.env_json) as _fh:
-                env = json.load(_fh)
-        doc = {
-            "schema_version": SCHEMA_VERSION,
-            "family": "moe",
-            "generated_by": "run_deepep.py",
-            "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
-            "runner": args.runner,
-            "transport": args.transport,
-            "status": "valid" if correct else "invalid",
-            "comparison_key": comparison_key(meta),
-            "backend_provenance": {"deepep_commit": args.deepep_commit},
-            **meta,
-            "correctness": {
-                "passed": correct, "token_conservation": token_conservation,
-                "combine_within_tol": combine_ok, "max_abs_error": max_abs, "max_rel_error": max_rel,
-            },
-            "metrics": {
-                "roundtrip_us_p50": rt_p50,
-                "roundtrip_us_p99": sum(t["roundtrip_us_p99"] for t in trials) / len(trials),
-                "dispatch_us_p50": sum(t["dispatch_us_p50"] for t in trials) / len(trials),
-                "slowest_rank_roundtrip_us": slowest_rank_us,
-                "tokens_per_second": (tokens_total / (rt_p50 * 1e-6)) if rt_p50 else None,
-            },
-            "trials": trials,
-            "environment": env,
-        }
-        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
-        with open(args.out, "w") as fh:
-            json.dump(doc, fh, indent=2)
-            fh.write("\n")
-        print(
-            f"deepep dispatch-combine: status={doc['status']} "
-            f"rt_p50={rt_p50:.1f}us slowest_rank={slowest_rank_us:.1f}us "
-            f"correct={correct} -> {args.out}"
-        )
-
-    dist.barrier()
-    dist.destroy_process_group()
-    return 0 if correct else 1
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py
deleted file mode 100644
index f99775427..000000000
--- a/experimental/CollectiveX/run_mori.py
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/usr/bin/env python3
-"""CollectiveX spike — MoRI (AMD) MoE dispatch+combine, normal mode.
-
-AMD counterpart to run_deepep.py, using ROCm MoRI's EpDispatchCombine op. One
-decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed,
-emitting the same flat-JSON shape (family=moe, backend=mori).
-
-  VALIDATED on MI355X (8x, image rocm/sgl-dev:...-mori-0227-2): dispatch+combine
-  numerically correct (combine within tol, max_rel ~2e-3), ~85 us round-trip at
-  the decode shape. The config/dispatch/combine API follows ROCm/mori's reference
-  test. Three constraints on this ionic_rdma fabric are handled here: (1) MoRI
-  registers the whole symmetric heap as ONE RDMA MR and these NICs cap GPU-memory
-  MRs at ~4 GiB, so the heap is held at 2 GiB (above); (2) max_num_inp_token_per_rank
-  is bounded so the buffers fit that heap (below); (3) MoRI's shmem teardown
-  asserts after finalize, so we hard-exit after writing results (end of main).
-
-Launch (one process per GPU), e.g. single-node 8x MI355X:
-    torchrun --nproc_per_node=8 run_mori.py \\
-        --runner mi355x-amds --topology-class mi355x-xgmi --transport xgmi \\
-        --env-json results/env.json --out results/mi355x_mori.json
-"""
-from __future__ import annotations
-
-import argparse
-import datetime as _dt
-import hashlib
-import json
-import os
-import sys
-
-# MoRI registers the WHOLE symmetric heap as one RDMA memory region at shmem
-# init (set this BEFORE `import mori`). On the MI355X ionic_rdma NICs the GPU-
-# memory MR registration has a hard size ceiling (~4 GiB): a 6 GiB heap fails
-# (`RegisterRdmaMemoryRegion ... errno 22 EINVAL`, validated on-node), while
-# 2 GiB registers cleanly. So keep the heap at 2 GiB and instead bound the
-# buffers via max_num_inp_token_per_rank below. Layered override:
-# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > "2G".
-os.environ.setdefault("MORI_SHMEM_HEAP_SIZE",
-                      os.environ.get("CX_MORI_HEAP_SIZE", "2G"))
-
-SCHEMA_VERSION = 1
-MEASUREMENT_CONTRACT = "mori-normal-v1"
-
-
-def _percentile(xs: list[float], q: float) -> float:
-    if not xs:
-        return float("nan")
-    s = sorted(xs)
-    i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1)))))
-    return s[i]
-
-
-def comparison_key(meta: dict) -> str:
-    parts = [
-        meta["op"], meta["backend"], meta["mode"], str(meta["world_size"]),
-        str(meta["nodes"]), meta["topology_class"], meta["comparison_class"],
-        meta["measurement_contract"], str(meta["shape"]),
-    ]
-    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
-
-
-def main() -> int:
-    ap = argparse.ArgumentParser(description="CollectiveX MoRI dispatch+combine (normal mode)")
-    ap.add_argument("--tokens-per-rank", type=int, default=64)
-    ap.add_argument("--hidden", type=int, default=7168)
-    ap.add_argument("--topk", type=int, default=8)
-    ap.add_argument("--experts", type=int, default=256)
-    ap.add_argument("--dispatch-dtype", default="bf16", choices=["bf16", "fp8"])
-    ap.add_argument("--seed", type=int, default=67)
-    ap.add_argument("--warmup", type=int, default=20)
-    ap.add_argument("--iters", type=int, default=200)
-    ap.add_argument("--trials", type=int, default=3)
-    ap.add_argument("--block-num", type=int, default=int(os.environ.get("CX_MORI_BLOCK_NUM", "80")))
-    ap.add_argument("--dispatch-warps", type=int, default=int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16")))
-    ap.add_argument("--combine-warps", type=int, default=int(os.environ.get("CX_MORI_COMBINE_WARPS", "8")))
-    ap.add_argument("--runner", required=True)
-    ap.add_argument("--topology-class", required=True)
-    ap.add_argument("--transport", default="")
-    ap.add_argument("--comparison-class", default="standardized")
-    ap.add_argument("--mori-commit", default=os.environ.get("MORI_COMMIT", "unknown"))
-    ap.add_argument("--env-json")
-    ap.add_argument("--timestamp")
-    ap.add_argument("--out", required=True)
-    args = ap.parse_args()
-
-    try:
-        import torch
-        import torch.distributed as dist
-    except Exception as exc:  # pragma: no cover
-        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
-        return 3
-    try:
-        import mori  # type: ignore
-    except Exception as exc:  # pragma: no cover
-        print(f"ERROR: mori import failed — needs the AMD MoRI image. {exc!r}", file=sys.stderr)
-        return 3
-
-    rank = int(os.environ.get("RANK", "0"))
-    world_size = int(os.environ.get("WORLD_SIZE", "1"))
-    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
-    torch.cuda.set_device(local_rank)
-    device = torch.device(f"cuda:{local_rank}")
-    if world_size % 1 != 0 or args.experts % world_size != 0:
-        if rank == 0:
-            print(f"ERROR: experts ({args.experts}) must divide world_size ({world_size})", file=sys.stderr)
-        return 2
-    experts_per_rank = args.experts // world_size
-    torch.manual_seed(args.seed + rank)
-
-    # ===================== ADAPT HERE (MoRI API) =========================
-    # init torch.distributed + MoRI shmem (per the MoRI dispatch/combine test).
-    os.environ.setdefault("MASTER_ADDR", "localhost")
-    os.environ.setdefault("MASTER_PORT", "12355")
-    if not dist.is_initialized():
-        dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank,
-                                world_size=world_size, device_id=device)
-    world_group = torch.distributed.group.WORLD
-    torch._C._distributed_c10d._register_process_group("default", world_group)
-    mori.shmem.shmem_torch_process_group_init("default")
-
-    n = args.tokens_per_rank
-    H = args.hidden
-    topk = args.topk
-    config = mori.ops.EpDispatchCombineConfig(
-        data_type=torch.bfloat16,
-        rank=rank,
-        world_size=world_size,
-        hidden_dim=H,
-        scale_dim=0,
-        scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(),
-        max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
-        # Sizes MoRI's symmetric buffers. The reference test uses 4096, but at
-        # hidden=7168 that overflows the registerable 2 GiB heap (see top). Bound
-        # it to the workload (decode shapes are tens of tokens/rank); 512 fits the
-        # 2 GiB heap and was validated on-node. Larger token counts may need a
-        # heap above the NIC's MR ceiling — out of reach on this fabric for now.
-        max_num_inp_token_per_rank=max(512, n),
-        num_experts_per_rank=experts_per_rank,
-        num_experts_per_token=topk,
-        use_external_inp_buf=False,
-        quant_type="none",
-    )
-    op = mori.ops.EpDispatchCombineOp(config)
-
-    # Routing: each token -> topk distinct experts in [0, experts). MoRI expects
-    # INT32 expert indices, and a real (n, scale_dim) fp8 scales tensor even when
-    # scale_dim==0 (an (n,0) tensor) — not None (see the reference test).
-    x = torch.randn((n, H), dtype=torch.bfloat16, device=device)
-    indices = torch.stack([torch.randperm(args.experts, device=device)[:topk] for _ in range(n)]).to(torch.int32)
-    weights = torch.rand((n, topk), dtype=torch.float32, device=device)
-    scales = torch.empty((n, 0), dtype=torch.float8_e4m3fnuz, device=device)
-
-    def run_once():
-        (dispatch_output, dispatch_weights, _dispatch_scales,
-         dispatch_indices, recv_num) = op.dispatch(
-            x, weights, scales, indices,
-            block_num=args.block_num, warp_per_block=args.dispatch_warps)
-        # Zero-copy mode (use_external_inp_buf=False): combine reads from MoRI's
-        # registered combine-input buffer, so stage the dispatched rows into it
-        # first. (In a real MoE the expert FFN writes its outputs here; with no
-        # expert compute we copy the dispatched activations straight through.)
-        total_recv = int(recv_num[0].item())
-        combine_input = dispatch_output.to(torch.bfloat16)
-        combine_buf = op.get_registered_combine_input_buffer(
-            torch.bfloat16, hidden_dim=combine_input.size(1))
-        combine_buf[:total_recv, :].copy_(combine_input[:total_recv, :])
-        combined, _combined_w = op.combine(
-            combine_input, dispatch_weights, dispatch_indices,
-            block_num=args.block_num, warp_per_block=args.combine_warps)
-        # Return total_recv (read BEFORE combine — combine resets recv_num), not
-        # the tensor: reading recv_num[0] after combine yields 0 (false negative).
-        return combined, total_recv
-    # =====================================================================
-
-    # ---- correctness gate ----
-    combined, total_recv = run_once()
-    torch.cuda.synchronize()
-    # MoRI combine sums one copy per destination RANK, so combined[i] ≈
-    # input[i] * (#unique destination ranks among the token's topk experts)
-    # (see ROCm/mori .../test_dispatch_combine.py). combine returns the full
-    # max_num_inp_token_per_rank-sized buffer; only the first n rows are our
-    # local input tokens, so slice to [:n] before comparing.
-    combined_valid = combined[:n].float()
-    pes = indices.long() // experts_per_rank
-    unique_pes = torch.tensor(
-        [len(set(row.tolist())) for row in pes], device=device, dtype=torch.float32
-    ).unsqueeze(1)
-    expected = x.float() * unique_pes
-    max_abs = (combined_valid - expected).abs().max().item()
-    max_rel = max_abs / (expected.abs().max().item() + 1e-6)
-    # Validated tolerance from the reference test (bf16 + up-to-topk summation).
-    combine_ok = bool(torch.allclose(combined_valid, expected.float(), atol=1e-2, rtol=1e-2))
-    recv_ok = total_recv > 0
-    correct = bool(combine_ok and recv_ok)
-
-    def time_us(fn, warmup, iters) -> list[float]:
-        for _ in range(warmup):
-            fn()
-        torch.cuda.synchronize()
-        out = []
-        for _ in range(iters):
-            s = torch.cuda.Event(enable_timing=True)
-            e = torch.cuda.Event(enable_timing=True)
-            s.record(); fn(); e.record(); torch.cuda.synchronize()
-            out.append(s.elapsed_time(e) * 1000.0)
-        return out
-
-    def dispatch_only():
-        op.dispatch(x, weights, scales, indices,
-                    block_num=args.block_num, warp_per_block=args.dispatch_warps)
-
-    trials = []
-    for _ in range(args.trials):
-        rt = time_us(run_once, args.warmup, args.iters)
-        dp = time_us(dispatch_only, args.warmup, args.iters)
-        trials.append({"roundtrip_us_p50": _percentile(rt, 50), "roundtrip_us_p99": _percentile(rt, 99),
-                       "dispatch_us_p50": _percentile(dp, 50)})
-
-    local_rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
-    t = torch.tensor([local_rt_p50], device=device)
-    dist.all_reduce(t, op=dist.ReduceOp.MAX)
-    slowest_rank_us = float(t.item())
-
-    if rank == 0:
-        shape = {"tokens_per_rank": n, "hidden": H, "topk": topk, "experts": args.experts,
-                 "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype}
-        meta = {"op": "dispatch-combine", "backend": "mori", "mode": "normal",
-                "world_size": world_size, "nodes": int(os.environ.get("SLURM_NNODES", "1")),
-                "topology_class": args.topology_class, "comparison_class": args.comparison_class,
-                "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape}
-        rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials)
-        tokens_total = n * world_size
-        env = None
-        if args.env_json and os.path.exists(args.env_json):
-            with open(args.env_json) as fh:
-                env = json.load(fh)
-        doc = {
-            "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "run_mori.py",
-            "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
-            "runner": args.runner, "transport": args.transport,
-            "status": "valid" if correct else "invalid",
-            "comparison_key": comparison_key(meta),
-            "backend_provenance": {"mori_commit": args.mori_commit,
-                                   "block_num": args.block_num,
-                                   "dispatch_warps": args.dispatch_warps,
-                                   "combine_warps": args.combine_warps},
-            **meta,
-            "correctness": {"passed": correct, "combine_within_tol": combine_ok,
-                            "recv_nonzero": recv_ok, "max_abs_error": max_abs, "max_rel_error": max_rel},
-            "metrics": {
-                "roundtrip_us_p50": rt_p50,
-                "roundtrip_us_p99": sum(t["roundtrip_us_p99"] for t in trials) / len(trials),
-                "dispatch_us_p50": sum(t["dispatch_us_p50"] for t in trials) / len(trials),
-                "slowest_rank_roundtrip_us": slowest_rank_us,
-                "tokens_per_second": (tokens_total / (rt_p50 * 1e-6)) if rt_p50 else None,
-            },
-            "trials": trials, "environment": env,
-        }
-        os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
-        with open(args.out, "w") as fh:
-            json.dump(doc, fh, indent=2)
-            fh.write("\n")
-        print(f"mori dispatch-combine: status={doc['status']} rt_p50={rt_p50:.1f}us "
-              f"slowest_rank={slowest_rank_us:.1f}us correct={correct} -> {args.out}")
-
-    # MoRI's shmem teardown asserts when the EpDispatchCombineOp is destroyed
-    # after shmem_finalize() (CheckStatusValid abort -> SIGABRT on this build,
-    # validated on-node). The result JSON is already written above, so just sync
-    # the ranks and hard-exit, skipping the buggy finalize/destructor path.
-    try:
-        dist.barrier()
-    except Exception:
-        pass
-    sys.stdout.flush()
-    sys.stderr.flush()
-    os._exit(0 if correct else 1)
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
index 013ce3151..90be0e480 100644
--- a/experimental/CollectiveX/summarize.py
+++ b/experimental/CollectiveX/summarize.py
@@ -117,6 +117,32 @@ def _fnum(x, fmt):
     return format(x, fmt) if isinstance(x, (int, float)) else "—"
 
 
+def _moe_sorted(moe):
+    return sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""), x.get("ep_size", 0)))
+
+
+def _moe_sweep_table(d):
+    """Markdown sweep table for one EP doc — the rows already ARE the ladder, so
+    emit one row per source-tokens-per-rank point. Skips old single-point docs
+    (no rows[])."""
+    rows = d.get("rows")
+    if not rows:
+        return []
+    sh = d.get("shape", {})
+    head = (f"\n**`{d.get('backend')}` · {d.get('phase')} · ep{d.get('ep_size')} · "
+            f"H{sh.get('hidden')} top{sh.get('topk')} E{sh.get('experts')} "
+            f"{sh.get('dispatch_dtype')} {sh.get('routing')}** — latency vs source tokens/rank\n")
+    out = [head,
+           "| tokens/rank | global tokens | dispatch µs | combine µs | round-trip µs | tokens/s | recv tok | correct |",
+           "|--:|--:|--:|--:|--:|--:|--:|:--:|"]
+    for r in rows:
+        out.append(f"| {r.get('tokens_per_rank')} | {r.get('global_tokens')} | "
+                   f"{_fnum(r.get('dispatch_us_p50'), '.2f')} | {_fnum(r.get('combine_us_p50'), '.2f')} | "
+                   f"{_fnum(r.get('roundtrip_us_p50'), '.2f')} | {_fnum(r.get('tokens_per_second'), '.3e')} | "
+                   f"{r.get('recv_tokens', '—')} | {'✅' if r.get('correct') else '❌'} |")
+    return out
+
+
 def render_plain(nccl, moe, n_valid, total) -> str:
     out = []
     hdr = "CollectiveX results"
@@ -133,15 +159,14 @@ def render_plain(nccl, moe, n_valid, total) -> str:
             out.append(f"  {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}"
                        f"{_lat_floor(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}")
     if moe:
-        out.append("\nMoE dispatch+combine (DeepEP / MoRI):")
-        out.append(f"  {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}{'disp_p50':>10}{'tokens/s':>13}  correct")
-        for d in sorted(moe, key=lambda x: x.get("backend", "")):
+        out.append("\nMoE EP dispatch/combine (DeepEP / MoRI) — headline (* = headline tokens/rank):")
+        out.append(f"  {'backend':<9}{'phase':<8}{'ep':>3} {'status':<9}{'T*':>5}{'disp_p50':>10}{'comb_p50':>10}{'rt_p50':>9}  correct")
+        for d in sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""))):
             m, c = d.get("metrics", {}), d.get("correctness", {})
-            tps = m.get("tokens_per_second")
-            out.append(f"  {d.get('backend',''):<10}{d.get('mode',''):<8}{d.get('status',''):<9}"
-                       f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}"
-                       f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}"
-                       f"{(tps if tps is not None else float('nan')):>13.3e}   {c.get('passed')}")
+            out.append(f"  {d.get('backend',''):<9}{d.get('phase',''):<8}{str(d.get('ep_size','')):>3} {d.get('status',''):<9}"
+                       f"{str(m.get('headline_tokens_per_rank','')):>5}"
+                       f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}{(m.get('combine_us_p50') or float('nan')):>10.1f}"
+                       f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}   {c.get('passed')}")
     return "\n".join(out)
 
 
@@ -167,15 +192,22 @@ def render_markdown(nccl, moe, n_valid, total) -> str:
                    "reduce-scatter / all-to-all; all-gather input/rank = size ÷ #GPUs). Small "
                    "sizes are latency-bound (busbw ≈ 0); peak bandwidth is at the largest size.")
     if moe:
-        out.append("\n### MoE dispatch+combine (DeepEP / MoRI)\n")
-        out.append("| backend | mode | status | rt p50 (µs) | rt p99 (µs) | dispatch p50 (µs) | tokens/s | correct |")
-        out.append("|---|---|---|--:|--:|--:|--:|:--:|")
-        for d in sorted(moe, key=lambda x: x.get("backend", "")):
+        out.append("\n### MoE EP dispatch / combine (DeepEP / MoRI)\n")
+        out.append("Headline = the reference point (tokens/rank shown as `T*`); the per-line "
+                   "sweep tables below carry the full source-tokens-per-rank curve.\n")
+        out.append("| backend | phase | ep | status | T\\* | dispatch p50 (µs) | combine p50 (µs) | round-trip p50 (µs) | tokens/s | correct |")
+        out.append("|---|---|--:|---|--:|--:|--:|--:|--:|:--:|")
+        for d in _moe_sorted(moe):
             m, c = d.get("metrics", {}), d.get("correctness", {})
-            out.append(f"| `{d.get('backend')}` | {d.get('mode')} | {_emoji(d.get('status'))} | "
-                       f"{_fnum(m.get('roundtrip_us_p50'), '.1f')} | {_fnum(m.get('roundtrip_us_p99'), '.1f')} | "
-                       f"{_fnum(m.get('dispatch_us_p50'), '.1f')} | {_fnum(m.get('tokens_per_second'), '.3e')} | "
-                       f"{'✅' if c.get('passed') else '❌'} |")
+            out.append(f"| `{d.get('backend')}` | {d.get('phase','')} | {d.get('ep_size','')} | {_emoji(d.get('status'))} | "
+                       f"{m.get('headline_tokens_per_rank','—')} | {_fnum(m.get('dispatch_us_p50'), '.1f')} | "
+                       f"{_fnum(m.get('combine_us_p50'), '.1f')} | {_fnum(m.get('roundtrip_us_p50'), '.1f')} | "
+                       f"{_fnum(m.get('tokens_per_second'), '.3e')} | {'✅' if c.get('passed') else '❌'} |")
+        for d in _moe_sorted(moe):
+            out += _moe_sweep_table(d)
+        out.append("\n> EP sweep: only source tokens/rank varies along a line; global tokens = "
+                   "tokens/rank × ep. Dispatch and combine are timed **separately** (combine's "
+                   "setup dispatch runs untimed); round-trip = dispatch + combine.")
     if not total:
         out.append("\n> No result files found — the benchmark produced nothing.")
     return "\n".join(out)
diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py
new file mode 100644
index 000000000..c54ccd00f
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — DeepEP (NVIDIA), normal mode.
+
+Ports the validated dispatch/combine sequence from the old run_deepep.py into the
+ep_harness Backend protocol. The harness owns the token sweep + separated timing;
+this file owns only DeepEP's API calls and its correctness reference.
+
+  !!! DeepEP's Python API is VERSION-SENSITIVE (V2 moved NVSHMEM->NCCL and unified
+  the APIs). The dispatch/combine block follows the documented normal-mode
+  intranode API; validate against the deep_ep commit actually built at job time
+  (rebuild-deepep) and recorded in provenance.
+
+Correctness (per DeepEP's tests/legacy/test_intranode.py): a pure dispatch->combine
+round trip with no expert compute reconstructs x only after dividing by the number
+of ranks each token was sent to, i.e. combined_x / is_token_in_rank.sum(dim=1).
+So the harness expects combined ≈ x * ranks_per_token.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+
+try:
+    from deep_ep import Buffer  # type: ignore
+except Exception as exc:  # pragma: no cover - needs the built DeepEP
+    print("ERROR: deep_ep import failed — DeepEP must be built at job setup "
+          f"(rebuild-deepep). {exc!r}", file=sys.stderr)
+    raise
+
+
+class DeepEPBackend:
+    name = "deepep"
+    mode = "normal"
+    measurement_contract = "deepep-normal-v1"
+    combine_needs_redispatch = False  # DeepEP combine reuses the handle (its own bench does too)
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.group = dist.group.WORLD
+        # Intranode normal mode: NVLink buffer only (no RDMA for single node). Size
+        # to hold the largest sweep point's routed traffic. Prefill's large-T points
+        # (up to 4096 tok/rank) need a bigger buffer than decode — validated on
+        # B200 (EP8) and GB200 (EP4) at 4 GiB through T=4096; decode is fine at 2 GiB.
+        # Override with CX_DEEPEP_NVL_BYTES.
+        _default_nvl = (4 if args.phase == "prefill" else 2) * 1024 * 1024 * 1024
+        num_nvl_bytes = int(os.environ.get("CX_DEEPEP_NVL_BYTES", str(_default_nvl)))
+        self.buffer = Buffer(self.group, num_nvl_bytes, 0)
+        try:
+            Buffer.set_num_sms(args.num_comm_sms)
+        except Exception as exc:  # pragma: no cover - version dependent
+            if rank == 0:
+                print(f"WARN: could not set num_sms={args.num_comm_sms}: {exc!r}", file=sys.stderr)
+        self.backend_provenance = {
+            "deepep_commit": os.environ.get("DEEPEP_COMMIT", "unknown"),
+            "num_nvl_bytes": num_nvl_bytes,
+            "num_comm_sms": args.num_comm_sms,
+        }
+        if args.dispatch_dtype == "fp8" and rank == 0:
+            print("WARN: deepep fp8 dispatch payload not wired for the exact-reconstruction "
+                  "gate yet; using bf16. (provenance reflects bf16.)", file=sys.stderr)
+            args.dispatch_dtype = "bf16"
+
+    def buffer_cap(self, args):
+        return None  # NVLink buffer is large; no hard per-T ceiling like MoRI's heap
+
+    def make_problem(self, T):
+        a = self.args
+        H, topk, E = a.hidden, a.topk, a.experts
+        x = torch.randn((T, H), dtype=torch.bfloat16, device=self.device)
+        if a.routing == "zipf":
+            probs = (1.0 / torch.arange(1, E + 1, device=self.device).float())
+            topk_idx = torch.multinomial(probs.expand(T, E), topk, replacement=False).to(torch.int64)
+        else:  # balanced / uniform: topk distinct experts drawn uniformly per token
+            topk_idx = torch.stack([
+                torch.randperm(E, device=self.device)[:topk] for _ in range(T)
+            ]).to(torch.int64)
+        topk_weights = torch.softmax(
+            torch.randn((T, topk), device=self.device, dtype=torch.float32), dim=-1)
+        return types.SimpleNamespace(T=T, x=x, topk_idx=topk_idx, topk_weights=topk_weights)
+
+    def dispatch(self, p):
+        # ===================== DeepEP normal-mode dispatch =====================
+        (num_tokens_per_rank, _, num_tokens_per_expert,
+         is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+        recv_x, recv_topk_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch(
+            p.x, topk_idx=p.topk_idx, topk_weights=p.topk_weights,
+            num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert)
+        # =======================================================================
+        return types.SimpleNamespace(
+            recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle,
+            is_token_in_rank=is_token_in_rank, num_tokens_per_expert=num_tokens_per_expert)
+
+    def stage(self, p, h):
+        # DeepEP combine consumes recv_x directly (no separate registered buffer to
+        # stage into) — the "expert outputs" are recv_x itself for a pure round trip.
+        return None
+
+    def combine(self, p, h):
+        combined_x, _, _ = self.buffer.combine(h.recv_x, h.handle, topk_weights=h.recv_topk_weights)
+        return combined_x
+
+    def expected(self, p, h):
+        # combined ≈ x * (#ranks each token was dispatched to)
+        ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float()
+        return p.x.float() * ranks_per_token, p.T
+
+    def recv_tokens(self, h):
+        return int(h.recv_x.shape[0])
+
+    def finalize(self, rc):
+        try:
+            dist.barrier()
+            dist.destroy_process_group()
+        except Exception:
+            pass
+        return rc
diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py
new file mode 100644
index 000000000..01214a3de
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_harness.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness.
+
+Backend-agnostic core for the EP benchmark. The per-backend adapters
+(`ep_deepep.py`, `ep_mori.py`) implement a small duck-typed protocol; this module
+owns everything else: the source-tokens-per-rank sweep, the SEPARATED dispatch /
+combine / round-trip timing, the correctness gate, and the provenance-tagged JSON
+doc the summarizer + plotter consume.
+
+Measurement model (see the CollectiveX EP framework notes):
+  * Primary x-axis is SOURCE TOKENS PER RANK, T in {1,2,4,8,...}. One row per T.
+    Only T varies along a line; everything else (backend, ep degree, phase,
+    precision, top-k, experts, hidden, routing, mode, comm-SMs) is FIXED and
+    identifies the line.
+  * Dispatch and combine are SEPARATE measurements. The combine timing window
+    contains ONLY combine(): the dispatch that produces its handle/layout (and
+    the "expert outputs" staged into the combine input) runs UNTIMED. The
+    round-trip is a third, distinct measurement (dispatch + combine).
+  * Both x values are recorded per row — tokens_per_rank and
+    global_tokens = T * ep_size — so a frontend can toggle weak-scaling (fixed
+    tokens/rank) vs strong-scaling (fixed global tokens) without re-running.
+
+stdlib-only at module top (torch is passed in by the entrypoint after a guarded
+import) so this file `py_compile`s on a machine without torch.
+
+Backend protocol (see ep_deepep.py / ep_mori.py):
+    name: str                      # "deepep" | "mori"
+    mode: str                      # "normal" | "ll"
+    measurement_contract: str      # e.g. "deepep-normal-v1"
+    combine_needs_redispatch: bool # True if combine consumes the dispatch state
+    backend_provenance: dict
+    buffer_cap(args) -> int|None   # max T the backend's buffers can hold (None = unbounded)
+    make_problem(T) -> problem     # build x[T,H], topk_idx[T,topk], topk_weights, scales
+    dispatch(problem) -> handle    # ONLY the dispatch comm op (timed for dispatch-only)
+    stage(problem, handle)         # untimed: place "expert outputs" into combine input
+    combine(problem, handle) -> tensor   # ONLY the combine comm op (timed for combine-only)
+    expected(problem, handle) -> (tensor, n_compare)   # reference for the gate
+    recv_tokens(handle) -> int     # realized tokens received this rank (comm volume)
+    finalize(rc) -> int|NoReturn   # clean shutdown (mori hard-exits)
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import os
+
+SCHEMA_VERSION = 1
+
+# Phase-default sweeps. Decode: a handful of active sequences per rank (small T).
+# Prefill: a chunk of context tokens per rank (large T). Powers of two so the
+# x-axis is even on a log scale. Either is overridable via --tokens-ladder; both
+# get clamped to the backend's buffer ceiling (MoRI's registerable heap).
+DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128]
+PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096]
+
+# bytes per element of the dispatch payload, for the comm-volume / algbw estimate.
+_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp8": 1}
+
+
+def add_common_args(ap: argparse.ArgumentParser) -> None:
+    """CLI args shared by every backend (the entrypoint adds --backend)."""
+    # workload shape — FIXED params identify the line; only --tokens-ladder sweeps.
+    ap.add_argument("--phase", default="decode", choices=["decode", "prefill"],
+                    help="decode (small T) or prefill (large T); picks the default ladder")
+    ap.add_argument("--tokens-ladder", default="",
+                    help="space/comma-separated source-tokens-per-rank sweep; blank = phase default")
+    ap.add_argument("--hidden", type=int, default=7168)
+    ap.add_argument("--topk", type=int, default=8)
+    ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across ep degrees)")
+    ap.add_argument("--dispatch-dtype", default="bf16", choices=["bf16", "fp8"])
+    ap.add_argument("--routing", default="balanced", choices=["balanced", "uniform", "zipf"])
+    ap.add_argument("--num-comm-sms", type=int, default=24, help="standardized communication-SM budget")
+    ap.add_argument("--num-ep-groups", type=int, default=1,
+                    help="concurrent EP groups on the node (1 = the ordinary line; >1 is a distinct experiment)")
+    ap.add_argument("--seed", type=int, default=67)
+    # measurement
+    ap.add_argument("--warmup", type=int, default=10)
+    ap.add_argument("--iters", type=int, default=50)
+    # provenance
+    ap.add_argument("--runner", required=True)
+    ap.add_argument("--topology-class", required=True)
+    ap.add_argument("--transport", default="")
+    ap.add_argument("--comparison-class", default="standardized")
+    ap.add_argument("--env-json")
+    ap.add_argument("--timestamp")
+    ap.add_argument("--out", required=True)
+
+
+def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]:
+    """Return (ladder, dropped). Parse an explicit spec else the phase default;
+    keep only positive ints; clamp to `cap` (backend buffer ceiling) and report
+    what was dropped so truncation is never silent."""
+    if spec and spec.strip():
+        raw = [t.strip() for t in spec.replace(",", " ").split()]
+        want = [int(t) for t in raw if t]
+    else:
+        want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER
+    want = sorted({t for t in want if t > 0})
+    if cap is not None:
+        kept = [t for t in want if t <= cap]
+        dropped = [t for t in want if t > cap]
+    else:
+        kept, dropped = want, []
+    return kept, dropped
+
+
+def percentile(xs: list[float], q: float) -> float:
+    if not xs:
+        return float("nan")
+    s = sorted(xs)
+    i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1)))))
+    return s[i]
+
+
+def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]:
+    """CUDA-event timing in microseconds.
+
+    Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration
+    (with a sync before the start event so its GPU work cannot bleed into the
+    measured window), then times `fn(pre_result)`. `pre` is how combine is
+    isolated for a backend whose combine consumes the dispatch state and so needs
+    a fresh dispatch+stage before every combine sample.
+    """
+    def sample():
+        arg = None
+        if pre is not None:
+            arg = pre()
+            torch.cuda.synchronize()
+        s = torch.cuda.Event(enable_timing=True)
+        e = torch.cuda.Event(enable_timing=True)
+        s.record()
+        fn(arg) if pre is not None else fn()
+        e.record()
+        torch.cuda.synchronize()
+        return s.elapsed_time(e) * 1000.0  # ms -> us
+
+    for _ in range(max(0, warmup)):
+        if pre is not None:
+            a = pre()
+            torch.cuda.synchronize()
+            fn(a)
+        else:
+            fn()
+    torch.cuda.synchronize()
+    return [sample() for _ in range(iters)]
+
+
+def comparison_key(meta: dict) -> str:
+    """Machine key gating which rows share a curve. Built from the FIXED config
+    ONLY — tokens_per_rank is the x-axis and MUST NOT be in the key, or every
+    sweep point would read as a different line. ep_size, num_ep_groups, phase and
+    topology-class ARE in the key, so EP4 vs EP8, decode vs prefill, and a
+    concurrent-groups run are labelled distinct rather than silently overlaid."""
+    parts = [
+        meta["op"], meta["backend"], meta["mode"], meta["phase"],
+        str(meta["ep_size"]), str(meta["num_ep_groups"]), str(meta["nodes"]),
+        meta["topology_class"], meta["comparison_class"], meta["measurement_contract"],
+        json.dumps(meta["shape"], sort_keys=True),
+    ]
+    return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16]
+
+
+def _reduce_max(torch, dist, device, vals: list[float]) -> list[float]:
+    t = torch.tensor(vals, device=device, dtype=torch.float64)
+    dist.all_reduce(t, op=dist.ReduceOp.MAX)
+    return [float(x) for x in t.tolist()]
+
+
+def _reduce_min_int(torch, dist, device, v: int) -> int:
+    t = torch.tensor([v], device=device, dtype=torch.int64)
+    dist.all_reduce(t, op=dist.ReduceOp.MIN)
+    return int(t.item())
+
+
+def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int:
+    """Drive the source-tokens-per-rank sweep for one fully-specified line.
+
+    For each T: build the problem, run one untimed dispatch->stage->combine for
+    the correctness gate, then take three SEPARATE timings — dispatch-only,
+    combine-only (dispatch+stage untimed), and the round trip. Latencies are
+    reduced MAX across ranks (a collective finishes with its slowest rank);
+    correctness is reduced MIN (any rank failing fails the point). Rank 0 writes
+    one JSON doc with a row per T. Returns a process exit code.
+    """
+    ep_size = world_size // max(1, args.num_ep_groups)
+    if args.experts % ep_size != 0:
+        if rank == 0:
+            print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})")
+        return 2
+    experts_per_rank = args.experts // ep_size
+    elem_bytes = _DTYPE_BYTES.get(args.dispatch_dtype, 2)
+
+    cap = backend.buffer_cap(args)
+    ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap)
+    if rank == 0 and dropped:
+        print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} "
+              f"(hidden={args.hidden}); not silently truncated.")
+    if not ladder:
+        if rank == 0:
+            print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})")
+        return 2
+    # Some backends (MoRI) wedge on a COLD dispatch that jumps straight to a large
+    # token count; they set needs_gradual_ramp so the sweep approaches its max T
+    # through a geometric ramp from 1 (validated on MI355X to avoid the hang while
+    # still reaching 512). A naturally-gradual ladder (decode) is unchanged.
+    if getattr(backend, "needs_gradual_ramp", False):
+        top, ramp, t = ladder[-1], [], 1
+        while t < top:
+            ramp.append(t)
+            t *= 2
+        ramp.append(top)
+        if rank == 0 and ramp != ladder:
+            print(f"NOTE: {backend.name} sweep ramped gradually 1..{top} (cold-jump-safe): {ramp}")
+        ladder = ramp
+
+    rows: list[dict] = []
+    for T in ladder:
+        problem = backend.make_problem(T)
+
+        # ---- correctness gate (untimed): dispatch -> stage experts -> combine ----
+        h = backend.dispatch(problem)
+        backend.stage(problem, h)
+        combined = backend.combine(problem, h)
+        torch.cuda.synchronize()
+        recv_local = backend.recv_tokens(h)
+        exp, n_cmp = backend.expected(problem, h)
+        got = combined[:n_cmp].float()
+        max_abs = (got - exp[:n_cmp].float()).abs().max().item()
+        denom = exp[:n_cmp].float().abs().max().item() + 1e-6
+        max_rel = max_abs / denom
+        local_ok = 1 if (max_rel < 2e-2 and recv_local > 0) else 0
+
+        # ---- three separate timings ----
+        disp = time_us(torch, lambda p=problem: backend.dispatch(p), args.warmup, args.iters)
+
+        def prep(p=problem):
+            hh = backend.dispatch(p)
+            backend.stage(p, hh)
+            return hh
+
+        if backend.combine_needs_redispatch:
+            comb = time_us(torch, lambda hh, p=problem: backend.combine(p, hh),
+                           args.warmup, args.iters, pre=prep)
+        else:
+            hh = prep()
+            comb = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx),
+                           args.warmup, args.iters)
+
+        def roundtrip(p=problem):
+            hh = backend.dispatch(p)
+            backend.stage(p, hh)
+            return backend.combine(p, hh)
+
+        rt = time_us(torch, roundtrip, args.warmup, args.iters)
+
+        # ---- reduce across ranks ----
+        d50, d99 = percentile(disp, 50), percentile(disp, 99)
+        c50, c99 = percentile(comb, 50), percentile(comb, 99)
+        r50, r99 = percentile(rt, 50), percentile(rt, 99)
+        (d50, d99, c50, c99, r50, r99) = _reduce_max(
+            torch, dist, device, [d50, d99, c50, c99, r50, r99])
+        recv = int(_reduce_max(torch, dist, device, [float(recv_local)])[0])
+        global_ok = _reduce_min_int(torch, dist, device, local_ok)
+        max_rel = _reduce_max(torch, dist, device, [max_rel])[0]
+
+        global_tokens = T * ep_size
+        dispatch_bytes = recv * args.hidden * elem_bytes
+        # Algorithmic bandwidth: realized received payload / dispatch time. Labelled
+        # "alg" (not bus) — an EP bus-bandwidth model is backend-specific and out of
+        # scope; latency is the primary metric, this is a comm-volume sanity figure.
+        disp_algbw = (dispatch_bytes / (d50 * 1e3)) if d50 > 0 else 0.0
+        tps = (global_tokens / (r50 * 1e-6)) if r50 > 0 else None
+
+        rows.append({
+            "tokens_per_rank": T,
+            "global_tokens": global_tokens,
+            "dispatch_us_p50": d50, "dispatch_us_p99": d99,
+            "combine_us_p50": c50, "combine_us_p99": c99,
+            "roundtrip_us_p50": r50, "roundtrip_us_p99": r99,
+            "recv_tokens": recv,
+            "dispatch_bytes": dispatch_bytes,
+            "dispatch_algbw_gbps": disp_algbw,
+            "tokens_per_second": tps,
+            "correct": bool(global_ok),
+            "max_rel_error": max_rel,
+        })
+        if rank == 0:
+            print(f"  T={T:<5} disp={d50:8.2f}us combine={c50:8.2f}us rt={r50:8.2f}us "
+                  f"recv={recv:<6} correct={bool(global_ok)}")
+
+    if rank != 0:
+        return 0
+
+    all_ok = bool(rows) and all(r["correct"] for r in rows)
+    shape = {
+        "hidden": args.hidden, "topk": args.topk, "experts": args.experts,
+        "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype,
+        "routing": args.routing, "num_comm_sms": args.num_comm_sms,
+    }
+    meta = {
+        "op": "ep-dispatch-combine", "backend": backend.name, "mode": backend.mode,
+        "phase": args.phase, "world_size": world_size, "ep_size": ep_size,
+        "num_ep_groups": args.num_ep_groups,
+        "nodes": int(os.environ.get("SLURM_NNODES", "1")),
+        "topology_class": args.topology_class, "comparison_class": args.comparison_class,
+        "measurement_contract": backend.measurement_contract, "shape": shape,
+    }
+    headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2])
+    env = None
+    if args.env_json and os.path.exists(args.env_json):
+        with open(args.env_json) as fh:
+            env = json.load(fh)
+    doc = {
+        "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "tests/run_ep.py",
+        "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(),
+        "runner": args.runner, "transport": args.transport,
+        "status": "valid" if all_ok else "invalid",
+        "comparison_key": comparison_key(meta),
+        "x_axis": {"primary": "tokens_per_rank",
+                   "global_relation": "global_tokens = tokens_per_rank * ep_size"},
+        "backend_provenance": backend.backend_provenance,
+        **meta,
+        "correctness": {"passed": all_ok,
+                        "max_rel_error": max((r["max_rel_error"] for r in rows), default=None),
+                        "points": len(rows)},
+        "metrics": {
+            "headline_tokens_per_rank": headline["tokens_per_rank"],
+            "dispatch_us_p50": headline["dispatch_us_p50"],
+            "combine_us_p50": headline["combine_us_p50"],
+            "roundtrip_us_p50": headline["roundtrip_us_p50"],
+            "roundtrip_us_p99": headline["roundtrip_us_p99"],
+            "tokens_per_second": headline["tokens_per_second"],
+        },
+        "rows": rows,
+        "environment": env,
+    }
+    os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True)
+    with open(args.out, "w") as fh:
+        json.dump(doc, fh, indent=2)
+        fh.write("\n")
+    print(f"{backend.name} ep-dispatch-combine [{args.phase}]: status={doc['status']} "
+          f"{len(rows)} points, headline T={headline['tokens_per_rank']} "
+          f"disp={headline['dispatch_us_p50']:.1f}us combine={headline['combine_us_p50']:.1f}us "
+          f"-> {args.out}")
+    return 0 if all_ok else 1
diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py
new file mode 100644
index 000000000..0b5257f36
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_mori.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — MoRI (AMD ROCm), normal mode.
+
+Ports the validated dispatch/combine sequence from the old run_mori.py into the
+ep_harness Backend protocol. The harness owns the token sweep + separated timing;
+this file owns MoRI's API and the three ionic_rdma-fabric constraints found on
+MI355X (all validated on-node, see CONTAINERS.md):
+  1. MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init, and
+     these NICs cap GPU-memory MRs at ~4 GiB — a 6 GiB heap fails (errno 22),
+     2 GiB registers. So hold the heap at 2 GiB and bound the buffers via
+     max_num_inp_token_per_rank (=> buffer_cap clamps the token sweep).
+  2. combine() resets recv_num, so read it BEFORE combine; combine returns the
+     full max_num_inp_token_per_rank buffer, so compare only the first T rows.
+  3. MoRI's shmem teardown asserts (CheckStatusValid -> SIGABRT) when the op is
+     destroyed after shmem_finalize(); finalize() hard-exits past it.
+
+combine_needs_redispatch = True: combine consumes the dispatch state (recv_num),
+so the harness re-dispatches (untimed) before each timed combine sample.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import types
+
+# MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init — set this
+# BEFORE `import mori`. 2 GiB registers cleanly on the MI355X ionic_rdma NICs;
+# larger fails. Layered: explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > 2G.
+os.environ.setdefault("MORI_SHMEM_HEAP_SIZE",
+                      os.environ.get("CX_MORI_HEAP_SIZE", "2G"))
+
+import torch
+import torch.distributed as dist
+
+try:
+    import mori  # type: ignore
+except Exception as exc:  # pragma: no cover - needs the AMD MoRI image
+    print("ERROR: mori import failed — needs the AMD MoRI image "
+          f"(rocm/sgl-dev:...-mori-...). {exc!r}", file=sys.stderr)
+    raise
+
+
+class MoRIBackend:
+    name = "mori"
+    mode = "normal"
+    measurement_contract = "mori-normal-v1"
+    combine_needs_redispatch = True
+    # MoRI wedges on a COLD dispatch that jumps straight to a large token count
+    # (validated on MI355X: a fresh-shmem sweep starting at T=128 hangs, while a
+    # gradual sweep 1,2,4,...,512 runs every point fine — including 256/512). So
+    # the harness ramps this backend's ladder geometrically from 1 up to its max,
+    # turning any phase's sweep into the proven gradual ramp.
+    needs_gradual_ramp = True
+
+    def __init__(self, args, rank, world_size, local_rank, device):
+        self.args = args
+        self.rank = rank
+        self.world_size = world_size
+        self.device = device
+        self.ep_size = world_size // max(1, args.num_ep_groups)
+        self.experts_per_rank = args.experts // self.ep_size
+        self.block_num = int(os.environ.get("CX_MORI_BLOCK_NUM", "80"))
+        self.dispatch_warps = int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16"))
+        self.combine_warps = int(os.environ.get("CX_MORI_COMBINE_WARPS", "8"))
+        if args.dispatch_dtype != "bf16":
+            if rank == 0:
+                print(f"WARN: mori adapter validated for bf16 (quant_type=none); "
+                      f"'{args.dispatch_dtype}' not wired — using bf16.", file=sys.stderr)
+            args.dispatch_dtype = "bf16"
+
+        # init MoRI shmem on the torch process group (per the reference test).
+        world_group = torch.distributed.group.WORLD
+        torch._C._distributed_c10d._register_process_group("default", world_group)
+        mori.shmem.shmem_torch_process_group_init("default")
+
+        # Size the symmetric buffers to the registerable heap (see buffer_cap). The
+        # op is built ONCE and reused for every T in the sweep; a T<=cap problem
+        # just fills the first T rows of the fixed buffer.
+        self._cap = self.buffer_cap(args)
+        self.config = mori.ops.EpDispatchCombineConfig(
+            data_type=torch.bfloat16, rank=rank, world_size=world_size,
+            hidden_dim=args.hidden, scale_dim=0,
+            scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(),
+            max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(),
+            max_num_inp_token_per_rank=max(512, self._cap),
+            num_experts_per_rank=self.experts_per_rank,
+            num_experts_per_token=args.topk,
+            use_external_inp_buf=False, quant_type="none",
+        )
+        self.op = mori.ops.EpDispatchCombineOp(self.config)
+        self.backend_provenance = {
+            "mori_commit": os.environ.get("MORI_COMMIT", "unknown"),
+            "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"),
+            "max_num_inp_token_per_rank": max(512, self._cap),
+            "block_num": self.block_num,
+            "dispatch_warps": self.dispatch_warps, "combine_warps": self.combine_warps,
+        }
+
+    def buffer_cap(self, args):
+        # Largest tokens/rank the 2 GiB registerable heap holds at this hidden size.
+        # 512 was validated on-node at hidden=7168; override via CX_MORI_MAX_TOKENS
+        # once a larger heap/ceiling is confirmed. Prefill ladders clamp to this.
+        return int(os.environ.get("CX_MORI_MAX_TOKENS", "512"))
+
+    def make_problem(self, T):
+        a = self.args
+        device, H, topk, E = self.device, a.hidden, a.topk, a.experts
+        x = torch.randn((T, H), dtype=torch.bfloat16, device=device)
+        # MoRI expects INT32 expert indices and a real (T, scale_dim) fp8 scales
+        # tensor even when scale_dim==0 (an (T,0) tensor), not None.
+        indices = torch.stack([
+            torch.randperm(E, device=device)[:topk] for _ in range(T)
+        ]).to(torch.int32)
+        weights = torch.rand((T, topk), dtype=torch.float32, device=device)
+        scales = torch.empty((T, 0), dtype=torch.float8_e4m3fnuz, device=device)
+        return types.SimpleNamespace(T=T, x=x, indices=indices, weights=weights, scales=scales)
+
+    def dispatch(self, p):
+        (dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num) = self.op.dispatch(
+            p.x, p.weights, p.scales, p.indices,
+            block_num=self.block_num, warp_per_block=self.dispatch_warps)
+        # Read total_recv BEFORE any combine — combine() resets recv_num (a later
+        # read yields 0, a false "received nothing").
+        total_recv = int(recv_num[0].item())
+        return types.SimpleNamespace(
+            dispatch_output=dispatch_output, dispatch_weights=dispatch_weights,
+            dispatch_indices=dispatch_indices, total_recv=total_recv,
+            combine_input=dispatch_output.to(torch.bfloat16))
+
+    def stage(self, p, h):
+        # Zero-copy mode (use_external_inp_buf=False): combine reads MoRI's
+        # registered combine-input buffer, so stage the dispatched rows into it.
+        # In a real MoE the expert FFN writes its outputs here; with no expert
+        # compute we copy the dispatched activations straight through.
+        buf = self.op.get_registered_combine_input_buffer(
+            torch.bfloat16, hidden_dim=h.combine_input.size(1))
+        buf[:h.total_recv, :].copy_(h.combine_input[:h.total_recv, :])
+
+    def combine(self, p, h):
+        combined, _w = self.op.combine(
+            h.combine_input, h.dispatch_weights, h.dispatch_indices,
+            block_num=self.block_num, warp_per_block=self.combine_warps)
+        return combined
+
+    def expected(self, p, h):
+        # MoRI combine sums one copy per destination RANK, so combined[i] ≈
+        # x[i] * (#unique destination ranks among the token's topk experts).
+        pes = p.indices.long() // self.experts_per_rank
+        unique_pes = torch.tensor(
+            [len(set(row.tolist())) for row in pes], device=self.device, dtype=torch.float32
+        ).unsqueeze(1)
+        return p.x.float() * unique_pes, p.T
+
+    def recv_tokens(self, h):
+        return int(h.total_recv)
+
+    def finalize(self, rc):
+        # MoRI's shmem teardown asserts when the op is destroyed after
+        # shmem_finalize() (CheckStatusValid -> SIGABRT on this build). The result
+        # JSON is already written, so sync the ranks and hard-exit past it.
+        try:
+            dist.barrier()
+        except Exception:
+            pass
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os._exit(0 if rc == 0 else 1)
diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py
new file mode 100644
index 000000000..898e4de51
--- /dev/null
+++ b/experimental/CollectiveX/tests/run_ep.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""CollectiveX — EP dispatch/combine benchmark entrypoint (run under torchrun).
+
+Picks a backend adapter (DeepEP or MoRI), runs the source-tokens-per-rank sweep
+via ep_harness, and writes one provenance-tagged JSON doc. Dispatch and combine
+are timed SEPARATELY (see ep_harness); only T varies along the resulting line.
+
+  torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \\
+      --phase decode --runner mi355x-amds --topology-class mi355x-xgmi \\
+      --transport xgmi --env-json results/env.json --out results/mi355x_mori_decode.json
+
+  torchrun --nproc_per_node=8 tests/run_ep.py --backend deepep \\
+      --phase prefill --runner b200-dgxc --topology-class b200-nvlink-island \\
+      --transport nvlink --env-json results/env.json --out results/b200_deepep_prefill.json
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under
+# torchrun (it executes the file as __main__, not as a package).
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import ep_harness  # noqa: E402  (stdlib-only; safe before torch)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep")
+    ap.add_argument("--backend", required=True, choices=["deepep", "mori"])
+    ep_harness.add_common_args(ap)
+    args = ap.parse_args()
+
+    try:
+        import torch
+        import torch.distributed as dist
+    except Exception as exc:  # pragma: no cover
+        print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+        return 3
+
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+    os.environ.setdefault("MASTER_ADDR", "localhost")
+    os.environ.setdefault("MASTER_PORT", "12355")
+
+    # MoRI inits its shmem on a process group it registers as "default" and wants
+    # the gloo+nccl combo with an explicit device_id (per its reference test);
+    # DeepEP uses a plain nccl group.
+    if not dist.is_initialized():
+        if args.backend == "mori":
+            dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank,
+                                    world_size=world_size, device_id=device)
+        else:
+            dist.init_process_group("nccl")
+
+    if args.backend == "mori":
+        from ep_mori import MoRIBackend as Backend
+    else:
+        from ep_deepep import DeepEPBackend as Backend
+
+    backend = Backend(args, rank, world_size, local_rank, device)
+    if rank == 0:
+        print(f"[run_ep] backend={args.backend} phase={args.phase} world={world_size} "
+              f"ep_size={world_size // max(1, args.num_ep_groups)} hidden={args.hidden} "
+              f"topk={args.topk} experts={args.experts} dtype={args.dispatch_dtype}")
+
+    rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size)
+    # finalize() handles backend-specific teardown: DeepEP returns rc cleanly;
+    # MoRI hard-exits past its post-shmem_finalize teardown assertion.
+    return backend.finalize(rc)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From e2717a341cf1514d4be6393db16121889db7bf19 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:57:51 +0800
Subject: [PATCH 17/17] CollectiveX: make MI355X launcher CI-robust (writable
 lock dir + node pin)

The MI355X MoRI jobs failed in CI when they landed on cold nodes: the squash
lock was created next to the squash in /var/lib/squash, which is root/admin-owned
on some nodes (flock -> "Bad file descriptor"), and nodes without the node-local
squash need a slow cold import that also hits lock/cache permissions.

- launch_mi355x-amds.sh: put the import lock in a guaranteed-writable per-node
  dir (CX_LOCK_DIR, default /tmp), not beside the squash; add CX_NODELIST to pin
  the allocation to nodes that already hold the squash.
- workflow: pin MI355X jobs (push + dispatch) to the warm-squash nodes
  (mia1-p01-g10,g15). Widen once the squash is staged cluster-wide.

The EP sweep itself is already hardware-validated (MoRI decode + prefill); this
only fixes squash setup so the jobs reach it in CI.
---
 .../workflows/collectivex-experimental.yml    |  6 ++++
 .../launchers/launch_mi355x-amds.sh           | 29 +++++++++++++++----
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml
index e2a8e2ff2..6965424ab 100644
--- a/.github/workflows/collectivex-experimental.yml
+++ b/.github/workflows/collectivex-experimental.yml
@@ -92,6 +92,10 @@ jobs:
     env:
       CX_BENCH: mori
       CX_PHASE: ${{ matrix.phase }}
+      # Pin to the MI355X nodes that hold the node-local squash and have a writable
+      # /var/lib/squash; other nodes need a slow cold import that can fail on lock/
+      # cache permissions. Widen once the squash is staged cluster-wide.
+      CX_NODELIST: mia1-p01-g10,mia1-p01-g15
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
         with: { clean: true }
@@ -133,6 +137,8 @@ jobs:
       CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
       # GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
       CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+      # MI355X: pin to the warm-squash, writable nodes (see the push job).
+      CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
         with: { clean: true }
diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
index 8092b84b4..3a7ceccb3 100644
--- a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
+++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh
@@ -34,6 +34,10 @@ TIME_MIN="${CX_TIME:-60}"   # generous: a cold enroot import of the large ROCm i
 IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}"
 SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}"   # node-local on MI355X
 EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}"
+# Optional node pin. The node-local squash is only staged on some nodes, and on
+# others /var/lib/squash isn't writable (cold-import fails). Pin CI to nodes that
+# already hold the squash via CX_NODELIST (overrides the exclude list).
+NODELIST="${CX_NODELIST:-}"
 MOUNT_DIR=/ix
 TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
 
@@ -52,15 +56,27 @@ cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH im
 # AMD workspace is compute-visible (the serving launcher bind-mounts it directly),
 # so no staging; the node-local squash is handled via srun below.
 MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
-SQUASH_FILE="$SQUASH_DIR/$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g').sqsh"
-LOCK_FILE="${SQUASH_FILE}.lock"
-cx_log "squash(node-local)=$SQUASH_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
+SQUASH_KEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')"
+SQUASH_FILE="$SQUASH_DIR/${SQUASH_KEY}.sqsh"
+# Lock in a guaranteed-writable per-node dir, NOT next to the squash: on some
+# nodes /var/lib/squash is root/admin-owned, so even a world-readable squash
+# can't get a sibling .lock created (flock -> "Bad file descriptor"). CX_LOCK_DIR
+# overrides. The lock only serializes concurrent imports on the same node.
+LOCK_FILE="${CX_LOCK_DIR:-/tmp}/${SQUASH_KEY}.sqsh.lock"
+cx_log "squash(node-local)=$SQUASH_FILE  lock=$LOCK_FILE  mount=$MOUNT_SRC -> $MOUNT_DIR"
 
 if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi
 command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node"
 
-salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \
-       --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+# Pin to specific nodes (CX_NODELIST) when set, else exclude the known-bad ones.
+if [ -n "$NODELIST" ]; then
+  cx_log "node pin: --nodelist=$NODELIST"
+  salloc --partition="$PARTITION" --nodelist="$NODELIST" --gres=gpu:"$NGPUS" \
+         --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+else
+  salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \
+         --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME"
+fi
 JOB_ID="$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)"
 [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID"
 cx_log "JOB_ID=$JOB_ID"
@@ -71,7 +87,8 @@ trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT
 # shellcheck disable=SC2016  # $(...) must expand on the remote node, not here
 srun --jobid="$JOB_ID" bash -c 'docker stop $(docker ps -aq) 2>/dev/null || true' || true
 srun --jobid="$JOB_ID" bash -c "
-  exec 9>\"$LOCK_FILE\"
+  mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true
+  exec 9>\"$LOCK_FILE\" || { echo 'cannot open lock $LOCK_FILE' >&2; exit 1; }
   flock -w 600 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; }
   if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then
     echo 'squash present: $SQUASH_FILE'