From 83761d008389ebcbbcd10e54e6c116f85b267066 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:03:14 +0800 Subject: [PATCH 01/17] Add CollectiveX experimental cross-vendor collective/EP benchmark Per-SKU launch adapters (launch_.sh) that run any benchmark via a CX_BENCH selector through a shared run_in_container.sh; multi-arch digest-pinned sglang container; NCCL-primitive + DeepEP dispatch/combine benchmarks with provenance + correctness gating; and an on:push workflow (GB200 NCCL smoke; workflow_dispatch for B200/DeepEP/larger sweeps). Validated on hardware: NCCL primitives on B200 (8x NVLink) and GB200 (4x NVL72 MNNVL); DeepEP dispatch/combine on GB200 (correctness-gated). --- .../workflows/collectivex-experimental.yml | 108 ++ experimental/CollectiveX/.gitignore | 12 + experimental/CollectiveX/CONTAINERS.md | 57 ++ experimental/CollectiveX/README.md | 103 ++ experimental/CollectiveX/env_capture.py | 250 +++++ experimental/CollectiveX/launchers/common.sh | 99 ++ .../launchers/launch_b200-dgxc-slurm.sh | 101 ++ .../CollectiveX/launchers/launch_b200-dgxc.sh | 64 ++ .../CollectiveX/launchers/launch_gb200-nv.sh | 67 ++ .../CollectiveX/launchers/run_in_container.sh | 74 ++ experimental/CollectiveX/plan.md | 939 ++++++++++++++++++ experimental/CollectiveX/plot.py | 141 +++ experimental/CollectiveX/requirements.txt | 9 + experimental/CollectiveX/results/.gitkeep | 3 + experimental/CollectiveX/run_deepep.py | 260 +++++ experimental/CollectiveX/run_nccl.py | 262 +++++ .../fixtures/all_reduce_perf_b200_8gpu.txt | 50 + 17 files changed, 2599 insertions(+) create mode 100644 .github/workflows/collectivex-experimental.yml create mode 100644 experimental/CollectiveX/.gitignore create mode 100644 experimental/CollectiveX/CONTAINERS.md create mode 100644 experimental/CollectiveX/README.md create mode 100644 experimental/CollectiveX/env_capture.py create mode 100644 experimental/CollectiveX/launchers/common.sh create mode 100644 experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh create mode 100644 experimental/CollectiveX/launchers/launch_b200-dgxc.sh create mode 100644 experimental/CollectiveX/launchers/launch_gb200-nv.sh create mode 100644 experimental/CollectiveX/launchers/run_in_container.sh create mode 100644 experimental/CollectiveX/plan.md create mode 100644 experimental/CollectiveX/plot.py create mode 100644 experimental/CollectiveX/requirements.txt create mode 100644 experimental/CollectiveX/results/.gitkeep create mode 100644 experimental/CollectiveX/run_deepep.py create mode 100644 experimental/CollectiveX/run_nccl.py create mode 100644 experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml new file mode 100644 index 000000000..6b07c2d56 --- /dev/null +++ b/.github/workflows/collectivex-experimental.yml @@ -0,0 +1,108 @@ +name: CollectiveX Experimental + +# Orchestration only — all benchmark logic lives in experimental/CollectiveX/. +# Push to the feature branch runs a small GB200 NCCL smoke (no merge to main +# needed); workflow_dispatch runs a chosen SKU + benchmark (the lane for B200, +# DeepEP, and larger sweeps). Each job lands on the SKU's self-hosted runner and +# invokes that SKU's launch script — the same launch_${RUNNER_NAME%%_*}.sh +# convention the serving benchmarks use. + +on: + push: + branches: + - collectivex + paths: + - 'experimental/CollectiveX/**' + - '.github/workflows/collectivex-experimental.yml' + workflow_dispatch: + inputs: + sku: + description: Self-hosted runner pool (label from .github/configs/runners.yaml) + type: choice + default: gb200 + options: [gb200, b200, b200-multinode, b300, gb300] + benchmark: + description: Which benchmark to run + type: choice + default: nccl + options: [nccl, deepep, all] + ops: + description: NCCL ops (space-separated); blank = default set + type: string + default: '' + min_bytes: + description: nccl-tests min message size + type: string + default: '8' + max_bytes: + description: nccl-tests max message size + type: string + default: '8G' + ngpus: + description: GPUs per node (blank = SKU default) + type: string + default: '' + +concurrency: + group: collectivex-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + # Push -> short GB200 NCCL smoke (idle capacity; never auto-contends with the + # B200 serving sweep). GB200 runner workspace is staged to compute-visible + # Lustre via CX_STAGE_DIR. + smoke: + if: github.event_name == 'push' + runs-on: gb200 + timeout-minutes: 60 + env: + CX_BENCH: nccl + CX_NGPUS: '4' + CX_MAX_BYTES: 1G + CX_TIME: '20' + CX_STAGE_DIR: /mnt/lustre01/users-public/sa-shared/cx-stage + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + - name: Launch GB200 NCCL smoke + env: + RUNNER_NAME: ${{ runner.name }} + run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Upload results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: collectivex_smoke_gb200_${{ github.run_id }} + path: experimental/CollectiveX/results/*.json + if-no-files-found: warn + + # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner. + dispatch: + if: github.event_name == 'workflow_dispatch' + runs-on: ${{ inputs.sku }} + timeout-minutes: 120 + env: + CX_BENCH: ${{ inputs.benchmark }} + CX_OPS: ${{ inputs.ops }} + CX_MIN_BYTES: ${{ inputs.min_bytes }} + CX_MAX_BYTES: ${{ inputs.max_bytes }} + CX_NGPUS: ${{ inputs.ngpus }} + # GB200/watchtower needs a compute-visible workspace; harmless elsewhere. + CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 + with: { clean: true } + - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} + env: + RUNNER_NAME: ${{ runner.name }} + run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Upload results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ github.run_id }} + path: experimental/CollectiveX/results/*.json + if-no-files-found: warn diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore new file mode 100644 index 000000000..4235a8ce9 --- /dev/null +++ b/experimental/CollectiveX/.gitignore @@ -0,0 +1,12 @@ +# in-container nccl-tests build cache +.nccl-tests/ +# python +__pycache__/ +*.pyc +# generated run artifacts: captured env embeds hostnames / GPU UUIDs / NIC GUIDs, +# so keep results out of git (CI uploads them as workflow artifacts instead). +# Sanitized headline numbers live in CONTAINERS.md. +results/*.json +results/plots/ +results/raw_*.txt +results/raw_*.txt.stderr diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md new file mode 100644 index 000000000..94ab7377f --- /dev/null +++ b/experimental/CollectiveX/CONTAINERS.md @@ -0,0 +1,57 @@ +# CollectiveX — container & library versions + +One **multi-arch, digest-pinned** container is used for all NVIDIA SKUs, so B200 +(x86_64) and GB200 (aarch64) share a single reference and the cross-vendor +comparison is truly same-image. Set in `launchers/common.sh` (`cx_default_image`). + +## Default container (all NVIDIA SKUs) + +- **Image (pin by digest):** `lmsysorg/sglang@sha256:42194170546745092e74cd5f81ad32a7c6e944c7111fe7bf13588152277ff356` — the OCI image index for tag `v0.5.12-cu130`. +- **Multi-arch manifest list:** linux/amd64 (`sha256:015f39a4…`) + linux/arm64 (`sha256:7a76819e…`). One digest; `enroot import` on each host pulls the matching arch. **Use the digest-only ref** (`repo@sha256:`) in `common.sh` — enroot 400s on a combined `tag@sha256:` reference. +- **Importing needs registry creds:** anonymous Docker Hub pulls return 401 in ad-hoc SSH sessions; the CI runners import with their configured credentials (the serving sweeps pull images routinely), and already-staged squashes need no import. The refactored launcher path was validated on the already-staged `v0.5.11-cu130` (same multi-arch cu130 line). +- **DeepEP: NOT bundled** here → `run_in_container.sh` builds it via `rebuild-deepep` at job setup (CX_BENCH=deepep). The NCCL path needs no DeepEP. +- **nccl-tests build:** in-container (login nodes have no `nvcc`), `CX_NCCL_HOME=/usr` (system `nccl.h` in `/usr/include`), `CX_CUDA_HOME=/usr/local/cuda`. cu130 lineage ⇒ CUDA 13; confirm exact NCCL/torch on first run and append below. + +## Audited reference (cu130 lineage) + +Live audit of the sibling DeepSeek-V4 image `lmsysorg/sglang:deepseek-v4-grace-blackwell` (aarch64) on GB200, 2026-06-23 — the multi-arch `v0.5.12-cu130` should match closely (same cu130 base); reconfirm on first run: + +| Component | Version | +|---|---| +| OS / arch | Ubuntu 24.04.3, aarch64 | +| CUDA (`nvcc`) | 13.0 (V13.0.88) | +| NCCL (system `/usr/include/nccl.h`) | 2.28.3; torch-bundled 2.27.7 | +| PyTorch | 2.9.1+cu130 | +| DeepEP | bundled in *that* image; **not** in the multi-arch default | +| NVSHMEM | `libnvshmem_host.so.3` present | +| OpenMPI / gcc / make | 4.1.6 / 13.3.0 / 4.3 | +| GPU / driver | GB200, 580.126.20 | + +**Version caveat:** the nccl-tests binary links **system NCCL** (2.28.x), while torch/DeepEP use the **bundled** NCCL (2.27.x). Record both in provenance (env_capture does); don't compare an nccl-tests curve against a DeepEP run as if NCCL were identical. + +## Bundled-DeepEP reference images (not the default) + +If a bundled DeepEP is needed before `rebuild-deepep` is wired on the multi-arch image, these arch-specific images bundle it (pin by digest): + +- B200 (amd64): `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b` (pre-staged on B200) +- GB200 (arm64): `lmsysorg/sglang:deepseek-v4-grace-blackwell@sha256:4f583347d7ff08aef7e16dbb4985b2a7c147ff49a0c261d5e27b8f5f41719368` (staged on GB200 Lustre) + +Select via `CX_IMAGE=…@sha256:…` on the launch script. + +## Cluster access / QOS + +- **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account. +- **GB200** (`watchtower`): account `benchmark`, qos `normal`, partition `batch` (`AllowQos=ALL`); idle capacity available. Runner workspace is **not** compute-visible → set `CX_STAGE_DIR` to a Lustre path (the launcher rsyncs there). + +## First real results (Milestone-0 spike, on the DeepSeek-V4 images) + +nccl-tests (system NCCL 2.28.3), all correctness-passed, peak bus-bw: + +| op | B200 8× (NVLink island, x86_64) | GB200 4× (NVL72 MNNVL, aarch64) | +|---|---|---| +| all_reduce | 835 GB/s | 689 GB/s | +| all_gather | 653 | 658 | +| reduce_scatter | 667 | 661 | +| alltoall | 638 | 666 | + +(B200 vs GB200 carry distinct `comparison_key`s by topology-class, so they are labelled-distinct, not silently merged. Re-run on the multi-arch default to refresh under one image.) diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md new file mode 100644 index 000000000..3b18c048d --- /dev/null +++ b/experimental/CollectiveX/README.md @@ -0,0 +1,103 @@ +# CollectiveX + +Cross-vendor collective / EP-library benchmark (see `plan.md`). Per-SKU **launch +adapters** (InferenceX-style `launch_.sh`) run **any benchmark** — selected +by `CX_BENCH` — through a shared in-container runner, and a GitHub Actions +workflow triggers runs on `push` (no merge to main needed). Milestone-0 headline +already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL). + +> Experimental: WIP, not an official InferenceMAX result. All logic stays under +> `experimental/CollectiveX/`; the only file outside is the orchestration-only +> workflow. + +## Files + +| File | Role | +|---|---| +| `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) | +| `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) | +| `run_deepep.py` | DeepEP dispatch+combine, normal mode, correctness-gated (torch + DeepEP) | +| `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) | +| `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build | +| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/all) | +| `launchers/launch_.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL) | +| `CONTAINERS.md` | the pinned multi-arch container + audited library versions | +| `results/` | flat JSON artifacts (+ `plots/`, raw captures) | +| `tests/fixtures/` | captured nccl-tests output for offline parser checks | + +## Run + +### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`) + +- **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle + capacity; never auto-contends with the B200 serving sweep). +- **workflow_dispatch** → pick `sku` (gb200 / b200 / b200-multinode / …), + `benchmark` (nccl / deepep / all), ops, sizes, ngpus. Lands on that SKU's + self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. + +(The workflow only fires once the branch is pushed to GitHub.) + +### Directly on a cluster login node + +```bash +# benchmark is selected by CX_BENCH (default nccl) +bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, NCCL primitives +CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild) +bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh # B200 8× NVLink +bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh # B200 2-node, cross-IB +``` + +Knobs: `CX_BENCH` (nccl|deepep|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`, +`CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible +staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate +nothing). Results land in `experimental/CollectiveX/results/`. + +### Offline (no GPU) — verify the parser/JSON pipeline + +```bash +python3 run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \ + --world-size 8 --nodes 1 --runner b200-dgxc --topology-class b200-nvlink-island --out /tmp/parsed.json +python3 env_capture.py # prints a (degraded, off-GPU) env record +python3 plot.py --results-dir results --out-dir results/plots # needs matplotlib +``` + +## Container + +One **multi-arch, digest-pinned** image for all NVIDIA SKUs: +`lmsysorg/sglang:v0.5.12-cu130@sha256:4219…f356` (amd64 + arm64). See +`CONTAINERS.md` for versions, the DeepEP-rebuild note, and the digest-pinned +DeepSeek-V4 fallback images. + +## How it runs (confirmed against the live clusters) + +- Adapters mirror `runners/launch_*.sh`: `salloc` → enroot squash (import only if + missing) → `srun --container-image=… --container-mounts=:/ix` → in-container + `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account + `benchmark`. +- Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in + `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node + adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`). +- The sglang image installs editable under `/workspace`, so the repo is mounted at + **`/ix`**. GB200 compute nodes don't see the runner workspace → `CX_STAGE_DIR` + rsyncs the tree to Lustre first. +- Every result embeds an `env_capture` record and a `comparison_key`; topology + class is part of the key, so B200(IB/NVLink) and GB200(MNNVL) stay labelled + distinct, never silently overlaid. + +## Status & known risks + +- **Spike done on real hardware** (both SKUs, 4 NCCL primitives, correctness-passed) + — on the DeepSeek-V4 images. Now standardizing on the **multi-arch** default; + validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9). +- **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds + it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive; + `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against + the built commit. B200 (x86_64) first; GB200 (aarch64) follows. +- **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a + compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container + or srt-slurm. CX_BENCH=nccl only for now. +- **B200 QOS:** account `benchmark` has only `gpu-2_qos` (the serving-sweep + partition); idle `gpu-1` needs a QOS grant. GB200 `batch` is open. + +Once the multi-arch image is validated end-to-end, freeze the schema from the +artifacts (plan: "Freeze the contract"). diff --git a/experimental/CollectiveX/env_capture.py b/experimental/CollectiveX/env_capture.py new file mode 100644 index 000000000..b906a0497 --- /dev/null +++ b/experimental/CollectiveX/env_capture.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — Layer-0 environment + topology capture. + +Emits a JSON document describing the node a collective benchmark ran on, so +every result is provenance-tagged and a B200-vs-GB200 comparison is defensible. +Standard library only (so it runs in any minimal container, and off-GPU it +degrades gracefully instead of crashing). torch is used only if importable. + +Usage: + python env_capture.py --out results/env_b200-dgxc.json + python env_capture.py --redact --out env.json # hash hostnames/IPs/UUIDs + +Importable: + from env_capture import capture_environment + env = capture_environment(redact=False) +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import platform +import re +import shutil +import socket +import subprocess +import sys + +SCHEMA_VERSION = 1 + +# Env vars worth recording — transport/tuning knobs that change what a +# collective actually does (esp. the GB200 MNNVL flags vs B200). +ENV_PREFIXES = ("NCCL_", "NVSHMEM_", "MC_", "UCX_", "SGLANG_DEEPEP", "DEEPEP_") +ENV_EXACT = ( + "CUDA_VISIBLE_DEVICES", + "CUDA_DEVICE_ORDER", + "SLURM_JOB_ID", + "SLURM_NNODES", + "SLURM_NTASKS", + "SLURM_JOB_PARTITION", + # Image identity — set by the launcher so the bundle records what ran. + "COLLECTIVEX_IMAGE", + "COLLECTIVEX_IMAGE_DIGEST", +) + + +def _run(cmd: list[str], timeout: int = 20) -> str | None: + """Run a command, return stdout (stripped) or None if unavailable.""" + if shutil.which(cmd[0]) is None: + return None + try: + out = subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, check=False + ) + except (subprocess.TimeoutExpired, OSError): + return None + if out.returncode != 0: + return None + return out.stdout.strip() + + +def _redact(value: str | None) -> str | None: + """Stable short hash so artifacts can be shared without leaking + hostnames / IPs / GPU UUIDs / IB GUIDs while staying joinable.""" + if not value: + return value + return "redacted-" + hashlib.sha256(value.encode()).hexdigest()[:12] + + +def _gpus(redact: bool) -> dict: + """GPU inventory via nvidia-smi (None fields off-GPU).""" + info: dict = {"source": None, "count": None, "devices": []} + q = _run( + [ + "nvidia-smi", + "--query-gpu=name,uuid,memory.total,compute_cap,pci.bus_id", + "--format=csv,noheader,nounits", + ] + ) + if q is None: + return info + info["source"] = "nvidia-smi" + devices = [] + for line in q.splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) < 5: + continue + name, uuid, mem_mib, cc, bus = parts[:5] + devices.append( + { + "name": name, + "uuid": _redact(uuid) if redact else uuid, + "memory_total_mib": int(mem_mib) if mem_mib.isdigit() else mem_mib, + "compute_capability": cc, + "pci_bus_id": _redact(bus) if redact else bus, + } + ) + info["count"] = len(devices) + info["devices"] = devices + return info + + +def _driver_cuda() -> dict: + out = _run( + ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"] + ) + driver = out.splitlines()[0].strip() if out else None + # `nvidia-smi` (no args) prints the CUDA driver-API version in its header. + cuda = None + header = _run(["nvidia-smi"]) + if header: + m = re.search(r"CUDA Version:\s*([0-9.]+)", header) + if m: + cuda = m.group(1) + return {"driver_version": driver, "cuda_version": cuda} + + +def _torch_info() -> dict: + """NCCL / torch build info — only if torch is importable in this env.""" + info: dict = {"available": False} + try: + import torch # type: ignore + except Exception: + return info + info["available"] = True + info["torch_version"] = torch.__version__ + try: + info["cuda_runtime"] = torch.version.cuda + except Exception: + info["cuda_runtime"] = None + try: + if torch.cuda.is_available(): + nccl = torch.cuda.nccl.version() + # version() returns an int (e.g. 22304) or a tuple, depending on build. + info["nccl_version"] = ( + ".".join(map(str, nccl)) if isinstance(nccl, tuple) else nccl + ) + info["device_count"] = torch.cuda.device_count() + info["device_name"] = torch.cuda.get_device_name(0) + cc = torch.cuda.get_device_capability(0) + info["compute_capability"] = f"{cc[0]}.{cc[1]}" + except Exception as exc: # pragma: no cover - hardware dependent + info["error"] = repr(exc) + return info + + +def _topology(redact: bool) -> dict: + """GPU/NIC topology matrix + a fingerprint to gate comparability. + + The fingerprint is a hash of the structural part of `nvidia-smi topo -m` + (the connection legend), so two nodes with the same wiring share a key + even if absolute device IDs differ.""" + topo = _run(["nvidia-smi", "topo", "-m"]) + if topo is None: + return {"source": None, "matrix": None, "fingerprint": None} + # Fingerprint the link-type tokens (NV#, NODE, SYS, PIX, PXB, ...) only — + # ignore GPU/NIC labels and whitespace so it's placement-stable. + tokens = re.findall(r"\b(NV\d+|NODE|SYS|PIX|PXB|PHB|X)\b", topo) + fingerprint = hashlib.sha256(" ".join(tokens).encode()).hexdigest()[:16] + return { + "source": "nvidia-smi topo -m", + # The matrix can contain hostnames in some setups; redact wholesale. + "matrix": ("" if redact else topo), + "fingerprint": fingerprint, + } + + +def _rdma(redact: bool) -> dict: + """RDMA/IB device presence — names only, GUIDs redactable.""" + devices: list[str] = [] + listing = _run(["ibv_devinfo", "-l"]) + if listing: + for line in listing.splitlines()[1:]: # first line is a count + name = line.strip() + if name: + devices.append(name) + elif _run(["ibstat", "-l"]): + devices = [d.strip() for d in _run(["ibstat", "-l"]).splitlines() if d.strip()] + return { + "available": bool(devices), + "devices": [_redact(d) if redact else d for d in devices], + } + + +def _env_vars() -> dict: + out = {} + for k, v in os.environ.items(): + if k in ENV_EXACT or any(k.startswith(p) for p in ENV_PREFIXES): + out[k] = v + return dict(sorted(out.items())) + + +def capture_environment(redact: bool = False, timestamp: str | None = None) -> dict: + """Return a JSON-serializable environment/provenance record.""" + host = socket.gethostname() + return { + "schema_version": SCHEMA_VERSION, + "captured_at": timestamp or _dt.datetime.now().astimezone().isoformat(), + "redacted": redact, + "host": _redact(host) if redact else host, + "platform": { + "system": platform.system(), + "release": platform.release(), + "machine": platform.machine(), # x86_64 vs aarch64 (B200 vs GB200) + "python": sys.version.split()[0], + }, + "gpus": _gpus(redact), + "driver": _driver_cuda(), + "torch": _torch_info(), + "topology": _topology(redact), + "rdma": _rdma(redact), + "env": _env_vars(), + } + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX Layer-0 environment capture") + ap.add_argument("--out", help="write JSON here (default: stdout)") + ap.add_argument( + "--redact", + action="store_true", + help="hash hostnames / IPs / GPU UUIDs / IB GUIDs for shareable artifacts", + ) + ap.add_argument( + "--timestamp", + help="ISO timestamp to stamp (default: now); pass one for reproducible bundles", + ) + args = ap.parse_args() + + env = capture_environment(redact=args.redact, timestamp=args.timestamp) + blob = json.dumps(env, indent=2) + if args.out: + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + fh.write(blob + "\n") + # A one-line human summary to stdout (the JSON is the artifact). + g = env["gpus"] + print( + f"env -> {args.out} | machine={env['platform']['machine']} " + f"gpus={g['count']} topo_fp={env['topology']['fingerprint']}" + ) + else: + print(blob) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh new file mode 100644 index 000000000..445cdb5ca --- /dev/null +++ b/experimental/CollectiveX/launchers/common.sh @@ -0,0 +1,99 @@ +# shellcheck shell=bash +# CollectiveX — shared launcher helpers (sourced, not executed). +# +# Cluster-generic scaffolding only (Slurm/container/build/staging); no +# model-serving. Logging goes to stderr so functions can `echo` a single +# result on stdout. + +cx_log() { printf '[collectivex] %s\n' "$*" >&2; } +cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; } + +# Single multi-arch, digest-pinned container for ALL NVIDIA SKUs. +# This is the OCI image index for tag `v0.5.12-cu130`, covering BOTH linux/amd64 +# (B200) and linux/arm64 (GB200); enroot import on each host pulls the matching +# arch from the index. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.) +# Pinned by DIGEST ONLY (no tag): enroot mis-parses a combined `tag@sha256` ref +# and 400s at auth, so we use `repo@sha256:` — also the stricter pin. +# NOTE: DeepEP is NOT bundled here -> run_in_container.sh builds it via +# rebuild-deepep at job setup. (The arch-specific deepseek-v4-{blackwell, +# grace-blackwell} images DO bundle DeepEP — see CONTAINERS.md — but are not +# multi-arch and are not used by default.) +CX_IMAGE_MULTIARCH="lmsysorg/sglang@sha256:42194170546745092e74cd5f81ad32a7c6e944c7111fe7bf13588152277ff356" + +cx_default_image() { + case "$1" in + b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;; + *) cx_die "no default image for runner prefix: $1" ;; + esac +} + +# cx_ensure_squash -> echoes the squash file path. +# Imports via enroot only if a valid squash is not already present (flock-guarded, +# mirroring runners/launch_b200-dgxc.sh). +cx_ensure_squash() { + local squash_dir="$1" image="$2" + mkdir -p "$squash_dir" 2>/dev/null || true + local key sq locks + key="$(printf '%s' "$image" | sed 's#[/:@#]#_#g')" + sq="$squash_dir/${key}.sqsh" + locks="$squash_dir/.locks"; mkdir -p "$locks" 2>/dev/null || true + ( + flock -w 900 9 || cx_die "lock timeout for $sq" + if unsquashfs -l "$sq" >/dev/null 2>&1; then + cx_log "squash present: $sq" + else + cx_log "enroot import docker://$image -> $sq (one-time, multi-GB)" + rm -f "$sq" + enroot import -o "$sq" "docker://$image" >&2 || cx_die "enroot import failed for $image" + unsquashfs -l "$sq" >/dev/null 2>&1 || cx_die "import produced no valid squash: $sq" + fi + ) 9>"$locks/${key}.lock" + echo "$sq" +} + +# cx_stage_repo -> echoes the mount-source root. +# Some clusters (e.g. GB200/watchtower) do not cross-mount the runner workspace +# to compute nodes. If CX_STAGE_DIR is set, rsync the CollectiveX tree onto that +# compute-visible shared FS and mount from there. No-op (echo repo_root) when +# stage_dir is empty or equals repo_root. +cx_stage_repo() { + local repo_root="$1" stage_dir="${2:-}" + if [ -z "$stage_dir" ] || [ "$stage_dir" = "$repo_root" ]; then + echo "$repo_root"; return 0 + fi + mkdir -p "$stage_dir/experimental" || cx_die "cannot create stage dir $stage_dir" + cx_log "staging experimental/CollectiveX -> $stage_dir (compute-visible)" + rsync -a --delete \ + --exclude='.nccl-tests/' --exclude='__pycache__/' --exclude='results/plots/' \ + "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" >&2 \ + || cx_die "rsync to stage dir failed" + echo "$stage_dir" +} + +# cx_build_nccl_tests -> echoes the build/ dir. +# Runs IN-CONTAINER (login nodes have no nvcc). Cached: skips if already built. +# CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang +# cu130 images); override CX_CUDA_HOME / CX_NCCL_HOME / CX_MPI_HOME if needed. +cx_build_nccl_tests() { + local parent="$1" mpi="${2:-0}" dir bin + dir="$parent/nccl-tests" + bin="$dir/build/all_reduce_perf" + if [ -x "$bin" ]; then + cx_log "nccl-tests already built: $dir/build" + echo "$dir/build"; return 0 + fi + mkdir -p "$parent" + if [ ! -d "$dir/.git" ]; then + cx_log "cloning nccl-tests -> $dir" + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$dir" >&2 \ + || cx_die "git clone nccl-tests failed" + fi + cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr})" + make -C "$dir" -j MPI="$mpi" \ + CUDA_HOME="${CX_CUDA_HOME:-/usr/local/cuda}" \ + NCCL_HOME="${CX_NCCL_HOME:-/usr}" \ + ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \ + || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME; need nccl.h + libnccl)" + [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin" + echo "$dir/build" +} diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh new file mode 100644 index 000000000..a58411343 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# CollectiveX — 2-node B200 SKU adapter (cross CX-7 InfiniBand spine), x86_64. +# +# The other half of the headline: the same primitives as single-node B200, but +# spanning two nodes so the transport is InfiniBand rather than NVLink. Contrast +# with GB200, where the 2-node-equivalent stays on NVL72 NVLink (MNNVL). +# +# Multi-node orchestration differs from single-node, so this adapter does NOT +# use run_in_container.sh: it builds nccl-tests (MPI=1), runs each op across all +# ranks (raw capture), then parses on the login node. Currently CX_BENCH=nccl +# only (multi-node DeepEP/MNNVL is the srt-slurm follow-up). +# +# SPIKE CAVEATS: needs `srun --mpi=pmix` wired for pyxis and a compute-visible +# checkout — set CX_STAGE_DIR to a shared FS (e.g. /home/sa-shared/cx-stage) if +# the runner workspace is not cross-mounted to compute. +# +# Run: bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +CX_BENCH="${CX_BENCH:-nccl}" +[ "$CX_BENCH" = "nccl" ] || cx_die "launch_b200-dgxc-slurm.sh supports CX_BENCH=nccl only (got '$CX_BENCH'); multi-node DeepEP is a follow-up" + +RUNNER_NAME="${RUNNER_NAME:-b200-dgxc-slurm}" +PARTITION="${CX_PARTITION:-gpu-2}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +GPUS_PER_NODE="${CX_GPUS_PER_NODE:-8}" +NODES="${CX_NODES:-2}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image b200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +TOPO="b200-nvlink-island+cx7-ib" +WORLD=$((NODES * GPUS_PER_NODE)) +MPI_FLAG="${CX_SRUN_MPI:-pmix}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" + +declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf + [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf ) + +cx_log "runner=$RUNNER_NAME nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \ + --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" \ + --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" + --no-container-entrypoint) +ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json" + +# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node). +srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" \ + bash -c ' + set -euo pipefail + cd /ix/experimental/CollectiveX + source launchers/common.sh + mkdir -p results + cx_build_nccl_tests "$PWD/.nccl-tests" 1 >/dev/null + python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" + ' + +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/build" +OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + +# 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS. +for op in $OPS; do + raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt" + cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG) -> $raw" + srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \ + --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \ + --export=ALL,NCCL_CUMEM_ENABLE=1 \ + "$BUILD_IN_CTR/${BIN[$op]}" -b "${CX_MIN_BYTES:-8}" -e "${CX_MAX_BYTES:-8G}" -f 2 -g 1 -c 1 -w 5 -n 20 \ + > "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)" + + # 3) Parse on the login node (pure stdlib python; no container needed). + python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \ + --world-size "$WORLD" --nodes "$NODES" \ + --runner "$RUNNER_NAME" --topology-class "$TOPO" --transport ib \ + --env-json "$ENVJSON" \ + --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \ + --timestamp "$TS" || cx_log "WARN: parse $op failed" +done + +cx_log "done — JSON artifacts under $CX_DIR/results/" diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh new file mode 100644 index 000000000..a1b5c0135 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# CollectiveX — B200 single-node SKU adapter (8x B200, NVLink island, x86_64). +# +# Thin adapter: handles B200-specific allocation/container, then hands off to +# launchers/run_in_container.sh which runs whichever benchmark CX_BENCH selects +# (nccl | deepep | all). Mirrors runners/launch_b200-dgxc.sh (salloc + enroot +# squash + srun --container) with all model-serving stripped. +# +# Run from inside the InferenceX checkout on the B200 login node: +# bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh # nccl (default) +# CX_BENCH=deepep bash .../launch_b200-dgxc.sh # DeepEP (rebuild) +# +# Env knobs: CX_PARTITION(gpu-2) CX_ACCOUNT(benchmark) CX_NGPUS(8) CX_TIME(30) +# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES +# CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-b200-dgxc}" +PARTITION="${CX_PARTITION:-gpu-2}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image b200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/home/sa-shared/containers}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +export NCCL_CUMEM_ENABLE=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \ + --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" + +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh new file mode 100644 index 000000000..35cdb8e28 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray. +# +# Thin adapter: handles GB200-specific allocation/container/transport-env, then +# hands off to launchers/run_in_container.sh which runs whichever benchmark +# CX_BENCH selects (nccl | deepep | all). The same NCCL primitive shape that +# runs on B200 (NVLink island + CX-7 IB across nodes) runs here entirely inside +# the NVL72 NVLink (MNNVL) domain — that contrast is the headline. +# +# Run from inside the InferenceX checkout on the GB200 login node: +# bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # nccl (default) +# CX_BENCH=deepep bash .../launch_gb200-nv.sh # DeepEP (rebuild) +# +# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NGPUS(4) CX_TIME(30) +# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES +# CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-gb200-nv}" +PARTITION="${CX_PARTITION:-batch}" +ACCOUNT="${CX_ACCOUNT:-benchmark}" +NGPUS="${CX_NGPUS:-4}" # NVL72 compute tray = 4 GPU/node +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/lustre01/users-public/sa-shared}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +# Exported so srun --export=ALL carries them into run_in_container.sh. +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl" +export CX_BENCH="${CX_BENCH:-nccl}" +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +# Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded. +export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS (aarch64) bench=$CX_BENCH" +cx_log "image=$IMAGE" +SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \ + --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" \ + --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home \ + --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" + +cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh new file mode 100644 index 000000000..7729528b2 --- /dev/null +++ b/experimental/CollectiveX/launchers/run_in_container.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# CollectiveX — generic in-container benchmark dispatcher (single-node). +# +# Runs INSIDE the container under `srun`, invoked by every per-SKU adapter +# (launch_.sh). The SKU adapter handles allocation/container/transport-env; +# this script decides WHICH benchmark to run from CX_BENCH, so any benchmark can +# be driven through any SKU's launch script. Writes provenance-tagged JSON to +# results/. +# +# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO +# Selector: CX_BENCH = nccl | deepep | all (default nccl) +# NCCL knobs: CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME +# DeepEP knobs: CX_TOKENS_PER_RANK CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE +set -euo pipefail + +cd /ix/experimental/CollectiveX +# shellcheck source=common.sh +source launchers/common.sh +mkdir -p results + +: "${CX_RUNNER:?CX_RUNNER not set}" +: "${CX_NGPUS:?CX_NGPUS not set}" +: "${CX_TS:?CX_TS not set}" +: "${CX_TOPO:?CX_TOPO not set}" +CX_BENCH="${CX_BENCH:-nccl}" +CX_TRANSPORT="${CX_TRANSPORT:-}" +ENVJSON="results/env_${CX_RUNNER}_${CX_TS}.json" + +cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO" +python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS" + +run_nccl_suite() { + local build ops op + build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" # single-node: MPI=0, -g N + ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + for op in $ops; do + python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \ + --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \ + --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1 \ + || cx_log "WARN: nccl $op failed" + done +} + +run_deepep_suite() { + # DeepEP is not bundled in the multi-arch image. Try to import; if absent, + # attempt rebuild-deepep (srt-slurm setup script) when available, else skip. + if ! python3 -c "import deep_ep" 2>/dev/null; then + if command -v rebuild-deepep.sh >/dev/null 2>&1; then + cx_log "building DeepEP via rebuild-deepep.sh" + rebuild-deepep.sh >&2 || cx_log "WARN: rebuild-deepep.sh failed" + else + cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; skipping deepep" + return 0 + fi + fi + torchrun --nproc_per_node="$CX_NGPUS" run_deepep.py \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \ + --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \ + --dispatch-dtype "${CX_DISPATCH_DTYPE:-fp8}" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_deepep_${CX_TS}.json" \ + || cx_log "WARN: deepep run failed" +} + +case "$CX_BENCH" in + nccl) run_nccl_suite ;; + deepep) run_deepep_suite ;; + all) run_nccl_suite; run_deepep_suite ;; + *) cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|all)" ;; +esac + +echo "=== results ==="; ls -1 results/*.json diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md new file mode 100644 index 000000000..365b23455 --- /dev/null +++ b/experimental/CollectiveX/plan.md @@ -0,0 +1,939 @@ +# CollectiveX — Plan + +> **How to read this.** This is the single canonical plan. It is **spike-first** and **scoped to `experimental/CollectiveX/`** on a branch — nothing in the production serving path changes until a promotion decision is made later. Part 1 is background (what CollectiveX is, reconstructed from team discussion). Part 2 is the implementation plan. Where this plan says "now," it means the Milestone 0 spike; "later" items (GitHub workflow, database, app frontend) are deliberately deferred. All repository references (runners, launchers, workflows, matrix logic, the `experimental/` charter) were verified against the live InferenceX repo — see References. + +--- + +# Part 1 — Background + +## What it is + +CollectiveX is an benchmarking workstream under the InferenceX umbrella. It measures **collective communication** and **MoE dispatch/combine**, and performs **apples-to-apples, cross-vendor comparison of expert-parallel (EP) libraries** across NVIDIA and AMD (TPU later). The intended deliverables are an **OSS benchmark project** and a **public explainer article** — a credible cross-vendor collective benchmark plus the story around it. + +## Why + +Existing public benchmarks don't offer trustworthy, like-for-like collective/EP comparison across vendors. CollectiveX fills that gap by reusing InferenceX's runner and cluster infrastructure to produce reproducible, provenance-tagged results. + +## Current state + +- An initial MVP exists: it collected collective and kernel shapes and produced MoE dispatch/combine results on NVIDIA. +- **Normal mode works; low-latency (LL) mode is blocked** on IBGDA enablement — a direct GPU↔NIC data-and-control path over PCIe that removes CPU coordination and simplifies MoE dispatch/combine collectives — which depends on cluster-networking work outside this project. +- The main near-term enabler is NVIDIA networking / IBGDA; the AMD EP stack and AMD networking (Ultra Ethernet) are the cross-vendor counterpart. + +--- + +# Part 2 — Implementation plan + +## Implementation status (built) + +The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that: + +- **Multi-arch, digest-pinned container** for all NVIDIA SKUs: `lmsysorg/sglang:v0.5.12-cu130@sha256:4219…f356` (amd64 + arm64) — one reference both arches; DeepEP via `rebuild-deepep`. See `CONTAINERS.md`. +- **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|all) through a shared `launchers/run_in_container.sh`. +- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → GB200 NCCL smoke; `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. + +This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental). + +## Scope and placement + +CollectiveX starts as an **experimental project on its own branch**, fully contained under `experimental/CollectiveX/`: + +```bash +git switch main +git pull --ff-only +git switch -c collectivex +mkdir -p experimental/CollectiveX +``` + +This matches the repository's intent: `experimental/` is explicitly non-core ("experimental WIP code that is mostly Claude Code generated… not intended for production use or as part of the official InferenceMAX results"). + +For the experimental phase, **everything stays inside `experimental/CollectiveX/**`**. Do **not** modify: + +```text +benchmarks/ +runners/ +utils/ +.github/configs/ +perf-changelog.yaml +InferenceX-app +``` + +The only eventual exception is a minimal workflow dispatcher under `.github/workflows/` (because executable workflows must live there); all real CollectiveX logic, schemas, launchers, and processing stay under `experimental/CollectiveX/`. + +**This supersedes any notion of CollectiveX becoming a top-level InferenceX subsystem or extending the production serving matrix up front.** Promotion — into core InferenceX, into a dedicated repo, or into InferenceX-app's database/frontend — is an explicit *later* decision (Milestone 4), made only after the benchmark contract has stabilized on real hardware. + +### What InferenceX already gives us + +InferenceX's existing execution model is almost exactly the control plane CollectiveX needs: + +1. Generate and strictly validate a matrix on a GitHub-hosted runner. +2. Fan jobs out to named or labelled self-hosted runners. +3. Those listeners submit work to Slurm (or launch Docker locally). +4. Normalize outputs. +5. Upload artifacts. +6. Aggregate and dispatch ingestion to the dashboard. + +`e2e-tests.yml` already divides generated configs into job families and invokes reusable single-node and multi-node workflows; `benchmark-tmpl.yml` cleans up resources, checks out the selected ref, **derives the launcher from the runner name**, launches the job, validates outputs, and uploads normalized results. Runner listeners live on cluster login/controller nodes while jobs run on compute nodes via Slurm; runner names/labels are load-bearing — the name prefix selects the launcher and exact names/SKU labels control scheduling. + +CollectiveX reuses all of this, but enters through **CollectiveX-specific launchers** rather than threading fake models through the serving launchers (see Cluster reuse). + +## Architecture + +Four planes, cleanly separated: + +- **Control plane:** scheduling, runners, cleanup, artifact movement, workflow metadata (reused from InferenceX). +- **Benchmark plane:** collective semantics, backend invocation, correctness, timing. +- **Data plane:** canonical result records, raw per-rank samples, topology and provenance. +- **Presentation plane:** comparable subsets, charts, history, diagnostics. + +Data flow within the experimental directory: + +```text +Portable shape definitions + + +Backend definitions + + +Target/cluster definitions + ↓ +CollectiveX matrix resolver + ↓ +Resolved shards + ↓ +Existing InferenceX self-hosted runner + ↓ +experimental/CollectiveX/launchers/* + ↓ +Backend adapter (NCCL / RCCL / DeepEP / AITER / MoRI / …) + ↓ +Versioned result bundle + ↓ +Aggregator + regression checker + ↓ +Static experimental report → (later) InferenceX-app ingestion → Postgres → /collectives +``` + +### Target structure at promotion (Milestone 4) + +This packaged layout is the **promotion target**, not the spike. Milestone 0 uses the light layout in the rollout section below (`run_nccl.py` / `run_deepep.py` / `env_capture.py` / `plot.py` + flat `results/`); the structure here is what CollectiveX grows into *if* it is promoted out of `experimental/`. + +```text +InferenceX/ +├── experimental/ +│ ├── README.md +│ └── CollectiveX/ +│ ├── README.md +│ ├── DESIGN.md +│ ├── ROADMAP.md +│ ├── pyproject.toml +│ ├── Makefile +│ │ +│ ├── src/ +│ │ └── collectivex/ +│ │ ├── __init__.py +│ │ ├── cli.py +│ │ ├── config/ +│ │ │ ├── models.py +│ │ │ ├── loader.py +│ │ │ ├── resolver.py +│ │ │ └── matrix.py +│ │ ├── benchmark/ +│ │ │ ├── harness.py +│ │ │ ├── timing.py +│ │ │ ├── correctness.py +│ │ │ ├── routing.py +│ │ │ └── metrics.py +│ │ ├── backends/ +│ │ │ ├── base.py +│ │ │ ├── fake.py +│ │ │ ├── nccl_tests.py +│ │ │ ├── rccl_tests.py +│ │ │ ├── deepep.py +│ │ │ └── framework_ep.py +│ │ ├── cluster/ +│ │ │ ├── inventory.py +│ │ │ ├── capabilities.py +│ │ │ ├── environment.py +│ │ │ └── launcher.py +│ │ ├── results/ +│ │ │ ├── models.py +│ │ │ ├── writer.py +│ │ │ ├── aggregate.py +│ │ │ ├── compare.py +│ │ │ └── redact.py +│ │ └── report/ +│ │ ├── build.py +│ │ └── templates/ +│ │ +│ ├── configs/ +│ │ ├── suites/ +│ │ │ ├── smoke.yaml +│ │ │ ├── primitives.yaml +│ │ │ ├── moe-decode.yaml +│ │ │ ├── moe-prefill.yaml +│ │ │ └── full.yaml +│ │ ├── shapes/ +│ │ │ ├── synthetic/ +│ │ │ └── traced/ +│ │ ├── backends/ +│ │ ├── targets/ +│ │ └── clusters.yaml +│ │ +│ ├── launchers/ +│ │ ├── common.sh +│ │ ├── launch_b200-dgxc.sh # B200 single node +│ │ ├── launch_b200-dgxc-slurm.sh # B200 multinode +│ │ └── launch_gb200-nv.sh # GB200 NVL72 +│ │ +│ ├── schemas/ +│ │ ├── case-v1.schema.json +│ │ ├── result-v1.schema.json +│ │ ├── manifest-v1.schema.json +│ │ └── environment-v1.schema.json +│ │ +│ ├── scripts/ +│ │ ├── bootstrap.sh +│ │ ├── run_suite.sh +│ │ ├── run_shard.sh +│ │ └── build_report.sh +│ │ +│ ├── tests/ +│ │ ├── fixtures/ +│ │ ├── test_config.py +│ │ ├── test_matrix.py +│ │ ├── test_parsers.py +│ │ ├── test_correctness.py +│ │ └── test_comparability.py +│ │ +│ └── docs/ +│ ├── BENCHMARK_CONTRACT.md +│ ├── BACKEND_ADAPTER.md +│ ├── SHAPE_REGISTRY.md +│ ├── RESULT_FORMAT.md +│ ├── FRONTEND.md +│ └── PROMOTION_CRITERIA.md +│ +└── .github/workflows/ + └── collectivex-experimental.yml # Added only when cluster CI begins (Milestone 2) +``` + +> Note: launcher names mirror the real runner-name prefixes. The spike adds the three NVIDIA launchers above; AMD (`launch_mi355x-amds.sh`) and others follow. + +## Benchmark model — keep four concepts separate + +CollectiveX needs its **own** schema. Do **not** reuse or extend the serving matrix, which is built around model / ISL / OSL / framework / TP / EP / concurrency and lives in `utils/matrix_logic/generate_sweep_configs.py`. Representing collectives with fake model names, `ISL=0`, or overloaded concurrency fields would create permanent technical debt. CollectiveX gets its own matrix logic (in the packaged layout, `src/collectivex/config/matrix.py`) — introduced with the workflow at Milestone 2, not the spike — rather than touching `utils/matrix_logic/generate_sweep_configs.py`. + +The model keeps four concepts independent: + +**Shape** — the logical communication workload: + +```text +operation, message size, tokens per rank, hidden size, top-k, +expert count, routing distribution, dtype, phase +``` + +**Backend** — the implementation under test: + +```text +NCCL, RCCL, DeepEP, AITER, MoRI, framework-native EP, reference implementation +``` + +**Target** — where and how it runs: + +```text +runner type, cluster, nodes, GPUs per node, rank placement, +fabric, container image, transport capabilities +``` + +**Suite** — a curated selection of shape × backend × target combinations. Keeping these separate prevents copying the same DeepSeek/MiniMax shape into every NVIDIA and AMD configuration. + +### Portable definitions + +Shape: + +```yaml +schema-version: 1 +shape-id: moe.decode.h7168.top8.e256.t64.uniform.v1 + +kind: moe +phase: decode +operation: dispatch-combine + +shape: + tokens-per-rank: 64 + hidden-size: 7168 + top-k: 8 + num-experts: 256 + dispatch-dtype: fp8 + combine-dtype: bf16 + routing: + distribution: uniform + seed: 67 + expert-alignment: 16 +``` + +Backend: + +```yaml +backend-id: deepep-normal +backend: deepep +mode: normal + +source: + repository: deepseek-ai/DeepEP + ref: pinned-commit + +settings: + async-overlap: false + num-comm-sms: standardized + qp-count: auto +``` + +Target: + +```yaml +target-id: b200-dgxc-4n +runner-type: b200-multinode +cluster-id: b200-dgxc + +resources: + nodes: 4 + gpus-per-node: 8 + exclusive: true + +placement: + ranks-per-node: 8 + rank-order: contiguous + +capabilities: + rdma: true + ibgda: experimental + nvshmem: true +``` + +Suite: + +```yaml +suite-id: moe-decode-smoke + +shapes: + - moe.decode.h7168.top8.e256.t64.uniform.v1 + +backends: + - deepep-normal + - deepep-low-latency + +targets: + - b200-dgxc-2n + +measurement: + warmup-iterations: 20 + measured-iterations: 200 + trials: 3 + correctness: full +``` + +### Case identity + +A **case** is one immutable, versioned point: the natural key composes the three concepts — + +```text +case-id = __ __ +e.g. deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n + nccl__allreduce.fp16.logsweep.v1__b200-dgxc-2n +``` + +A shape must never silently change; a newly extracted distribution gets a new versioned `shape-id`. + +**Required shape fields — primitives:** operation; logical element count; datatype; input/output bytes; in-place vs out-of-place; reduction op (where applicable); world size; rank placement; host-driven vs device-driven launch; blocking/synchronization semantics. + +**Required shape fields — MoE (additional):** tokens per rank; hidden size; top-k; number of experts; EP size; dispatch and combine dtypes; routing distribution; expert alignment/padding; capacity constraints; quantization scale representation; cached vs recomputed routing layout; communication-SM count; async-overlap mode. DeepEP shows why these must be first-class — its interface takes tokens/rank, hidden size, top-k, expert count, FP8 mode and comm-SM settings, and exposes async dispatch/combine. + +### Shape registry + +Two independent shape sources: + +**Synthetic** — for continuous curves and hardware characterization (logarithmic byte sweep for primitives; token-count sweep for MoE; EP-scaling sweep; uniform and controlled-skew routing; intranode and internode placements; decode-oriented and prefill-oriented regimes). Don't build every Cartesian combination; define named suites (`primitive-latency-v1`, `primitive-bandwidth-v1`, `moe-decode-v1`, `moe-prefill-v1`, `moe-skew-v1`, `scaleout-v1`). + +**Trace-derived** — extracted from real InferenceX runs/profiles: + +```text +models/deepseek-v4/decode/ +models/minimax-m3/decode/ +models/kimi-k2.7/prefill/ +``` + +Each traced shape retains: source workflow run; model/config; phase; layer/layer-group; observed token histogram; routing skew; concurrent collective count; framework version; extraction-tool version. InferenceX already has a targeted profiling workflow (`profile.yml`) with optional MoE debug output and a separate trace-storage path — a natural source for real shapes rather than only guessed synthetic inputs. + +## Benchmark layers and comparison classes + +| Layer | Purpose | Examples | +|---|---|---| +| **L0 Environment** | Prove the cluster is benchmarkable | topology, NIC/GPU state, peer access, RDMA, IBGDA capability, version capture | +| **L1 Primitive collectives** | Characterize the raw communication substrate | send/recv, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv | +| **L2 MoE communication** | Compare real EP libraries | dispatch, combine, dispatch+combine round trip, normal and low-latency modes | +| **L3 Integrated pipelines** | Communication in realistic operator sequences | route → permute → dispatch → grouped GEMM → combine → unpermute | +| **L4 E2E correlation** | Explain InferenceX serving performance | isolated CollectiveX result linked to the corresponding InferenceX run/profile | + +The MVP concentrates on **L1 and L2**. L3 overlaps OperatorX and comes after the contracts are stable; L4 is the eventual tie-back to serving. + +**L0 — Environment validation** (before measuring anything): GPU count/identity; GPU/NIC topology; CUDA/ROCm version; driver version; NCCL/RCCL version; RDMA device visibility; peer-access matrix; IBGDA/SHMEM capability; container digest; clock/power state; selected network interfaces. A failed probe yields one clear `environment-invalid` result, not dozens of misleading backend failures. + +**L1 — Primitives:** send/receive, all-reduce, all-gather, reduce-scatter, all-to-all, all-to-allv. Use vendor test programs where possible rather than rewriting primitives. Measure two regions separately: latency (bytes→low KiB) and bandwidth (MiB→GiB). + +**L2 — MoE collectives:** dispatch, combine, dispatch+combine. Dimensions: tokens/rank, hidden size, top-k, expert count, EP size, dispatch dtype, combine dtype, routing skew, normal vs low-latency, comm-SM count, node count. + +### Three comparison classes + +Every result is tagged with exactly one, and they must never be silently mixed on one chart: + +| Class | Meaning | +|---|---| +| `standardized` | Matched logical shape **and** fixed resource budget — same shape, topology, dtype, correctness contract, allowed comm-SMs, and timing boundaries. The main apples-to-apples comparison. | +| `backend-optimized` | Same logical output, but each library uses its recommended comm-SMs / protocols / QP count / buffer sizing / graph capture / tuning. Answers "what is the best each stack can do?" | +| `framework-integrated` | The actual path used by SGLang / vLLM / TensorRT-LLM / Dynamo. Connects to InferenceX; not a pure microbenchmark. | + +### Comparability key + +Every result gets a machine-generated comparison key; rows with different keys are not connected on the same curve by default: + +```text +operation, shape ID, dtype, world size, node count, rank placement, +routing distribution, comparison class, measurement contract version, topology class +``` + +## Measurement and correctness + +### Timing boundaries + +Record separately — never report one latency that sometimes includes JIT and sometimes doesn't: + +```text +1. communicator creation +2. buffer allocation and registration +3. first invocation / JIT +4. warmed steady-state invocation +5. host launch time +6. GPU completion time +7. optional end-to-end framework-visible time +``` + +Per measured iteration: synchronize before starting (unless explicitly testing queued execution); use GPU events for device duration and host monotonic time for API/launch duration; retain per-rank measurements; aggregate only after rank-level data is stored; report the **slowest rank** as well as the average. + +### Correctness as a hard gate + +A result is `valid` only after correctness passes. A fast result that fails correctness stays visible as `invalid` — never silently dropped. + +Primitive checks: deterministic input; expected reduction result; guard regions around buffers; in-place and out-of-place checks; dtype-specific tolerances. + +MoE checks: token conservation; correct expert assignment; correct routing weights; valid permutation metadata; dispatch output vs reference; combine output vs reference; no padded-token leakage; deterministic routing hash. + +Failed results remain in artifacts, e.g.: + +```json +{ + "status": "invalid", + "correctness_passed": false, + "error": "combine result exceeded bf16 tolerance" +} +``` + +### Routing distributions + +At minimum: uniform; single-hot/worst-case concentration; Zipf-like skew; bounded imbalance; replayed real histogram. Store the routing seed and the generated assignment hash. + +### Metrics + +| Category | Metrics | +|---|---| +| Latency | p50, p90, p95, p99, min, max | +| Rank behavior | slowest-rank latency, rank spread, coefficient of variation | +| Primitive throughput | algorithm bandwidth, bus bandwidth, effective bytes/s | +| MoE throughput | tokens/s, logical payload GB/s, dispatch and combine separately | +| Efficiency | bandwidth relative to declared topology bottleneck | +| Host overhead | API launch time, CPU utilization where available | +| GPU overhead | communication SM count, GPU active time, optional power | +| Memory | persistent buffer bytes, peak temporary bytes | +| Overlap | standalone comm, standalone compute, overlapped duration, overlap efficiency | +| Reliability | initialization failures, hangs, retries, correctness failures | +| Provenance | all software, image, driver, firmware and topology identifiers | + +### Bandwidth definitions + +NCCL `algbw`/`busbw` are stored but not treated as universal (NCCL applies operation-specific correction factors). MoE libraries often report **logical bottleneck bandwidth** (may include local-rank traffic or exclude metadata/padding; DeepEP explicitly publishes logical bandwidth). Store separate fields, and use `null` rather than a deceptive inference when a backend can't expose physical bytes: + +```text +logical_payload_bytes +allocated_payload_bytes +estimated_link_bytes +metadata_bytes +padding_bytes +``` + +## Result and artifact format + +Each shard emits a versioned bundle: + +```text +output/ +├── manifest.json +├── cases.json +├── results.jsonl +├── rank-samples.jsonl.gz +├── summary.json +├── environment/ +│ ├── gpu.json +│ ├── network.json +│ ├── topology.json +│ └── software.json +├── raw/ +│ ├── stdout.log +│ ├── stderr.log +│ └── backend-output/ +├── commands/ +│ └── reproduce.sh +└── profiles/ +``` + +**Manifest** (invariant run-level metadata): schema version; workflow run + attempt; source SHA/ref; cluster ID; runner; Slurm job ID; node count; topology fingerprint; image digest; backend commit/build; start/end timestamps; redaction version. + +**Result row:** + +```json +{ + "schema_version": 1, + "case_id": "deepep-normal__moe.decode.h7168.top8.e256.t64.uniform.v1__b200-dgxc-4n", + "status": "valid", + "trial": 1, + "backend": "deepep", + "mode": "normal", + "comparison_class": "standardized", + "metrics": { + "latency_us_p50": 0, + "latency_us_p99": 0, + "slowest_rank_us_p50": 0, + "logical_bandwidth_gbps": 0, + "tokens_per_second": 0, + "rank_spread_pct": 0, + "persistent_buffer_bytes": 0 + }, + "correctness": { "passed": true, "max_abs_error": 0, "max_rel_error": 0 } +} +``` + +Use an explicit `schema_version` from the beginning — do not repeat the app's historical need to infer schema version from whether a field happens to exist. + +## Backend adapters + +Each adapter implements a small contract: + +```python +class CollectiveBackend: + def probe(self, environment) -> CapabilityReport: ... + def prepare(self, case, workdir) -> PreparedCommand: ... + def run(self, prepared, launcher) -> RawRun: ... + def parse(self, raw_run) -> list[RankSample]: ... + def validate(self, case, raw_run) -> CorrectnessReport: ... + def describe(self) -> BackendProvenance: ... +``` + +**Tier 0 — communication baselines:** NVIDIA `nccl-tests`, ROCm `rccl-tests`, optionally PyTorch distributed as a common-API baseline. Don't rewrite primitives from scratch — `nccl-tests` already supports multi-node, warmups, correctness checking (`-c 1`), per-rank aggregation, device-driven implementations, and separate CPU-time reporting. *(Confirm whether the installed build emits JSON; if not, parse the text table.)* + +**Tier 1 — MoE dispatch/combine:** upstream DeepEP, ROCm DeepEP, and the NVIDIA/AMD EP paths already used by the InferenceX serving stacks. **Version pins are first-class.** Upstream DeepEP V2 changed NVSHMEM→NCCL, unified high-throughput and low-latency APIs, changed buffer behavior, and removed a previous zero-SM LL mode; ROCm's port has different maturity, NIC variants, rocSHMEM dependencies. DeepEP is **built at job setup** (via `rebuild-deepep.sh`, resolved by srt-slurm), not shipped in the image — its build time and `aarch64` (GB200) feasibility are tracked spike risks. A chart labelled only "DeepEP" is therefore ambiguous — store: + +```text +backend name, upstream/fork, git commit, API generation, +transport backend, build flags, runtime library versions, container digest +``` + +**Tier 2 — additional optimized stacks (later):** MSCCL++, AITER comm/fusion paths, MoRI/Pollara, NVSHMEM/rocSHMEM microbenchmarks, framework-native fused collectives. + +## Rollout — spike-first + +**Spike-first.** No schema, Pydantic model, or comparison contract is frozen until one real, correctness-gated number exists on real hardware. The first milestone is a single end-to-end spike on **two NVIDIA topologies, B200 and GB200**, chosen because they exercise the two transport regimes that matter: B200 is an 8-GPU NVLink island with CX-7 InfiniBand between nodes; GB200 is an NVL72 multi-node-NVLink (MNNVL) domain. Running the same collective across both is itself the first headline result, and it forces the provenance and comparison-class machinery to be real from line one. The schema is the spike's *output*, extracted from the artifacts it produces — not its input. AMD and all platform work (workflow, DB, frontend) follow. + +### Milestone 0 — NVIDIA B200 + GB200 spike + +One milestone, NVIDIA-only, end to end. This collapses the former "design contract," "CPU framework," "primitive NVIDIA baseline," and the NVIDIA half of "MoE MVP" into a single vertical slice that produces real numbers on real fabric. + +Scaffolding — deliberately light, matching `experimental/` convention (bare scripts + flat JSON + a plot; no package / Pydantic / JSON-schemas yet — those arrive at the contract freeze): + +```text +experimental/CollectiveX/ + README.md + run_nccl.py # argparse; run stock nccl-tests, parse its text table (do NOT assume JSON) + run_deepep.py # one dispatch+combine shape, normal mode + env_capture.py # Layer-0 env + topology fingerprint (torch.cuda.* + nvidia-smi topo) → json + plot.py # matplotlib, like token_position_decode_slo/*/plot_*.py + launchers/ + common.sh + launch_b200-dgxc.sh # B200 single node (b200-dgxc runner → 8-GPU NVLink island, x86_64) + launch_b200-dgxc-slurm.sh # B200 multinode (b200-multinode runner → CX-7 IB spine) + launch_gb200-nv.sh # GB200 (gb200 runner → NVL72 MNNVL, aarch64, 4 GPU/node) + results/*.json # flat, hand-verifiable +``` + +Reuse existing patterns rather than reinventing: `experimental/dsv32/bench.py` for `torch.cuda.Event` timing and stdout environment capture, and `experimental/token_position_decode_slo/glm-5/{bmk_*_sbatch.sh,plot_sla_frontier.py}` for Slurm orchestration + plotting. Mirror the runner→launcher routing convention (`bash ./launchers/launch_${RUNNER_NAME%%_*}.sh`) so the runner name selects the CollectiveX launcher as the serving path does. + +**DeepEP is not prebuilt in any image.** The serving recipes build it at job setup via `setup_script: rebuild-deepep.sh` (resolved by srt-slurm; see `benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`). The spike reuses that same rebuild path — on B200 (x86_64) first. Pin images by digest from `.github/configs/nvidia-master.yaml`: B200 `lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b`; GB200 `lmsysorg/sglang:nightly-dev-cu13-20260608-303757cc` (an unpinned nightly today — capture its digest before relying on it). + +What it measures: + +```text +Primitives (stock nccl-tests, -c 1 for correctness) — on BOTH B200 and GB200: + all-reduce, all-gather, reduce-scatter, all-to-all + latency regime (bytes→KiB) and bandwidth regime (MiB→GiB) + B200 : 8 GPU/node (x86_64); 1 node (NVLink island) and 2 nodes (cross CX-7 IB) + GB200 : 4 GPU/node (aarch64); 1 node and 2+ nodes — all still inside the NVL72 NVLink (MNNVL) domain + +MoE (DeepEP, normal mode only — LL mode is the known-broken/blocked path, out of scope): + one decode-shaped dispatch+combine: tokens-per-rank=64, hidden=7168, + top-k=8, experts=256, dispatch fp8 + correctness: token conservation + combine vs a reference implementation + B200 (x86_64) first; GB200 DeepEP is a fast-follow once the aarch64 rebuild-deepep path is proven +``` + +The headline is the **same NCCL primitive shape on both topologies**: B200's 2-node path crosses CX-7 InfiniBand, while GB200's stays on NVL72 NVLink (MNNVL). That IB-vs-MNNVL contrast at a matched logical shape is the result worth publishing. (nccl-tests and DeepEP must be built for `aarch64` on GB200 — the reason DeepEP is B200-first.) + +Provenance captured on every row from the first run — non-negotiable even in a spike, because it is what makes the B200-vs-GB200 number defensible: + +```text +topology-class b200-nvlink-island(+cx7-ib) | gb200-nvl72-mnnvl +transport actually used (NVLink / IB / NVSHMEM-IBGDA), derived from flags + measured behavior +transport env set/recorded: + B200 : NCCL_CUMEM_ENABLE=1 + GB200 : NCCL_CUMEM_ENABLE=1, NCCL_MNNVL_ENABLE=1, MC_FORCE_MNNVL=1 + (also seen in serving: NCCL_P2P_LEVEL=NVL, SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK) +comm-SM count, QP count where applicable +backend commit + API generation + build flags +container digest, CUDA / driver / NCCL versions +comparison-class tag (standardized where shape, dtype and SM budget match) +``` + +These flags come from validated GB200 serving recipes (`…/srt-slurm-recipes/sglang/qwen3.5/gb200-fp8/`); MNNVL is GB200/GB300-only, which is exactly what makes the transport differ from B200. + +Output: a result bundle on disk (`manifest.json`, `results.jsonl`, `environment/`, `raw/`, `commands/reproduce.sh`). Hand-verify the first rows; do not build a generated Pydantic contract yet. + +Exit criteria: + +* real NCCL latency + bandwidth curves on **both** B200 and GB200, correctness-passed (the headline) +* one DeepEP dispatch+combine number (normal mode) on **B200**, correctness-passed; GB200 DeepEP as the immediate fast-follow +* every row carries topology-class, transport, comparison-class and full provenance +* a B200-vs-GB200 side-by-side that the comparison key permits **and labels as topology-class-differing** — that labeled comparison is the intended result, not an accident +* **only now** freeze the schema (`CollectiveCase` / `CollectiveResult` / manifest), extracted from these artifacts + +Explicitly out of scope for the spike: AMD, IBGDA low-latency mode, GitHub Actions, database, frontend, trace-derived shapes, and the fake backend as a deliverable (keep a trivial one only if it speeds offline tests). + +### Milestone 1 — AMD parity + +Bring the AMD side up against the schema the spike froze — not in parallel with it: + +```text +RCCL-tests adapter (mirror the nccl-tests text-table parser) +one AMD launcher (launch_mi355x-amds.sh) +one AMD MoE dispatch/combine backend (DeepEP ROCm / AITER / MoRI) +equivalent shapes + identical result contract +first cross-vendor (NVIDIA vs AMD) comparison +``` + +Record the AMD transport stack (rocSHMEM, MoRI-IO / Pollara, NIC variant) with the same provenance rigor the spike established. An unlabeled "DeepEP" row compared across vendors is meaningless. + +### Milestone 2 — GitHub workflow + +Add (orchestration only; see GitHub workflow design below): + +```text +collectivex-experimental.yml +preflight +canary +matrix sharding +artifact collection +regression comparison +static report artifact +``` + +Do not connect it to `perf-changelog.yaml`. + +### Milestone 3 — Trace-derived shapes + +Extract representative shapes from InferenceX profiles (DeepSeek V4, MiniMax M3, Kimi). Every traced shape must retain: source workflow run; source configuration; framework version; model phase; extraction-tool version; routing-histogram hash. + +### Milestone 4 — Promotion decision + +Only then decide whether to: keep CollectiveX permanently experimental; move it into core InferenceX; extract it into a dedicated repository; or integrate its data into InferenceX-app (database + `/collectives` frontend). + +### First PRs (the spike) + +The spike lands as a few small PRs, each producing something runnable — not a docs-and-schema PR: + +```text +1. Scaffold + NCCL on B200 single node + run_nccl.py (text-table parser), env_capture.py, plot.py, + launchers/launch_b200-dgxc.sh, results/*.json + → lands when it emits a real all-reduce curve with provenance from an 8-GPU B200 + +2. B200 multinode + GB200 + launchers/launch_b200-dgxc-slurm.sh, launchers/launch_gb200-nv.sh + → lands when the same primitive runs on 2-node B200 (cross-IB) and on GB200 NVL72 (MNNVL), + each tagged with topology-class and transport (aarch64 build for GB200) + +3. DeepEP dispatch+combine — B200 first + run_deepep.py, routing generator + reference combine for correctness, + reusing rebuild-deepep at job setup + → one decode shape, normal mode, on B200; GB200 DeepEP fast-follow + +4. Freeze the contract + extract the case / result / manifest schema from the bundles produced in 1–3; + add fixtures captured from real output — this is where the packaged structure begins +``` + +The first objective is a real, provenance-tagged, correctness-gated number on two NVIDIA topologies — the contract is the spike's output, not its foundation. + +## Cluster reuse and capability inventory + +### What to reuse + +Existing self-hosted runner registrations; exact runner labels; Slurm access from runner hosts; checkout and artifact patterns; resource-cleanup strategy; repository secrets; container caches where appropriate. The runner inventory (`.github/configs/runners.yaml`) already enumerates H100, H200, B200, B300, GB200, GB300, MI300X, MI325X, MI355X fleets and groups such as `h200-multinode`, `b200-multinode`, individual nodes, etc. CollectiveX **reads** this file rather than duplicating runner names. + +### What not to reuse directly + +Do not call the serving launchers (`runners/launch_${RUNNER_NAME%%_*}.sh`) — they carry model-serving assumptions (model paths, framework setup, result naming). Mirror the **selection convention** with CollectiveX launchers instead: + +```bash +bash experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh +``` + +Each CollectiveX launcher handles only: Slurm allocation; container image; mounts; network environment; rank launch; result copy-back; cleanup. There are **two launch paths**, mirroring the serving side: **single-node** B200 mirrors the `salloc … --gres=gpu:N --exclusive … && srun --container-image=` pattern in `runners/launch_b200-dgxc.sh`; **multi-node** B200/GB200 drives **srt-slurm** (`srtctl apply -f `), which already knows how to rebuild DeepEP and set the MNNVL env — so the CollectiveX GB200 launcher is a thin wrapper handing srt-slurm a CollectiveX recipe, not a from-scratch sbatch. (Later, common Slurm/container functions can be factored into a shared lib used by both systems.) + +> Runner-name subtlety to handle in `inventory.py`: one physical cluster can appear under multiple prefixes — `b200-dgxc_NN` routes to `launch_b200-dgxc.sh` (single-node) while `b200-dgxc-slurm_N` (label `b200-multinode`) routes to `launch_b200-dgxc-slurm.sh`. One fabric domain can therefore span several runner labels. + +### Capability overlay + +`inventory.py` loads `../../../.github/configs/runners.yaml` and combines it with a CollectiveX capability overlay — one source of truth for runner names, CollectiveX metadata kept isolated: + +```yaml +b200-multinode: + launcher: b200-dgxc-slurm + vendor: nvidia + hardware: b200 + topology-class: b200-nvlink-cx7 + fabric-domain: b200-dgxc-main + gpus-per-node: 8 + arch: x86_64 + max-nodes: 16 + scheduler: slurm + container-runtime: enroot-pyxis + capabilities: + nccl: true + deepep: true # built at job setup via rebuild-deepep, not prebuilt + rdma: true + nvshmem: true + ibgda: experimental # capability present ≠ currently validated + scheduling: + exclusive-nodes: true + max-parallel-shards: 1 + +gb200: + launcher: gb200-nv + vendor: nvidia + hardware: gb200 + topology-class: gb200-nvl72-mnnvl + gpus-per-node: 4 # NVL72 compute tray + arch: aarch64 # nccl-tests + DeepEP must build for aarch64 + scheduler: srt-slurm + transport-env: { NCCL_CUMEM_ENABLE: 1, NCCL_MNNVL_ENABLE: 1, MC_FORCE_MNNVL: 1 } + capabilities: + nccl: true + deepep: true # rebuilt at setup; aarch64 path is a tracked risk + mnnvl: true # GB200/GB300 only + ibgda: experimental +``` + +`fabric-domain` is essential: two jobs on separate compute nodes may still contend for the same leaf/spine network, so **GitHub concurrency is keyed by fabric domain, not GPU SKU**. The inventory distinguishes hardware capability, software currently installed, and feature state (known-good vs experimental vs temporarily broken) — IBGDA support and "IBGDA low-latency currently validated" are different properties. + +**Operational coexistence with the serving sweep.** `b200-multinode` is only three runners (`b200-dgxc-slurm_7/8/9`), **shared with the production serving sweeps**, and srt-slurm allocations are long. Exclusive nodes + `max-parallel-shards: 1` + fabric-domain serialization means CollectiveX and the serving sweep contend for the same scarce runners. Decide the scheduling/coexistence policy (off-hours windows? a dedicated runner?) before enabling any recurring CollectiveX suite, rather than discovering the contention in CI. + +## GitHub workflow design (Milestone 2) + +When cluster CI begins, add one small orchestration-only file — `.github/workflows/collectivex-experimental.yml` — with no benchmarking logic: + +```text +validate → resolve matrix → preflight canaries → benchmark shards +→ aggregate → compare against baseline → build static report → upload artifacts +``` + +Triggers while on the branch: + +```yaml +on: + push: + branches: [ collectivex ] + paths: + - experimental/CollectiveX/** + - .github/workflows/collectivex-experimental.yml + pull_request: + paths: + - experimental/CollectiveX/** + - .github/workflows/collectivex-experimental.yml +``` + +Later, after a minimal dispatcher exists on `main`, add `workflow_dispatch` with inputs: `ref, suite, target, backend, shape, profile` (and comparison class / normal-LL-both / dry-run). + +Jobs: + +1. **Validate** — install the package; validate all suite/shape/backend/cluster YAML; confirm runner references exist in `runners.yaml`; reject unknown fields; emit the resolved run plan as an artifact. (Match InferenceX's strict Pydantic practice — models reject extra fields.) +2. **Compile and shard** — **do not** generate one job per benchmark point. Group cases by `cluster, node count, GPU placement, container image, backend build, transport mode, fabric domain, profiler requirement`. A shard runs many compatible points under one Slurm allocation (avoids thousands of matrix jobs, repeated communicator init, queue latency, repeated container import). Bounded runtime; record per-case failures unless the cluster itself is unhealthy. +3. **Preflight** — confirm GPU count; validate peer access; enumerate NICs; test RDMA/device visibility; verify backend libraries; run a tiny correctness case; capture topology/software. A failed preflight marks the whole shard `environment-invalid` rather than manufacturing dozens of backend failures. +4. **Canary** — for each `(cluster, backend, mode)` group, run one small representative case; launch the larger matrix only after it passes (mirrors InferenceX's canary-before-full-sweep). +5. **Benchmark** (`collectivex-benchmark-tmpl.yml`) — run on the resolved runner label; unique Slurm job name from workflow/attempt/shard; exclusive nodes; serialize/limit by `fabric-domain`; call the CollectiveX launcher; upload results even on partial failure; always upload environment+logs; fail the job only after artifact creation. +6. **Aggregate and regress** — validate every result against JSON schema; reject duplicate natural keys; merge rank samples and summaries; compute trial aggregates; compare against the most recent compatible baseline; publish a step summary; upload one `results_collectivex` bundle. +7. **Dispatch ingestion** (only once promoted to feed the app) — repository-dispatch the InferenceX-app repo with `{ "benchmark-family": "collectivex", "run-id": "...", "run-attempt": "..." }`. + +Use a separate `collectivex-changelog.yaml`: a CollectiveX backend change must not trigger the expensive serving sweep through `perf-changelog.yaml`, and a serving change must not launch every collective suite. + +## Regression policy (Milestone 2+) + +A compatible baseline requires exact matches on: case ID; cluster ID; topology fingerprint (or approved topology class); backend; comparison class; normal/LL mode; node and rank placement; dtype and shape; measurement-contract version. **Do not compare "same GPU SKU" across materially different fabrics.** + +```text +regression if: + correctness changed pass → fail + OR median latency degradation exceeds max(fixed floor, cluster noise threshold) + OR bandwidth degradation exceeds max(fixed floor, cluster noise threshold) +``` + +Derive each cluster's noise threshold from repeated baseline measurements via median absolute deviation — don't hard-code a universal 3% before knowing each fabric's noise. Retain failed, timed-out, and invalid results; reliability is part of the benchmark. + +## Reporting, database, and frontend + +**Now (spike / Milestone 2): a static, artifact-driven report.** Do not begin by changing InferenceX-app. + +```bash +python -m collectivex.report --results output/aggregate.json --output output/report/ +``` + +```text +report/ +├── index.html +├── data.json +├── assets/ +└── runs/ + └── .html +``` + +Report views: **Overview** (supported clusters/backends, latest run, correctness failures, recent regressions, coverage matrix); **Primitive explorer** (latency / algbw / busbw / rank-spread vs payload size; single-node vs multinode); **MoE explorer** (dispatch & combine latency vs tokens/rank; tokens/s vs EP size; uniform vs skewed; normal vs LL; comm-SMs vs performance); **Case details** (exact shape, backend commit, container digest, topology fingerprint, environment, command, correctness report, rank-level distribution, raw logs). A **comparison warning** must visibly reject invalid comparisons: + +```text +Not directly comparable: +- different routing distribution +- different topology class +- different communication-SM budget +- standardized versus backend-optimized mode +``` + +**Later (Milestone 4 / promotion into InferenceX-app):** add `/collectives` to the app (Next.js, React Query, raw API rows, client-side transforms, D3 charts; tab metadata/routing are centralized). Avoid a single global "CollectiveX score" at launch. Port the report views, plus Library Comparison, Scale-and-topology, and Historical-regression views, and a run-detail drawer. The frontend computes the `comparison-key` and refuses to connect rows with differing keys by default — **this guard matters more than any individual chart.** + +API routes (app): + +```text +/api/v1/collectives +/api/v1/collectives/availability +/api/v1/collectives/history +/api/v1/collectives/runs/:id +/api/v1/collectives/artifacts/:id +``` + +Continue the app convention: API returns raw DB rows; the frontend does chart-specific transforms. + +**Database (app, later).** Do not put CollectiveX rows in `benchmark_results` (its identity is serving configs + ISL/OSL/concurrency). Reuse `workflow_runs`, then add: + +```sql +collective_workloads(id, case_id, schema_version, family, operation, shape jsonb) +collective_environments(id, cluster_id, hardware, topology_class, topology_hash, software jsonb, capabilities jsonb) +collective_configs(id, workload_id, environment_id, backend, backend_version, comparison_class, mode, nodes, gpus_per_node, world_size, settings jsonb) +collective_results(id, workflow_run_id, config_id, trial, date, status, metrics jsonb, + latency_p50_us, latency_p99_us, logical_bandwidth_gbps, bus_bandwidth_gbps, + tokens_per_second, rank_skew_pct, error) +collective_artifacts(result_id, artifact_type, storage_url, metadata jsonb) +collective_availability(date, hardware, cluster_id, backend, family, operation, mode) +``` + +Follow the app's hybrid design (JSONB for evolving metrics; indexed "hot" columns for common filters; idempotent ingestion; natural unique keys; denormalized date; latest-results materialized view). Keep raw per-rank samples in artifacts/object storage, not in Postgres. + +## Future expansions + +The spike de-risks the path to the actual deliverable — a public OSS collective benchmark and an explainer article. Expansion axes, roughly near → far, with dependencies: + +**Hardware breadth.** B300 / GB300 next (GB300 is also MNNVL, with known disagg KV-transfer wins) → H100 / H200 as a cheaper, more-available **InfiniBand baseline** ideal for characterizing per-fabric noise → AMD MI300X / MI325X / MI355X (this is Milestone 1) → TPU (far; a separate stack and toolchain). + +**Backend breadth.** Framework-native EP (the `framework-integrated` class — ties numbers back to the SGLang/vLLM serving paths) → MSCCL++, NVSHMEM / rocSHMEM microbenchmarks, AITER comm/fusion, MoRI / Pollara (AMD). + +**IBGDA low-latency mode.** The recurring strategic blocker and the original "LL is broken" story; gated on the NVIDIA SRE maintenance window for B200/B300. Highest narrative value — add as an experimental suite the moment it unblocks. + +**Scale-out.** 2 → 4 → 8 → 16 nodes; on GB200, intra-NVL72 vs cross-rack scaling-efficiency curves (where MNNVL ends and the inter-rack fabric begins). + +**L3 integrated operator path.** route → permute → dispatch → grouped-GEMM → combine → unpermute — the bridge to OperatorX. + +**L4 e2e correlation.** Link an isolated dispatch/combine number to the same shape's cost inside a real serving run via `profile.yml` traces — the "explain serving performance" payoff and the tie-back to the core product. + +**Trace-derived shapes (Milestone 3).** DeepSeek V4 / MiniMax M3 / Kimi token-histogram and routing-skew extraction, so the synthetic shapes are anchored to real workloads. + +**AMD Ultra Ethernet (UEC).** The AMD networking path; pairs with the MoRI / Pollara backends. + +**Productization (north star).** Static report → public OSS benchmark site + the explainer article; promotion into InferenceX-app (`/collectives` + Postgres + nightly suite + regression alerts) at Milestone 2 / 4. + +## Continuous benchmark — vision & scope + +Goal: a continuous benchmark that reproduces the spike automatically and grows into a credible cross-vendor EP/collective comparison. **Start with balanced DeepSeek shapes, intranode EP**, then venture to advanced cases. Target **≥1 EP library per platform** first — DeepEP on NVIDIA, MoRI on AMD. + +### EP library landscape +- MoRI (AMD) — https://github.com/ROCm/mori +- DeepEP / DeepEPv2 / Hybrid-EP — https://github.com/deepseek-ai/DeepEP (hybrid: https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) +- NVIDIA NCCL EP — https://github.com/NVIDIA/nccl/tree/master/contrib/nccl_ep +- UCCL — https://github.com/uccl-project/uccl +- NVLink One-Sided AllToAll EP (mainly NVL72) — TensorRT-LLM blog18 (Optimizing MoE Communication with One-Sided AllToAll over NVLink) +- NIXL EP — https://github.com/ai-dynamo/nixl/tree/main/examples/device/ep + +### Shapes & axes +- **Classic DeepSeek V3:** hidden 7168, top-8, 256 routable experts. +- **Prefill vs decode** (# tokens). +- **Normal EP vs low-latency (LL) EP.** +- **Dispatch precision:** NVFP4, MXFP4, MXFP8, BF16. +- **Combine precision:** MXFP8, direct-cast FP8, BF16, NVFP4 — see MoRI #311, flashinfer #3643 / #3376. +- **Balanced vs unbalanced vs EPLB.** +- **Realistic shapes from InferenceX models** — collect hidden sizes / routing (Qwen3.5 has an unusual top-k). + +### Other inference collectives (later) +- KV-cache transfer: MoRI-IO, NIXL, Mooncake; CPU↔GPU offload — `experimental/kvcache_transfer_DtoH_HtoD/benchmark.py`. +- Low-latency one-shot / two-shot all-reduce (SGLang & vLLM in-tree kernels + AITER / FlashInfer variants) — e.g. sglang `sgl-kernel/csrc/allreduce/quick_all_reduce.cuh`. + +### Reference benchmark scripts to draw from +- flashinfer PR #3000; ROCm/mori `tests/python/ops`; DeepEP `tests/legacy`. + +### Learning resources +- arXiv 2511.15076, 2603.13606, 2512.19849, 2412.19437. + +## Things not to do + +* Do not add collective fields to the existing serving matrix. +* Do not make one GitHub Actions job per payload size. +* Do not call all logical-bandwidth figures "bus bandwidth." +* Do not compare different topology fingerprints as though GPU SKU were sufficient. +* Do not silently discard failed or incorrect results. +* Do not let a backend choose undocumented tuning parameters (in `standardized` mode). +* Do not make low-latency mode the only reported result. +* Do not publish one overall ranking before coverage and comparison contracts are stable. +* Do not start with every EP library, TPU, UEC, and every model shape. +* Do not store full raw rank samples indefinitely in Postgres. +* Do not expose internal hostnames, paths, NIC GUIDs, IP addresses, or private image references in public artifacts. +* Do not freeze the schema before the spike has produced a real artifact to freeze it from. + +## References (verified against the live InferenceX repo) + +- `experimental/README.md` — the non-core / "not official results" charter this project lives under. +- `.github/configs/runners.yaml` — runner labels and exact names (H100…GB300, AMD MI3xx). +- `.github/workflows/benchmark-tmpl.yml`, `benchmark-multinode-tmpl.yml`, `profile.yml`, `speedbench-al.yml` — the `bash ./runners/launch_${RUNNER_NAME%%_*}.sh` selection convention. +- `runners/launch_*.sh` — existing per-cluster launchers (`launch_b200-dgxc.sh`, `launch_b200-dgxc-slurm.sh`, `launch_gb200-nv.sh`, `launch_mi355x-amds.sh`, …). +- `utils/matrix_logic/generate_sweep_configs.py`, `validation.py` — the serving matrix CollectiveX must **not** extend. +- `.github/workflows/e2e-tests.yml`, `collect-results.yml` — the validate → fan-out → collect control plane being reused. +- `perf-changelog.yaml` — the additions-only serving gate CollectiveX must **not** trigger. +- NVIDIA Magnum IO NVSHMEM + GPUDirect Async (IBGDA): `https://developer.nvidia.com/blog/improving-network-performance-of-hpc-systems-using-nvidia-magnum-io-nvshmem-and-gpudirect-async/` diff --git a/experimental/CollectiveX/plot.py b/experimental/CollectiveX/plot.py new file mode 100644 index 000000000..0106c61c9 --- /dev/null +++ b/experimental/CollectiveX/plot.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — plot NCCL primitive curves, B200 vs GB200. + +Loads run_nccl.py result JSONs from results/, and for each operation draws two +panels: latency-vs-size and bus-bandwidth-vs-size, overlaying one curve per +(runner, topology-class, world-size). The B200(IB)-vs-GB200(MNNVL) contrast at +a matched shape is the intended overlay and the spike's headline. + +Comparison guard (plan §Comparability): curves are only overlaid when they +share op + dtype + comparison-class + measurement-contract. Anything else is +reported as "not directly comparable" and skipped rather than silently mixed. + + python plot.py --results-dir results --out-dir results/plots + +matplotlib + (optional) numpy. Run on a workstation/laptop over the JSON +artifacts; no GPU needed. +""" +from __future__ import annotations + +import argparse +import glob +import json +import os +from collections import defaultdict + +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +def _human(nbytes: int) -> str: + for unit in ("B", "KiB", "MiB", "GiB"): + if nbytes < 1024 or unit == "GiB": + return f"{nbytes:.0f}{unit}" if unit == "B" else f"{nbytes/1:.0f}{unit}" + nbytes /= 1024 + return str(nbytes) + + +def load_nccl_results(results_dir: str) -> list[dict]: + docs = [] + for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))): + try: + d = json.load(open(path)) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") == "nccl" and d.get("rows"): + d["_path"] = path + docs.append(d) + return docs + + +def curve_label(d: dict) -> str: + return f"{d['runner']} · {d['topology_class']} · ws{d['world_size']}" + + +def overlay_signature(d: dict) -> tuple: + """Fields that must match for two curves to share a chart (topology and + world-size are deliberately NOT here — they are the comparison axis).""" + return (d["op"], d.get("dtype"), d.get("comparison_class"), d.get("measurement_contract")) + + +def plot_op(op: str, docs: list[dict], out_dir: str) -> str | None: + if not docs: + return None + # Comparison guard: keep the dominant signature, warn on the rest. + sigs = defaultdict(list) + for d in docs: + sigs[overlay_signature(d)].append(d) + main_sig = max(sigs, key=lambda s: len(sigs[s])) + keep = sigs[main_sig] + for sig, ds in sigs.items(): + if sig == main_sig: + continue + for d in ds: + print(f" [guard] skipping {curve_label(d)} for op={op}: not directly " + f"comparable (dtype/class/contract differs: {sig} vs {main_sig})") + + fig, (ax_lat, ax_bw) = plt.subplots(1, 2, figsize=(14, 5)) + for d in sorted(keep, key=curve_label): + rows = sorted(d["rows"], key=lambda r: r["size_bytes"]) + sizes = [r["size_bytes"] for r in rows] + lat = [r["out_of_place"]["time_us"] for r in rows] + bw = [r["busbw_gbps"] for r in rows] + label = curve_label(d) + ax_lat.plot(sizes, lat, "o-", linewidth=2, markersize=4, label=label) + ax_bw.plot(sizes, bw, "o-", linewidth=2, markersize=4, label=label) + + for ax in (ax_lat, ax_bw): + ax.set_xscale("log", base=2) + ax.set_xlabel("Message size (bytes)") + ax.grid(True, alpha=0.3) + ax.legend(fontsize=9) + ax_lat.set_yscale("log") + ax_lat.set_ylabel("Latency (µs, out-of-place)") + ax_lat.set_title(f"{op}: latency vs size") + ax_bw.set_ylabel("Bus bandwidth (GB/s)") + ax_bw.set_title(f"{op}: bus bandwidth vs size") + fig.suptitle( + f"CollectiveX · {op} · dtype={main_sig[1]} · class={main_sig[2]} " + f"(topology is the comparison axis)", + fontsize=11, + ) + fig.tight_layout() + os.makedirs(out_dir, exist_ok=True) + out = os.path.join(out_dir, f"nccl_{op}.png") + fig.savefig(out, dpi=150, bbox_inches="tight") + plt.close(fig) + return out + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX primitive plots") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--out-dir", default="results/plots") + ap.add_argument("--op", help="only plot this op") + args = ap.parse_args() + + docs = load_nccl_results(args.results_dir) + if not docs: + print(f"no nccl result JSONs found in {args.results_dir}/") + return 1 + + by_op = defaultdict(list) + for d in docs: + by_op[d["op"]].append(d) + + ops = [args.op] if args.op else sorted(by_op) + made = [] + for op in ops: + out = plot_op(op, by_op.get(op, []), args.out_dir) + if out: + made.append(out) + print(f"wrote {out} ({len(by_op[op])} curve(s))") + if not made: + print("nothing plotted") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt new file mode 100644 index 000000000..574afb1f0 --- /dev/null +++ b/experimental/CollectiveX/requirements.txt @@ -0,0 +1,9 @@ +# CollectiveX spike dependencies. +# +# run_nccl.py + env_capture.py : Python standard library only (run anywhere). +# run_deepep.py : torch + deep_ep — provided by the benchmark +# container; DeepEP is built at job setup +# (rebuild-deepep), NOT pinned here. +# plot.py : the only thing worth a local venv: +matplotlib +numpy diff --git a/experimental/CollectiveX/results/.gitkeep b/experimental/CollectiveX/results/.gitkeep new file mode 100644 index 000000000..8940934a2 --- /dev/null +++ b/experimental/CollectiveX/results/.gitkeep @@ -0,0 +1,3 @@ +# CollectiveX result bundles land here as flat *.json (one per runner×op), +# plus plots/ and raw_*.txt captures (gitignored). Keep this file so the dir +# exists before the first run. diff --git a/experimental/CollectiveX/run_deepep.py b/experimental/CollectiveX/run_deepep.py new file mode 100644 index 000000000..44a3ae3e0 --- /dev/null +++ b/experimental/CollectiveX/run_deepep.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — DeepEP MoE dispatch+combine (normal mode), B200 first. + +One decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed, +emitting the same flat-JSON provenance shape as run_nccl.py. + +Scope (plan §Milestone 0): normal mode only — low-latency (LL) mode is the +known-broken/blocked IBGDA path and is out of scope for the spike. B200 +(x86_64) first; GB200 is the fast-follow once the aarch64 rebuild-deepep path +is proven. + + !!! DeepEP's Python API is VERSION-SENSITIVE (the plan notes V2 changed + NVSHMEM->NCCL, unified the APIs, and removed zero-SM LL mode). The + dispatch/combine block below follows the documented normal-mode intranode + API and is marked "ADAPT HERE" — validate the call signatures against the + DeepEP commit actually built by rebuild-deepep at job time, and record that + commit in provenance. Build is done at job setup, not shipped in the image. + +Launch (one process per GPU), e.g. single-node 8x B200: + torchrun --nproc_per_node=8 run_deepep.py \\ + --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/b200_deepep.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "deepep-normal-v1" + + +def _percentile(xs: list[float], q: float) -> float: + if not xs: + return float("nan") + s = sorted(xs) + i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1))))) + return s[i] + + +def comparison_key(meta: dict) -> str: + parts = [ + meta["op"], meta["backend"], meta["mode"], str(meta["world_size"]), + str(meta["nodes"]), meta["topology_class"], meta["comparison_class"], + meta["measurement_contract"], str(meta["shape"]), + ] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX DeepEP dispatch+combine (normal mode)") + # shape (decode-ish default from the plan) + ap.add_argument("--tokens-per-rank", type=int, default=64) + ap.add_argument("--hidden", type=int, default=7168) + ap.add_argument("--topk", type=int, default=8) + ap.add_argument("--experts", type=int, default=256) + ap.add_argument("--dispatch-dtype", default="fp8", choices=["fp8", "bf16"]) + ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"]) + ap.add_argument("--seed", type=int, default=67) + # measurement + ap.add_argument("--warmup", type=int, default=20) + ap.add_argument("--iters", type=int, default=200) + ap.add_argument("--trials", type=int, default=3) + ap.add_argument("--num-sms", type=int, default=24, help="communication SMs (standardized budget)") + # provenance + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + ap.add_argument("--comparison-class", default="standardized") + ap.add_argument("--deepep-commit", default=os.environ.get("DEEPEP_COMMIT", "unknown")) + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + # ---- imports guarded so a missing build fails loudly, not cryptically ---- + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + try: + from deep_ep import Buffer # type: ignore + except Exception as exc: # pragma: no cover + print( + "ERROR: deep_ep import failed — DeepEP must be built at job setup " + f"(rebuild-deepep). {exc!r}", + file=sys.stderr, + ) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local_rank) + if not dist.is_initialized(): + dist.init_process_group("nccl") + group = dist.group.WORLD + device = torch.device(f"cuda:{local_rank}") + torch.manual_seed(args.seed + rank) + + n = args.tokens_per_rank + H = args.hidden + topk = args.topk + E = args.experts + + # Input tokens + routing. Weights sum to 1 per token so that a pure + # dispatch->combine round trip (no expert compute) reconstructs x. + x = torch.randn((n, H), dtype=torch.bfloat16, device=device) + if args.routing == "uniform": + topk_idx = torch.stack([ + torch.randperm(E, device=device)[:topk] for _ in range(n) + ]).to(torch.int64) + else: # zipf-ish skew toward low expert ids + probs = (1.0 / torch.arange(1, E + 1, device=device).float()) + topk_idx = torch.multinomial(probs.expand(n, E), topk, replacement=False).to(torch.int64) + topk_weights = torch.softmax(torch.randn((n, topk), device=device, dtype=torch.float32), dim=-1) + + # Buffer sizing: intranode uses NVLink buffer only (no RDMA for single node). + # Numbers follow DeepEP's intranode test guidance; tune per build. + num_nvl_bytes = 1024 * 1024 * 1024 + num_rdma_bytes = 0 + buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes) + + def run_once(): + # ===================== ADAPT HERE (DeepEP API) ======================= + # Normal-mode intranode dispatch/combine. Signatures below match the + # documented DeepEP normal API; confirm against the built commit. + (num_tokens_per_rank, _, num_tokens_per_expert, + is_token_in_rank, _) = buffer.get_dispatch_layout(topk_idx, E) + recv_x, recv_topk_idx, recv_topk_weights, _, handle, _ = buffer.dispatch( + x, + topk_idx=topk_idx, + topk_weights=topk_weights, + num_tokens_per_rank=num_tokens_per_rank, + is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert, + ) + combined_x, _, _ = buffer.combine(recv_x, handle, topk_weights=recv_topk_weights) + # ===================================================================== + return combined_x, num_tokens_per_expert, is_token_in_rank + + # ---- correctness gate (run before timing; a fast wrong answer is invalid) ---- + combined_x, num_tokens_per_expert, is_token_in_rank = run_once() + torch.cuda.synchronize() + expected_routed = n * topk + routed = int(torch.as_tensor(num_tokens_per_expert).sum().item()) + token_conservation = (routed == expected_routed) + # DeepEP combine sums one copy of each token per destination RANK, so the + # dispatch->combine round trip reconstructs x only after dividing by the + # number of ranks each token was sent to (per DeepEP's own check in + # tests/legacy/test_intranode.py: combined_x / is_token_in_rank.sum(dim=1)). + ranks_per_token = is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float() + check_x = combined_x.float() / ranks_per_token + max_abs = (check_x - x.float()).abs().max().item() + max_rel = (max_abs / (x.float().abs().max().item() + 1e-6)) + combine_ok = max_rel < 2e-2 # bf16 dispatch/combine round-trip tolerance + correct = bool(token_conservation and combine_ok) + + # ---- timing (CUDA events; per-rank; reduce for slowest rank) ---- + def time_ms(fn, warmup, iters) -> list[float]: + for _ in range(warmup): + fn() + torch.cuda.synchronize() + out = [] + for _ in range(iters): + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + fn() + e.record() + torch.cuda.synchronize() + out.append(s.elapsed_time(e) * 1000.0) # ms -> us + return out + + def dispatch_only(): + (npr, _, npe, itir, _) = buffer.get_dispatch_layout(topk_idx, E) + buffer.dispatch(x, topk_idx=topk_idx, topk_weights=topk_weights, + num_tokens_per_rank=npr, is_token_in_rank=itir, + num_tokens_per_expert=npe) + + trials = [] + for _ in range(args.trials): + rt = time_ms(run_once, args.warmup, args.iters) # dispatch+combine round trip + dp = time_ms(dispatch_only, args.warmup, args.iters) # dispatch only + trials.append({ + "roundtrip_us_p50": _percentile(rt, 50), "roundtrip_us_p99": _percentile(rt, 99), + "dispatch_us_p50": _percentile(dp, 50), + }) + + local_rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) + # slowest rank across the world + t = torch.tensor([local_rt_p50], device=device) + dist.all_reduce(t, op=dist.ReduceOp.MAX) + slowest_rank_us = float(t.item()) + + if rank == 0: + shape = { + "tokens_per_rank": n, "hidden": H, "topk": topk, "experts": E, + "dispatch_dtype": args.dispatch_dtype, "routing": args.routing, + "num_comm_sms": args.num_sms, + } + meta = { + "op": "dispatch-combine", "backend": "deepep", "mode": "normal", + "world_size": world_size, "nodes": max(1, world_size // 8), + "topology_class": args.topology_class, "comparison_class": args.comparison_class, + "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape, + } + tokens_total = n * world_size + rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) + env = None + if args.env_json and os.path.exists(args.env_json): + env = json.load(open(args.env_json)) + doc = { + "schema_version": SCHEMA_VERSION, + "family": "moe", + "generated_by": "run_deepep.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, + "transport": args.transport, + "status": "valid" if correct else "invalid", + "comparison_key": comparison_key(meta), + "backend_provenance": {"deepep_commit": args.deepep_commit}, + **meta, + "correctness": { + "passed": correct, "token_conservation": token_conservation, + "combine_within_tol": combine_ok, "max_abs_error": max_abs, "max_rel_error": max_rel, + }, + "metrics": { + "roundtrip_us_p50": rt_p50, + "roundtrip_us_p99": sum(t["roundtrip_us_p99"] for t in trials) / len(trials), + "dispatch_us_p50": sum(t["dispatch_us_p50"] for t in trials) / len(trials), + "slowest_rank_roundtrip_us": slowest_rank_us, + "tokens_per_second": (tokens_total / (rt_p50 * 1e-6)) if rt_p50 else None, + }, + "trials": trials, + "environment": env, + } + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print( + f"deepep dispatch-combine: status={doc['status']} " + f"rt_p50={rt_p50:.1f}us slowest_rank={slowest_rank_us:.1f}us " + f"correct={correct} -> {args.out}" + ) + + dist.barrier() + dist.destroy_process_group() + return 0 if correct else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py new file mode 100644 index 000000000..d32de9f23 --- /dev/null +++ b/experimental/CollectiveX/run_nccl.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — NCCL primitive benchmark wrapper. + +Runs stock `nccl-tests` binaries (built in-container at job time — the login +nodes have no nvcc), parses the text table (NOT JSON — we do not assume the +build emits JSON), and writes a flat, provenance-tagged JSON result the plot +script and the eventual schema-freeze can consume. + +Standard library only, so it runs in any minimal container. + +Run (inside the container, after building nccl-tests): + python run_nccl.py --op all_reduce \\ + --nccl-tests-dir /tmp/nccl-tests/build \\ + --world-size 8 --min-bytes 8 --max-bytes 8G \\ + --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\ + --env-json results/env.json --out results/b200_all_reduce.json + +Verify the parser offline (no GPU needed): + python run_nccl.py --op all_reduce --parse-only tests/fixtures/all_reduce_perf_b200_8gpu.txt \\ + --world-size 8 --runner b200-dgxc --topology-class b200-nvlink-island \\ + --out /tmp/parsed.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import subprocess +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "nccl-tests-v1" + +# op -> nccl-tests binary name +OP_BINARY = { + "all_reduce": "all_reduce_perf", + "all_gather": "all_gather_perf", + "reduce_scatter": "reduce_scatter_perf", + "alltoall": "alltoall_perf", + "all_to_all": "alltoall_perf", + "broadcast": "broadcast_perf", + "sendrecv": "sendrecv_perf", +} + + +def _f(tok: str): + """Parse a numeric cell; nccl-tests prints 'N/A' for #wrong when -c 0.""" + if tok in ("N/A", "n/a", "-"): + return None + try: + return float(tok) + except ValueError: + return None + + +def parse_nccl_table(text: str) -> tuple[list[dict], dict]: + """Parse nccl-tests stdout into per-size rows + a run summary. + + Robust across ops: the column count varies (all_reduce/reduce_scatter carry + redop+root; all_gather/alltoall do not), but every op prints the same 8 + trailing numeric columns — out-of-place (time, algbw, busbw, #wrong) then + in-place (time, algbw, busbw, #wrong). `size` is always the first token and + `type` the third. So we key off the first token and the last 8 tokens. + """ + rows: list[dict] = [] + summary: dict = {"avg_busbw_gbps": None, "out_of_bounds": None, "check_passed": None} + for line in text.splitlines(): + s = line.strip() + if not s: + continue + if s.startswith("#"): + if "Avg bus bandwidth" in s: + summary["avg_busbw_gbps"] = _f(s.split(":")[-1].strip()) + elif "Out of bounds values" in s: + tail = s.split(":")[-1].strip() + summary["out_of_bounds"] = tail + summary["check_passed"] = tail.endswith("OK") + continue + toks = s.split() + # Data line: first token is the byte size (all digits), and we need the + # 8 trailing metric columns plus size+count+type up front (>=11 tokens). + if len(toks) < 11 or not toks[0].isdigit(): + continue + tail = toks[-8:] + size = int(toks[0]) + dtype = toks[2] if len(toks) >= 3 else None + oop_wrong = _f(tail[3]) + ip_wrong = _f(tail[7]) + rows.append( + { + "size_bytes": size, + "dtype": dtype, + "out_of_place": { + "time_us": _f(tail[0]), + "algbw_gbps": _f(tail[1]), + "busbw_gbps": _f(tail[2]), + "wrong": oop_wrong, + }, + "in_place": { + "time_us": _f(tail[4]), + "algbw_gbps": _f(tail[5]), + "busbw_gbps": _f(tail[6]), + "wrong": ip_wrong, + }, + # convenience: best (max) busbw across the two placements + "busbw_gbps": max( + [b for b in (_f(tail[2]), _f(tail[6])) if b is not None], + default=None, + ), + "correct": ( + None + if oop_wrong is None and ip_wrong is None + else ((oop_wrong or 0) == 0 and (ip_wrong or 0) == 0) + ), + } + ) + return rows, summary + + +def comparison_key(meta: dict) -> str: + """Machine key gating which rows may share a curve (see plan §Comparability). + Topology-class is intentionally part of the key, so B200(IB) and + GB200(MNNVL) are labelled distinct rather than silently overlaid.""" + parts = [ + meta["op"], + meta["dtype"], + str(meta["world_size"]), + str(meta["nodes"]), + meta["topology_class"], + meta["comparison_class"], + meta["measurement_contract"], + ] + digest = hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + return digest + + +def build_command(args, binary_path: str) -> list[str]: + cmd: list[str] = [] + if args.launch_prefix: + cmd += args.launch_prefix.split() + cmd += [ + binary_path, + "-b", str(args.min_bytes), + "-e", str(args.max_bytes), + "-f", str(args.factor), + "-g", str(args.gpus_per_proc), + "-c", str(args.check), + "-w", str(args.warmup), + "-n", str(args.iters), + ] + if args.extra_args: + cmd += args.extra_args.split() + return cmd + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX NCCL primitive runner") + ap.add_argument("--op", required=True, choices=sorted(OP_BINARY)) + ap.add_argument("--nccl-tests-dir", help="dir containing _perf binaries (build/)") + ap.add_argument("--parse-only", help="parse this captured stdout file instead of running") + # nccl-tests knobs + ap.add_argument("--min-bytes", default="8") + ap.add_argument("--max-bytes", default="8G") + ap.add_argument("--factor", type=int, default=2, help="size step factor") + ap.add_argument("--gpus-per-proc", type=int, default=8, + help="-g: GPUs per process (single-node multi-GPU). Use 1 under MPI.") + ap.add_argument("--check", type=int, default=1, help="-c: 1 enables correctness check") + ap.add_argument("--warmup", type=int, default=5) + ap.add_argument("--iters", type=int, default=20) + ap.add_argument("--extra-args", default="", help="extra args appended to the binary") + ap.add_argument("--launch-prefix", default="", + help="e.g. 'mpirun -np 16 --hostfile hf' for multi-node; empty for single-node -g mode") + # provenance + ap.add_argument("--runner", required=True, help="runner label, e.g. b200-dgxc") + ap.add_argument("--world-size", type=int, required=True, help="total ranks/GPUs in the run") + ap.add_argument("--nodes", type=int, default=1) + ap.add_argument("--topology-class", required=True, + help="e.g. b200-nvlink-island, b200-nvlink-island+cx7-ib, gb200-nvl72-mnnvl") + ap.add_argument("--transport", default="", help="observed transport label: nvlink | ib | mnnvl") + ap.add_argument("--comparison-class", default="standardized", + choices=["standardized", "backend-optimized", "framework-integrated"]) + ap.add_argument("--env-json", help="path to env_capture.py output to embed") + ap.add_argument("--timestamp", help="ISO timestamp (default now)") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + binary = OP_BINARY[args.op] + command = None + if args.parse_only: + with open(args.parse_only) as fh: + stdout = fh.read() + ran_ok = True + else: + if not args.nccl_tests_dir: + ap.error("--nccl-tests-dir is required unless --parse-only is given") + binary_path = os.path.join(args.nccl_tests_dir, binary) + if not os.path.exists(binary_path): + print(f"ERROR: binary not found: {binary_path}", file=sys.stderr) + return 2 + command = build_command(args, binary_path) + print("running:", " ".join(command), file=sys.stderr) + proc = subprocess.run(command, capture_output=True, text=True, check=False) + stdout = proc.stdout + ran_ok = proc.returncode == 0 + if not ran_ok: + print(stdout, file=sys.stderr) + print(proc.stderr, file=sys.stderr) + print(f"ERROR: {binary} exited {proc.returncode}", file=sys.stderr) + + rows, summary = parse_nccl_table(stdout) + dtype = rows[0]["dtype"] if rows else None + + meta = { + "op": args.op, + "dtype": dtype, + "world_size": args.world_size, + "nodes": args.nodes, + "topology_class": args.topology_class, + "comparison_class": args.comparison_class, + "measurement_contract": MEASUREMENT_CONTRACT, + } + + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + + doc = { + "schema_version": SCHEMA_VERSION, + "family": "nccl", + "generated_by": "run_nccl.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, + "binary": binary, + "command": " ".join(command) if command else f"", + "transport": args.transport, + "status": "valid" if (summary.get("check_passed") in (True, None) and ran_ok and rows) else "invalid", + "comparison_key": comparison_key(meta), + **meta, + "summary": summary, + "num_rows": len(rows), + "rows": rows, + "environment": env, + } + + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + + print( + f"{args.op}: parsed {len(rows)} sizes -> {args.out} " + f"(status={doc['status']}, avg_busbw={summary.get('avg_busbw_gbps')} GB/s, " + f"key={doc['comparison_key']})" + ) + return 0 if doc["status"] == "valid" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt new file mode 100644 index 000000000..c8825164e --- /dev/null +++ b/experimental/CollectiveX/tests/fixtures/all_reduce_perf_b200_8gpu.txt @@ -0,0 +1,50 @@ +# nThread 1 nGpus 8 minBytes 8 maxBytes 8589934592 step: 2(factor) warmup iters: 5 iters: 20 agg iters: 1 validation: 1 graph: 0 +# +# Using devices +# Rank 0 Group 0 Pid 12345 on b200-node device 0 [0x1b] NVIDIA B200 +# Rank 1 Group 0 Pid 12345 on b200-node device 1 [0x43] NVIDIA B200 +# Rank 2 Group 0 Pid 12345 on b200-node device 2 [0x52] NVIDIA B200 +# Rank 3 Group 0 Pid 12345 on b200-node device 3 [0x61] NVIDIA B200 +# Rank 4 Group 0 Pid 12345 on b200-node device 4 [0x9d] NVIDIA B200 +# Rank 5 Group 0 Pid 12345 on b200-node device 5 [0xc3] NVIDIA B200 +# Rank 6 Group 0 Pid 12345 on b200-node device 6 [0xd1] NVIDIA B200 +# Rank 7 Group 0 Pid 12345 on b200-node device 7 [0xdf] NVIDIA B200 +# +# out-of-place in-place +# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 9.62 0.00 0.00 0 9.60 0.00 0.00 0 + 16 4 float sum -1 9.61 0.00 0.00 0 9.59 0.00 0.00 0 + 32 8 float sum -1 9.63 0.00 0.00 0 9.62 0.00 0.00 0 + 64 16 float sum -1 9.60 0.00 0.00 0 9.58 0.00 0.00 0 + 128 32 float sum -1 9.64 0.01 0.02 0 9.63 0.01 0.02 0 + 256 64 float sum -1 9.66 0.03 0.05 0 9.64 0.03 0.05 0 + 512 128 float sum -1 9.69 0.05 0.09 0 9.67 0.05 0.09 0 + 1024 256 float sum -1 9.74 0.11 0.18 0 9.72 0.11 0.18 0 + 2048 512 float sum -1 9.82 0.21 0.37 0 9.80 0.21 0.37 0 + 4096 1024 float sum -1 9.97 0.41 0.72 0 9.95 0.41 0.72 0 + 8192 2048 float sum -1 10.22 0.80 1.40 0 10.20 0.80 1.40 0 + 16384 4096 float sum -1 10.81 1.52 2.65 0 10.79 1.52 2.65 0 + 32768 8192 float sum -1 11.93 2.75 4.81 0 11.90 2.75 4.81 0 + 65536 16384 float sum -1 13.62 4.81 8.42 0 13.59 4.82 8.43 0 + 131072 32768 float sum -1 16.94 7.74 13.54 0 16.90 7.76 13.57 0 + 262144 65536 float sum -1 23.14 11.33 19.83 0 23.10 11.35 19.86 0 + 524288 131072 float sum -1 35.62 14.72 25.76 0 35.55 14.75 25.81 0 + 1048576 262144 float sum -1 60.40 17.36 30.38 0 60.30 17.39 30.43 0 + 2097152 524288 float sum -1 76.50 27.41 47.97 0 76.40 27.45 48.04 0 + 4194304 1048576 float sum -1 110.20 38.06 66.61 0 110.05 38.11 66.70 0 + 8388608 2097152 float sum -1 165.80 50.60 88.55 0 165.60 50.66 88.65 0 + 16777216 4194304 float sum -1 250.10 67.08 117.40 0 249.80 67.16 117.54 0 + 33554432 8388608 float sum -1 360.50 93.08 162.90 0 360.10 93.18 163.07 0 + 67108864 16777216 float sum -1 520.80 128.85 225.50 0 520.20 129.00 225.75 0 + 134217728 33554432 float sum -1 720.30 186.34 326.10 0 719.50 186.55 326.46 0 + 268435456 67108864 float sum -1 1080.50 248.43 434.80 0 1079.20 248.73 435.27 0 + 536870912 134217728 float sum -1 1990.20 269.76 472.10 0 1988.50 269.99 472.49 0 + 1073741824 268435456 float sum -1 3940.60 272.48 476.84 0 3938.10 272.65 477.14 0 + 2147483648 536870912 float sum -1 7850.10 273.56 478.73 0 7846.20 273.69 478.96 0 + 4294967296 1073741824 float sum -1 15680.50 273.91 479.34 0 15673.80 274.03 479.55 0 + 8589934592 2147483648 float sum -1 31250.80 274.87 481.02 0 31238.10 274.98 481.22 0 +# +# Out of bounds values : 0 OK +# Avg bus bandwidth : 168.42 +# From b7ed913b66905c0e380fa82495b7741ad3280473 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 15:55:30 +0800 Subject: [PATCH 02/17] CollectiveX: import container by multi-arch tag, fix CI import hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GB200 on:push smoke hung 25 min in enroot import: a bare digest ref (repo@sha256:) can't form an anonymous Docker Hub token scope, so enroot prompted for a password and blocked in non-interactive CI. Import by the multi-arch TAG instead (anonymous auth works, same as the serving launchers) and add &2; } cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; } -# Single multi-arch, digest-pinned container for ALL NVIDIA SKUs. -# This is the OCI image index for tag `v0.5.12-cu130`, covering BOTH linux/amd64 -# (B200) and linux/arm64 (GB200); enroot import on each host pulls the matching -# arch from the index. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.) -# Pinned by DIGEST ONLY (no tag): enroot mis-parses a combined `tag@sha256` ref -# and 400s at auth, so we use `repo@sha256:` — also the stricter pin. -# NOTE: DeepEP is NOT bundled here -> run_in_container.sh builds it via -# rebuild-deepep at job setup. (The arch-specific deepseek-v4-{blackwell, -# grace-blackwell} images DO bundle DeepEP — see CONTAINERS.md — but are not -# multi-arch and are not used by default.) -CX_IMAGE_MULTIARCH="lmsysorg/sglang@sha256:42194170546745092e74cd5f81ad32a7c6e944c7111fe7bf13588152277ff356" +# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI +# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import +# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.) +# IMPORT BY TAG, not by digest: enroot's anonymous Docker Hub token scope is built +# from the tag; a bare `repo@sha256:` ref makes enroot prompt for a password and +# HANG in non-interactive CI (and a combined `tag@sha256` ref 400s). The expected +# multi-arch index digest is recorded for provenance/verification: +CX_IMAGE_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975" +# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based +# squash creation on these nodes — "failed to mount overlay ... Invalid argument". +# v0.5.11-cu130 imports cleanly and is pre-staged on GB200.) +# DeepEP is NOT bundled here -> run_in_container.sh builds it via rebuild-deepep. +# (The arch-specific deepseek-v4-{blackwell,grace-blackwell} images DO bundle +# DeepEP — see CONTAINERS.md — but are not multi-arch and are not the default.) +CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130" cx_default_image() { case "$1" in @@ -44,7 +48,10 @@ cx_ensure_squash() { else cx_log "enroot import docker://$image -> $sq (one-time, multi-GB)" rm -f "$sq" - enroot import -o "$sq" "docker://$image" >&2 || cx_die "enroot import failed for $image" + # &2 \ + || cx_die "enroot import failed for $image (anonymous auth needs a TAG ref, not a bare digest; or pre-stage the squash)" unsquashfs -l "$sq" >/dev/null 2>&1 || cx_die "import produced no valid squash: $sq" fi ) 9>"$locks/${key}.lock" diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md index 365b23455..6ceb512ef 100644 --- a/experimental/CollectiveX/plan.md +++ b/experimental/CollectiveX/plan.md @@ -28,7 +28,7 @@ Existing public benchmarks don't offer trustworthy, like-for-like collective/EP The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that: -- **Multi-arch, digest-pinned container** for all NVIDIA SKUs: `lmsysorg/sglang:v0.5.12-cu130@sha256:4219…f356` (amd64 + arm64) — one reference both arches; DeepEP via `rebuild-deepep`. See `CONTAINERS.md`. +- **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`. - **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|all) through a shared `launchers/run_in_container.sh`. - **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → GB200 NCCL smoke; `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. From ccfae8edc8a027516742603f464ffd00731fbebc Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 16:03:48 +0800 Subject: [PATCH 03/17] CollectiveX: copy staged results back to checkout for artifact upload On the GB200 Actions path, CX_STAGE_DIR makes the launcher rsync the tree to compute-visible Lustre and the container writes results/ there; upload-artifact reads the checkout's results/ (empty), so the green smoke produced no artifact. Add cx_collect_results to copy result JSONs from the stage dir back to the checkout after the run (no-op when no staging was used). --- experimental/CollectiveX/launchers/common.sh | 13 +++++++++++++ .../CollectiveX/launchers/launch_b200-dgxc.sh | 1 + .../CollectiveX/launchers/launch_gb200-nv.sh | 1 + 3 files changed, 15 insertions(+) diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh index f3997cf9e..d8d5749eb 100644 --- a/experimental/CollectiveX/launchers/common.sh +++ b/experimental/CollectiveX/launchers/common.sh @@ -77,6 +77,19 @@ cx_stage_repo() { echo "$stage_dir" } +# cx_collect_results +# When the run used a staged (compute-visible) mount, copy result JSONs back to +# the original checkout's results/ so the workflow's upload-artifact (which reads +# the checkout, not the stage dir) finds them. No-op when no staging was used. +cx_collect_results() { + local mount_src="$1" repo_root="$2" dst + [ "$mount_src" = "$repo_root" ] && return 0 + dst="$repo_root/experimental/CollectiveX/results" + mkdir -p "$dst" + cp "$mount_src/experimental/CollectiveX/results/"*.json "$dst/" 2>/dev/null || true + cx_log "copied results from stage dir -> $dst (for artifact upload)" +} + # cx_build_nccl_tests -> echoes the build/ dir. # Runs IN-CONTAINER (login nodes have no nvcc). Cached: skips if already built. # CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh index a1b5c0135..29e4eea56 100644 --- a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh @@ -61,4 +61,5 @@ srun --jobid="$JOB_ID" \ --no-container-entrypoint --export=ALL \ bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh index 35cdb8e28..8b24a710d 100644 --- a/experimental/CollectiveX/launchers/launch_gb200-nv.sh +++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh @@ -64,4 +64,5 @@ srun --jobid="$JOB_ID" \ --no-container-entrypoint --export=ALL \ bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" From b3841719bd6e9fec538059d701da16011c29c5e5 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 16:23:27 +0800 Subject: [PATCH 04/17] CollectiveX: per-job summary table + address PR review findings Add summarize.py (compact NCCL/DeepEP results table, printed at end of every job) and make it the result gate. Fix review findings: benchmark failures/skipped-deepep now fail the job instead of reporting green (#1); DeepEP nodes from SLURM_NNODES not world_size//8 (#3); apply Buffer.set_num_sms so num_comm_sms is real (#8); nccl-tests -c 1 with a missing check footer is now invalid (#7); use context managers for file reads (#4,#5); launchers export COLLECTIVEX_IMAGE/_DIGEST for provenance (#9); trim workflow_dispatch sku options to launcher-backed pools (#2). Artifact-path finding (#6) already fixed via cx_collect_results. --- .../workflows/collectivex-experimental.yml | 6 +- .../launchers/launch_b200-dgxc-slurm.sh | 2 + .../CollectiveX/launchers/launch_b200-dgxc.sh | 2 + .../CollectiveX/launchers/launch_gb200-nv.sh | 2 + .../CollectiveX/launchers/run_in_container.sh | 42 ++++--- experimental/CollectiveX/plot.py | 3 +- experimental/CollectiveX/run_deepep.py | 12 +- experimental/CollectiveX/run_nccl.py | 3 +- experimental/CollectiveX/summarize.py | 119 ++++++++++++++++++ 9 files changed, 167 insertions(+), 24 deletions(-) create mode 100644 experimental/CollectiveX/summarize.py diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml index 6b07c2d56..4446473e9 100644 --- a/.github/workflows/collectivex-experimental.yml +++ b/.github/workflows/collectivex-experimental.yml @@ -17,10 +17,12 @@ on: workflow_dispatch: inputs: sku: - description: Self-hosted runner pool (label from .github/configs/runners.yaml) + # Only SKUs with a matching launchers/launch_.sh are offered — + # runner.name's prefix selects the script, so an SKU without one fails. + description: Self-hosted runner pool (must have a CollectiveX launcher) type: choice default: gb200 - options: [gb200, b200, b200-multinode, b300, gb300] + options: [gb200, b200-dgxc, b200-multinode] benchmark: description: Which benchmark to run type: choice diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh index a58411343..e5add9189 100644 --- a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh @@ -40,6 +40,8 @@ TOPO="b200-nvlink-island+cx7-ib" WORLD=$((NODES * GPUS_PER_NODE)) MPI_FLAG="${CX_SRUN_MPI:-pmix}" export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +# Record container identity in env_capture provenance (propagated via --export=ALL). +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf ) diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh index 29e4eea56..42d860975 100644 --- a/experimental/CollectiveX/launchers/launch_b200-dgxc.sh +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc.sh @@ -35,6 +35,8 @@ export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" export CX_TOPO="b200-nvlink-island" CX_TRANSPORT="nvlink" export CX_BENCH="${CX_BENCH:-nccl}" export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +# Record container identity in env_capture provenance. +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" export NCCL_CUMEM_ENABLE=1 cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH" diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh index 8b24a710d..60d5b297d 100644 --- a/experimental/CollectiveX/launchers/launch_gb200-nv.sh +++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh @@ -37,6 +37,8 @@ export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl" export CX_BENCH="${CX_BENCH:-nccl}" export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" +# Record container identity in env_capture provenance. +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" # Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded. export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh index 7729528b2..cde27ac1c 100644 --- a/experimental/CollectiveX/launchers/run_in_container.sh +++ b/experimental/CollectiveX/launchers/run_in_container.sh @@ -30,45 +30,51 @@ cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS" run_nccl_suite() { - local build ops op - build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" # single-node: MPI=0, -g N + local build ops op sfail=0 + build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1 # single-node: MPI=0, -g N ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" for op in $ops; do - python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \ - --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \ - --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ - --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \ - --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1 \ - || cx_log "WARN: nccl $op failed" + if ! python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \ + --world-size "$CX_NGPUS" --nodes 1 --gpus-per-proc "$CX_NGPUS" \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \ + --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1; then + cx_log "WARN: nccl $op failed or invalid"; sfail=1 + fi done + return "$sfail" } run_deepep_suite() { # DeepEP is not bundled in the multi-arch image. Try to import; if absent, - # attempt rebuild-deepep (srt-slurm setup script) when available, else skip. + # attempt rebuild-deepep (srt-slurm setup script). Inability to run is a + # failure, not a silent skip — the caller asked for deepep. if ! python3 -c "import deep_ep" 2>/dev/null; then if command -v rebuild-deepep.sh >/dev/null 2>&1; then cx_log "building DeepEP via rebuild-deepep.sh" - rebuild-deepep.sh >&2 || cx_log "WARN: rebuild-deepep.sh failed" + rebuild-deepep.sh >&2 || { cx_log "WARN: rebuild-deepep.sh failed"; return 1; } else - cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; skipping deepep" - return 0 + cx_log "WARN: deep_ep not importable and no rebuild-deepep.sh on PATH; cannot run deepep" + return 1 fi fi torchrun --nproc_per_node="$CX_NGPUS" run_deepep.py \ --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \ --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \ - --dispatch-dtype "${CX_DISPATCH_DTYPE:-fp8}" \ + --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" \ --env-json "$ENVJSON" --out "results/${CX_RUNNER}_deepep_${CX_TS}.json" \ - || cx_log "WARN: deepep run failed" + || { cx_log "WARN: deepep run failed"; return 1; } } +rc=0 case "$CX_BENCH" in - nccl) run_nccl_suite ;; - deepep) run_deepep_suite ;; - all) run_nccl_suite; run_deepep_suite ;; + nccl) run_nccl_suite || rc=1 ;; + deepep) run_deepep_suite || rc=1 ;; + all) run_nccl_suite || rc=1; run_deepep_suite || rc=1 ;; *) cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|all)" ;; esac -echo "=== results ==="; ls -1 results/*.json +# Summary table for the log; also fails the job if no valid results were produced. +python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1 +exit "$rc" diff --git a/experimental/CollectiveX/plot.py b/experimental/CollectiveX/plot.py index 0106c61c9..c24136ebc 100644 --- a/experimental/CollectiveX/plot.py +++ b/experimental/CollectiveX/plot.py @@ -40,7 +40,8 @@ def load_nccl_results(results_dir: str) -> list[dict]: docs = [] for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))): try: - d = json.load(open(path)) + with open(path) as _f: + d = json.load(_f) except (json.JSONDecodeError, OSError): continue if d.get("family") == "nccl" and d.get("rows"): diff --git a/experimental/CollectiveX/run_deepep.py b/experimental/CollectiveX/run_deepep.py index 44a3ae3e0..3d61c69e4 100644 --- a/experimental/CollectiveX/run_deepep.py +++ b/experimental/CollectiveX/run_deepep.py @@ -126,6 +126,13 @@ def main() -> int: num_nvl_bytes = 1024 * 1024 * 1024 num_rdma_bytes = 0 buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes) + # Apply the standardized communication-SM budget so the recorded + # num_comm_sms reflects the actual run (best-effort across DeepEP versions). + try: + Buffer.set_num_sms(args.num_sms) + except Exception as exc: # pragma: no cover - API/version dependent + if rank == 0: + print(f"WARN: could not set num_sms={args.num_sms}: {exc!r}", file=sys.stderr) def run_once(): # ===================== ADAPT HERE (DeepEP API) ======================= @@ -207,7 +214,7 @@ def dispatch_only(): } meta = { "op": "dispatch-combine", "backend": "deepep", "mode": "normal", - "world_size": world_size, "nodes": max(1, world_size // 8), + "world_size": world_size, "nodes": int(os.environ.get("SLURM_NNODES", "1")), "topology_class": args.topology_class, "comparison_class": args.comparison_class, "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape, } @@ -215,7 +222,8 @@ def dispatch_only(): rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) env = None if args.env_json and os.path.exists(args.env_json): - env = json.load(open(args.env_json)) + with open(args.env_json) as _fh: + env = json.load(_fh) doc = { "schema_version": SCHEMA_VERSION, "family": "moe", diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py index d32de9f23..993c0c06d 100644 --- a/experimental/CollectiveX/run_nccl.py +++ b/experimental/CollectiveX/run_nccl.py @@ -236,7 +236,8 @@ def main() -> int: "binary": binary, "command": " ".join(command) if command else f"", "transport": args.transport, - "status": "valid" if (summary.get("check_passed") in (True, None) and ran_ok and rows) else "invalid", + "status": ("valid" if (rows and ran_ok and (summary.get("check_passed") is True + or (args.check == 0 and summary.get("check_passed") is None))) else "invalid"), "comparison_key": comparison_key(meta), **meta, "summary": summary, diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py new file mode 100644 index 000000000..bb439dcb4 --- /dev/null +++ b/experimental/CollectiveX/summarize.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +"""CollectiveX — print a compact summary table of a run's results. + +Reads the result JSONs a job produced (filtered by runner + timestamp when +given) and prints one table per family (NCCL primitives, MoE/DeepEP). Runs at +the end of every job (from run_in_container.sh) so the Slurm/Actions log shows a +digestible table, not just file paths. + +Doubles as a result gate: exits non-zero if no valid results were produced (so a +benchmark that failed/skipped doesn't get reported as a green job). + + python summarize.py --results-dir results --runner gb200-nv_1 --ts +""" +from __future__ import annotations + +import argparse +import glob +import json +import os + + +def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[dict]: + docs = [] + for path in sorted(glob.glob(os.path.join(results_dir, "*.json"))): + base = os.path.basename(path) + if base.startswith("env_"): + continue + if runner and not base.startswith(f"{runner}_"): + continue + if ts and ts not in base: + continue + try: + with open(path) as fh: + d = json.load(fh) + except (json.JSONDecodeError, OSError): + continue + if d.get("family") in ("nccl", "moe"): + d["_base"] = base + docs.append(d) + return docs + + +def _peak_busbw(rows: list[dict]) -> float: + return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0) + + +def _min_lat(rows: list[dict]) -> float: + vals = [r["out_of_place"]["time_us"] for r in rows + if r.get("out_of_place", {}).get("time_us") is not None] + return min(vals) if vals else float("nan") + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX result summary table") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--runner", default=None) + ap.add_argument("--ts", default=None) + args = ap.parse_args() + + docs = load_results(args.results_dir, args.runner, args.ts) + nccl = [d for d in docs if d["family"] == "nccl"] + moe = [d for d in docs if d["family"] == "moe"] + + hdr = "CollectiveX results" + if docs: + d0 = docs[0] + hdr += (f" — runner={d0.get('runner')} topology={d0.get('topology_class')}" + f" transport={d0.get('transport')}") + print("\n" + "=" * len(hdr)) + print(hdr) + print("=" * len(hdr)) + + n_valid = 0 + + if nccl: + ws = nccl[0].get("world_size") + print(f"\nNCCL primitives (world={ws}, dtype={nccl[0].get('dtype')}):") + print(f" {'op':<16}{'status':<9}{'peak busbw':>12}{'min lat':>10}{'avg busbw':>11}") + print(f" {'':<16}{'':<9}{'(GB/s)':>12}{'(us)':>10}{'(GB/s)':>11}") + for d in sorted(nccl, key=lambda x: x["op"]): + rows = d.get("rows", []) + n_valid += d.get("status") == "valid" + avg = (d.get("summary") or {}).get("avg_busbw_gbps") + print(f" {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}" + f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") + + if moe: + print("\nMoE / DeepEP dispatch+combine:") + print(f" {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}" + f"{'disp_p50':>10}{'tokens/s':>13}{' correct'}") + print(f" {'':<10}{'':<8}{'':<9}{'(us)':>9}{'(us)':>9}{'(us)':>10}{'':>13}") + for d in sorted(moe, key=lambda x: x.get("backend", "")): + m = d.get("metrics", {}) + c = d.get("correctness", {}) + n_valid += d.get("status") == "valid" + tps = m.get("tokens_per_second") + print(f" {d.get('backend',''):<10}{d.get('mode',''):<8}{d.get('status',''):<9}" + f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}" + f"{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}" + f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}" + f"{(tps if tps is not None else float('nan')):>13.3e}" + f" {c.get('passed')}") + + total = len(docs) + print(f"\n{n_valid}/{total} results valid.\n") + if total == 0: + print("ERROR: no result files found to summarize — benchmark produced nothing.") + return 1 + if n_valid == 0: + print("ERROR: no valid results — failing the job.") + return 1 + if n_valid < total: + print(f"WARNING: {total - n_valid} result(s) invalid.") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From f48daed804fc07174f7b5fc153ac6da21708833d Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 16:50:12 +0800 Subject: [PATCH 05/17] CollectiveX: render results as a GitHub Actions job summary summarize.py --markdown emits GitHub-flavored markdown tables (NCCL + DeepEP); a per-job 'Results summary' workflow step appends it to $GITHUB_STEP_SUMMARY so the run page shows a rendered table (per the GitHub job-summaries feature). Plain-text mode still drives the in-container result gate. --- .../workflows/collectivex-experimental.yml | 6 + experimental/CollectiveX/README.md | 6 +- experimental/CollectiveX/summarize.py | 145 +++++++++++------- 3 files changed, 99 insertions(+), 58 deletions(-) diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml index 4446473e9..c63b56635 100644 --- a/.github/workflows/collectivex-experimental.yml +++ b/.github/workflows/collectivex-experimental.yml @@ -73,6 +73,9 @@ jobs: env: RUNNER_NAME: ${{ runner.name }} run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Results summary + if: always() + run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" - name: Upload results if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -101,6 +104,9 @@ jobs: env: RUNNER_NAME: ${{ runner.name }} run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + - name: Results summary + if: always() + run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" - name: Upload results if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md index 4fb871bf1..606eeb395 100644 --- a/experimental/CollectiveX/README.md +++ b/experimental/CollectiveX/README.md @@ -31,11 +31,13 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL - **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle capacity; never auto-contends with the B200 serving sweep). -- **workflow_dispatch** → pick `sku` (gb200 / b200 / b200-multinode / …), +- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode), `benchmark` (nccl / deepep / all), ops, sizes, ngpus. Lands on that SKU's self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. -(The workflow only fires once the branch is pushed to GitHub.) +Each job renders a results table to the **GitHub Actions job summary** (via +`summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs +as an artifact. (The workflow only fires once the branch is pushed to GitHub.) ### Directly on a cluster login node diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py index bb439dcb4..8d81b13ee 100644 --- a/experimental/CollectiveX/summarize.py +++ b/experimental/CollectiveX/summarize.py @@ -1,15 +1,17 @@ #!/usr/bin/env python3 -"""CollectiveX — print a compact summary table of a run's results. +"""CollectiveX — summarize a run's results. -Reads the result JSONs a job produced (filtered by runner + timestamp when -given) and prints one table per family (NCCL primitives, MoE/DeepEP). Runs at -the end of every job (from run_in_container.sh) so the Slurm/Actions log shows a -digestible table, not just file paths. - -Doubles as a result gate: exits non-zero if no valid results were produced (so a -benchmark that failed/skipped doesn't get reported as a green job). +Two output modes over the same data: + (default) a plain-text table for the Slurm/container log; ALSO the result + gate — exits non-zero if no valid results were produced, so a + failed/skipped benchmark doesn't get reported as a green job. + --markdown GitHub-flavored markdown for a GitHub Actions job summary + (https://github.blog/.../supercharging-github-actions-with-job-summaries/); + reporting only, always exits 0. A workflow step appends this to + $GITHUB_STEP_SUMMARY so the run page shows a rendered table. python summarize.py --results-dir results --runner gb200-nv_1 --ts + python summarize.py --results-dir results --markdown >> "$GITHUB_STEP_SUMMARY" """ from __future__ import annotations @@ -35,82 +37,113 @@ def load_results(results_dir: str, runner: str | None, ts: str | None) -> list[d except (json.JSONDecodeError, OSError): continue if d.get("family") in ("nccl", "moe"): - d["_base"] = base docs.append(d) return docs -def _peak_busbw(rows: list[dict]) -> float: +def _peak_busbw(rows): return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0) -def _min_lat(rows: list[dict]) -> float: +def _min_lat(rows): vals = [r["out_of_place"]["time_us"] for r in rows if r.get("out_of_place", {}).get("time_us") is not None] return min(vals) if vals else float("nan") -def main() -> int: - ap = argparse.ArgumentParser(description="CollectiveX result summary table") - ap.add_argument("--results-dir", default="results") - ap.add_argument("--runner", default=None) - ap.add_argument("--ts", default=None) - args = ap.parse_args() +def _fnum(x, fmt): + return format(x, fmt) if isinstance(x, (int, float)) else "—" - docs = load_results(args.results_dir, args.runner, args.ts) - nccl = [d for d in docs if d["family"] == "nccl"] - moe = [d for d in docs if d["family"] == "moe"] +def render_plain(nccl, moe, n_valid, total) -> str: + out = [] hdr = "CollectiveX results" - if docs: - d0 = docs[0] - hdr += (f" — runner={d0.get('runner')} topology={d0.get('topology_class')}" - f" transport={d0.get('transport')}") - print("\n" + "=" * len(hdr)) - print(hdr) - print("=" * len(hdr)) + if nccl or moe: + d0 = (nccl + moe)[0] + hdr += f" — runner={d0.get('runner')} topology={d0.get('topology_class')} transport={d0.get('transport')}" + out += ["=" * len(hdr), hdr, "=" * len(hdr)] + if nccl: + out.append(f"\nNCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')}):") + out.append(f" {'op':<16}{'status':<9}{'peak busbw':>12}{'min lat':>10}{'avg busbw':>11}") + for d in sorted(nccl, key=lambda x: x["op"]): + rows = d.get("rows", []) + avg = (d.get("summary") or {}).get("avg_busbw_gbps") + out.append(f" {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}" + f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") + if moe: + out.append("\nMoE / DeepEP dispatch+combine:") + out.append(f" {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}{'disp_p50':>10}{'tokens/s':>13} correct") + for d in sorted(moe, key=lambda x: x.get("backend", "")): + m, c = d.get("metrics", {}), d.get("correctness", {}) + tps = m.get("tokens_per_second") + out.append(f" {d.get('backend',''):<10}{d.get('mode',''):<8}{d.get('status',''):<9}" + f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}" + f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}" + f"{(tps if tps is not None else float('nan')):>13.3e} {c.get('passed')}") + out.append(f"\n{n_valid}/{total} results valid.") + return "\n".join(out) + - n_valid = 0 +def _emoji(status) -> str: + return "✅ valid" if status == "valid" else f"❌ {status}" + +def render_markdown(nccl, moe, n_valid, total) -> str: + out = [] + if nccl or moe: + d0 = (nccl + moe)[0] + out.append(f"## CollectiveX results — `{d0.get('runner')}` · {d0.get('topology_class')} · {d0.get('transport') or 'n/a'}") if nccl: - ws = nccl[0].get("world_size") - print(f"\nNCCL primitives (world={ws}, dtype={nccl[0].get('dtype')}):") - print(f" {'op':<16}{'status':<9}{'peak busbw':>12}{'min lat':>10}{'avg busbw':>11}") - print(f" {'':<16}{'':<9}{'(GB/s)':>12}{'(us)':>10}{'(GB/s)':>11}") + out.append(f"\n### NCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n") + out.append("| op | status | peak busbw (GB/s) | min lat (µs) | avg busbw (GB/s) |") + out.append("|---|---|--:|--:|--:|") for d in sorted(nccl, key=lambda x: x["op"]): rows = d.get("rows", []) - n_valid += d.get("status") == "valid" avg = (d.get("summary") or {}).get("avg_busbw_gbps") - print(f" {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}" - f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") - + out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | " + f"{_min_lat(rows):.2f} | {_fnum(avg, '.1f')} |") if moe: - print("\nMoE / DeepEP dispatch+combine:") - print(f" {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}" - f"{'disp_p50':>10}{'tokens/s':>13}{' correct'}") - print(f" {'':<10}{'':<8}{'':<9}{'(us)':>9}{'(us)':>9}{'(us)':>10}{'':>13}") + out.append("\n### MoE / DeepEP dispatch+combine\n") + out.append("| backend | mode | status | rt p50 (µs) | rt p99 (µs) | dispatch p50 (µs) | tokens/s | correct |") + out.append("|---|---|---|--:|--:|--:|--:|:--:|") for d in sorted(moe, key=lambda x: x.get("backend", "")): - m = d.get("metrics", {}) - c = d.get("correctness", {}) - n_valid += d.get("status") == "valid" - tps = m.get("tokens_per_second") - print(f" {d.get('backend',''):<10}{d.get('mode',''):<8}{d.get('status',''):<9}" - f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}" - f"{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}" - f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}" - f"{(tps if tps is not None else float('nan')):>13.3e}" - f" {c.get('passed')}") + m, c = d.get("metrics", {}), d.get("correctness", {}) + out.append(f"| `{d.get('backend')}` | {d.get('mode')} | {_emoji(d.get('status'))} | " + f"{_fnum(m.get('roundtrip_us_p50'), '.1f')} | {_fnum(m.get('roundtrip_us_p99'), '.1f')} | " + f"{_fnum(m.get('dispatch_us_p50'), '.1f')} | {_fnum(m.get('tokens_per_second'), '.3e')} | " + f"{'✅' if c.get('passed') else '❌'} |") + badge = "✅" if (total and n_valid == total) else "⚠️" + out.append(f"\n{badge} **{n_valid}/{total} results valid.**") + if not total: + out.append("\n> No result files found — the benchmark produced nothing.") + return "\n".join(out) + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX result summary") + ap.add_argument("--results-dir", default="results") + ap.add_argument("--runner", default=None) + ap.add_argument("--ts", default=None) + ap.add_argument("--markdown", action="store_true", + help="emit GitHub job-summary markdown (reporting only; always exits 0)") + args = ap.parse_args() + docs = load_results(args.results_dir, args.runner, args.ts) + nccl = [d for d in docs if d["family"] == "nccl"] + moe = [d for d in docs if d["family"] == "moe"] total = len(docs) - print(f"\n{n_valid}/{total} results valid.\n") + n_valid = sum(d.get("status") == "valid" for d in docs) + + if args.markdown: + print(render_markdown(nccl, moe, n_valid, total)) + return 0 # reporting step — never fail the job here + + print(render_plain(nccl, moe, n_valid, total)) if total == 0: - print("ERROR: no result files found to summarize — benchmark produced nothing.") - return 1 - if n_valid == 0: - print("ERROR: no valid results — failing the job.") + print("ERROR: no result files found — benchmark produced nothing.") return 1 if n_valid < total: - print(f"WARNING: {total - n_valid} result(s) invalid.") + print(f"ERROR: {total - n_valid} result(s) invalid — failing the job.") return 1 return 0 From be9cc91cd4e083189afcf1493e6d4975c59121c8 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 17:13:56 +0800 Subject: [PATCH 06/17] CollectiveX: add MI355X / MoRI EP path (dispatch+combine) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First AMD / cross-vendor reach, scaffolded ahead of Milestone 1: - run_mori.py: MoRI dispatch+combine (normal mode), correctness-gated, mirroring ROCm/mori's dispatch_combine example — int32 routing indices, (n,0) fp8 scales, the zero-copy registered-combine-input-buffer staging step, and expected = input x (#unique destination ranks). Emits the same flat JSON shape (family=moe, backend=mori) with CUDA-event timing. - launchers/launch_mi355x-amds.sh: AMD adapter — partition compute, no account, --cpus-per-task=128, node-local /var/lib/squash imported via srun on the allocated node, --container-writable --container-remap-root, forces CX_BENCH=mori, mounts the (compute-visible) checkout at /ix. - launchers/run_in_container.sh: run_mori_suite + mori case (nccl|deepep|mori|all). - launchers/common.sh: ROCm MoRI image (rocm/sgl-dev:...-mori-0227-2) in cx_default_image for mi355x*/mi350x*/mi325x*/mi300x*. - workflow: mi355x sku + mori benchmark options for workflow_dispatch. - docs: CONTAINERS.md AMD section, README files/run/risks, plan.md status. Not yet hardware-validated (no MI355X access) — MoRI's Python API is version-sensitive (marked ADAPT HERE); the first runner job is the validation, as GB200 was for DeepEP. The ROCm image isn't digest-pinned yet. --- .../workflows/collectivex-experimental.yml | 5 +- experimental/CollectiveX/CONTAINERS.md | 12 + experimental/CollectiveX/README.md | 25 +- experimental/CollectiveX/launchers/common.sh | 7 + .../launchers/launch_mi355x-amds.sh | 91 +++++++ .../CollectiveX/launchers/run_in_container.sh | 24 +- experimental/CollectiveX/plan.md | 3 +- experimental/CollectiveX/run_mori.py | 254 ++++++++++++++++++ 8 files changed, 409 insertions(+), 12 deletions(-) create mode 100644 experimental/CollectiveX/launchers/launch_mi355x-amds.sh create mode 100644 experimental/CollectiveX/run_mori.py diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml index c63b56635..c98646efe 100644 --- a/.github/workflows/collectivex-experimental.yml +++ b/.github/workflows/collectivex-experimental.yml @@ -22,12 +22,13 @@ on: description: Self-hosted runner pool (must have a CollectiveX launcher) type: choice default: gb200 - options: [gb200, b200-dgxc, b200-multinode] + options: [gb200, b200-dgxc, b200-multinode, mi355x] benchmark: + # mori runs only on mi355x; nccl/deepep/all on the NVIDIA SKUs. description: Which benchmark to run type: choice default: nccl - options: [nccl, deepep, all] + options: [nccl, deepep, mori, all] ops: description: NCCL ops (space-separated); blank = default set type: string diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md index 3aff25194..1c82e0f66 100644 --- a/experimental/CollectiveX/CONTAINERS.md +++ b/experimental/CollectiveX/CONTAINERS.md @@ -39,6 +39,18 @@ If a bundled DeepEP is needed before `rebuild-deepep` is wired on the multi-arch Select via `CX_IMAGE=…@sha256:…` on the launch script. +## AMD container (MI355X) — MoRI EP + +AMD CDNA4 cannot run the CUDA multi-arch image; MI355X uses a ROCm image that +bundles **MoRI** (AMD's EP dispatch/combine library). Set in `cx_default_image` +for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`). + +- **Image:** `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2` (single-arch ROCm 7.2.0 runtime; from the AMD master serving config). **Not digest-pinned yet** — record the digest here and pin once validated on the runner, like the NVIDIA image. +- **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run. +- **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`). +- **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up. +- **NOT yet validated on hardware** (no MI355X access at authoring). Treat the first runner job as the validation, exactly as `run_deepep.py` was on GB200. Likely first-run touch-ups: MoRI Python API signatures (`EpDispatchCombineConfig` kwargs, `dispatch`/`combine`/`get_registered_combine_input_buffer`), then fill a version table here (ROCm, torch, RCCL, MoRI commit). + ## Cluster access / QOS - **B200** (`slurm-login-slinky`): account `benchmark`, **only `gpu-2_qos`** → partition `gpu-2` only (shared with the serving sweep). `gpu-1`/`all` (idle) need `gpu-1_qos`/`all_qos`, not associated with this account. diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md index 606eeb395..ac489f541 100644 --- a/experimental/CollectiveX/README.md +++ b/experimental/CollectiveX/README.md @@ -17,10 +17,11 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL | `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) | | `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) | | `run_deepep.py` | DeepEP dispatch+combine, normal mode, correctness-gated (torch + DeepEP) | +| `run_mori.py` | MoRI (AMD) dispatch+combine, normal mode, correctness-gated (torch + MoRI) | | `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) | | `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build | -| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/all) | -| `launchers/launch_.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL) | +| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) | +| `launchers/launch_.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI) | | `CONTAINERS.md` | the pinned multi-arch container + audited library versions | | `results/` | flat JSON artifacts (+ `plots/`, raw captures) | | `tests/fixtures/` | captured nccl-tests output for offline parser checks | @@ -31,9 +32,10 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL - **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle capacity; never auto-contends with the B200 serving sweep). -- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode), - `benchmark` (nccl / deepep / all), ops, sizes, ngpus. Lands on that SKU's - self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. +- **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode / + mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only), ops, + sizes, ngpus. Lands on that SKU's self-hosted runner and runs + `launch_${RUNNER_NAME%%_*}.sh`. Each job renders a results table to the **GitHub Actions job summary** (via `summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs @@ -47,9 +49,10 @@ bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB2 CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild) bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh # B200 8× NVLink bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh # B200 2-node, cross-IB +bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # MI355X 8× XGMI, MoRI EP (AMD; forces CX_BENCH=mori) ``` -Knobs: `CX_BENCH` (nccl|deepep|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`, +Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`, `CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate nothing). Results land in `experimental/CollectiveX/results/`. @@ -78,6 +81,10 @@ DeepSeek-V4 fallback images. missing) → `srun --container-image=… --container-mounts=:/ix` → in-container `run_in_container.sh`. B200 partition `gpu-2`, GB200 partition `batch`, account `benchmark`. +- **AMD MI355X** (`launch_mi355x-amds.sh`, MoRI / `CX_BENCH=mori`) diverges: partition + `compute`, no account, pyxis `--container-writable --container-remap-root`, and a + **node-local** squash (`/var/lib/squash`) imported via `srun` on the allocated node + (not the login node). Workspace is bind-mounted directly (no `CX_STAGE_DIR`). - Login nodes have no `nvcc`, so `nccl-tests` is **built in-container** (cached in `.nccl-tests/`, `CX_NCCL_HOME=/usr`). Single-node uses `-g N`; the 2-node adapter builds `MPI=1` and launches one rank per GPU (`srun --mpi=pmix`). @@ -97,6 +104,12 @@ DeepSeek-V4 fallback images. it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive; `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against the built commit. B200 (x86_64) first; GB200 (aarch64) follows. +- **MoRI / MI355X** (`run_mori.py` + `launch_mi355x-amds.sh`) is **scaffolded, not yet + run on hardware** (no MI355X access). It mirrors `ROCm/mori`'s dispatch/combine + example — config + the `get_registered_combine_input_buffer` zero-copy path, + correctness `expected = input × (#unique destination ranks)`. The API is + version-sensitive (`ADAPT HERE`), so the first runner job is the validation, like + GB200 was for DeepEP; the AMD ROCm image isn't digest-pinned yet. - **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container or srt-slurm. CX_BENCH=nccl only for now. diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh index d8d5749eb..7d63dfdc8 100644 --- a/experimental/CollectiveX/launchers/common.sh +++ b/experimental/CollectiveX/launchers/common.sh @@ -24,8 +24,15 @@ CX_IMAGE_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca5 # DeepEP — see CONTAINERS.md — but are not multi-arch and are not the default.) CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130" +# AMD (ROCm/CDNA): the multi-arch NVIDIA image above is x86_64+aarch64 CUDA and +# cannot run on MI355X. AMD uses a separate ROCm image that bundles MoRI (the +# AMD EP library). Single-arch (linux/amd64 host, ROCm runtime); not digest- +# pinned yet — pin once validated on the runner. See CONTAINERS.md. +CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2" + cx_default_image() { case "$1" in + mi355x*|mi350x*|mi325x*|mi300x*) echo "$CX_IMAGE_AMD_MORI" ;; b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;; *) cx_die "no default image for runner prefix: $1" ;; esac diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh new file mode 100644 index 000000000..f6901f7d4 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# CollectiveX — MI355X (AMD CDNA4, 8 GPU/node) SKU adapter: MoRI dispatch/combine. +# +# AMD counterpart to the NVIDIA adapters. Differs from them in ways taken from +# the real runners/launch_mi355x-amds.sh: +# * partition `compute`, no --account (cluster default), --cpus-per-task=128, +# and known-bad nodes excluded; +# * squash is NODE-LOCAL (/var/lib/squash), so enroot import runs via srun on +# the allocated node (not on the login node like the shared-FS NVIDIA path); +# * pyxis flags --container-writable --container-remap-root for the ROCm image. +# MoRI is the only AMD backend wired (CX_BENCH=mori); rccl-tests primitives are a +# follow-up. +# +# !!! NOT yet validated on hardware (no MI355X cluster access at authoring time). +# Treat the first on-runner run as validation — like run_deepep.py was on GB200. +# +# Run from inside the InferenceX checkout on the MI355X login node: +# bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh +# +# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(30) CX_IMAGE +# CX_SQUASH_DIR(/var/lib/squash) CX_EXCLUDE_NODES CX_DRYRUN(0) +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=common.sh +source "$HERE/common.sh" + +RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}" +PARTITION="${CX_PARTITION:-compute}" +NGPUS="${CX_NGPUS:-8}" +TIME_MIN="${CX_TIME:-30}" +IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}" +SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}" # node-local on MI355X +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" + +# MoRI is the only AMD backend wired today; force it. +if [ "${CX_BENCH:-mori}" != "mori" ]; then + cx_log "mi355x: CX_BENCH='${CX_BENCH}' not supported on AMD yet; using mori" +fi +export CX_BENCH=mori +export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_TOPO="mi355x-xgmi" CX_TRANSPORT="xgmi" +export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" + +cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=mori image=$IMAGE" +# AMD workspace is compute-visible (the serving launcher bind-mounts it directly), +# so no staging; the node-local squash is handled via srun below. +MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +SQUASH_FILE="$SQUASH_DIR/$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" +cx_log "squash(node-local)=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" + +if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi +command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" + +salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \ + --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" +cx_log "JOB_ID=$JOB_ID" +trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + +# Clear stray containers, then enroot-import to the node-local squash (flock, +# /dev/null || true' || true +srun --jobid="$JOB_ID" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 900 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then + echo 'squash present: $SQUASH_FILE' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" \"docker://$IMAGE\" /dev/null; then + cx_log "WARN: mori not importable — needs the AMD MoRI image (rocm/sgl-dev:...-mori-...); cannot run mori" + return 1 + fi + torchrun --nproc_per_node="$CX_NGPUS" run_mori.py \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \ + --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_mori_${CX_TS}.json" \ + || { cx_log "WARN: mori run failed"; return 1; } +} + rc=0 case "$CX_BENCH" in nccl) run_nccl_suite || rc=1 ;; deepep) run_deepep_suite || rc=1 ;; + mori) run_mori_suite || rc=1 ;; all) run_nccl_suite || rc=1; run_deepep_suite || rc=1 ;; - *) cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|all)" ;; + *) cx_die "unknown CX_BENCH=$CX_BENCH (want nccl|deepep|mori|all)" ;; esac # Summary table for the log; also fails the job if no valid results were produced. diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md index 6ceb512ef..ced877dd8 100644 --- a/experimental/CollectiveX/plan.md +++ b/experimental/CollectiveX/plan.md @@ -29,8 +29,9 @@ Existing public benchmarks don't offer trustworthy, like-for-like collective/EP The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) and GB200 (4× NVL72 MNNVL, aarch64) — 4 NCCL primitives, correctness-passed, topology-keyed distinctly (peak bus-bw: B200 all-reduce 835 GB/s; GB200 689 GB/s). Built on top of that: - **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`. -- **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|all) through a shared `launchers/run_in_container.sh`. +- **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`. - **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → GB200 NCCL smoke; `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. +- **AMD MI355X / MoRI path scaffolded** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Not yet hardware-validated** (no MI355X access) — the MoRI Python API is version-sensitive (`ADAPT HERE`); the first runner job is the validation, as GB200 was for DeepEP. This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental). diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py new file mode 100644 index 000000000..d4d0297ef --- /dev/null +++ b/experimental/CollectiveX/run_mori.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +"""CollectiveX spike — MoRI (AMD) MoE dispatch+combine, normal mode. + +AMD counterpart to run_deepep.py, using ROCm MoRI's EpDispatchCombine op. One +decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed, +emitting the same flat-JSON shape (family=moe, backend=mori). + + !!! MoRI's Python API is VERSION-SENSITIVE. The config/dispatch/combine block + below follows ROCm/mori examples/ops/dispatch_combine/test_dispatch_combine.py + and is marked "ADAPT HERE" — validate the signatures against the MoRI build in + the image (rocm/sgl-dev:...-mori-...) and record its commit. This file has NOT + been run on MI355X yet (no cluster access at authoring time); treat the first + on-runner run as the validation, exactly as run_deepep.py was for GB200. + +Launch (one process per GPU), e.g. single-node 8x MI355X: + torchrun --nproc_per_node=8 run_mori.py \\ + --runner mi355x-amds --topology-class mi355x-xgmi --transport xgmi \\ + --env-json results/env.json --out results/mi355x_mori.json +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os +import sys + +SCHEMA_VERSION = 1 +MEASUREMENT_CONTRACT = "mori-normal-v1" + + +def _percentile(xs: list[float], q: float) -> float: + if not xs: + return float("nan") + s = sorted(xs) + i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1))))) + return s[i] + + +def comparison_key(meta: dict) -> str: + parts = [ + meta["op"], meta["backend"], meta["mode"], str(meta["world_size"]), + str(meta["nodes"]), meta["topology_class"], meta["comparison_class"], + meta["measurement_contract"], str(meta["shape"]), + ] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX MoRI dispatch+combine (normal mode)") + ap.add_argument("--tokens-per-rank", type=int, default=64) + ap.add_argument("--hidden", type=int, default=7168) + ap.add_argument("--topk", type=int, default=8) + ap.add_argument("--experts", type=int, default=256) + ap.add_argument("--dispatch-dtype", default="bf16", choices=["bf16", "fp8"]) + ap.add_argument("--seed", type=int, default=67) + ap.add_argument("--warmup", type=int, default=20) + ap.add_argument("--iters", type=int, default=200) + ap.add_argument("--trials", type=int, default=3) + ap.add_argument("--block-num", type=int, default=int(os.environ.get("CX_MORI_BLOCK_NUM", "80"))) + ap.add_argument("--dispatch-warps", type=int, default=int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16"))) + ap.add_argument("--combine-warps", type=int, default=int(os.environ.get("CX_MORI_COMBINE_WARPS", "8"))) + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + ap.add_argument("--comparison-class", default="standardized") + ap.add_argument("--mori-commit", default=os.environ.get("MORI_COMMIT", "unknown")) + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + args = ap.parse_args() + + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + try: + import mori # type: ignore + except Exception as exc: # pragma: no cover + print(f"ERROR: mori import failed — needs the AMD MoRI image. {exc!r}", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + if world_size % 1 != 0 or args.experts % world_size != 0: + if rank == 0: + print(f"ERROR: experts ({args.experts}) must divide world_size ({world_size})", file=sys.stderr) + return 2 + experts_per_rank = args.experts // world_size + torch.manual_seed(args.seed + rank) + + # ===================== ADAPT HERE (MoRI API) ========================= + # init torch.distributed + MoRI shmem (per the MoRI dispatch/combine test). + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12355") + if not dist.is_initialized(): + dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, + world_size=world_size, device_id=device) + world_group = torch.distributed.group.WORLD + torch._C._distributed_c10d._register_process_group("default", world_group) + mori.shmem.shmem_torch_process_group_init("default") + + n = args.tokens_per_rank + H = args.hidden + topk = args.topk + config = mori.ops.EpDispatchCombineConfig( + data_type=torch.bfloat16, + rank=rank, + world_size=world_size, + hidden_dim=H, + scale_dim=0, + scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(), + max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_num_inp_token_per_rank=max(4096, n), + num_experts_per_rank=experts_per_rank, + num_experts_per_token=topk, + use_external_inp_buf=False, + quant_type="none", + ) + op = mori.ops.EpDispatchCombineOp(config) + + # Routing: each token -> topk distinct experts in [0, experts). MoRI expects + # INT32 expert indices, and a real (n, scale_dim) fp8 scales tensor even when + # scale_dim==0 (an (n,0) tensor) — not None (see the reference test). + x = torch.randn((n, H), dtype=torch.bfloat16, device=device) + indices = torch.stack([torch.randperm(args.experts, device=device)[:topk] for _ in range(n)]).to(torch.int32) + weights = torch.rand((n, topk), dtype=torch.float32, device=device) + scales = torch.empty((n, 0), dtype=torch.float8_e4m3fnuz, device=device) + + def run_once(): + (dispatch_output, dispatch_weights, _dispatch_scales, + dispatch_indices, recv_num) = op.dispatch( + x, weights, scales, indices, + block_num=args.block_num, warp_per_block=args.dispatch_warps) + # Zero-copy mode (use_external_inp_buf=False): combine reads from MoRI's + # registered combine-input buffer, so stage the dispatched rows into it + # first. (In a real MoE the expert FFN writes its outputs here; with no + # expert compute we copy the dispatched activations straight through.) + total_recv = int(recv_num[0].item()) + combine_input = dispatch_output.to(torch.bfloat16) + combine_buf = op.get_registered_combine_input_buffer( + torch.bfloat16, hidden_dim=combine_input.size(1)) + combine_buf[:total_recv, :].copy_(combine_input[:total_recv, :]) + combined, _combined_w = op.combine( + combine_input, dispatch_weights, dispatch_indices, + block_num=args.block_num, warp_per_block=args.combine_warps) + return combined, recv_num + # ===================================================================== + + # ---- correctness gate ---- + combined, recv_num = run_once() + torch.cuda.synchronize() + # MoRI combine sums one copy per destination RANK, so combined[i] ≈ + # input[i] * (#unique destination ranks among the token's topk experts) + # (see ROCm/mori .../test_dispatch_combine.py). + pes = indices.long() // experts_per_rank + unique_pes = torch.tensor( + [len(set(row.tolist())) for row in pes], device=device, dtype=torch.float32 + ).unsqueeze(1) + expected = x.float() * unique_pes + max_abs = (combined.float() - expected).abs().max().item() + max_rel = max_abs / (expected.abs().max().item() + 1e-6) + # Validated tolerance from the reference test (bf16 + up-to-topk summation). + combine_ok = bool(torch.allclose(combined.float(), expected.float(), atol=1e-2, rtol=1e-2)) + recv_ok = bool(int(recv_num[0].item()) > 0) if recv_num is not None else True + correct = bool(combine_ok and recv_ok) + + def time_us(fn, warmup, iters) -> list[float]: + for _ in range(warmup): + fn() + torch.cuda.synchronize() + out = [] + for _ in range(iters): + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record(); fn(); e.record(); torch.cuda.synchronize() + out.append(s.elapsed_time(e) * 1000.0) + return out + + def dispatch_only(): + op.dispatch(x, weights, scales, indices, + block_num=args.block_num, warp_per_block=args.dispatch_warps) + + trials = [] + for _ in range(args.trials): + rt = time_us(run_once, args.warmup, args.iters) + dp = time_us(dispatch_only, args.warmup, args.iters) + trials.append({"roundtrip_us_p50": _percentile(rt, 50), "roundtrip_us_p99": _percentile(rt, 99), + "dispatch_us_p50": _percentile(dp, 50)}) + + local_rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) + t = torch.tensor([local_rt_p50], device=device) + dist.all_reduce(t, op=dist.ReduceOp.MAX) + slowest_rank_us = float(t.item()) + + if rank == 0: + shape = {"tokens_per_rank": n, "hidden": H, "topk": topk, "experts": args.experts, + "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype} + meta = {"op": "dispatch-combine", "backend": "mori", "mode": "normal", + "world_size": world_size, "nodes": int(os.environ.get("SLURM_NNODES", "1")), + "topology_class": args.topology_class, "comparison_class": args.comparison_class, + "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape} + rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) + tokens_total = n * world_size + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + doc = { + "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "run_mori.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + "status": "valid" if correct else "invalid", + "comparison_key": comparison_key(meta), + "backend_provenance": {"mori_commit": args.mori_commit, + "block_num": args.block_num, + "dispatch_warps": args.dispatch_warps, + "combine_warps": args.combine_warps}, + **meta, + "correctness": {"passed": correct, "combine_within_tol": combine_ok, + "recv_nonzero": recv_ok, "max_abs_error": max_abs, "max_rel_error": max_rel}, + "metrics": { + "roundtrip_us_p50": rt_p50, + "roundtrip_us_p99": sum(t["roundtrip_us_p99"] for t in trials) / len(trials), + "dispatch_us_p50": sum(t["dispatch_us_p50"] for t in trials) / len(trials), + "slowest_rank_roundtrip_us": slowest_rank_us, + "tokens_per_second": (tokens_total / (rt_p50 * 1e-6)) if rt_p50 else None, + }, + "trials": trials, "environment": env, + } + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"mori dispatch-combine: status={doc['status']} rt_p50={rt_p50:.1f}us " + f"slowest_rank={slowest_rank_us:.1f}us correct={correct} -> {args.out}") + + try: + mori.shmem.shmem_finalize() + except Exception: + pass + dist.barrier() + dist.destroy_process_group() + return 0 if correct else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) From d8ee9bf858a3471f2899276fa1a22aedfce8f32a Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 17:25:10 +0800 Subject: [PATCH 07/17] CollectiveX: run MI355X MoRI on push; align launcher with serving script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - workflow: replace the on:push GB200 NCCL smoke with the MI355X MoRI dispatch/combine run (runs-on: mi355x, CX_BENCH=mori), and name the job "CollectiveX Experimental" (no longer "smoke"). GB200/B200 NCCL + DeepEP remain on workflow_dispatch. - launch_mi355x-amds.sh: adapt more faithfully to runners/launch_mi355x-amds.sh — squeue by job-name only (no -u), flock -w 600, and clear ROCm gpucore.* dumps after the run so the next checkout is clean. Bump default CX_TIME to 60 for a cold ROCm-image import. - summarize.py: drop the "N/N results valid." footer from both the job-summary (markdown) and plain output; the failure gate still reports invalid results. Relabel the MoE section "MoE dispatch+combine (DeepEP / MoRI)". - docs: README/plan describe push -> MI355X MoRI. --- .../workflows/collectivex-experimental.yml | 33 +++++++++---------- experimental/CollectiveX/README.md | 4 +-- .../launchers/launch_mi355x-amds.sh | 11 ++++--- experimental/CollectiveX/plan.md | 2 +- experimental/CollectiveX/summarize.py | 7 ++-- 5 files changed, 27 insertions(+), 30 deletions(-) diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml index c98646efe..fcfdcb88e 100644 --- a/.github/workflows/collectivex-experimental.yml +++ b/.github/workflows/collectivex-experimental.yml @@ -1,11 +1,11 @@ name: CollectiveX Experimental # Orchestration only — all benchmark logic lives in experimental/CollectiveX/. -# Push to the feature branch runs a small GB200 NCCL smoke (no merge to main -# needed); workflow_dispatch runs a chosen SKU + benchmark (the lane for B200, -# DeepEP, and larger sweeps). Each job lands on the SKU's self-hosted runner and -# invokes that SKU's launch script — the same launch_${RUNNER_NAME%%_*}.sh -# convention the serving benchmarks use. +# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no +# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane +# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's +# self-hosted runner and invokes that SKU's launch script — the same +# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use. on: push: @@ -54,23 +54,20 @@ permissions: contents: read jobs: - # Push -> short GB200 NCCL smoke (idle capacity; never auto-contends with the - # B200 serving sweep). GB200 runner workspace is staged to compute-visible - # Lustre via CX_STAGE_DIR. - smoke: + # Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and + # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute- + # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs. + experimental: + name: CollectiveX Experimental if: github.event_name == 'push' - runs-on: gb200 - timeout-minutes: 60 + runs-on: mi355x + timeout-minutes: 90 env: - CX_BENCH: nccl - CX_NGPUS: '4' - CX_MAX_BYTES: 1G - CX_TIME: '20' - CX_STAGE_DIR: /mnt/lustre01/users-public/sa-shared/cx-stage + CX_BENCH: mori steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 with: { clean: true } - - name: Launch GB200 NCCL smoke + - name: Launch MI355X MoRI env: RUNNER_NAME: ${{ runner.name }} run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" @@ -81,7 +78,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: collectivex_smoke_gb200_${{ github.run_id }} + name: collectivex_mi355x_mori_${{ github.run_id }} path: experimental/CollectiveX/results/*.json if-no-files-found: warn diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md index ac489f541..11bbd8aaa 100644 --- a/experimental/CollectiveX/README.md +++ b/experimental/CollectiveX/README.md @@ -30,8 +30,8 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL ### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`) -- **push** to `experimental/CollectiveX/**` → short **GB200 NCCL smoke** (idle - capacity; never auto-contends with the B200 serving sweep). +- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** dispatch/combine + run (the "CollectiveX Experimental" job; lands on a free `mi355x-amds` runner). - **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode / mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only), ops, sizes, ngpus. Lands on that SKU's self-hosted runner and runs diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh index f6901f7d4..f1117229c 100644 --- a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh +++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh @@ -17,7 +17,7 @@ # Run from inside the InferenceX checkout on the MI355X login node: # bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # -# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(30) CX_IMAGE +# Env knobs: CX_PARTITION(compute) CX_NGPUS(8) CX_TIME(60) CX_IMAGE # CX_SQUASH_DIR(/var/lib/squash) CX_EXCLUDE_NODES CX_DRYRUN(0) set -euo pipefail @@ -30,7 +30,7 @@ source "$HERE/common.sh" RUNNER_NAME="${RUNNER_NAME:-mi355x-amds}" PARTITION="${CX_PARTITION:-compute}" NGPUS="${CX_NGPUS:-8}" -TIME_MIN="${CX_TIME:-30}" +TIME_MIN="${CX_TIME:-60}" # generous: a cold enroot import of the large ROCm image IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}" SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}" # node-local on MI355X EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}" @@ -59,7 +59,7 @@ command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm lo salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \ --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" -JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" +JOB_ID="$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)" [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" cx_log "JOB_ID=$JOB_ID" trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT @@ -70,7 +70,7 @@ trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT srun --jobid="$JOB_ID" bash -c 'docker stop $(docker ps -aq) 2>/dev/null || true' || true srun --jobid="$JOB_ID" bash -c " exec 9>\"$LOCK_FILE\" - flock -w 900 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; } + flock -w 600 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; } if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then echo 'squash present: $SQUASH_FILE' else @@ -88,4 +88,7 @@ srun --jobid="$JOB_ID" \ bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" +# ROCm can leave gpucore.* dumps in the workdir on a crash; clear them so the +# next checkout on this runner is clean (mirrors the serving launcher). +rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md index ced877dd8..7f1e19d64 100644 --- a/experimental/CollectiveX/plan.md +++ b/experimental/CollectiveX/plan.md @@ -30,7 +30,7 @@ The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) - **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`. - **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`. -- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → GB200 NCCL smoke; `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. +- **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. - **AMD MI355X / MoRI path scaffolded** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Not yet hardware-validated** (no MI355X access) — the MoRI Python API is version-sensitive (`ADAPT HERE`); the first runner job is the validation, as GB200 was for DeepEP. This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental). diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py index 8d81b13ee..dd51f7c73 100644 --- a/experimental/CollectiveX/summarize.py +++ b/experimental/CollectiveX/summarize.py @@ -71,7 +71,7 @@ def render_plain(nccl, moe, n_valid, total) -> str: out.append(f" {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}" f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") if moe: - out.append("\nMoE / DeepEP dispatch+combine:") + out.append("\nMoE dispatch+combine (DeepEP / MoRI):") out.append(f" {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}{'disp_p50':>10}{'tokens/s':>13} correct") for d in sorted(moe, key=lambda x: x.get("backend", "")): m, c = d.get("metrics", {}), d.get("correctness", {}) @@ -80,7 +80,6 @@ def render_plain(nccl, moe, n_valid, total) -> str: f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}" f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}" f"{(tps if tps is not None else float('nan')):>13.3e} {c.get('passed')}") - out.append(f"\n{n_valid}/{total} results valid.") return "\n".join(out) @@ -103,7 +102,7 @@ def render_markdown(nccl, moe, n_valid, total) -> str: out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | " f"{_min_lat(rows):.2f} | {_fnum(avg, '.1f')} |") if moe: - out.append("\n### MoE / DeepEP dispatch+combine\n") + out.append("\n### MoE dispatch+combine (DeepEP / MoRI)\n") out.append("| backend | mode | status | rt p50 (µs) | rt p99 (µs) | dispatch p50 (µs) | tokens/s | correct |") out.append("|---|---|---|--:|--:|--:|--:|:--:|") for d in sorted(moe, key=lambda x: x.get("backend", "")): @@ -112,8 +111,6 @@ def render_markdown(nccl, moe, n_valid, total) -> str: f"{_fnum(m.get('roundtrip_us_p50'), '.1f')} | {_fnum(m.get('roundtrip_us_p99'), '.1f')} | " f"{_fnum(m.get('dispatch_us_p50'), '.1f')} | {_fnum(m.get('tokens_per_second'), '.3e')} | " f"{'✅' if c.get('passed') else '❌'} |") - badge = "✅" if (total and n_valid == total) else "⚠️" - out.append(f"\n{badge} **{n_valid}/{total} results valid.**") if not total: out.append("\n> No result files found — the benchmark produced nothing.") return "\n".join(out) From ac3f1b9df26072a81dfe397c13edae75bce652a2 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 17:37:58 +0800 Subject: [PATCH 08/17] CollectiveX: size MoRI symmetric heap (first MI355X run hit the 2 GiB default) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First MI355X run reached the MoRI dispatch kernel — salloc, ROCm-image import, mount, torchrun, 8-rank Gloo + shmem init, and EpDispatchCombineConfig/op/dispatch all worked, confirming the API signatures. It OOM'd MoRI's default 2 GiB static symmetric heap (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). run_mori.py now sets MORI_SHMEM_HEAP_SIZE before `import mori` (default 16 GiB, override CX_MORI_HEAP_BYTES). Docstring + CONTAINERS.md record the finding; correctness/timing validated by the heap-sized re-run. --- experimental/CollectiveX/CONTAINERS.md | 2 +- experimental/CollectiveX/run_mori.py | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md index 1c82e0f66..ee4114cff 100644 --- a/experimental/CollectiveX/CONTAINERS.md +++ b/experimental/CollectiveX/CONTAINERS.md @@ -49,7 +49,7 @@ for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`). - **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run. - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`). - **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up. -- **NOT yet validated on hardware** (no MI355X access at authoring). Treat the first runner job as the validation, exactly as `run_deepep.py` was on GB200. Likely first-run touch-ups: MoRI Python API signatures (`EpDispatchCombineConfig` kwargs, `dispatch`/`combine`/`get_registered_combine_input_buffer`), then fill a version table here (ROCm, torch, RCCL, MoRI commit). +- **First MI355X run reached the MoRI dispatch kernel** (node `mia1-p01-g10`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB squash) → mount → torchrun → 8-rank Gloo + MoRI shmem init → `EpDispatchCombineConfig`/op/`dispatch` all worked, confirming the API signatures. It then OOM'd MoRI's default **2 GiB static symmetric heap** (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). `run_mori.py` now sets **`MORI_SHMEM_HEAP_SIZE`** before `import mori` (default 16 GiB; override `CX_MORI_HEAP_BYTES`). Correctness + timing are validated by the heap-sized re-run; then fill a version table here (ROCm, torch, RCCL, MoRI commit). ## Cluster access / QOS diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py index d4d0297ef..dc724d398 100644 --- a/experimental/CollectiveX/run_mori.py +++ b/experimental/CollectiveX/run_mori.py @@ -5,12 +5,12 @@ decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed, emitting the same flat-JSON shape (family=moe, backend=mori). - !!! MoRI's Python API is VERSION-SENSITIVE. The config/dispatch/combine block - below follows ROCm/mori examples/ops/dispatch_combine/test_dispatch_combine.py - and is marked "ADAPT HERE" — validate the signatures against the MoRI build in - the image (rocm/sgl-dev:...-mori-...) and record its commit. This file has NOT - been run on MI355X yet (no cluster access at authoring time); treat the first - on-runner run as the validation, exactly as run_deepep.py was for GB200. + MoRI's Python API is VERSION-SENSITIVE. The config/dispatch/combine block below + follows ROCm/mori examples/ops/dispatch_combine/test_dispatch_combine.py. The + first MI355X run (image rocm/sgl-dev:...-mori-0227-2) confirmed the setup + + config + dispatch path reach the MoRI kernel; it OOM'd the default 2 GiB + symmetric heap, now sized up via MORI_SHMEM_HEAP_SIZE above. The correctness + gate and timing are validated by the heap-sized re-run. Launch (one process per GPU), e.g. single-node 8x MI355X: torchrun --nproc_per_node=8 run_mori.py \\ @@ -26,6 +26,15 @@ import os import sys +# MoRI's symmetric-memory heap defaults to 2 GiB (static) — too small for the +# DeepSeek hidden size (7168) across 8 ranks: the dispatch/combine buffers +# overflow it ("Out of static heap memory ... Increase via MORI_SHMEM_HEAP_SIZE", +# observed on the first MI355X run). Size it generously here, BEFORE `import mori` +# (the heap is created at shmem init); MI355X HBM is ample. Layered override: +# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_BYTES > 16 GiB default. +os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", + os.environ.get("CX_MORI_HEAP_BYTES", str(16 * 1024**3))) + SCHEMA_VERSION = 1 MEASUREMENT_CONTRACT = "mori-normal-v1" From 46208f23b281c4c7e3bf8e91636ef845bca4b4cf Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 17:48:08 +0800 Subject: [PATCH 09/17] CollectiveX: set MoRI heap to 6G (16 GiB failed RDMA MR registration) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The heap-bump run cleared the 2 GiB OOM but then failed registering the 16 GiB symmetric heap as an RDMA memory region (errno 22 EINVAL, size=17179869184). ROCm/mori's reference test uses MORI_SHMEM_HEAP_SIZE="6G" single-node — big enough for the hidden=7168 dispatch/combine buffers, small enough to register. Match it: default "6G" (override CX_MORI_HEAP_SIZE). The rest of the config already matches the reference (max_num_inp_token_per_rank=4096, hidden=7168, backend cpu:gloo,cuda:nccl), so this lands on the proven single-node setup. --- experimental/CollectiveX/CONTAINERS.md | 2 +- experimental/CollectiveX/run_mori.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md index ee4114cff..701656ce7 100644 --- a/experimental/CollectiveX/CONTAINERS.md +++ b/experimental/CollectiveX/CONTAINERS.md @@ -49,7 +49,7 @@ for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`). - **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run. - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`). - **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up. -- **First MI355X run reached the MoRI dispatch kernel** (node `mia1-p01-g10`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB squash) → mount → torchrun → 8-rank Gloo + MoRI shmem init → `EpDispatchCombineConfig`/op/`dispatch` all worked, confirming the API signatures. It then OOM'd MoRI's default **2 GiB static symmetric heap** (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). `run_mori.py` now sets **`MORI_SHMEM_HEAP_SIZE`** before `import mori` (default 16 GiB; override `CX_MORI_HEAP_BYTES`). Correctness + timing are validated by the heap-sized re-run; then fill a version table here (ROCm, torch, RCCL, MoRI commit). +- **First MI355X run reached the MoRI dispatch kernel** (node `mia1-p01-g10`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB squash) → mount → torchrun → 8-rank Gloo + MoRI shmem init → `EpDispatchCombineConfig`/op/`dispatch` all worked, confirming the API signatures. It then OOM'd MoRI's default **2 GiB static symmetric heap** (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). `run_mori.py` now sets **`MORI_SHMEM_HEAP_SIZE`** before `import mori` (default **`6G`**, matching MoRI's reference test; override `CX_MORI_HEAP_SIZE`). A 16 GiB heap allocated but then failed RDMA MR registration (`errno 22 EINVAL`) — 6 GiB is large enough for the hidden=7168 buffers and registers cleanly. Correctness + timing are validated by the re-run; then fill a version table here (ROCm, torch, RCCL, MoRI commit). ## Cluster access / QOS diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py index dc724d398..b5aaff3b8 100644 --- a/experimental/CollectiveX/run_mori.py +++ b/experimental/CollectiveX/run_mori.py @@ -26,14 +26,15 @@ import os import sys -# MoRI's symmetric-memory heap defaults to 2 GiB (static) — too small for the -# DeepSeek hidden size (7168) across 8 ranks: the dispatch/combine buffers -# overflow it ("Out of static heap memory ... Increase via MORI_SHMEM_HEAP_SIZE", -# observed on the first MI355X run). Size it generously here, BEFORE `import mori` -# (the heap is created at shmem init); MI355X HBM is ample. Layered override: -# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_BYTES > 16 GiB default. +# MoRI's symmetric-memory heap defaults to 2 GiB (static), too small for the +# DeepSeek hidden size (7168) across 8 ranks (dispatch/combine buffers overflow +# it). Set it BEFORE `import mori` (the heap is created at shmem init). Use the +# reference test's "6G": big enough for the buffers, and small enough to +# RDMA-register — a 16 GiB heap allocated fine but failed RDMA MR registration +# (errno 22 EINVAL) on the first heap-bumped MI355X run. Layered override: +# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > "6G". os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", - os.environ.get("CX_MORI_HEAP_BYTES", str(16 * 1024**3))) + os.environ.get("CX_MORI_HEAP_SIZE", "6G")) SCHEMA_VERSION = 1 MEASUREMENT_CONTRACT = "mori-normal-v1" From b62de9949d9348af732037bce2c0c51169d21f91 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 18:34:27 +0800 Subject: [PATCH 10/17] CollectiveX: MoRI MI355X validated on hardware; fix heap/buffer/teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drove run_mori.py to a correct run on 8x MI355X (on-node via salloc+srun): dispatch+combine numerically correct (combine within tol, max_rel ~2e-3), ~85us round-trip at the decode shape. The first runs surfaced four issues, all fixed and re-validated: - RDMA MR ceiling: MoRI registers the WHOLE symmetric heap as one RDMA MR at init (even single-node; no disable-RDMA knob). The ionic_rdma NICs cap GPU MRs at ~4 GiB — a 6 GiB heap fails (RegisterRdmaMemoryRegion errno 22), 2 GiB registers. Hold heap at MORI_SHMEM_HEAP_SIZE=2G (override CX_MORI_HEAP_SIZE). - Buffer sizing: max_num_inp_token_per_rank 4096 -> max(512, n) so the buffers fit the 2 GiB heap (4096 was inherited from the reference test). - Correctness shape: combine returns the full max-token buffer; compare only combined[:n] against expected. - recv count: read total_recv BEFORE combine (combine resets recv_num, which made recv_nonzero a false negative). - Teardown: MoRI's shmem teardown asserts (CheckStatusValid -> SIGABRT) when the op is destroyed after shmem_finalize(); hard-exit after writing results. Docs (README/plan/CONTAINERS) updated from "scaffolded" to validated, with the fabric constraints recorded. --- experimental/CollectiveX/CONTAINERS.md | 7 ++- experimental/CollectiveX/README.md | 13 ++--- experimental/CollectiveX/plan.md | 2 +- experimental/CollectiveX/run_mori.py | 66 ++++++++++++++++---------- 4 files changed, 55 insertions(+), 33 deletions(-) diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md index 701656ce7..52dfc3b80 100644 --- a/experimental/CollectiveX/CONTAINERS.md +++ b/experimental/CollectiveX/CONTAINERS.md @@ -49,7 +49,12 @@ for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`). - **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run. - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`). - **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up. -- **First MI355X run reached the MoRI dispatch kernel** (node `mia1-p01-g10`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB squash) → mount → torchrun → 8-rank Gloo + MoRI shmem init → `EpDispatchCombineConfig`/op/`dispatch` all worked, confirming the API signatures. It then OOM'd MoRI's default **2 GiB static symmetric heap** (hidden=7168 dispatch/combine buffers across 8 ranks request ~0.9 GiB each). `run_mori.py` now sets **`MORI_SHMEM_HEAP_SIZE`** before `import mori` (default **`6G`**, matching MoRI's reference test; override `CX_MORI_HEAP_SIZE`). A 16 GiB heap allocated but then failed RDMA MR registration (`errno 22 EINVAL`) — 6 GiB is large enough for the hidden=7168 buffers and registers cleanly. Correctness + timing are validated by the re-run; then fill a version table here (ROCm, torch, RCCL, MoRI commit). +- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `run_mori.py`: + - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here. + - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now. + - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `run_mori.py` hard-exits after writing results to avoid it. + + Still TODO: capture the exact MoRI commit + a version table (ROCm/torch/RCCL) into provenance, and digest-pin the image. ## Cluster access / QOS diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md index 11bbd8aaa..4540033b4 100644 --- a/experimental/CollectiveX/README.md +++ b/experimental/CollectiveX/README.md @@ -104,12 +104,13 @@ DeepSeek-V4 fallback images. it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive; `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against the built commit. B200 (x86_64) first; GB200 (aarch64) follows. -- **MoRI / MI355X** (`run_mori.py` + `launch_mi355x-amds.sh`) is **scaffolded, not yet - run on hardware** (no MI355X access). It mirrors `ROCm/mori`'s dispatch/combine - example — config + the `get_registered_combine_input_buffer` zero-copy path, - correctness `expected = input × (#unique destination ranks)`. The API is - version-sensitive (`ADAPT HERE`), so the first runner job is the validation, like - GB200 was for DeepEP; the AMD ROCm image isn't digest-pinned yet. +- **MoRI / MI355X** (`run_mori.py` + `launch_mi355x-amds.sh`) is **validated on + hardware** (8× MI355X: dispatch+combine numerically correct, ~85 µs round-trip). + It mirrors `ROCm/mori`'s example (config + `get_registered_combine_input_buffer` + zero-copy path, `expected = input × #unique-destination-ranks`). Three + ionic_rdma-fabric constraints are baked in (see `CONTAINERS.md`): a 2 GiB heap + (the NICs cap RDMA MRs at ~4 GiB), a bounded `max_num_inp_token_per_rank`, and a + hard-exit past MoRI's buggy shmem teardown. The ROCm image isn't digest-pinned yet. - **Multi-node** (`launch_b200-dgxc-slurm.sh`) assumes `srun --mpi=pmix` + a compute-visible checkout (`CX_STAGE_DIR`); else fall back to mpirun-in-container or srt-slurm. CX_BENCH=nccl only for now. diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md index 7f1e19d64..d39f96967 100644 --- a/experimental/CollectiveX/plan.md +++ b/experimental/CollectiveX/plan.md @@ -31,7 +31,7 @@ The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) - **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`. - **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`. - **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. -- **AMD MI355X / MoRI path scaffolded** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Not yet hardware-validated** (no MI355X access) — the MoRI Python API is version-sensitive (`ADAPT HERE`); the first runner job is the validation, as GB200 was for DeepEP. +- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `run_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`). This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental). diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py index b5aaff3b8..f99775427 100644 --- a/experimental/CollectiveX/run_mori.py +++ b/experimental/CollectiveX/run_mori.py @@ -5,12 +5,14 @@ decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed, emitting the same flat-JSON shape (family=moe, backend=mori). - MoRI's Python API is VERSION-SENSITIVE. The config/dispatch/combine block below - follows ROCm/mori examples/ops/dispatch_combine/test_dispatch_combine.py. The - first MI355X run (image rocm/sgl-dev:...-mori-0227-2) confirmed the setup + - config + dispatch path reach the MoRI kernel; it OOM'd the default 2 GiB - symmetric heap, now sized up via MORI_SHMEM_HEAP_SIZE above. The correctness - gate and timing are validated by the heap-sized re-run. + VALIDATED on MI355X (8x, image rocm/sgl-dev:...-mori-0227-2): dispatch+combine + numerically correct (combine within tol, max_rel ~2e-3), ~85 us round-trip at + the decode shape. The config/dispatch/combine API follows ROCm/mori's reference + test. Three constraints on this ionic_rdma fabric are handled here: (1) MoRI + registers the whole symmetric heap as ONE RDMA MR and these NICs cap GPU-memory + MRs at ~4 GiB, so the heap is held at 2 GiB (above); (2) max_num_inp_token_per_rank + is bounded so the buffers fit that heap (below); (3) MoRI's shmem teardown + asserts after finalize, so we hard-exit after writing results (end of main). Launch (one process per GPU), e.g. single-node 8x MI355X: torchrun --nproc_per_node=8 run_mori.py \\ @@ -26,15 +28,15 @@ import os import sys -# MoRI's symmetric-memory heap defaults to 2 GiB (static), too small for the -# DeepSeek hidden size (7168) across 8 ranks (dispatch/combine buffers overflow -# it). Set it BEFORE `import mori` (the heap is created at shmem init). Use the -# reference test's "6G": big enough for the buffers, and small enough to -# RDMA-register — a 16 GiB heap allocated fine but failed RDMA MR registration -# (errno 22 EINVAL) on the first heap-bumped MI355X run. Layered override: -# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > "6G". +# MoRI registers the WHOLE symmetric heap as one RDMA memory region at shmem +# init (set this BEFORE `import mori`). On the MI355X ionic_rdma NICs the GPU- +# memory MR registration has a hard size ceiling (~4 GiB): a 6 GiB heap fails +# (`RegisterRdmaMemoryRegion ... errno 22 EINVAL`, validated on-node), while +# 2 GiB registers cleanly. So keep the heap at 2 GiB and instead bound the +# buffers via max_num_inp_token_per_rank below. Layered override: +# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > "2G". os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", - os.environ.get("CX_MORI_HEAP_SIZE", "6G")) + os.environ.get("CX_MORI_HEAP_SIZE", "2G")) SCHEMA_VERSION = 1 MEASUREMENT_CONTRACT = "mori-normal-v1" @@ -127,7 +129,12 @@ def main() -> int: scale_dim=0, scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(), max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), - max_num_inp_token_per_rank=max(4096, n), + # Sizes MoRI's symmetric buffers. The reference test uses 4096, but at + # hidden=7168 that overflows the registerable 2 GiB heap (see top). Bound + # it to the workload (decode shapes are tens of tokens/rank); 512 fits the + # 2 GiB heap and was validated on-node. Larger token counts may need a + # heap above the NIC's MR ceiling — out of reach on this fabric for now. + max_num_inp_token_per_rank=max(512, n), num_experts_per_rank=experts_per_rank, num_experts_per_token=topk, use_external_inp_buf=False, @@ -160,25 +167,30 @@ def run_once(): combined, _combined_w = op.combine( combine_input, dispatch_weights, dispatch_indices, block_num=args.block_num, warp_per_block=args.combine_warps) - return combined, recv_num + # Return total_recv (read BEFORE combine — combine resets recv_num), not + # the tensor: reading recv_num[0] after combine yields 0 (false negative). + return combined, total_recv # ===================================================================== # ---- correctness gate ---- - combined, recv_num = run_once() + combined, total_recv = run_once() torch.cuda.synchronize() # MoRI combine sums one copy per destination RANK, so combined[i] ≈ # input[i] * (#unique destination ranks among the token's topk experts) - # (see ROCm/mori .../test_dispatch_combine.py). + # (see ROCm/mori .../test_dispatch_combine.py). combine returns the full + # max_num_inp_token_per_rank-sized buffer; only the first n rows are our + # local input tokens, so slice to [:n] before comparing. + combined_valid = combined[:n].float() pes = indices.long() // experts_per_rank unique_pes = torch.tensor( [len(set(row.tolist())) for row in pes], device=device, dtype=torch.float32 ).unsqueeze(1) expected = x.float() * unique_pes - max_abs = (combined.float() - expected).abs().max().item() + max_abs = (combined_valid - expected).abs().max().item() max_rel = max_abs / (expected.abs().max().item() + 1e-6) # Validated tolerance from the reference test (bf16 + up-to-topk summation). - combine_ok = bool(torch.allclose(combined.float(), expected.float(), atol=1e-2, rtol=1e-2)) - recv_ok = bool(int(recv_num[0].item()) > 0) if recv_num is not None else True + combine_ok = bool(torch.allclose(combined_valid, expected.float(), atol=1e-2, rtol=1e-2)) + recv_ok = total_recv > 0 correct = bool(combine_ok and recv_ok) def time_us(fn, warmup, iters) -> list[float]: @@ -251,13 +263,17 @@ def dispatch_only(): print(f"mori dispatch-combine: status={doc['status']} rt_p50={rt_p50:.1f}us " f"slowest_rank={slowest_rank_us:.1f}us correct={correct} -> {args.out}") + # MoRI's shmem teardown asserts when the EpDispatchCombineOp is destroyed + # after shmem_finalize() (CheckStatusValid abort -> SIGABRT on this build, + # validated on-node). The result JSON is already written above, so just sync + # the ranks and hard-exit, skipping the buggy finalize/destructor path. try: - mori.shmem.shmem_finalize() + dist.barrier() except Exception: pass - dist.barrier() - dist.destroy_process_group() - return 0 if correct else 1 + sys.stdout.flush() + sys.stderr.flush() + os._exit(0 if correct else 1) if __name__ == "__main__": From 481ef595a59ae616062c82dcd7ffc6d1e654dd38 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 19:15:17 +0800 Subject: [PATCH 11/17] CollectiveX: wire rccl-tests collective primitives for MI355X (CX_BENCH=nccl) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the AMD collective-primitive path so all_reduce/reduce_scatter/all_gather/ alltoall run on MI355X, not just MoRI: - common.sh: cx_build_rccl_tests — clones ROCm/rccl-tests and builds with `make` against /opt/rocm (amdclang++/librccl). It's a nccl-tests fork producing the same _perf binaries and output format, so run_nccl.py parses it unchanged. Validated building + running all 4 ops in-container on MI355X (correctness OK). - run_in_container.sh: run_nccl_suite picks rccl-tests on ROCm (/opt/rocm or hipcc), nccl-tests otherwise; identical op loop + run_nccl.py invocation. - launch_mi355x-amds.sh: honor CX_BENCH (mori default | nccl) instead of forcing mori; same -g N single-node 8-GPU launch. - docs: README/CONTAINERS note the rccl path. B200 already has the nccl path; this makes primitives available on all three SKUs via workflow_dispatch. --- experimental/CollectiveX/CONTAINERS.md | 2 +- experimental/CollectiveX/README.md | 8 +++-- experimental/CollectiveX/launchers/common.sh | 30 +++++++++++++++++++ .../launchers/launch_mi355x-amds.sh | 18 ++++++----- .../CollectiveX/launchers/run_in_container.sh | 14 +++++++-- 5 files changed, 57 insertions(+), 15 deletions(-) diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md index 52dfc3b80..1d84bffd5 100644 --- a/experimental/CollectiveX/CONTAINERS.md +++ b/experimental/CollectiveX/CONTAINERS.md @@ -48,7 +48,7 @@ for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`). - **Image:** `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2` (single-arch ROCm 7.2.0 runtime; from the AMD master serving config). **Not digest-pinned yet** — record the digest here and pin once validated on the runner, like the NVIDIA image. - **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run. - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`). -- **Transport:** intra-node **XGMI** (8× MI355X). No rccl-tests primitive path is wired on AMD yet — **MoRI only** (`CX_BENCH=mori`); RCCL primitives are a follow-up. +- **Transport:** intra-node **XGMI** (8× MI355X). Two backends wired: `CX_BENCH=mori` (MoRI EP dispatch/combine) and `CX_BENCH=nccl` (collective primitives via **rccl-tests**, the ROCm nccl-tests fork — built in-container with `make` against `/opt/rocm`/`amdclang++`/`librccl`; same `_perf` binaries + output format as nccl-tests, so `run_nccl.py` parses it unchanged). - **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `run_mori.py`: - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here. - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now. diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md index 4540033b4..5cea3b15b 100644 --- a/experimental/CollectiveX/README.md +++ b/experimental/CollectiveX/README.md @@ -21,7 +21,7 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL | `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) | | `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build | | `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) | -| `launchers/launch_.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI) | +| `launchers/launch_.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI + rccl) | | `CONTAINERS.md` | the pinned multi-arch container + audited library versions | | `results/` | flat JSON artifacts (+ `plots/`, raw captures) | | `tests/fixtures/` | captured nccl-tests output for offline parser checks | @@ -33,7 +33,8 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL - **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** dispatch/combine run (the "CollectiveX Experimental" job; lands on a free `mi355x-amds` runner). - **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode / - mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only), ops, + mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only; `nccl` + on MI355X runs rccl-tests), ops, sizes, ngpus. Lands on that SKU's self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. @@ -49,7 +50,8 @@ bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB2 CX_BENCH=deepep bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # GB200, DeepEP (rebuild) bash experimental/CollectiveX/launchers/launch_b200-dgxc.sh # B200 8× NVLink bash experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh # B200 2-node, cross-IB -bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # MI355X 8× XGMI, MoRI EP (AMD; forces CX_BENCH=mori) +bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # MI355X 8× XGMI, MoRI EP (CX_BENCH=mori, default) +CX_BENCH=nccl bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # MI355X primitives via rccl-tests ``` Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`, diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh index 7d63dfdc8..10b46eb31 100644 --- a/experimental/CollectiveX/launchers/common.sh +++ b/experimental/CollectiveX/launchers/common.sh @@ -124,3 +124,33 @@ cx_build_nccl_tests() { [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin" echo "$dir/build" } + +# cx_build_rccl_tests -> echoes the build/ dir. +# AMD/ROCm counterpart of cx_build_nccl_tests: ROCm/rccl-tests is a fork of +# nccl-tests producing the SAME binary names (_perf) and output format, so +# run_nccl.py parses it unchanged. `make` defaults to ROCm at /opt/rocm +# (amdclang++ + librccl); validated building in-container on MI355X. Override +# CX_ROCM_HOME / CX_RCCL_HOME / CX_MPI_HOME if the toolchain lives elsewhere. +cx_build_rccl_tests() { + local parent="$1" mpi="${2:-0}" dir bin + dir="$parent/rccl-tests" + bin="$dir/build/all_reduce_perf" + if [ -x "$bin" ]; then + cx_log "rccl-tests already built: $dir/build" + echo "$dir/build"; return 0 + fi + mkdir -p "$parent" + if [ ! -d "$dir/.git" ]; then + cx_log "cloning rccl-tests -> $dir" + git clone --depth 1 https://github.com/ROCm/rccl-tests.git "$dir" >&2 \ + || cx_die "git clone rccl-tests failed" + fi + cx_log "building rccl-tests (MPI=$mpi, ROCm ${CX_ROCM_HOME:-/opt/rocm})" + make -C "$dir" -j MPI="$mpi" \ + ${CX_ROCM_HOME:+HIP_HOME="$CX_ROCM_HOME"} \ + ${CX_RCCL_HOME:+RCCL_HOME="$CX_RCCL_HOME"} \ + ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \ + || cx_die "rccl-tests build failed (need ROCm + librccl; try CX_ROCM_HOME)" + [ -x "$bin" ] || cx_die "rccl-tests build produced no binary at $bin" + echo "$dir/build" +} diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh index f1117229c..5d76ee667 100644 --- a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh +++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh @@ -8,8 +8,8 @@ # * squash is NODE-LOCAL (/var/lib/squash), so enroot import runs via srun on # the allocated node (not on the login node like the shared-FS NVIDIA path); # * pyxis flags --container-writable --container-remap-root for the ROCm image. -# MoRI is the only AMD backend wired (CX_BENCH=mori); rccl-tests primitives are a -# follow-up. +# AMD backends: CX_BENCH=mori (MoRI EP dispatch/combine, default) or nccl +# (collective primitives via rccl-tests, the ROCm nccl-tests fork). # # !!! NOT yet validated on hardware (no MI355X cluster access at authoring time). # Treat the first on-runner run as validation — like run_deepep.py was on GB200. @@ -37,16 +37,18 @@ EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}" MOUNT_DIR=/ix TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" -# MoRI is the only AMD backend wired today; force it. -if [ "${CX_BENCH:-mori}" != "mori" ]; then - cx_log "mi355x: CX_BENCH='${CX_BENCH}' not supported on AMD yet; using mori" -fi -export CX_BENCH=mori +# AMD backends wired: mori (MoRI EP dispatch/combine) and nccl (collective +# primitives via rccl-tests). Default mori; honor an explicit CX_BENCH. +export CX_BENCH="${CX_BENCH:-mori}" +case "$CX_BENCH" in + mori|nccl) ;; + *) cx_log "mi355x: CX_BENCH='$CX_BENCH' unsupported on AMD (want mori|nccl); using mori"; export CX_BENCH=mori ;; +esac export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" export CX_TOPO="mi355x-xgmi" CX_TRANSPORT="xgmi" export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" -cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=mori image=$IMAGE" +cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH image=$IMAGE" # AMD workspace is compute-visible (the serving launcher bind-mounts it directly), # so no staging; the node-local squash is handled via srun below. MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh index c1cf532e9..f2bb60513 100644 --- a/experimental/CollectiveX/launchers/run_in_container.sh +++ b/experimental/CollectiveX/launchers/run_in_container.sh @@ -31,8 +31,16 @@ cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX python3 env_capture.py --out "$ENVJSON" --timestamp "$CX_TS" run_nccl_suite() { - local build ops op sfail=0 - build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1 # single-node: MPI=0, -g N + local build ops op sfail=0 impl=nccl + # AMD/ROCm -> rccl-tests (fork; same binaries + output, parsed by run_nccl.py); + # NVIDIA/CUDA -> nccl-tests. Both single-node: MPI=0, -g N. + if [ -d /opt/rocm ] || command -v hipcc >/dev/null 2>&1; then + impl=rccl + build="$(cx_build_rccl_tests "$PWD/.nccl-tests" 0)" || return 1 + else + build="$(cx_build_nccl_tests "$PWD/.nccl-tests" 0)" || return 1 + fi + cx_log "collective impl=$impl build=$build" ops="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" for op in $ops; do if ! python3 run_nccl.py --op "$op" --nccl-tests-dir "$build" \ @@ -40,7 +48,7 @@ run_nccl_suite() { --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${op}_${CX_TS}.json" \ --min-bytes "${CX_MIN_BYTES:-8}" --max-bytes "${CX_MAX_BYTES:-8G}" --check 1; then - cx_log "WARN: nccl $op failed or invalid"; sfail=1 + cx_log "WARN: $impl $op failed or invalid"; sfail=1 fi done return "$sfail" From 78322de627833673d1ca65d5d039e0e5a2240e8b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 19:16:35 +0800 Subject: [PATCH 12/17] CollectiveX: key dispatch concurrency by SKU so B200/MI355X runs don't cancel each other --- .github/workflows/collectivex-experimental.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml index fcfdcb88e..451c3e676 100644 --- a/.github/workflows/collectivex-experimental.yml +++ b/.github/workflows/collectivex-experimental.yml @@ -47,7 +47,9 @@ on: default: '' concurrency: - group: collectivex-${{ github.ref }}-${{ github.event_name }} + # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do + # not cancel each other; push has no sku input -> shares one 'push' group. + group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }} cancel-in-progress: true permissions: From 2b2357322bfd9a8979272a31825b2f1fb5ce73bb Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 19:58:59 +0800 Subject: [PATCH 13/17] CollectiveX: render busbw & latency vs bytes/rank sweep tables in the job summary --- experimental/CollectiveX/summarize.py | 91 +++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 13 deletions(-) diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py index dd51f7c73..013ce3151 100644 --- a/experimental/CollectiveX/summarize.py +++ b/experimental/CollectiveX/summarize.py @@ -45,10 +45,72 @@ def _peak_busbw(rows): return max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0) -def _min_lat(rows): - vals = [r["out_of_place"]["time_us"] for r in rows - if r.get("out_of_place", {}).get("time_us") is not None] - return min(vals) if vals else float("nan") +_OP_ORDER = ["all_reduce", "reduce_scatter", "all_gather", "alltoall"] + + +def _row_lat(r): + vals = [(r.get(k) or {}).get("time_us") for k in ("out_of_place", "in_place")] + vals = [v for v in vals if v is not None] + return min(vals) if vals else None + + +def _lat_floor(rows): + # Small-message latency floor: time at the smallest REAL (size>0) message. + # (Sub-granularity 0-byte rows are a no-op ~1 us and not a real latency.) + real = [r for r in rows if (r.get("size_bytes") or 0) > 0] + if not real: + return float("nan") + v = _row_lat(min(real, key=lambda r: r["size_bytes"])) + return v if v is not None else float("nan") + + +def _at_size(rows, size, fn): + for r in rows: + if r.get("size_bytes") == size: + return fn(r) + return None + + +def _fmt_bytes(b): + for u, s in ((2**30, "GiB"), (2**20, "MiB"), (2**10, "KiB")): + if b >= u and b % u == 0: + return f"{b // u} {s}" + return f"{b} B" + + +def _ops_sorted(nccl): + present = {d.get("op") for d in nccl} + ordered = [o for o in _OP_ORDER if o in present] + return ordered + sorted(present - set(ordered)) + + +def _ladder(nccl): + sizes = sorted({r["size_bytes"] for d in nccl for r in d.get("rows", []) + if (r.get("size_bytes") or 0) > 0}) + if not sizes: + return [] + cand = [16384, 262144, 4194304, 67108864, 268435456, 1073741824, 4294967296] + lad = [s for s in cand if s in set(sizes) and s < sizes[-1]] + lad.append(sizes[-1]) + return lad + + +def _sweep_table(nccl, title, rowfn, fmt): + lad = _ladder(nccl) + if not lad: + return [] + ops = _ops_sorted(nccl) + rows_by_op = {d.get("op"): d.get("rows", []) for d in nccl} + out = [f"\n**{title}**\n", + "| bytes/rank | " + " | ".join(f"`{o}`" for o in ops) + " |", + "|---" + "|--:" * len(ops) + "|"] + for s in lad: + cells = [] + for o in ops: + v = _at_size(rows_by_op.get(o, []), s, rowfn) + cells.append(format(v, fmt) if isinstance(v, (int, float)) else "—") + out.append(f"| {_fmt_bytes(s)} | " + " | ".join(cells) + " |") + return out def _fnum(x, fmt): @@ -64,12 +126,12 @@ def render_plain(nccl, moe, n_valid, total) -> str: out += ["=" * len(hdr), hdr, "=" * len(hdr)] if nccl: out.append(f"\nNCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')}):") - out.append(f" {'op':<16}{'status':<9}{'peak busbw':>12}{'min lat':>10}{'avg busbw':>11}") + out.append(f" {'op':<16}{'status':<9}{'peak busbw':>12}{'lat floor':>10}{'avg busbw':>11}") for d in sorted(nccl, key=lambda x: x["op"]): rows = d.get("rows", []) avg = (d.get("summary") or {}).get("avg_busbw_gbps") out.append(f" {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}" - f"{_min_lat(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") + f"{_lat_floor(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") if moe: out.append("\nMoE dispatch+combine (DeepEP / MoRI):") out.append(f" {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}{'disp_p50':>10}{'tokens/s':>13} correct") @@ -93,14 +155,17 @@ def render_markdown(nccl, moe, n_valid, total) -> str: d0 = (nccl + moe)[0] out.append(f"## CollectiveX results — `{d0.get('runner')}` · {d0.get('topology_class')} · {d0.get('transport') or 'n/a'}") if nccl: - out.append(f"\n### NCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n") - out.append("| op | status | peak busbw (GB/s) | min lat (µs) | avg busbw (GB/s) |") - out.append("|---|---|--:|--:|--:|") - for d in sorted(nccl, key=lambda x: x["op"]): + out.append(f"\n### NCCL/RCCL primitives (world={nccl[0].get('world_size')}, dtype={nccl[0].get('dtype')})\n") + out.append("| op | status | peak busbw (GB/s) | lat floor (µs) |") + out.append("|---|---|--:|--:|") + for d in sorted(nccl, key=lambda x: _OP_ORDER.index(x["op"]) if x["op"] in _OP_ORDER else 99): rows = d.get("rows", []) - avg = (d.get("summary") or {}).get("avg_busbw_gbps") - out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | " - f"{_min_lat(rows):.2f} | {_fnum(avg, '.1f')} |") + out.append(f"| `{d['op']}` | {_emoji(d.get('status'))} | {_peak_busbw(rows):.1f} | {_lat_floor(rows):.2f} |") + out += _sweep_table(nccl, "Bus bandwidth vs bytes/rank (GB/s)", lambda r: r.get("busbw_gbps"), ".1f") + out += _sweep_table(nccl, "Latency vs bytes/rank (µs)", _row_lat, ".2f") + out.append("\n> bytes/rank = nccl/rccl-tests message size (= per-rank for all-reduce / " + "reduce-scatter / all-to-all; all-gather input/rank = size ÷ #GPUs). Small " + "sizes are latency-bound (busbw ≈ 0); peak bandwidth is at the largest size.") if moe: out.append("\n### MoE dispatch+combine (DeepEP / MoRI)\n") out.append("| backend | mode | status | rt p50 (µs) | rt p99 (µs) | dispatch p50 (µs) | tokens/s | correct |") From a3a492c56353c710dad493176b7f664d58393c16 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 20:23:19 +0800 Subject: [PATCH 14/17] CollectiveX: GB200 8-GPU multi-node MNNVL path (CX_NODES), validated on-node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit launch_gb200-nv.sh now branches on CX_NODES: 1 (default) keeps the single-tray 4-GPU dispatcher path; >1 runs across the NVL72 NVLink fabric (e.g. CX_NODES=2 = 8 GPU) by building nccl-tests MPI=1, running each op across WORLD ranks via `srun --mpi=pmix` (1 GPU/rank) with the MNNVL env, and parsing on the login node — mirroring launch_b200-dgxc-slurm but staying on NVLink instead of IB. Validated on GB200 (2x watchtower-navy trays, 8 GPU): all 4 ops valid, peak busbw all_reduce 822.8 / reduce_scatter 670.6 / all_gather 651.2 / alltoall 625.0 GB/s — ~30% over single-tray and on par with B200 8-GPU NVLink, i.e. MNNVL engaged (not an IB fallback). - common.sh: cx_build_nccl_tests auto-detects MPI_HOME for MPI=1 (Debian OpenMPI headers live under /usr/lib//openmpi/include; MPI_HOME=/usr fails). Works x86_64 + aarch64. - launch_b200-dgxc-slurm.sh: fix BUILD_IN_CTR path (.nccl-tests/nccl-tests/build). - workflow: add `nodes` dispatch input -> CX_NODES. --- .../workflows/collectivex-experimental.yml | 5 + experimental/CollectiveX/launchers/common.sh | 14 ++- .../launchers/launch_b200-dgxc-slurm.sh | 2 +- .../CollectiveX/launchers/launch_gb200-nv.sh | 117 ++++++++++++++---- 4 files changed, 108 insertions(+), 30 deletions(-) diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml index 451c3e676..19f48fc30 100644 --- a/.github/workflows/collectivex-experimental.yml +++ b/.github/workflows/collectivex-experimental.yml @@ -45,6 +45,10 @@ on: description: GPUs per node (blank = SKU default) type: string default: '' + nodes: + description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node. + type: string + default: '' concurrency: # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do @@ -95,6 +99,7 @@ jobs: CX_MIN_BYTES: ${{ inputs.min_bytes }} CX_MAX_BYTES: ${{ inputs.max_bytes }} CX_NGPUS: ${{ inputs.ngpus }} + CX_NODES: ${{ inputs.nodes }} # GB200/watchtower needs a compute-visible workspace; harmless elsewhere. CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} steps: diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh index 10b46eb31..259f1cfa6 100644 --- a/experimental/CollectiveX/launchers/common.sh +++ b/experimental/CollectiveX/launchers/common.sh @@ -115,12 +115,20 @@ cx_build_nccl_tests() { git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$dir" >&2 \ || cx_die "git clone nccl-tests failed" fi - cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr})" + # MPI=1 needs MPI_HOME. On Debian/Ubuntu OpenMPI the headers live under + # /usr/lib//openmpi/include (NOT /usr/include), so MPI_HOME=/usr fails; + # point it at that openmpi dir (libmpi resolves via the default linker path). + # Works for both x86_64 (B200) and aarch64 (GB200). Override with CX_MPI_HOME. + local mpi_home="${CX_MPI_HOME:-}" + if [ "$mpi" = "1" ] && [ -z "$mpi_home" ]; then + mpi_home="$(ls -d /usr/lib/*/openmpi 2>/dev/null | head -n1)" + fi + cx_log "building nccl-tests (MPI=$mpi, NCCL_HOME=${CX_NCCL_HOME:-/usr}${mpi_home:+, MPI_HOME=$mpi_home})" make -C "$dir" -j MPI="$mpi" \ CUDA_HOME="${CX_CUDA_HOME:-/usr/local/cuda}" \ NCCL_HOME="${CX_NCCL_HOME:-/usr}" \ - ${CX_MPI_HOME:+MPI_HOME="$CX_MPI_HOME"} >&2 \ - || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME; need nccl.h + libnccl)" + ${mpi_home:+MPI_HOME="$mpi_home"} >&2 \ + || cx_die "nccl-tests build failed (try a different CX_NCCL_HOME/CX_MPI_HOME; need nccl.h + libnccl)" [ -x "$bin" ] || cx_die "nccl-tests build produced no binary at $bin" echo "$dir/build" } diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh index e5add9189..312a7b33a 100644 --- a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh @@ -78,7 +78,7 @@ srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" ' -BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/build" +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests/build" OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" # 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS. diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh index 60d5b297d..30b336d5b 100644 --- a/experimental/CollectiveX/launchers/launch_gb200-nv.sh +++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh @@ -1,19 +1,23 @@ #!/usr/bin/env bash # CollectiveX — GB200 (NVL72, MNNVL domain) SKU adapter. aarch64, 4 GPU/tray. # -# Thin adapter: handles GB200-specific allocation/container/transport-env, then -# hands off to launchers/run_in_container.sh which runs whichever benchmark -# CX_BENCH selects (nccl | deepep | all). The same NCCL primitive shape that -# runs on B200 (NVLink island + CX-7 IB across nodes) runs here entirely inside -# the NVL72 NVLink (MNNVL) domain — that contrast is the headline. +# Two paths, selected by CX_NODES: +# * CX_NODES=1 (default): single tray, 4 GPU, intra-tray MNNVL. Hands off to +# run_in_container.sh (CX_BENCH = nccl | deepep | all), -g 4. +# * CX_NODES>1: multi-node over the NVL72 NVLink fabric (MNNVL), e.g. CX_NODES=2 +# = 8 GPU. nccl only — builds nccl-tests (MPI=1), runs each op across all ranks +# via `srun --mpi=pmix` (1 GPU/rank), parses on the login node. Same shape that +# runs single-node B200 (NVLink island) and multi-node B200 (CX-7 IB) — here it +# stays entirely on NVL72 NVLink. Validated 8-GPU (2 trays) on-node. # # Run from inside the InferenceX checkout on the GB200 login node: -# bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # nccl (default) -# CX_BENCH=deepep bash .../launch_gb200-nv.sh # DeepEP (rebuild) +# bash experimental/CollectiveX/launchers/launch_gb200-nv.sh # 4 GPU, nccl +# CX_NODES=2 bash .../launch_gb200-nv.sh # 8 GPU MNNVL +# CX_BENCH=deepep bash .../launch_gb200-nv.sh # 4 GPU, DeepEP # -# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NGPUS(4) CX_TIME(30) -# CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH CX_OPS CX_MIN_BYTES CX_MAX_BYTES -# CX_DRYRUN(0) +# Env knobs: CX_PARTITION(batch) CX_ACCOUNT(benchmark) CX_NODES(1) +# CX_GPUS_PER_NODE(4) CX_TIME(30) CX_IMAGE CX_SQUASH_DIR CX_STAGE_DIR CX_BENCH +# CX_OPS CX_MIN_BYTES CX_MAX_BYTES CX_SRUN_MPI(pmix) CX_DRYRUN(0) set -euo pipefail HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -25,24 +29,24 @@ source "$HERE/common.sh" RUNNER_NAME="${RUNNER_NAME:-gb200-nv}" PARTITION="${CX_PARTITION:-batch}" ACCOUNT="${CX_ACCOUNT:-benchmark}" -NGPUS="${CX_NGPUS:-4}" # NVL72 compute tray = 4 GPU/node +GPUS_PER_NODE="${CX_GPUS_PER_NODE:-4}" # NVL72 compute tray = 4 GPU/node +NODES="${CX_NODES:-1}" TIME_MIN="${CX_TIME:-30}" IMAGE="${CX_IMAGE:-$(cx_default_image gb200)}" SQUASH_DIR="${CX_SQUASH_DIR:-/mnt/lustre01/users-public/sa-shared}" MOUNT_DIR=/ix TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +WORLD=$((NODES * GPUS_PER_NODE)) -# Exported so srun --export=ALL carries them into run_in_container.sh. -export CX_RUNNER="$RUNNER_NAME" CX_NGPUS="$NGPUS" CX_TS="$TS" +export CX_RUNNER="$RUNNER_NAME" CX_TS="$TS" export CX_TOPO="gb200-nvl72-mnnvl" CX_TRANSPORT="mnnvl" export CX_BENCH="${CX_BENCH:-nccl}" export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" -# Record container identity in env_capture provenance. export COLLECTIVEX_IMAGE="$IMAGE" COLLECTIVEX_IMAGE_DIGEST="${CX_IMAGE_DIGEST:-}" # Validated GB200 MNNVL transport env (from serving recipes) — set AND recorded. export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 -cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS (aarch64) bench=$CX_BENCH" +cx_log "runner=$RUNNER_NAME partition=$PARTITION nodes=$NODES x ${GPUS_PER_NODE}gpu world=$WORLD bench=$CX_BENCH (aarch64)" cx_log "image=$IMAGE" SQUASH_FILE="$(cx_ensure_squash "$SQUASH_DIR" "$IMAGE")" MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" @@ -51,20 +55,81 @@ cx_log "squash=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" -salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$NGPUS" \ - --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +# ---------------------------------------------------------------------------- +if [ "$NODES" -le 1 ]; then + # Single tray (4 GPU): generic dispatcher, -g N single process. + export CX_NGPUS="$GPUS_PER_NODE" + salloc --partition="$PARTITION" --account="$ACCOUNT" --gres=gpu:"$GPUS_PER_NODE" \ + --exclusive --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" + JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" + [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" + cx_log "JOB_ID=$JOB_ID" + trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT + srun --jobid="$JOB_ID" \ + --container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export=ALL \ + bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" + cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" + cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" + exit 0 +fi + +# ---------------------------------------------------------------------------- +# Multi-node MNNVL (nccl only): mirrors launch_b200-dgxc-slurm but stays on the +# NVL72 NVLink fabric. Build nccl-tests MPI=1, run each op across WORLD ranks +# (1 GPU/rank) via srun --mpi=pmix, parse on the login node. +[ "$CX_BENCH" = "nccl" ] || cx_die "GB200 multi-node supports CX_BENCH=nccl only (got '$CX_BENCH')" +MPI_FLAG="${CX_SRUN_MPI:-pmix}" +declare -A BIN=( [all_reduce]=all_reduce_perf [all_gather]=all_gather_perf + [reduce_scatter]=reduce_scatter_perf [alltoall]=alltoall_perf ) + +salloc --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \ + --gres=gpu:"$GPUS_PER_NODE" --exclusive --time="$TIME_MIN" \ + --no-shell --job-name="$RUNNER_NAME" JOB_ID="$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)" [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" -cx_log "JOB_ID=$JOB_ID" +cx_log "JOB_ID=$JOB_ID nodes=[$(squeue -j "$JOB_ID" -h -o %N 2>/dev/null)]" trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT -srun --jobid="$JOB_ID" \ - --container-image="$SQUASH_FILE" \ - --container-mounts="$MOUNT_SRC:$MOUNT_DIR" \ - --no-container-mount-home \ - --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ - --no-container-entrypoint --export=ALL \ - bash "$MOUNT_DIR/experimental/CollectiveX/launchers/run_in_container.sh" +COMMON_MOUNT=(--container-image="$SQUASH_FILE" --container-mounts="$MOUNT_SRC:$MOUNT_DIR" + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" + --no-container-entrypoint) +ENVJSON="$MOUNT_SRC/experimental/CollectiveX/results/env_${RUNNER_NAME}_${TS}.json" + +# 1) Build nccl-tests (MPI=1) + capture environment (single task, one node). +srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" \ + --export=ALL,CX_TS="$TS",CX_RUNNER="$RUNNER_NAME" /dev/null + python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" + ' + +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests/build" +OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" + +# 2) Per op: run across all ranks (1 GPU/rank), tee raw output to the shared FS. +for op in $OPS; do + raw="$MOUNT_SRC/experimental/CollectiveX/results/raw_${RUNNER_NAME}_${op}_${TS}.txt" + cx_log "running $op across $WORLD ranks (mpi=$MPI_FLAG, MNNVL) -> $raw" + srun --jobid="$JOB_ID" --mpi="$MPI_FLAG" --nodes="$NODES" \ + --ntasks="$WORLD" --ntasks-per-node="$GPUS_PER_NODE" "${COMMON_MOUNT[@]}" \ + --export=ALL,NCCL_CUMEM_ENABLE=1,NCCL_MNNVL_ENABLE=1,MC_FORCE_MNNVL=1 "$raw" 2>"$raw.stderr" || cx_log "WARN: $op srun returned nonzero (see $raw.stderr)" + + # 3) Parse on the login node (pure stdlib; no container needed). + python3 "$CX_DIR/run_nccl.py" --op "$op" --parse-only "$raw" \ + --world-size "$WORLD" --nodes "$NODES" \ + --runner "$RUNNER_NAME" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" \ + --out "$CX_DIR/results/${RUNNER_NAME}_${op}_${TS}.json" \ + --timestamp "$TS" || cx_log "WARN: parse $op failed" +done cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" -cx_log "done — JSON artifacts under $MOUNT_SRC/experimental/CollectiveX/results/" +cx_log "done — JSON artifacts under $CX_DIR/results/" From 871086dd0b648180447e4dd0bac3556370f51686 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Tue, 23 Jun 2026 20:37:03 +0800 Subject: [PATCH 15/17] CollectiveX: fix multi-node build cache (MPI=0 vs MPI=1) + gate all-zero busbw The first GB200 8-GPU CI run came back green but all-zero busbw: it reused a cached MPI=0 nccl-tests build in the staging dir, and an MPI=0 binary under `srun --mpi=pmix` runs as N standalone world=1 procs (busbw formula -> 0), so every rank printed its own table (232 rows) and check still "passed". - common.sh: cache MPI=0 and MPI=1 builds in separate dirs (nccl-tests vs nccl-tests-mpi) so they never cross-contaminate. - launch_gb200-nv.sh / launch_b200-dgxc-slurm.sh: read the -mpi build dir. - run_nccl.py: a result with peak busbw == 0 is now `invalid` (fails the gate), so a non-communicating run goes red instead of green-zero. --- experimental/CollectiveX/launchers/common.sh | 8 ++++++-- .../CollectiveX/launchers/launch_b200-dgxc-slurm.sh | 2 +- experimental/CollectiveX/launchers/launch_gb200-nv.sh | 2 +- experimental/CollectiveX/run_nccl.py | 8 +++++++- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/experimental/CollectiveX/launchers/common.sh b/experimental/CollectiveX/launchers/common.sh index 259f1cfa6..e560fc987 100644 --- a/experimental/CollectiveX/launchers/common.sh +++ b/experimental/CollectiveX/launchers/common.sh @@ -102,8 +102,12 @@ cx_collect_results() { # CX_NCCL_HOME defaults to /usr (system nccl.h in /usr/include on the sglang # cu130 images); override CX_CUDA_HOME / CX_NCCL_HOME / CX_MPI_HOME if needed. cx_build_nccl_tests() { - local parent="$1" mpi="${2:-0}" dir bin - dir="$parent/nccl-tests" + local parent="$1" mpi="${2:-0}" dir bin sfx="" + # Cache MPI=0 and MPI=1 builds in SEPARATE dirs. A single-node (MPI=0) binary + # reused under `srun --mpi=pmix` runs as N standalone world=1 procs (busbw=0); + # keying the cache by flavor prevents that cross-contamination. + [ "$mpi" = "1" ] && sfx="-mpi" + dir="$parent/nccl-tests$sfx" bin="$dir/build/all_reduce_perf" if [ -x "$bin" ]; then cx_log "nccl-tests already built: $dir/build" diff --git a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh index 312a7b33a..b7a03b2c1 100644 --- a/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh +++ b/experimental/CollectiveX/launchers/launch_b200-dgxc-slurm.sh @@ -78,7 +78,7 @@ srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" --export=ALL,CX python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" ' -BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests/build" +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build" OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" # 2) Per op: run across all ranks (one GPU per task), tee raw output to shared FS. diff --git a/experimental/CollectiveX/launchers/launch_gb200-nv.sh b/experimental/CollectiveX/launchers/launch_gb200-nv.sh index 30b336d5b..4863b9c10 100644 --- a/experimental/CollectiveX/launchers/launch_gb200-nv.sh +++ b/experimental/CollectiveX/launchers/launch_gb200-nv.sh @@ -109,7 +109,7 @@ srun --jobid="$JOB_ID" --ntasks=1 --nodes=1 "${COMMON_MOUNT[@]}" \ python3 env_capture.py --out "results/env_${CX_RUNNER}_${CX_TS}.json" --timestamp "$CX_TS" ' -BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests/build" +BUILD_IN_CTR="$MOUNT_DIR/experimental/CollectiveX/.nccl-tests/nccl-tests-mpi/build" OPS="${CX_OPS:-all_reduce all_gather reduce_scatter alltoall}" # 2) Per op: run across all ranks (1 GPU/rank), tee raw output to the shared FS. diff --git a/experimental/CollectiveX/run_nccl.py b/experimental/CollectiveX/run_nccl.py index 993c0c06d..c22654c59 100644 --- a/experimental/CollectiveX/run_nccl.py +++ b/experimental/CollectiveX/run_nccl.py @@ -227,6 +227,11 @@ def main() -> int: with open(args.env_json) as fh: env = json.load(fh) + # All-zero busbw means the benchmark didn't actually communicate — e.g. an + # MPI=0 binary launched under srun --mpi=pmix runs as N standalone world=1 + # procs (busbw formula -> 0). Don't let that pass the gate as "valid". + peak_busbw = max((r.get("busbw_gbps") or 0.0 for r in rows), default=0.0) + doc = { "schema_version": SCHEMA_VERSION, "family": "nccl", @@ -236,7 +241,8 @@ def main() -> int: "binary": binary, "command": " ".join(command) if command else f"", "transport": args.transport, - "status": ("valid" if (rows and ran_ok and (summary.get("check_passed") is True + "status": ("valid" if (rows and ran_ok and peak_busbw > 0.0 + and (summary.get("check_passed") is True or (args.check == 0 and summary.get("check_passed") is None))) else "invalid"), "comparison_key": comparison_key(meta), **meta, From 368cfbc6390cf69b864dedc121a79a12114b716b Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:51:36 +0800 Subject: [PATCH 16/17] CollectiveX: EP dispatch/combine token sweep with separated timing (tests/) Refactor the single-point DeepEP/MoRI drivers into a shared EP harness under tests/ that sweeps source-tokens-per-rank and times dispatch and combine SEPARATELY (combine's setup dispatch runs untimed; round-trip is a third measurement). One line = one fully-specified config (backend, ep degree, phase, dispatch precision, top-k/experts/hidden, routing); only T varies. Each row records both tokens_per_rank and global_tokens (= T * ep_size) for the weak/strong-scaling x-axis toggle, plus recv_tokens and an algbw estimate. comparison_key is built from the fixed config only (T excluded). - tests/ep_harness.py: phase-aware token ladder, CUDA-event timing (untimed `pre` hook isolates combine), fixed-config comparison_key, doc emission. - tests/ep_deepep.py, tests/ep_mori.py: backend adapters (ported the validated call sequences). MoRI ramps its ladder gradually 1..max (a cold dispatch that jumps straight to a large T wedges; the gradual ramp is validated to avoid it). - tests/run_ep.py: entrypoint; run_in_container.sh runs it per CX_PHASE. - summarize.py: per-backend EP sweep tables (dispatch/combine/round-trip vs tokens/rank) + a combine column on the headline. - workflow: phase matrix so decode + prefill land as separate jobs; EP inputs (phase, tokens_ladder, dispatch_dtype). - Validated on hardware (decode + prefill): MI355X MoRI (EP8), B200 DeepEP (EP8), GB200 DeepEP (EP4). - Replaces run_deepep.py / run_mori.py. --- .../workflows/collectivex-experimental.yml | 41 ++- experimental/CollectiveX/.gitignore | 2 + experimental/CollectiveX/CONTAINERS.md | 6 +- experimental/CollectiveX/README.md | 27 +- .../launchers/launch_mi355x-amds.sh | 2 +- .../CollectiveX/launchers/run_in_container.sh | 52 ++- experimental/CollectiveX/plan.md | 6 +- experimental/CollectiveX/run_deepep.py | 268 -------------- experimental/CollectiveX/run_mori.py | 280 -------------- experimental/CollectiveX/summarize.py | 64 +++- experimental/CollectiveX/tests/ep_deepep.py | 124 +++++++ experimental/CollectiveX/tests/ep_harness.py | 347 ++++++++++++++++++ experimental/CollectiveX/tests/ep_mori.py | 167 +++++++++ experimental/CollectiveX/tests/run_ep.py | 78 ++++ 14 files changed, 863 insertions(+), 601 deletions(-) delete mode 100644 experimental/CollectiveX/run_deepep.py delete mode 100644 experimental/CollectiveX/run_mori.py create mode 100644 experimental/CollectiveX/tests/ep_deepep.py create mode 100644 experimental/CollectiveX/tests/ep_harness.py create mode 100644 experimental/CollectiveX/tests/ep_mori.py create mode 100644 experimental/CollectiveX/tests/run_ep.py diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml index 19f48fc30..e2a8e2ff2 100644 --- a/.github/workflows/collectivex-experimental.yml +++ b/.github/workflows/collectivex-experimental.yml @@ -49,6 +49,21 @@ on: description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node. type: string default: '' + phase: + # EP only. 'both' fans out to one job per phase (decode + prefill). + description: EP phase — decode (small T) / prefill (large T); 'both' = a job each + type: choice + default: both + options: [both, decode, prefill] + tokens_ladder: + description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default + type: string + default: '' + dispatch_dtype: + description: EP dispatch payload precision + type: choice + default: bf16 + options: [bf16, fp8] concurrency: # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do @@ -64,16 +79,23 @@ jobs: # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute- # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs. experimental: - name: CollectiveX Experimental + name: CollectiveX Experimental (${{ matrix.phase }}) if: github.event_name == 'push' runs-on: mi355x timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + # MI355X MoRI EP dispatch/combine, one job per phase: decode (small T) + + # prefill (large T, clamped to the registerable heap). + phase: [decode, prefill] env: CX_BENCH: mori + CX_PHASE: ${{ matrix.phase }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 with: { clean: true } - - name: Launch MI355X MoRI + - name: Launch MI355X MoRI (${{ matrix.phase }}) env: RUNNER_NAME: ${{ runner.name }} run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" @@ -84,7 +106,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: collectivex_mi355x_mori_${{ github.run_id }} + name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }} path: experimental/CollectiveX/results/*.json if-no-files-found: warn @@ -93,6 +115,12 @@ jobs: if: github.event_name == 'workflow_dispatch' runs-on: ${{ inputs.sku }} timeout-minutes: 120 + strategy: + fail-fast: false + matrix: + # 'both' -> one job per phase (decode + prefill); else a single job. Phase + # only affects EP (deepep/mori); nccl ignores it (runs the same twice). + phase: ${{ fromJSON(inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase)) }} env: CX_BENCH: ${{ inputs.benchmark }} CX_OPS: ${{ inputs.ops }} @@ -100,12 +128,15 @@ jobs: CX_MAX_BYTES: ${{ inputs.max_bytes }} CX_NGPUS: ${{ inputs.ngpus }} CX_NODES: ${{ inputs.nodes }} + CX_PHASE: ${{ matrix.phase }} + CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }} + CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }} # GB200/watchtower needs a compute-visible workspace; harmless elsewhere. CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 with: { clean: true } - - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} + - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }}) env: RUNNER_NAME: ${{ runner.name }} run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" @@ -116,6 +147,6 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ github.run_id }} + name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }} path: experimental/CollectiveX/results/*.json if-no-files-found: warn diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore index 4235a8ce9..a4717f5ff 100644 --- a/experimental/CollectiveX/.gitignore +++ b/experimental/CollectiveX/.gitignore @@ -10,3 +10,5 @@ results/*.json results/plots/ results/raw_*.txt results/raw_*.txt.stderr +# running local-only reflection log (not a committed artifact) +notes.md diff --git a/experimental/CollectiveX/CONTAINERS.md b/experimental/CollectiveX/CONTAINERS.md index 1d84bffd5..6b409bac0 100644 --- a/experimental/CollectiveX/CONTAINERS.md +++ b/experimental/CollectiveX/CONTAINERS.md @@ -46,13 +46,13 @@ bundles **MoRI** (AMD's EP dispatch/combine library). Set in `cx_default_image` for `mi355x*` (also `mi350x*`/`mi325x*`/`mi300x*`). - **Image:** `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2` (single-arch ROCm 7.2.0 runtime; from the AMD master serving config). **Not digest-pinned yet** — record the digest here and pin once validated on the runner, like the NVIDIA image. -- **MoRI:** bundled in-image (build tag `mori-0227`). `run_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run. +- **MoRI:** bundled in-image (build tag `mori-0227`). `tests/ep_mori.py` follows the upstream `ROCm/mori` `tests`/`examples` dispatch+combine path; capture the exact MoRI commit (`MORI_COMMIT` env → provenance) on first run. - **Squash is NODE-LOCAL** (`/var/lib/squash`), not a shared FS, so `launch_mi355x-amds.sh` imports via `srun` on the allocated node (the NVIDIA adapters import on the login node onto shared FS). pyxis flags `--container-writable --container-remap-root` (matches the AMD serving launcher); workspace is bind-mounted directly (no `CX_STAGE_DIR`). - **Transport:** intra-node **XGMI** (8× MI355X). Two backends wired: `CX_BENCH=mori` (MoRI EP dispatch/combine) and `CX_BENCH=nccl` (collective primitives via **rccl-tests**, the ROCm nccl-tests fork — built in-container with `make` against `/opt/rocm`/`amdclang++`/`librccl`; same `_perf` binaries + output format as nccl-tests, so `run_nccl.py` parses it unchanged). -- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `run_mori.py`: +- **Validated on MI355X** (on-node via `salloc`+`srun`, nodes `mia1-p01-g10`/`g15`): `salloc` → enroot import (anonymous auth + tag, 24 layers → ~60 GB node-local squash) → torchrun → 8-rank Gloo + MoRI shmem → `EpDispatchCombineConfig`/dispatch/combine **numerically correct** (combine within tol, `max_rel ~2e-3`, ~85 µs round-trip at the decode shape). Three ionic_rdma-fabric constraints, all handled in `tests/ep_mori.py`: - **RDMA MR size ceiling (~4 GiB).** MoRI registers the *entire* symmetric heap as one RDMA MR at init — even single-node (no disable-RDMA knob exists; only `MORI_DISABLE_P2P`, which forces the opposite). On these ionic NICs a 6 GiB MR fails (`RegisterRdmaMemoryRegion … errno 22 EINVAL`) while 2 GiB registers. Heap is held at **`MORI_SHMEM_HEAP_SIZE=2G`** (override `CX_MORI_HEAP_SIZE`). The reference test's hardcoded `6G` is exactly why it can't run as-is here. - **Buffer sizing.** `max_num_inp_token_per_rank` is bounded (512 at the decode shape) so dispatch/combine buffers fit the 2 GiB heap. Much larger token counts would need a heap past the MR ceiling — out of reach on this fabric for now. - - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `run_mori.py` hard-exits after writing results to avoid it. + - **Teardown.** MoRI's shmem teardown asserts (`CheckStatusValid` → SIGABRT) when the op is destroyed after `shmem_finalize()`; `tests/ep_mori.py`'s `finalize()` hard-exits after writing results to avoid it. Still TODO: capture the exact MoRI commit + a version table (ROCm/torch/RCCL) into provenance, and digest-pin the image. diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md index 5cea3b15b..a7c479b86 100644 --- a/experimental/CollectiveX/README.md +++ b/experimental/CollectiveX/README.md @@ -16,11 +16,12 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL |---|---| | `env_capture.py` | Layer-0 environment + topology fingerprint → JSON (stdlib only) | | `run_nccl.py` | run stock `nccl-tests`, parse the text table, emit flat JSON (stdlib only) | -| `run_deepep.py` | DeepEP dispatch+combine, normal mode, correctness-gated (torch + DeepEP) | -| `run_mori.py` | MoRI (AMD) dispatch+combine, normal mode, correctness-gated (torch + MoRI) | +| `tests/run_ep.py` | EP dispatch/combine entrypoint (torchrun): source-tokens-per-rank sweep, dispatch & combine timed **separately** | +| `tests/ep_harness.py` | shared EP harness: token ladder, separated timing, correctness gate, doc emission (stdlib top) | +| `tests/ep_deepep.py`, `tests/ep_mori.py` | per-backend adapters (DeepEP / MoRI) implementing the harness protocol | | `plot.py` | latency/bus-bw curves, B200-vs-GB200 overlay with a comparison guard (matplotlib) | | `launchers/common.sh` | shared helpers: image resolve, enroot squash, staging, nccl-tests build | -| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) | +| `launchers/run_in_container.sh` | generic in-container dispatcher — runs `CX_BENCH` (nccl/deepep/mori/all) over `CX_PHASE` | | `launchers/launch_.sh` | per-SKU adapters: `launch_b200-dgxc.sh` (8× NVLink), `launch_b200-dgxc-slurm.sh` (2-node IB), `launch_gb200-nv.sh` (NVL72 MNNVL), `launch_mi355x-amds.sh` (8× XGMI, AMD MoRI + rccl) | | `CONTAINERS.md` | the pinned multi-arch container + audited library versions | | `results/` | flat JSON artifacts (+ `plots/`, raw captures) | @@ -30,13 +31,15 @@ already ran for real on both B200 (8× NVLink island) and GB200 (4× NVL72 MNNVL ### Via GitHub Actions (`.github/workflows/collectivex-experimental.yml`) -- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** dispatch/combine - run (the "CollectiveX Experimental" job; lands on a free `mi355x-amds` runner). +- **push** to `experimental/CollectiveX/**` → the **MI355X MoRI** EP dispatch/combine + sweep, **one job per phase** (decode + prefill) via a matrix (lands on free + `mi355x-amds` runners). - **workflow_dispatch** → pick `sku` (gb200 / b200-dgxc / b200-multinode / mi355x), `benchmark` (nccl / deepep / mori / all — `mori` is AMD-only; `nccl` - on MI355X runs rccl-tests), ops, - sizes, ngpus. Lands on that SKU's self-hosted runner and runs - `launch_${RUNNER_NAME%%_*}.sh`. + on MI355X runs rccl-tests), `phase` (decode / prefill / **both** → a job each), + `tokens_ladder`, `dispatch_dtype`, ops, sizes, ngpus. Lands on that SKU's + self-hosted runner and runs `launch_${RUNNER_NAME%%_*}.sh`. For EP results + across all SKUs, dispatch once per `sku` with `phase=both`. Each job renders a results table to the **GitHub Actions job summary** (via `summarize.py --markdown` → `$GITHUB_STEP_SUMMARY`) and uploads the result JSONs @@ -57,7 +60,9 @@ CX_BENCH=nccl bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh # Knobs: `CX_BENCH` (nccl|deepep|mori|all), `CX_OPS`, `CX_MIN_BYTES`/`CX_MAX_BYTES`, `CX_NGPUS`, `CX_TIME`, `CX_IMAGE`, `CX_SQUASH_DIR`, `CX_STAGE_DIR` (compute-visible staging — needed on GB200/watchtower), `CX_DRYRUN=1` (print plan, allocate -nothing). Results land in `experimental/CollectiveX/results/`. +nothing). EP (deepep/mori) adds `CX_PHASE` (decode|prefill|both), `CX_TOKENS_LADDER` +(e.g. `"1 2 4 8 16 32 64 128"`), `CX_HIDDEN`/`CX_TOPK`/`CX_EXPERTS`, +`CX_DISPATCH_DTYPE`, `CX_NUM_EP_GROUPS`. Results land in `experimental/CollectiveX/results/`. ### Offline (no GPU) — verify the parser/JSON pipeline @@ -104,9 +109,9 @@ DeepSeek-V4 fallback images. validate it on first run and refresh `CONTAINERS.md` (expect CUDA 13 / NCCL 2.28 / torch 2.9). - **DeepEP** is not bundled in the multi-arch image → `run_in_container.sh` builds it via `rebuild-deepep` (CX_BENCH=deepep). Its Python API is version-sensitive; - `run_deepep.py` marks the dispatch/combine block `ADAPT HERE` — validate against + `tests/ep_deepep.py` follows the documented normal-mode API — validate against the built commit. B200 (x86_64) first; GB200 (aarch64) follows. -- **MoRI / MI355X** (`run_mori.py` + `launch_mi355x-amds.sh`) is **validated on +- **MoRI / MI355X** (`tests/ep_mori.py` + `launch_mi355x-amds.sh`) is **validated on hardware** (8× MI355X: dispatch+combine numerically correct, ~85 µs round-trip). It mirrors `ROCm/mori`'s example (config + `get_registered_combine_input_buffer` zero-copy path, `expected = input × #unique-destination-ranks`). Three diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh index 5d76ee667..8092b84b4 100644 --- a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh +++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh @@ -12,7 +12,7 @@ # (collective primitives via rccl-tests, the ROCm nccl-tests fork). # # !!! NOT yet validated on hardware (no MI355X cluster access at authoring time). -# Treat the first on-runner run as validation — like run_deepep.py was on GB200. +# Treat the first on-runner run as validation — like the DeepEP path was on GB200. # # Run from inside the InferenceX checkout on the MI355X login node: # bash experimental/CollectiveX/launchers/launch_mi355x-amds.sh diff --git a/experimental/CollectiveX/launchers/run_in_container.sh b/experimental/CollectiveX/launchers/run_in_container.sh index f2bb60513..3874cabea 100644 --- a/experimental/CollectiveX/launchers/run_in_container.sh +++ b/experimental/CollectiveX/launchers/run_in_container.sh @@ -11,7 +11,10 @@ # Selector: CX_BENCH = nccl | deepep | mori | all (default nccl) # (mori = AMD ROCm EP; nccl/deepep = NVIDIA. `all` = nccl+deepep.) # NCCL knobs: CX_OPS, CX_MIN_BYTES, CX_MAX_BYTES, CX_TRANSPORT, CX_NCCL_HOME -# EP knobs (DeepEP/MoRI): CX_TOKENS_PER_RANK CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE +# EP knobs (DeepEP/MoRI), all -> tests/run_ep.py: +# CX_PHASE = decode | prefill | both (default decode) <- picks the token sweep +# CX_TOKENS_LADDER (space/comma sep; blank = phase default), CX_TOKENS_PER_RANK (legacy single point) +# CX_HIDDEN CX_TOPK CX_EXPERTS CX_DISPATCH_DTYPE CX_ROUTING CX_NUM_EP_GROUPS CX_NUM_COMM_SMS set -euo pipefail cd /ix/experimental/CollectiveX @@ -54,6 +57,38 @@ run_nccl_suite() { return "$sfail" } +# Resolve the source-tokens-per-rank sweep: explicit CX_TOKENS_LADDER wins; else +# the legacy single-point CX_TOKENS_PER_RANK becomes a one-point ladder; else +# blank => tests/run_ep.py picks the phase default (decode small / prefill large). +cx_ep_ladder() { + if [ -n "${CX_TOKENS_LADDER:-}" ]; then printf '%s' "$CX_TOKENS_LADDER" + elif [ -n "${CX_TOKENS_PER_RANK:-}" ]; then printf '%s' "$CX_TOKENS_PER_RANK" + else printf ''; fi +} + +# run_ep_suite +# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and +# combine are timed separately inside it. One JSON per (backend, phase). +run_ep_suite() { + local backend="$1" phase phases ladder rc=0 + ladder="$(cx_ep_ladder)" + phases="${CX_PHASE:-decode}" + [ "$phases" = "both" ] && phases="decode prefill" + for phase in $phases; do + cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-}'" + if ! torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py --backend "$backend" \ + --phase "$phase" --tokens-ladder "$ladder" \ + --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \ + --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" --routing "${CX_ROUTING:-balanced}" \ + --num-ep-groups "${CX_NUM_EP_GROUPS:-1}" --num-comm-sms "${CX_NUM_COMM_SMS:-24}" \ + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ + --env-json "$ENVJSON" --out "results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json"; then + cx_log "WARN: $backend $phase run failed or invalid"; rc=1 + fi + done + return "$rc" +} + run_deepep_suite() { # DeepEP is not bundled in the multi-arch image. Try to import; if absent, # attempt rebuild-deepep (srt-slurm setup script). Inability to run is a @@ -67,13 +102,7 @@ run_deepep_suite() { return 1 fi fi - torchrun --nproc_per_node="$CX_NGPUS" run_deepep.py \ - --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ - --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \ - --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \ - --dispatch-dtype "${CX_DISPATCH_DTYPE:-bf16}" \ - --env-json "$ENVJSON" --out "results/${CX_RUNNER}_deepep_${CX_TS}.json" \ - || { cx_log "WARN: deepep run failed"; return 1; } + run_ep_suite deepep } run_mori_suite() { @@ -84,12 +113,7 @@ run_mori_suite() { cx_log "WARN: mori not importable — needs the AMD MoRI image (rocm/sgl-dev:...-mori-...); cannot run mori" return 1 fi - torchrun --nproc_per_node="$CX_NGPUS" run_mori.py \ - --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" \ - --tokens-per-rank "${CX_TOKENS_PER_RANK:-64}" --hidden "${CX_HIDDEN:-7168}" \ - --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" \ - --env-json "$ENVJSON" --out "results/${CX_RUNNER}_mori_${CX_TS}.json" \ - || { cx_log "WARN: mori run failed"; return 1; } + run_ep_suite mori } rc=0 diff --git a/experimental/CollectiveX/plan.md b/experimental/CollectiveX/plan.md index d39f96967..d62bb7746 100644 --- a/experimental/CollectiveX/plan.md +++ b/experimental/CollectiveX/plan.md @@ -31,7 +31,7 @@ The Milestone-0 spike ran for real on **both** B200 (8× NVLink island, x86_64) - **Multi-arch container** for all NVIDIA SKUs: import by tag `lmsysorg/sglang:v0.5.11-cu130` (amd64 + arm64; index digest `sha256:061fb71f…` recorded for provenance) — one reference both arches; DeepEP via `rebuild-deepep`. Imported by tag, not digest (enroot anonymous auth needs a tag); v0.5.12-cu130 avoided (62-layer overlay-mount failure). See `CONTAINERS.md`. - **Per-SKU launch adapters** (`launchers/launch_.sh`, the InferenceX `launch_${RUNNER_NAME%%_*}.sh` convention) that run **any** benchmark via `CX_BENCH` (nccl|deepep|mori|all) through a shared `launchers/run_in_container.sh`. - **`on: push` workflow** (`.github/workflows/collectivex-experimental.yml`): push → MI355X MoRI dispatch/combine (the "CollectiveX Experimental" job); `workflow_dispatch` → chosen `sku`+`benchmark`. No merge to main; activates when the branch is pushed to GitHub. -- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `run_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `run_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`). +- **AMD MI355X / MoRI path validated** (first cross-vendor reach, ahead of Milestone 1): `tests/ep_mori.py` (MoRI dispatch+combine, mirrors `ROCm/mori`'s example with the zero-copy registered-combine-buffer path and `expected = input × unique-destination-ranks`), `launchers/launch_mi355x-amds.sh` (partition `compute`, node-local `/var/lib/squash` imported via `srun`, `--container-writable --container-remap-root`), ROCm MoRI image in `cx_default_image`, and `mi355x`/`mori` workflow options. **Validated on 8× MI355X** (dispatch+combine numerically correct, ~85 µs round-trip): the run surfaced three ionic_rdma-fabric constraints now baked into `tests/ep_mori.py` — a 2 GiB symmetric heap (these NICs cap RDMA MRs at ~4 GiB; MoRI registers the whole heap), a bounded `max_num_inp_token_per_rank`, and a hard-exit past MoRI's post-finalize shmem teardown assertion (see `CONTAINERS.md`). This supersedes the Milestone-0 "light single-script launcher" sketch below where they differ — launchers are now thin SKU adapters + a shared dispatcher (still light/experimental). @@ -562,7 +562,7 @@ Scaffolding — deliberately light, matching `experimental/` convention (bare sc experimental/CollectiveX/ README.md run_nccl.py # argparse; run stock nccl-tests, parse its text table (do NOT assume JSON) - run_deepep.py # one dispatch+combine shape, normal mode + tests/run_ep.py # EP dispatch/combine sweep (DeepEP/MoRI); dispatch & combine timed separately env_capture.py # Layer-0 env + topology fingerprint (torch.cuda.* + nvidia-smi topo) → json plot.py # matplotlib, like token_position_decode_slo/*/plot_*.py launchers/ @@ -678,7 +678,7 @@ The spike lands as a few small PRs, each producing something runnable — not a each tagged with topology-class and transport (aarch64 build for GB200) 3. DeepEP dispatch+combine — B200 first - run_deepep.py, routing generator + reference combine for correctness, + tests/ep_deepep.py, routing generator + reference combine for correctness, reusing rebuild-deepep at job setup → one decode shape, normal mode, on B200; GB200 DeepEP fast-follow diff --git a/experimental/CollectiveX/run_deepep.py b/experimental/CollectiveX/run_deepep.py deleted file mode 100644 index 3d61c69e4..000000000 --- a/experimental/CollectiveX/run_deepep.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python3 -"""CollectiveX spike — DeepEP MoE dispatch+combine (normal mode), B200 first. - -One decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed, -emitting the same flat-JSON provenance shape as run_nccl.py. - -Scope (plan §Milestone 0): normal mode only — low-latency (LL) mode is the -known-broken/blocked IBGDA path and is out of scope for the spike. B200 -(x86_64) first; GB200 is the fast-follow once the aarch64 rebuild-deepep path -is proven. - - !!! DeepEP's Python API is VERSION-SENSITIVE (the plan notes V2 changed - NVSHMEM->NCCL, unified the APIs, and removed zero-SM LL mode). The - dispatch/combine block below follows the documented normal-mode intranode - API and is marked "ADAPT HERE" — validate the call signatures against the - DeepEP commit actually built by rebuild-deepep at job time, and record that - commit in provenance. Build is done at job setup, not shipped in the image. - -Launch (one process per GPU), e.g. single-node 8x B200: - torchrun --nproc_per_node=8 run_deepep.py \\ - --runner b200-dgxc --topology-class b200-nvlink-island --transport nvlink \\ - --env-json results/env.json --out results/b200_deepep.json -""" -from __future__ import annotations - -import argparse -import datetime as _dt -import hashlib -import json -import os -import sys - -SCHEMA_VERSION = 1 -MEASUREMENT_CONTRACT = "deepep-normal-v1" - - -def _percentile(xs: list[float], q: float) -> float: - if not xs: - return float("nan") - s = sorted(xs) - i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1))))) - return s[i] - - -def comparison_key(meta: dict) -> str: - parts = [ - meta["op"], meta["backend"], meta["mode"], str(meta["world_size"]), - str(meta["nodes"]), meta["topology_class"], meta["comparison_class"], - meta["measurement_contract"], str(meta["shape"]), - ] - return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] - - -def main() -> int: - ap = argparse.ArgumentParser(description="CollectiveX DeepEP dispatch+combine (normal mode)") - # shape (decode-ish default from the plan) - ap.add_argument("--tokens-per-rank", type=int, default=64) - ap.add_argument("--hidden", type=int, default=7168) - ap.add_argument("--topk", type=int, default=8) - ap.add_argument("--experts", type=int, default=256) - ap.add_argument("--dispatch-dtype", default="fp8", choices=["fp8", "bf16"]) - ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"]) - ap.add_argument("--seed", type=int, default=67) - # measurement - ap.add_argument("--warmup", type=int, default=20) - ap.add_argument("--iters", type=int, default=200) - ap.add_argument("--trials", type=int, default=3) - ap.add_argument("--num-sms", type=int, default=24, help="communication SMs (standardized budget)") - # provenance - ap.add_argument("--runner", required=True) - ap.add_argument("--topology-class", required=True) - ap.add_argument("--transport", default="") - ap.add_argument("--comparison-class", default="standardized") - ap.add_argument("--deepep-commit", default=os.environ.get("DEEPEP_COMMIT", "unknown")) - ap.add_argument("--env-json") - ap.add_argument("--timestamp") - ap.add_argument("--out", required=True) - args = ap.parse_args() - - # ---- imports guarded so a missing build fails loudly, not cryptically ---- - try: - import torch - import torch.distributed as dist - except Exception as exc: # pragma: no cover - print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) - return 3 - try: - from deep_ep import Buffer # type: ignore - except Exception as exc: # pragma: no cover - print( - "ERROR: deep_ep import failed — DeepEP must be built at job setup " - f"(rebuild-deepep). {exc!r}", - file=sys.stderr, - ) - return 3 - - rank = int(os.environ.get("RANK", "0")) - world_size = int(os.environ.get("WORLD_SIZE", "1")) - local_rank = int(os.environ.get("LOCAL_RANK", "0")) - torch.cuda.set_device(local_rank) - if not dist.is_initialized(): - dist.init_process_group("nccl") - group = dist.group.WORLD - device = torch.device(f"cuda:{local_rank}") - torch.manual_seed(args.seed + rank) - - n = args.tokens_per_rank - H = args.hidden - topk = args.topk - E = args.experts - - # Input tokens + routing. Weights sum to 1 per token so that a pure - # dispatch->combine round trip (no expert compute) reconstructs x. - x = torch.randn((n, H), dtype=torch.bfloat16, device=device) - if args.routing == "uniform": - topk_idx = torch.stack([ - torch.randperm(E, device=device)[:topk] for _ in range(n) - ]).to(torch.int64) - else: # zipf-ish skew toward low expert ids - probs = (1.0 / torch.arange(1, E + 1, device=device).float()) - topk_idx = torch.multinomial(probs.expand(n, E), topk, replacement=False).to(torch.int64) - topk_weights = torch.softmax(torch.randn((n, topk), device=device, dtype=torch.float32), dim=-1) - - # Buffer sizing: intranode uses NVLink buffer only (no RDMA for single node). - # Numbers follow DeepEP's intranode test guidance; tune per build. - num_nvl_bytes = 1024 * 1024 * 1024 - num_rdma_bytes = 0 - buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes) - # Apply the standardized communication-SM budget so the recorded - # num_comm_sms reflects the actual run (best-effort across DeepEP versions). - try: - Buffer.set_num_sms(args.num_sms) - except Exception as exc: # pragma: no cover - API/version dependent - if rank == 0: - print(f"WARN: could not set num_sms={args.num_sms}: {exc!r}", file=sys.stderr) - - def run_once(): - # ===================== ADAPT HERE (DeepEP API) ======================= - # Normal-mode intranode dispatch/combine. Signatures below match the - # documented DeepEP normal API; confirm against the built commit. - (num_tokens_per_rank, _, num_tokens_per_expert, - is_token_in_rank, _) = buffer.get_dispatch_layout(topk_idx, E) - recv_x, recv_topk_idx, recv_topk_weights, _, handle, _ = buffer.dispatch( - x, - topk_idx=topk_idx, - topk_weights=topk_weights, - num_tokens_per_rank=num_tokens_per_rank, - is_token_in_rank=is_token_in_rank, - num_tokens_per_expert=num_tokens_per_expert, - ) - combined_x, _, _ = buffer.combine(recv_x, handle, topk_weights=recv_topk_weights) - # ===================================================================== - return combined_x, num_tokens_per_expert, is_token_in_rank - - # ---- correctness gate (run before timing; a fast wrong answer is invalid) ---- - combined_x, num_tokens_per_expert, is_token_in_rank = run_once() - torch.cuda.synchronize() - expected_routed = n * topk - routed = int(torch.as_tensor(num_tokens_per_expert).sum().item()) - token_conservation = (routed == expected_routed) - # DeepEP combine sums one copy of each token per destination RANK, so the - # dispatch->combine round trip reconstructs x only after dividing by the - # number of ranks each token was sent to (per DeepEP's own check in - # tests/legacy/test_intranode.py: combined_x / is_token_in_rank.sum(dim=1)). - ranks_per_token = is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float() - check_x = combined_x.float() / ranks_per_token - max_abs = (check_x - x.float()).abs().max().item() - max_rel = (max_abs / (x.float().abs().max().item() + 1e-6)) - combine_ok = max_rel < 2e-2 # bf16 dispatch/combine round-trip tolerance - correct = bool(token_conservation and combine_ok) - - # ---- timing (CUDA events; per-rank; reduce for slowest rank) ---- - def time_ms(fn, warmup, iters) -> list[float]: - for _ in range(warmup): - fn() - torch.cuda.synchronize() - out = [] - for _ in range(iters): - s = torch.cuda.Event(enable_timing=True) - e = torch.cuda.Event(enable_timing=True) - s.record() - fn() - e.record() - torch.cuda.synchronize() - out.append(s.elapsed_time(e) * 1000.0) # ms -> us - return out - - def dispatch_only(): - (npr, _, npe, itir, _) = buffer.get_dispatch_layout(topk_idx, E) - buffer.dispatch(x, topk_idx=topk_idx, topk_weights=topk_weights, - num_tokens_per_rank=npr, is_token_in_rank=itir, - num_tokens_per_expert=npe) - - trials = [] - for _ in range(args.trials): - rt = time_ms(run_once, args.warmup, args.iters) # dispatch+combine round trip - dp = time_ms(dispatch_only, args.warmup, args.iters) # dispatch only - trials.append({ - "roundtrip_us_p50": _percentile(rt, 50), "roundtrip_us_p99": _percentile(rt, 99), - "dispatch_us_p50": _percentile(dp, 50), - }) - - local_rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) - # slowest rank across the world - t = torch.tensor([local_rt_p50], device=device) - dist.all_reduce(t, op=dist.ReduceOp.MAX) - slowest_rank_us = float(t.item()) - - if rank == 0: - shape = { - "tokens_per_rank": n, "hidden": H, "topk": topk, "experts": E, - "dispatch_dtype": args.dispatch_dtype, "routing": args.routing, - "num_comm_sms": args.num_sms, - } - meta = { - "op": "dispatch-combine", "backend": "deepep", "mode": "normal", - "world_size": world_size, "nodes": int(os.environ.get("SLURM_NNODES", "1")), - "topology_class": args.topology_class, "comparison_class": args.comparison_class, - "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape, - } - tokens_total = n * world_size - rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) - env = None - if args.env_json and os.path.exists(args.env_json): - with open(args.env_json) as _fh: - env = json.load(_fh) - doc = { - "schema_version": SCHEMA_VERSION, - "family": "moe", - "generated_by": "run_deepep.py", - "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), - "runner": args.runner, - "transport": args.transport, - "status": "valid" if correct else "invalid", - "comparison_key": comparison_key(meta), - "backend_provenance": {"deepep_commit": args.deepep_commit}, - **meta, - "correctness": { - "passed": correct, "token_conservation": token_conservation, - "combine_within_tol": combine_ok, "max_abs_error": max_abs, "max_rel_error": max_rel, - }, - "metrics": { - "roundtrip_us_p50": rt_p50, - "roundtrip_us_p99": sum(t["roundtrip_us_p99"] for t in trials) / len(trials), - "dispatch_us_p50": sum(t["dispatch_us_p50"] for t in trials) / len(trials), - "slowest_rank_roundtrip_us": slowest_rank_us, - "tokens_per_second": (tokens_total / (rt_p50 * 1e-6)) if rt_p50 else None, - }, - "trials": trials, - "environment": env, - } - os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) - with open(args.out, "w") as fh: - json.dump(doc, fh, indent=2) - fh.write("\n") - print( - f"deepep dispatch-combine: status={doc['status']} " - f"rt_p50={rt_p50:.1f}us slowest_rank={slowest_rank_us:.1f}us " - f"correct={correct} -> {args.out}" - ) - - dist.barrier() - dist.destroy_process_group() - return 0 if correct else 1 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/experimental/CollectiveX/run_mori.py b/experimental/CollectiveX/run_mori.py deleted file mode 100644 index f99775427..000000000 --- a/experimental/CollectiveX/run_mori.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python3 -"""CollectiveX spike — MoRI (AMD) MoE dispatch+combine, normal mode. - -AMD counterpart to run_deepep.py, using ROCm MoRI's EpDispatchCombine op. One -decode-shaped dispatch+combine point, correctness-gated, CUDA-event timed, -emitting the same flat-JSON shape (family=moe, backend=mori). - - VALIDATED on MI355X (8x, image rocm/sgl-dev:...-mori-0227-2): dispatch+combine - numerically correct (combine within tol, max_rel ~2e-3), ~85 us round-trip at - the decode shape. The config/dispatch/combine API follows ROCm/mori's reference - test. Three constraints on this ionic_rdma fabric are handled here: (1) MoRI - registers the whole symmetric heap as ONE RDMA MR and these NICs cap GPU-memory - MRs at ~4 GiB, so the heap is held at 2 GiB (above); (2) max_num_inp_token_per_rank - is bounded so the buffers fit that heap (below); (3) MoRI's shmem teardown - asserts after finalize, so we hard-exit after writing results (end of main). - -Launch (one process per GPU), e.g. single-node 8x MI355X: - torchrun --nproc_per_node=8 run_mori.py \\ - --runner mi355x-amds --topology-class mi355x-xgmi --transport xgmi \\ - --env-json results/env.json --out results/mi355x_mori.json -""" -from __future__ import annotations - -import argparse -import datetime as _dt -import hashlib -import json -import os -import sys - -# MoRI registers the WHOLE symmetric heap as one RDMA memory region at shmem -# init (set this BEFORE `import mori`). On the MI355X ionic_rdma NICs the GPU- -# memory MR registration has a hard size ceiling (~4 GiB): a 6 GiB heap fails -# (`RegisterRdmaMemoryRegion ... errno 22 EINVAL`, validated on-node), while -# 2 GiB registers cleanly. So keep the heap at 2 GiB and instead bound the -# buffers via max_num_inp_token_per_rank below. Layered override: -# explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > "2G". -os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", - os.environ.get("CX_MORI_HEAP_SIZE", "2G")) - -SCHEMA_VERSION = 1 -MEASUREMENT_CONTRACT = "mori-normal-v1" - - -def _percentile(xs: list[float], q: float) -> float: - if not xs: - return float("nan") - s = sorted(xs) - i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1))))) - return s[i] - - -def comparison_key(meta: dict) -> str: - parts = [ - meta["op"], meta["backend"], meta["mode"], str(meta["world_size"]), - str(meta["nodes"]), meta["topology_class"], meta["comparison_class"], - meta["measurement_contract"], str(meta["shape"]), - ] - return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] - - -def main() -> int: - ap = argparse.ArgumentParser(description="CollectiveX MoRI dispatch+combine (normal mode)") - ap.add_argument("--tokens-per-rank", type=int, default=64) - ap.add_argument("--hidden", type=int, default=7168) - ap.add_argument("--topk", type=int, default=8) - ap.add_argument("--experts", type=int, default=256) - ap.add_argument("--dispatch-dtype", default="bf16", choices=["bf16", "fp8"]) - ap.add_argument("--seed", type=int, default=67) - ap.add_argument("--warmup", type=int, default=20) - ap.add_argument("--iters", type=int, default=200) - ap.add_argument("--trials", type=int, default=3) - ap.add_argument("--block-num", type=int, default=int(os.environ.get("CX_MORI_BLOCK_NUM", "80"))) - ap.add_argument("--dispatch-warps", type=int, default=int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16"))) - ap.add_argument("--combine-warps", type=int, default=int(os.environ.get("CX_MORI_COMBINE_WARPS", "8"))) - ap.add_argument("--runner", required=True) - ap.add_argument("--topology-class", required=True) - ap.add_argument("--transport", default="") - ap.add_argument("--comparison-class", default="standardized") - ap.add_argument("--mori-commit", default=os.environ.get("MORI_COMMIT", "unknown")) - ap.add_argument("--env-json") - ap.add_argument("--timestamp") - ap.add_argument("--out", required=True) - args = ap.parse_args() - - try: - import torch - import torch.distributed as dist - except Exception as exc: # pragma: no cover - print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) - return 3 - try: - import mori # type: ignore - except Exception as exc: # pragma: no cover - print(f"ERROR: mori import failed — needs the AMD MoRI image. {exc!r}", file=sys.stderr) - return 3 - - rank = int(os.environ.get("RANK", "0")) - world_size = int(os.environ.get("WORLD_SIZE", "1")) - local_rank = int(os.environ.get("LOCAL_RANK", "0")) - torch.cuda.set_device(local_rank) - device = torch.device(f"cuda:{local_rank}") - if world_size % 1 != 0 or args.experts % world_size != 0: - if rank == 0: - print(f"ERROR: experts ({args.experts}) must divide world_size ({world_size})", file=sys.stderr) - return 2 - experts_per_rank = args.experts // world_size - torch.manual_seed(args.seed + rank) - - # ===================== ADAPT HERE (MoRI API) ========================= - # init torch.distributed + MoRI shmem (per the MoRI dispatch/combine test). - os.environ.setdefault("MASTER_ADDR", "localhost") - os.environ.setdefault("MASTER_PORT", "12355") - if not dist.is_initialized(): - dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, - world_size=world_size, device_id=device) - world_group = torch.distributed.group.WORLD - torch._C._distributed_c10d._register_process_group("default", world_group) - mori.shmem.shmem_torch_process_group_init("default") - - n = args.tokens_per_rank - H = args.hidden - topk = args.topk - config = mori.ops.EpDispatchCombineConfig( - data_type=torch.bfloat16, - rank=rank, - world_size=world_size, - hidden_dim=H, - scale_dim=0, - scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(), - max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), - # Sizes MoRI's symmetric buffers. The reference test uses 4096, but at - # hidden=7168 that overflows the registerable 2 GiB heap (see top). Bound - # it to the workload (decode shapes are tens of tokens/rank); 512 fits the - # 2 GiB heap and was validated on-node. Larger token counts may need a - # heap above the NIC's MR ceiling — out of reach on this fabric for now. - max_num_inp_token_per_rank=max(512, n), - num_experts_per_rank=experts_per_rank, - num_experts_per_token=topk, - use_external_inp_buf=False, - quant_type="none", - ) - op = mori.ops.EpDispatchCombineOp(config) - - # Routing: each token -> topk distinct experts in [0, experts). MoRI expects - # INT32 expert indices, and a real (n, scale_dim) fp8 scales tensor even when - # scale_dim==0 (an (n,0) tensor) — not None (see the reference test). - x = torch.randn((n, H), dtype=torch.bfloat16, device=device) - indices = torch.stack([torch.randperm(args.experts, device=device)[:topk] for _ in range(n)]).to(torch.int32) - weights = torch.rand((n, topk), dtype=torch.float32, device=device) - scales = torch.empty((n, 0), dtype=torch.float8_e4m3fnuz, device=device) - - def run_once(): - (dispatch_output, dispatch_weights, _dispatch_scales, - dispatch_indices, recv_num) = op.dispatch( - x, weights, scales, indices, - block_num=args.block_num, warp_per_block=args.dispatch_warps) - # Zero-copy mode (use_external_inp_buf=False): combine reads from MoRI's - # registered combine-input buffer, so stage the dispatched rows into it - # first. (In a real MoE the expert FFN writes its outputs here; with no - # expert compute we copy the dispatched activations straight through.) - total_recv = int(recv_num[0].item()) - combine_input = dispatch_output.to(torch.bfloat16) - combine_buf = op.get_registered_combine_input_buffer( - torch.bfloat16, hidden_dim=combine_input.size(1)) - combine_buf[:total_recv, :].copy_(combine_input[:total_recv, :]) - combined, _combined_w = op.combine( - combine_input, dispatch_weights, dispatch_indices, - block_num=args.block_num, warp_per_block=args.combine_warps) - # Return total_recv (read BEFORE combine — combine resets recv_num), not - # the tensor: reading recv_num[0] after combine yields 0 (false negative). - return combined, total_recv - # ===================================================================== - - # ---- correctness gate ---- - combined, total_recv = run_once() - torch.cuda.synchronize() - # MoRI combine sums one copy per destination RANK, so combined[i] ≈ - # input[i] * (#unique destination ranks among the token's topk experts) - # (see ROCm/mori .../test_dispatch_combine.py). combine returns the full - # max_num_inp_token_per_rank-sized buffer; only the first n rows are our - # local input tokens, so slice to [:n] before comparing. - combined_valid = combined[:n].float() - pes = indices.long() // experts_per_rank - unique_pes = torch.tensor( - [len(set(row.tolist())) for row in pes], device=device, dtype=torch.float32 - ).unsqueeze(1) - expected = x.float() * unique_pes - max_abs = (combined_valid - expected).abs().max().item() - max_rel = max_abs / (expected.abs().max().item() + 1e-6) - # Validated tolerance from the reference test (bf16 + up-to-topk summation). - combine_ok = bool(torch.allclose(combined_valid, expected.float(), atol=1e-2, rtol=1e-2)) - recv_ok = total_recv > 0 - correct = bool(combine_ok and recv_ok) - - def time_us(fn, warmup, iters) -> list[float]: - for _ in range(warmup): - fn() - torch.cuda.synchronize() - out = [] - for _ in range(iters): - s = torch.cuda.Event(enable_timing=True) - e = torch.cuda.Event(enable_timing=True) - s.record(); fn(); e.record(); torch.cuda.synchronize() - out.append(s.elapsed_time(e) * 1000.0) - return out - - def dispatch_only(): - op.dispatch(x, weights, scales, indices, - block_num=args.block_num, warp_per_block=args.dispatch_warps) - - trials = [] - for _ in range(args.trials): - rt = time_us(run_once, args.warmup, args.iters) - dp = time_us(dispatch_only, args.warmup, args.iters) - trials.append({"roundtrip_us_p50": _percentile(rt, 50), "roundtrip_us_p99": _percentile(rt, 99), - "dispatch_us_p50": _percentile(dp, 50)}) - - local_rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) - t = torch.tensor([local_rt_p50], device=device) - dist.all_reduce(t, op=dist.ReduceOp.MAX) - slowest_rank_us = float(t.item()) - - if rank == 0: - shape = {"tokens_per_rank": n, "hidden": H, "topk": topk, "experts": args.experts, - "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype} - meta = {"op": "dispatch-combine", "backend": "mori", "mode": "normal", - "world_size": world_size, "nodes": int(os.environ.get("SLURM_NNODES", "1")), - "topology_class": args.topology_class, "comparison_class": args.comparison_class, - "measurement_contract": MEASUREMENT_CONTRACT, "shape": shape} - rt_p50 = sum(t["roundtrip_us_p50"] for t in trials) / len(trials) - tokens_total = n * world_size - env = None - if args.env_json and os.path.exists(args.env_json): - with open(args.env_json) as fh: - env = json.load(fh) - doc = { - "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "run_mori.py", - "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), - "runner": args.runner, "transport": args.transport, - "status": "valid" if correct else "invalid", - "comparison_key": comparison_key(meta), - "backend_provenance": {"mori_commit": args.mori_commit, - "block_num": args.block_num, - "dispatch_warps": args.dispatch_warps, - "combine_warps": args.combine_warps}, - **meta, - "correctness": {"passed": correct, "combine_within_tol": combine_ok, - "recv_nonzero": recv_ok, "max_abs_error": max_abs, "max_rel_error": max_rel}, - "metrics": { - "roundtrip_us_p50": rt_p50, - "roundtrip_us_p99": sum(t["roundtrip_us_p99"] for t in trials) / len(trials), - "dispatch_us_p50": sum(t["dispatch_us_p50"] for t in trials) / len(trials), - "slowest_rank_roundtrip_us": slowest_rank_us, - "tokens_per_second": (tokens_total / (rt_p50 * 1e-6)) if rt_p50 else None, - }, - "trials": trials, "environment": env, - } - os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) - with open(args.out, "w") as fh: - json.dump(doc, fh, indent=2) - fh.write("\n") - print(f"mori dispatch-combine: status={doc['status']} rt_p50={rt_p50:.1f}us " - f"slowest_rank={slowest_rank_us:.1f}us correct={correct} -> {args.out}") - - # MoRI's shmem teardown asserts when the EpDispatchCombineOp is destroyed - # after shmem_finalize() (CheckStatusValid abort -> SIGABRT on this build, - # validated on-node). The result JSON is already written above, so just sync - # the ranks and hard-exit, skipping the buggy finalize/destructor path. - try: - dist.barrier() - except Exception: - pass - sys.stdout.flush() - sys.stderr.flush() - os._exit(0 if correct else 1) - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py index 013ce3151..90be0e480 100644 --- a/experimental/CollectiveX/summarize.py +++ b/experimental/CollectiveX/summarize.py @@ -117,6 +117,32 @@ def _fnum(x, fmt): return format(x, fmt) if isinstance(x, (int, float)) else "—" +def _moe_sorted(moe): + return sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""), x.get("ep_size", 0))) + + +def _moe_sweep_table(d): + """Markdown sweep table for one EP doc — the rows already ARE the ladder, so + emit one row per source-tokens-per-rank point. Skips old single-point docs + (no rows[]).""" + rows = d.get("rows") + if not rows: + return [] + sh = d.get("shape", {}) + head = (f"\n**`{d.get('backend')}` · {d.get('phase')} · ep{d.get('ep_size')} · " + f"H{sh.get('hidden')} top{sh.get('topk')} E{sh.get('experts')} " + f"{sh.get('dispatch_dtype')} {sh.get('routing')}** — latency vs source tokens/rank\n") + out = [head, + "| tokens/rank | global tokens | dispatch µs | combine µs | round-trip µs | tokens/s | recv tok | correct |", + "|--:|--:|--:|--:|--:|--:|--:|:--:|"] + for r in rows: + out.append(f"| {r.get('tokens_per_rank')} | {r.get('global_tokens')} | " + f"{_fnum(r.get('dispatch_us_p50'), '.2f')} | {_fnum(r.get('combine_us_p50'), '.2f')} | " + f"{_fnum(r.get('roundtrip_us_p50'), '.2f')} | {_fnum(r.get('tokens_per_second'), '.3e')} | " + f"{r.get('recv_tokens', '—')} | {'✅' if r.get('correct') else '❌'} |") + return out + + def render_plain(nccl, moe, n_valid, total) -> str: out = [] hdr = "CollectiveX results" @@ -133,15 +159,14 @@ def render_plain(nccl, moe, n_valid, total) -> str: out.append(f" {d['op']:<16}{d.get('status',''):<9}{_peak_busbw(rows):>12.1f}" f"{_lat_floor(rows):>10.2f}{(avg if avg is not None else float('nan')):>11.1f}") if moe: - out.append("\nMoE dispatch+combine (DeepEP / MoRI):") - out.append(f" {'backend':<10}{'mode':<8}{'status':<9}{'rt_p50':>9}{'rt_p99':>9}{'disp_p50':>10}{'tokens/s':>13} correct") - for d in sorted(moe, key=lambda x: x.get("backend", "")): + out.append("\nMoE EP dispatch/combine (DeepEP / MoRI) — headline (* = headline tokens/rank):") + out.append(f" {'backend':<9}{'phase':<8}{'ep':>3} {'status':<9}{'T*':>5}{'disp_p50':>10}{'comb_p50':>10}{'rt_p50':>9} correct") + for d in sorted(moe, key=lambda x: (x.get("backend", ""), x.get("phase", ""))): m, c = d.get("metrics", {}), d.get("correctness", {}) - tps = m.get("tokens_per_second") - out.append(f" {d.get('backend',''):<10}{d.get('mode',''):<8}{d.get('status',''):<9}" - f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f}{(m.get('roundtrip_us_p99') or float('nan')):>9.1f}" - f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}" - f"{(tps if tps is not None else float('nan')):>13.3e} {c.get('passed')}") + out.append(f" {d.get('backend',''):<9}{d.get('phase',''):<8}{str(d.get('ep_size','')):>3} {d.get('status',''):<9}" + f"{str(m.get('headline_tokens_per_rank','')):>5}" + f"{(m.get('dispatch_us_p50') or float('nan')):>10.1f}{(m.get('combine_us_p50') or float('nan')):>10.1f}" + f"{(m.get('roundtrip_us_p50') or float('nan')):>9.1f} {c.get('passed')}") return "\n".join(out) @@ -167,15 +192,22 @@ def render_markdown(nccl, moe, n_valid, total) -> str: "reduce-scatter / all-to-all; all-gather input/rank = size ÷ #GPUs). Small " "sizes are latency-bound (busbw ≈ 0); peak bandwidth is at the largest size.") if moe: - out.append("\n### MoE dispatch+combine (DeepEP / MoRI)\n") - out.append("| backend | mode | status | rt p50 (µs) | rt p99 (µs) | dispatch p50 (µs) | tokens/s | correct |") - out.append("|---|---|---|--:|--:|--:|--:|:--:|") - for d in sorted(moe, key=lambda x: x.get("backend", "")): + out.append("\n### MoE EP dispatch / combine (DeepEP / MoRI)\n") + out.append("Headline = the reference point (tokens/rank shown as `T*`); the per-line " + "sweep tables below carry the full source-tokens-per-rank curve.\n") + out.append("| backend | phase | ep | status | T\\* | dispatch p50 (µs) | combine p50 (µs) | round-trip p50 (µs) | tokens/s | correct |") + out.append("|---|---|--:|---|--:|--:|--:|--:|--:|:--:|") + for d in _moe_sorted(moe): m, c = d.get("metrics", {}), d.get("correctness", {}) - out.append(f"| `{d.get('backend')}` | {d.get('mode')} | {_emoji(d.get('status'))} | " - f"{_fnum(m.get('roundtrip_us_p50'), '.1f')} | {_fnum(m.get('roundtrip_us_p99'), '.1f')} | " - f"{_fnum(m.get('dispatch_us_p50'), '.1f')} | {_fnum(m.get('tokens_per_second'), '.3e')} | " - f"{'✅' if c.get('passed') else '❌'} |") + out.append(f"| `{d.get('backend')}` | {d.get('phase','')} | {d.get('ep_size','')} | {_emoji(d.get('status'))} | " + f"{m.get('headline_tokens_per_rank','—')} | {_fnum(m.get('dispatch_us_p50'), '.1f')} | " + f"{_fnum(m.get('combine_us_p50'), '.1f')} | {_fnum(m.get('roundtrip_us_p50'), '.1f')} | " + f"{_fnum(m.get('tokens_per_second'), '.3e')} | {'✅' if c.get('passed') else '❌'} |") + for d in _moe_sorted(moe): + out += _moe_sweep_table(d) + out.append("\n> EP sweep: only source tokens/rank varies along a line; global tokens = " + "tokens/rank × ep. Dispatch and combine are timed **separately** (combine's " + "setup dispatch runs untimed); round-trip = dispatch + combine.") if not total: out.append("\n> No result files found — the benchmark produced nothing.") return "\n".join(out) diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py new file mode 100644 index 000000000..c54ccd00f --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — DeepEP (NVIDIA), normal mode. + +Ports the validated dispatch/combine sequence from the old run_deepep.py into the +ep_harness Backend protocol. The harness owns the token sweep + separated timing; +this file owns only DeepEP's API calls and its correctness reference. + + !!! DeepEP's Python API is VERSION-SENSITIVE (V2 moved NVSHMEM->NCCL and unified + the APIs). The dispatch/combine block follows the documented normal-mode + intranode API; validate against the deep_ep commit actually built at job time + (rebuild-deepep) and recorded in provenance. + +Correctness (per DeepEP's tests/legacy/test_intranode.py): a pure dispatch->combine +round trip with no expert compute reconstructs x only after dividing by the number +of ranks each token was sent to, i.e. combined_x / is_token_in_rank.sum(dim=1). +So the harness expects combined ≈ x * ranks_per_token. +""" +from __future__ import annotations + +import os +import sys +import types + +import torch +import torch.distributed as dist + +try: + from deep_ep import Buffer # type: ignore +except Exception as exc: # pragma: no cover - needs the built DeepEP + print("ERROR: deep_ep import failed — DeepEP must be built at job setup " + f"(rebuild-deepep). {exc!r}", file=sys.stderr) + raise + + +class DeepEPBackend: + name = "deepep" + mode = "normal" + measurement_contract = "deepep-normal-v1" + combine_needs_redispatch = False # DeepEP combine reuses the handle (its own bench does too) + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.group = dist.group.WORLD + # Intranode normal mode: NVLink buffer only (no RDMA for single node). Size + # to hold the largest sweep point's routed traffic. Prefill's large-T points + # (up to 4096 tok/rank) need a bigger buffer than decode — validated on + # B200 (EP8) and GB200 (EP4) at 4 GiB through T=4096; decode is fine at 2 GiB. + # Override with CX_DEEPEP_NVL_BYTES. + _default_nvl = (4 if args.phase == "prefill" else 2) * 1024 * 1024 * 1024 + num_nvl_bytes = int(os.environ.get("CX_DEEPEP_NVL_BYTES", str(_default_nvl))) + self.buffer = Buffer(self.group, num_nvl_bytes, 0) + try: + Buffer.set_num_sms(args.num_comm_sms) + except Exception as exc: # pragma: no cover - version dependent + if rank == 0: + print(f"WARN: could not set num_sms={args.num_comm_sms}: {exc!r}", file=sys.stderr) + self.backend_provenance = { + "deepep_commit": os.environ.get("DEEPEP_COMMIT", "unknown"), + "num_nvl_bytes": num_nvl_bytes, + "num_comm_sms": args.num_comm_sms, + } + if args.dispatch_dtype == "fp8" and rank == 0: + print("WARN: deepep fp8 dispatch payload not wired for the exact-reconstruction " + "gate yet; using bf16. (provenance reflects bf16.)", file=sys.stderr) + args.dispatch_dtype = "bf16" + + def buffer_cap(self, args): + return None # NVLink buffer is large; no hard per-T ceiling like MoRI's heap + + def make_problem(self, T): + a = self.args + H, topk, E = a.hidden, a.topk, a.experts + x = torch.randn((T, H), dtype=torch.bfloat16, device=self.device) + if a.routing == "zipf": + probs = (1.0 / torch.arange(1, E + 1, device=self.device).float()) + topk_idx = torch.multinomial(probs.expand(T, E), topk, replacement=False).to(torch.int64) + else: # balanced / uniform: topk distinct experts drawn uniformly per token + topk_idx = torch.stack([ + torch.randperm(E, device=self.device)[:topk] for _ in range(T) + ]).to(torch.int64) + topk_weights = torch.softmax( + torch.randn((T, topk), device=self.device, dtype=torch.float32), dim=-1) + return types.SimpleNamespace(T=T, x=x, topk_idx=topk_idx, topk_weights=topk_weights) + + def dispatch(self, p): + # ===================== DeepEP normal-mode dispatch ===================== + (num_tokens_per_rank, _, num_tokens_per_expert, + is_token_in_rank, _) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + recv_x, recv_topk_idx, recv_topk_weights, _, handle, _ = self.buffer.dispatch( + p.x, topk_idx=p.topk_idx, topk_weights=p.topk_weights, + num_tokens_per_rank=num_tokens_per_rank, is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert) + # ======================================================================= + return types.SimpleNamespace( + recv_x=recv_x, recv_topk_weights=recv_topk_weights, handle=handle, + is_token_in_rank=is_token_in_rank, num_tokens_per_expert=num_tokens_per_expert) + + def stage(self, p, h): + # DeepEP combine consumes recv_x directly (no separate registered buffer to + # stage into) — the "expert outputs" are recv_x itself for a pure round trip. + return None + + def combine(self, p, h): + combined_x, _, _ = self.buffer.combine(h.recv_x, h.handle, topk_weights=h.recv_topk_weights) + return combined_x + + def expected(self, p, h): + # combined ≈ x * (#ranks each token was dispatched to) + ranks_per_token = h.is_token_in_rank.sum(dim=1, keepdim=True).clamp(min=1).float() + return p.x.float() * ranks_per_token, p.T + + def recv_tokens(self, h): + return int(h.recv_x.shape[0]) + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py new file mode 100644 index 000000000..01214a3de --- /dev/null +++ b/experimental/CollectiveX/tests/ep_harness.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness. + +Backend-agnostic core for the EP benchmark. The per-backend adapters +(`ep_deepep.py`, `ep_mori.py`) implement a small duck-typed protocol; this module +owns everything else: the source-tokens-per-rank sweep, the SEPARATED dispatch / +combine / round-trip timing, the correctness gate, and the provenance-tagged JSON +doc the summarizer + plotter consume. + +Measurement model (see the CollectiveX EP framework notes): + * Primary x-axis is SOURCE TOKENS PER RANK, T in {1,2,4,8,...}. One row per T. + Only T varies along a line; everything else (backend, ep degree, phase, + precision, top-k, experts, hidden, routing, mode, comm-SMs) is FIXED and + identifies the line. + * Dispatch and combine are SEPARATE measurements. The combine timing window + contains ONLY combine(): the dispatch that produces its handle/layout (and + the "expert outputs" staged into the combine input) runs UNTIMED. The + round-trip is a third, distinct measurement (dispatch + combine). + * Both x values are recorded per row — tokens_per_rank and + global_tokens = T * ep_size — so a frontend can toggle weak-scaling (fixed + tokens/rank) vs strong-scaling (fixed global tokens) without re-running. + +stdlib-only at module top (torch is passed in by the entrypoint after a guarded +import) so this file `py_compile`s on a machine without torch. + +Backend protocol (see ep_deepep.py / ep_mori.py): + name: str # "deepep" | "mori" + mode: str # "normal" | "ll" + measurement_contract: str # e.g. "deepep-normal-v1" + combine_needs_redispatch: bool # True if combine consumes the dispatch state + backend_provenance: dict + buffer_cap(args) -> int|None # max T the backend's buffers can hold (None = unbounded) + make_problem(T) -> problem # build x[T,H], topk_idx[T,topk], topk_weights, scales + dispatch(problem) -> handle # ONLY the dispatch comm op (timed for dispatch-only) + stage(problem, handle) # untimed: place "expert outputs" into combine input + combine(problem, handle) -> tensor # ONLY the combine comm op (timed for combine-only) + expected(problem, handle) -> (tensor, n_compare) # reference for the gate + recv_tokens(handle) -> int # realized tokens received this rank (comm volume) + finalize(rc) -> int|NoReturn # clean shutdown (mori hard-exits) +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import os + +SCHEMA_VERSION = 1 + +# Phase-default sweeps. Decode: a handful of active sequences per rank (small T). +# Prefill: a chunk of context tokens per rank (large T). Powers of two so the +# x-axis is even on a log scale. Either is overridable via --tokens-ladder; both +# get clamped to the backend's buffer ceiling (MoRI's registerable heap). +DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128] +PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096] + +# bytes per element of the dispatch payload, for the comm-volume / algbw estimate. +_DTYPE_BYTES = {"bf16": 2, "fp16": 2, "fp8": 1} + + +def add_common_args(ap: argparse.ArgumentParser) -> None: + """CLI args shared by every backend (the entrypoint adds --backend).""" + # workload shape — FIXED params identify the line; only --tokens-ladder sweeps. + ap.add_argument("--phase", default="decode", choices=["decode", "prefill"], + help="decode (small T) or prefill (large T); picks the default ladder") + ap.add_argument("--tokens-ladder", default="", + help="space/comma-separated source-tokens-per-rank sweep; blank = phase default") + ap.add_argument("--hidden", type=int, default=7168) + ap.add_argument("--topk", type=int, default=8) + ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across ep degrees)") + ap.add_argument("--dispatch-dtype", default="bf16", choices=["bf16", "fp8"]) + ap.add_argument("--routing", default="balanced", choices=["balanced", "uniform", "zipf"]) + ap.add_argument("--num-comm-sms", type=int, default=24, help="standardized communication-SM budget") + ap.add_argument("--num-ep-groups", type=int, default=1, + help="concurrent EP groups on the node (1 = the ordinary line; >1 is a distinct experiment)") + ap.add_argument("--seed", type=int, default=67) + # measurement + ap.add_argument("--warmup", type=int, default=10) + ap.add_argument("--iters", type=int, default=50) + # provenance + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + ap.add_argument("--comparison-class", default="standardized") + ap.add_argument("--env-json") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + + +def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]: + """Return (ladder, dropped). Parse an explicit spec else the phase default; + keep only positive ints; clamp to `cap` (backend buffer ceiling) and report + what was dropped so truncation is never silent.""" + if spec and spec.strip(): + raw = [t.strip() for t in spec.replace(",", " ").split()] + want = [int(t) for t in raw if t] + else: + want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER + want = sorted({t for t in want if t > 0}) + if cap is not None: + kept = [t for t in want if t <= cap] + dropped = [t for t in want if t > cap] + else: + kept, dropped = want, [] + return kept, dropped + + +def percentile(xs: list[float], q: float) -> float: + if not xs: + return float("nan") + s = sorted(xs) + i = max(0, min(len(s) - 1, int(round(q / 100.0 * (len(s) - 1))))) + return s[i] + + +def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]: + """CUDA-event timing in microseconds. + + Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration + (with a sync before the start event so its GPU work cannot bleed into the + measured window), then times `fn(pre_result)`. `pre` is how combine is + isolated for a backend whose combine consumes the dispatch state and so needs + a fresh dispatch+stage before every combine sample. + """ + def sample(): + arg = None + if pre is not None: + arg = pre() + torch.cuda.synchronize() + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + fn(arg) if pre is not None else fn() + e.record() + torch.cuda.synchronize() + return s.elapsed_time(e) * 1000.0 # ms -> us + + for _ in range(max(0, warmup)): + if pre is not None: + a = pre() + torch.cuda.synchronize() + fn(a) + else: + fn() + torch.cuda.synchronize() + return [sample() for _ in range(iters)] + + +def comparison_key(meta: dict) -> str: + """Machine key gating which rows share a curve. Built from the FIXED config + ONLY — tokens_per_rank is the x-axis and MUST NOT be in the key, or every + sweep point would read as a different line. ep_size, num_ep_groups, phase and + topology-class ARE in the key, so EP4 vs EP8, decode vs prefill, and a + concurrent-groups run are labelled distinct rather than silently overlaid.""" + parts = [ + meta["op"], meta["backend"], meta["mode"], meta["phase"], + str(meta["ep_size"]), str(meta["num_ep_groups"]), str(meta["nodes"]), + meta["topology_class"], meta["comparison_class"], meta["measurement_contract"], + json.dumps(meta["shape"], sort_keys=True), + ] + return hashlib.sha256("|".join(parts).encode()).hexdigest()[:16] + + +def _reduce_max(torch, dist, device, vals: list[float]) -> list[float]: + t = torch.tensor(vals, device=device, dtype=torch.float64) + dist.all_reduce(t, op=dist.ReduceOp.MAX) + return [float(x) for x in t.tolist()] + + +def _reduce_min_int(torch, dist, device, v: int) -> int: + t = torch.tensor([v], device=device, dtype=torch.int64) + dist.all_reduce(t, op=dist.ReduceOp.MIN) + return int(t.item()) + + +def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int: + """Drive the source-tokens-per-rank sweep for one fully-specified line. + + For each T: build the problem, run one untimed dispatch->stage->combine for + the correctness gate, then take three SEPARATE timings — dispatch-only, + combine-only (dispatch+stage untimed), and the round trip. Latencies are + reduced MAX across ranks (a collective finishes with its slowest rank); + correctness is reduced MIN (any rank failing fails the point). Rank 0 writes + one JSON doc with a row per T. Returns a process exit code. + """ + ep_size = world_size // max(1, args.num_ep_groups) + if args.experts % ep_size != 0: + if rank == 0: + print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})") + return 2 + experts_per_rank = args.experts // ep_size + elem_bytes = _DTYPE_BYTES.get(args.dispatch_dtype, 2) + + cap = backend.buffer_cap(args) + ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap) + if rank == 0 and dropped: + print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} " + f"(hidden={args.hidden}); not silently truncated.") + if not ladder: + if rank == 0: + print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})") + return 2 + # Some backends (MoRI) wedge on a COLD dispatch that jumps straight to a large + # token count; they set needs_gradual_ramp so the sweep approaches its max T + # through a geometric ramp from 1 (validated on MI355X to avoid the hang while + # still reaching 512). A naturally-gradual ladder (decode) is unchanged. + if getattr(backend, "needs_gradual_ramp", False): + top, ramp, t = ladder[-1], [], 1 + while t < top: + ramp.append(t) + t *= 2 + ramp.append(top) + if rank == 0 and ramp != ladder: + print(f"NOTE: {backend.name} sweep ramped gradually 1..{top} (cold-jump-safe): {ramp}") + ladder = ramp + + rows: list[dict] = [] + for T in ladder: + problem = backend.make_problem(T) + + # ---- correctness gate (untimed): dispatch -> stage experts -> combine ---- + h = backend.dispatch(problem) + backend.stage(problem, h) + combined = backend.combine(problem, h) + torch.cuda.synchronize() + recv_local = backend.recv_tokens(h) + exp, n_cmp = backend.expected(problem, h) + got = combined[:n_cmp].float() + max_abs = (got - exp[:n_cmp].float()).abs().max().item() + denom = exp[:n_cmp].float().abs().max().item() + 1e-6 + max_rel = max_abs / denom + local_ok = 1 if (max_rel < 2e-2 and recv_local > 0) else 0 + + # ---- three separate timings ---- + disp = time_us(torch, lambda p=problem: backend.dispatch(p), args.warmup, args.iters) + + def prep(p=problem): + hh = backend.dispatch(p) + backend.stage(p, hh) + return hh + + if backend.combine_needs_redispatch: + comb = time_us(torch, lambda hh, p=problem: backend.combine(p, hh), + args.warmup, args.iters, pre=prep) + else: + hh = prep() + comb = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx), + args.warmup, args.iters) + + def roundtrip(p=problem): + hh = backend.dispatch(p) + backend.stage(p, hh) + return backend.combine(p, hh) + + rt = time_us(torch, roundtrip, args.warmup, args.iters) + + # ---- reduce across ranks ---- + d50, d99 = percentile(disp, 50), percentile(disp, 99) + c50, c99 = percentile(comb, 50), percentile(comb, 99) + r50, r99 = percentile(rt, 50), percentile(rt, 99) + (d50, d99, c50, c99, r50, r99) = _reduce_max( + torch, dist, device, [d50, d99, c50, c99, r50, r99]) + recv = int(_reduce_max(torch, dist, device, [float(recv_local)])[0]) + global_ok = _reduce_min_int(torch, dist, device, local_ok) + max_rel = _reduce_max(torch, dist, device, [max_rel])[0] + + global_tokens = T * ep_size + dispatch_bytes = recv * args.hidden * elem_bytes + # Algorithmic bandwidth: realized received payload / dispatch time. Labelled + # "alg" (not bus) — an EP bus-bandwidth model is backend-specific and out of + # scope; latency is the primary metric, this is a comm-volume sanity figure. + disp_algbw = (dispatch_bytes / (d50 * 1e3)) if d50 > 0 else 0.0 + tps = (global_tokens / (r50 * 1e-6)) if r50 > 0 else None + + rows.append({ + "tokens_per_rank": T, + "global_tokens": global_tokens, + "dispatch_us_p50": d50, "dispatch_us_p99": d99, + "combine_us_p50": c50, "combine_us_p99": c99, + "roundtrip_us_p50": r50, "roundtrip_us_p99": r99, + "recv_tokens": recv, + "dispatch_bytes": dispatch_bytes, + "dispatch_algbw_gbps": disp_algbw, + "tokens_per_second": tps, + "correct": bool(global_ok), + "max_rel_error": max_rel, + }) + if rank == 0: + print(f" T={T:<5} disp={d50:8.2f}us combine={c50:8.2f}us rt={r50:8.2f}us " + f"recv={recv:<6} correct={bool(global_ok)}") + + if rank != 0: + return 0 + + all_ok = bool(rows) and all(r["correct"] for r in rows) + shape = { + "hidden": args.hidden, "topk": args.topk, "experts": args.experts, + "experts_per_rank": experts_per_rank, "dispatch_dtype": args.dispatch_dtype, + "routing": args.routing, "num_comm_sms": args.num_comm_sms, + } + meta = { + "op": "ep-dispatch-combine", "backend": backend.name, "mode": backend.mode, + "phase": args.phase, "world_size": world_size, "ep_size": ep_size, + "num_ep_groups": args.num_ep_groups, + "nodes": int(os.environ.get("SLURM_NNODES", "1")), + "topology_class": args.topology_class, "comparison_class": args.comparison_class, + "measurement_contract": backend.measurement_contract, "shape": shape, + } + headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2]) + env = None + if args.env_json and os.path.exists(args.env_json): + with open(args.env_json) as fh: + env = json.load(fh) + doc = { + "schema_version": SCHEMA_VERSION, "family": "moe", "generated_by": "tests/run_ep.py", + "generated_at": args.timestamp or _dt.datetime.now().astimezone().isoformat(), + "runner": args.runner, "transport": args.transport, + "status": "valid" if all_ok else "invalid", + "comparison_key": comparison_key(meta), + "x_axis": {"primary": "tokens_per_rank", + "global_relation": "global_tokens = tokens_per_rank * ep_size"}, + "backend_provenance": backend.backend_provenance, + **meta, + "correctness": {"passed": all_ok, + "max_rel_error": max((r["max_rel_error"] for r in rows), default=None), + "points": len(rows)}, + "metrics": { + "headline_tokens_per_rank": headline["tokens_per_rank"], + "dispatch_us_p50": headline["dispatch_us_p50"], + "combine_us_p50": headline["combine_us_p50"], + "roundtrip_us_p50": headline["roundtrip_us_p50"], + "roundtrip_us_p99": headline["roundtrip_us_p99"], + "tokens_per_second": headline["tokens_per_second"], + }, + "rows": rows, + "environment": env, + } + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w") as fh: + json.dump(doc, fh, indent=2) + fh.write("\n") + print(f"{backend.name} ep-dispatch-combine [{args.phase}]: status={doc['status']} " + f"{len(rows)} points, headline T={headline['tokens_per_rank']} " + f"disp={headline['dispatch_us_p50']:.1f}us combine={headline['combine_us_p50']:.1f}us " + f"-> {args.out}") + return 0 if all_ok else 1 diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py new file mode 100644 index 000000000..0b5257f36 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_mori.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — MoRI (AMD ROCm), normal mode. + +Ports the validated dispatch/combine sequence from the old run_mori.py into the +ep_harness Backend protocol. The harness owns the token sweep + separated timing; +this file owns MoRI's API and the three ionic_rdma-fabric constraints found on +MI355X (all validated on-node, see CONTAINERS.md): + 1. MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init, and + these NICs cap GPU-memory MRs at ~4 GiB — a 6 GiB heap fails (errno 22), + 2 GiB registers. So hold the heap at 2 GiB and bound the buffers via + max_num_inp_token_per_rank (=> buffer_cap clamps the token sweep). + 2. combine() resets recv_num, so read it BEFORE combine; combine returns the + full max_num_inp_token_per_rank buffer, so compare only the first T rows. + 3. MoRI's shmem teardown asserts (CheckStatusValid -> SIGABRT) when the op is + destroyed after shmem_finalize(); finalize() hard-exits past it. + +combine_needs_redispatch = True: combine consumes the dispatch state (recv_num), +so the harness re-dispatches (untimed) before each timed combine sample. +""" +from __future__ import annotations + +import os +import sys +import types + +# MoRI registers the WHOLE symmetric heap as one RDMA MR at shmem init — set this +# BEFORE `import mori`. 2 GiB registers cleanly on the MI355X ionic_rdma NICs; +# larger fails. Layered: explicit MORI_SHMEM_HEAP_SIZE > CX_MORI_HEAP_SIZE > 2G. +os.environ.setdefault("MORI_SHMEM_HEAP_SIZE", + os.environ.get("CX_MORI_HEAP_SIZE", "2G")) + +import torch +import torch.distributed as dist + +try: + import mori # type: ignore +except Exception as exc: # pragma: no cover - needs the AMD MoRI image + print("ERROR: mori import failed — needs the AMD MoRI image " + f"(rocm/sgl-dev:...-mori-...). {exc!r}", file=sys.stderr) + raise + + +class MoRIBackend: + name = "mori" + mode = "normal" + measurement_contract = "mori-normal-v1" + combine_needs_redispatch = True + # MoRI wedges on a COLD dispatch that jumps straight to a large token count + # (validated on MI355X: a fresh-shmem sweep starting at T=128 hangs, while a + # gradual sweep 1,2,4,...,512 runs every point fine — including 256/512). So + # the harness ramps this backend's ladder geometrically from 1 up to its max, + # turning any phase's sweep into the proven gradual ramp. + needs_gradual_ramp = True + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.ep_size = world_size // max(1, args.num_ep_groups) + self.experts_per_rank = args.experts // self.ep_size + self.block_num = int(os.environ.get("CX_MORI_BLOCK_NUM", "80")) + self.dispatch_warps = int(os.environ.get("CX_MORI_DISPATCH_WARPS", "16")) + self.combine_warps = int(os.environ.get("CX_MORI_COMBINE_WARPS", "8")) + if args.dispatch_dtype != "bf16": + if rank == 0: + print(f"WARN: mori adapter validated for bf16 (quant_type=none); " + f"'{args.dispatch_dtype}' not wired — using bf16.", file=sys.stderr) + args.dispatch_dtype = "bf16" + + # init MoRI shmem on the torch process group (per the reference test). + world_group = torch.distributed.group.WORLD + torch._C._distributed_c10d._register_process_group("default", world_group) + mori.shmem.shmem_torch_process_group_init("default") + + # Size the symmetric buffers to the registerable heap (see buffer_cap). The + # op is built ONCE and reused for every T in the sweep; a T<=cap problem + # just fills the first T rows of the fixed buffer. + self._cap = self.buffer_cap(args) + self.config = mori.ops.EpDispatchCombineConfig( + data_type=torch.bfloat16, rank=rank, world_size=world_size, + hidden_dim=args.hidden, scale_dim=0, + scale_type_size=torch.tensor([], dtype=torch.float8_e4m3fnuz).element_size(), + max_token_type_size=torch.tensor([], dtype=torch.float32).element_size(), + max_num_inp_token_per_rank=max(512, self._cap), + num_experts_per_rank=self.experts_per_rank, + num_experts_per_token=args.topk, + use_external_inp_buf=False, quant_type="none", + ) + self.op = mori.ops.EpDispatchCombineOp(self.config) + self.backend_provenance = { + "mori_commit": os.environ.get("MORI_COMMIT", "unknown"), + "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"), + "max_num_inp_token_per_rank": max(512, self._cap), + "block_num": self.block_num, + "dispatch_warps": self.dispatch_warps, "combine_warps": self.combine_warps, + } + + def buffer_cap(self, args): + # Largest tokens/rank the 2 GiB registerable heap holds at this hidden size. + # 512 was validated on-node at hidden=7168; override via CX_MORI_MAX_TOKENS + # once a larger heap/ceiling is confirmed. Prefill ladders clamp to this. + return int(os.environ.get("CX_MORI_MAX_TOKENS", "512")) + + def make_problem(self, T): + a = self.args + device, H, topk, E = self.device, a.hidden, a.topk, a.experts + x = torch.randn((T, H), dtype=torch.bfloat16, device=device) + # MoRI expects INT32 expert indices and a real (T, scale_dim) fp8 scales + # tensor even when scale_dim==0 (an (T,0) tensor), not None. + indices = torch.stack([ + torch.randperm(E, device=device)[:topk] for _ in range(T) + ]).to(torch.int32) + weights = torch.rand((T, topk), dtype=torch.float32, device=device) + scales = torch.empty((T, 0), dtype=torch.float8_e4m3fnuz, device=device) + return types.SimpleNamespace(T=T, x=x, indices=indices, weights=weights, scales=scales) + + def dispatch(self, p): + (dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num) = self.op.dispatch( + p.x, p.weights, p.scales, p.indices, + block_num=self.block_num, warp_per_block=self.dispatch_warps) + # Read total_recv BEFORE any combine — combine() resets recv_num (a later + # read yields 0, a false "received nothing"). + total_recv = int(recv_num[0].item()) + return types.SimpleNamespace( + dispatch_output=dispatch_output, dispatch_weights=dispatch_weights, + dispatch_indices=dispatch_indices, total_recv=total_recv, + combine_input=dispatch_output.to(torch.bfloat16)) + + def stage(self, p, h): + # Zero-copy mode (use_external_inp_buf=False): combine reads MoRI's + # registered combine-input buffer, so stage the dispatched rows into it. + # In a real MoE the expert FFN writes its outputs here; with no expert + # compute we copy the dispatched activations straight through. + buf = self.op.get_registered_combine_input_buffer( + torch.bfloat16, hidden_dim=h.combine_input.size(1)) + buf[:h.total_recv, :].copy_(h.combine_input[:h.total_recv, :]) + + def combine(self, p, h): + combined, _w = self.op.combine( + h.combine_input, h.dispatch_weights, h.dispatch_indices, + block_num=self.block_num, warp_per_block=self.combine_warps) + return combined + + def expected(self, p, h): + # MoRI combine sums one copy per destination RANK, so combined[i] ≈ + # x[i] * (#unique destination ranks among the token's topk experts). + pes = p.indices.long() // self.experts_per_rank + unique_pes = torch.tensor( + [len(set(row.tolist())) for row in pes], device=self.device, dtype=torch.float32 + ).unsqueeze(1) + return p.x.float() * unique_pes, p.T + + def recv_tokens(self, h): + return int(h.total_recv) + + def finalize(self, rc): + # MoRI's shmem teardown asserts when the op is destroyed after + # shmem_finalize() (CheckStatusValid -> SIGABRT on this build). The result + # JSON is already written, so sync the ranks and hard-exit past it. + try: + dist.barrier() + except Exception: + pass + sys.stdout.flush() + sys.stderr.flush() + os._exit(0 if rc == 0 else 1) diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py new file mode 100644 index 000000000..898e4de51 --- /dev/null +++ b/experimental/CollectiveX/tests/run_ep.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +"""CollectiveX — EP dispatch/combine benchmark entrypoint (run under torchrun). + +Picks a backend adapter (DeepEP or MoRI), runs the source-tokens-per-rank sweep +via ep_harness, and writes one provenance-tagged JSON doc. Dispatch and combine +are timed SEPARATELY (see ep_harness); only T varies along the resulting line. + + torchrun --nproc_per_node=8 tests/run_ep.py --backend mori \\ + --phase decode --runner mi355x-amds --topology-class mi355x-xgmi \\ + --transport xgmi --env-json results/env.json --out results/mi355x_mori_decode.json + + torchrun --nproc_per_node=8 tests/run_ep.py --backend deepep \\ + --phase prefill --runner b200-dgxc --topology-class b200-nvlink-island \\ + --transport nvlink --env-json results/env.json --out results/b200_deepep_prefill.json +""" +from __future__ import annotations + +import argparse +import os +import sys + +# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under +# torchrun (it executes the file as __main__, not as a package). +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import ep_harness # noqa: E402 (stdlib-only; safe before torch) + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep") + ap.add_argument("--backend", required=True, choices=["deepep", "mori"]) + ep_harness.add_common_args(ap) + args = ap.parse_args() + + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12355") + + # MoRI inits its shmem on a process group it registers as "default" and wants + # the gloo+nccl combo with an explicit device_id (per its reference test); + # DeepEP uses a plain nccl group. + if not dist.is_initialized(): + if args.backend == "mori": + dist.init_process_group(backend="cpu:gloo,cuda:nccl", rank=rank, + world_size=world_size, device_id=device) + else: + dist.init_process_group("nccl") + + if args.backend == "mori": + from ep_mori import MoRIBackend as Backend + else: + from ep_deepep import DeepEPBackend as Backend + + backend = Backend(args, rank, world_size, local_rank, device) + if rank == 0: + print(f"[run_ep] backend={args.backend} phase={args.phase} world={world_size} " + f"ep_size={world_size // max(1, args.num_ep_groups)} hidden={args.hidden} " + f"topk={args.topk} experts={args.experts} dtype={args.dispatch_dtype}") + + rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size) + # finalize() handles backend-specific teardown: DeepEP returns rc cleanly; + # MoRI hard-exits past its post-shmem_finalize teardown assertion. + return backend.finalize(rc) + + +if __name__ == "__main__": + raise SystemExit(main()) From e2717a341cf1514d4be6393db16121889db7bf19 Mon Sep 17 00:00:00 2001 From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:57:51 +0800 Subject: [PATCH 17/17] CollectiveX: make MI355X launcher CI-robust (writable lock dir + node pin) The MI355X MoRI jobs failed in CI when they landed on cold nodes: the squash lock was created next to the squash in /var/lib/squash, which is root/admin-owned on some nodes (flock -> "Bad file descriptor"), and nodes without the node-local squash need a slow cold import that also hits lock/cache permissions. - launch_mi355x-amds.sh: put the import lock in a guaranteed-writable per-node dir (CX_LOCK_DIR, default /tmp), not beside the squash; add CX_NODELIST to pin the allocation to nodes that already hold the squash. - workflow: pin MI355X jobs (push + dispatch) to the warm-squash nodes (mia1-p01-g10,g15). Widen once the squash is staged cluster-wide. The EP sweep itself is already hardware-validated (MoRI decode + prefill); this only fixes squash setup so the jobs reach it in CI. --- .../workflows/collectivex-experimental.yml | 6 ++++ .../launchers/launch_mi355x-amds.sh | 29 +++++++++++++++---- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/.github/workflows/collectivex-experimental.yml b/.github/workflows/collectivex-experimental.yml index e2a8e2ff2..6965424ab 100644 --- a/.github/workflows/collectivex-experimental.yml +++ b/.github/workflows/collectivex-experimental.yml @@ -92,6 +92,10 @@ jobs: env: CX_BENCH: mori CX_PHASE: ${{ matrix.phase }} + # Pin to the MI355X nodes that hold the node-local squash and have a writable + # /var/lib/squash; other nodes need a slow cold import that can fail on lock/ + # cache permissions. Widen once the squash is staged cluster-wide. + CX_NODELIST: mia1-p01-g10,mia1-p01-g15 steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 with: { clean: true } @@ -133,6 +137,8 @@ jobs: CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }} # GB200/watchtower needs a compute-visible workspace; harmless elsewhere. CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} + # MI355X: pin to the warm-squash, writable nodes (see the push job). + CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 with: { clean: true } diff --git a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh index 8092b84b4..3a7ceccb3 100644 --- a/experimental/CollectiveX/launchers/launch_mi355x-amds.sh +++ b/experimental/CollectiveX/launchers/launch_mi355x-amds.sh @@ -34,6 +34,10 @@ TIME_MIN="${CX_TIME:-60}" # generous: a cold enroot import of the large ROCm i IMAGE="${CX_IMAGE:-$(cx_default_image mi355x)}" SQUASH_DIR="${CX_SQUASH_DIR:-/var/lib/squash}" # node-local on MI355X EXCLUDE_NODES="${CX_EXCLUDE_NODES:-mia1-p01-g09,mia1-p01-g11}" +# Optional node pin. The node-local squash is only staged on some nodes, and on +# others /var/lib/squash isn't writable (cold-import fails). Pin CI to nodes that +# already hold the squash via CX_NODELIST (overrides the exclude list). +NODELIST="${CX_NODELIST:-}" MOUNT_DIR=/ix TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" @@ -52,15 +56,27 @@ cx_log "runner=$RUNNER_NAME partition=$PARTITION ngpus=$NGPUS bench=$CX_BENCH im # AMD workspace is compute-visible (the serving launcher bind-mounts it directly), # so no staging; the node-local squash is handled via srun below. MOUNT_SRC="$(cx_stage_repo "$REPO_ROOT" "${CX_STAGE_DIR:-}")" -SQUASH_FILE="$SQUASH_DIR/$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g').sqsh" -LOCK_FILE="${SQUASH_FILE}.lock" -cx_log "squash(node-local)=$SQUASH_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" +SQUASH_KEY="$(printf '%s' "$IMAGE" | sed 's#[/:@#]#_#g')" +SQUASH_FILE="$SQUASH_DIR/${SQUASH_KEY}.sqsh" +# Lock in a guaranteed-writable per-node dir, NOT next to the squash: on some +# nodes /var/lib/squash is root/admin-owned, so even a world-readable squash +# can't get a sibling .lock created (flock -> "Bad file descriptor"). CX_LOCK_DIR +# overrides. The lock only serializes concurrent imports on the same node. +LOCK_FILE="${CX_LOCK_DIR:-/tmp}/${SQUASH_KEY}.sqsh.lock" +cx_log "squash(node-local)=$SQUASH_FILE lock=$LOCK_FILE mount=$MOUNT_SRC -> $MOUNT_DIR" if [ "${CX_DRYRUN:-0}" = "1" ]; then cx_log "CX_DRYRUN=1 — not allocating"; exit 0; fi command -v salloc >/dev/null || cx_die "salloc not found — run on the Slurm login node" -salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \ - --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +# Pin to specific nodes (CX_NODELIST) when set, else exclude the known-bad ones. +if [ -n "$NODELIST" ]; then + cx_log "node pin: --nodelist=$NODELIST" + salloc --partition="$PARTITION" --nodelist="$NODELIST" --gres=gpu:"$NGPUS" \ + --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +else + salloc --partition="$PARTITION" --exclude="$EXCLUDE_NODES" --gres=gpu:"$NGPUS" \ + --exclusive --cpus-per-task=128 --time="$TIME_MIN" --no-shell --job-name="$RUNNER_NAME" +fi JOB_ID="$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)" [ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID" cx_log "JOB_ID=$JOB_ID" @@ -71,7 +87,8 @@ trap 'scancel "$JOB_ID" 2>/dev/null || true' EXIT # shellcheck disable=SC2016 # $(...) must expand on the remote node, not here srun --jobid="$JOB_ID" bash -c 'docker stop $(docker ps -aq) 2>/dev/null || true' || true srun --jobid="$JOB_ID" bash -c " - exec 9>\"$LOCK_FILE\" + mkdir -p \"$(dirname "$LOCK_FILE")\" 2>/dev/null || true + exec 9>\"$LOCK_FILE\" || { echo 'cannot open lock $LOCK_FILE' >&2; exit 1; } flock -w 600 9 || { echo 'lock timeout for $SQUASH_FILE' >&2; exit 1; } if unsquashfs -l \"$SQUASH_FILE\" >/dev/null 2>&1; then echo 'squash present: $SQUASH_FILE'