Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
342 changes: 342 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2390,6 +2390,348 @@ glm5-fp4-b300-sglang-mtp:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }

glm5-fp4-gb200-dynamo-trt:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13
model: nvidia/GLM-5-NVFP4
model-prefix: glm5
runner: gb200
precision: fp4
framework: dynamo-trt
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
# STP configurations
- conc-list: [ 4 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch1_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 5 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 20 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch4_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch4_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 84 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 168 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch32_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch32_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 25 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 284 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch64_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen4tep8_batch64_eplb0_mtp0.yaml"
decode:
num-worker: 4
tp: 8
ep: 8
dp-attn: false
- conc-list: [ 666 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 1229 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep32_batch32_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 2151 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep32_batch64_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 2151 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 4301 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx2dep4_gen1dep16_batch256_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 4301 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/STP/ctx1dep4_gen1dep8_batch512_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- isl: 8192
osl: 1024
search-space:
# STP configurations
- conc-list: [ 5 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch1_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 10 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch2_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 25 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch4_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 50 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch8_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 105 ]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx1dep4_gen5tep4_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 5
tp: 4
ep: 4
dp-attn: false
- conc-list: [ 308 ]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep32_batch8_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx2dep4_gen1dep32_batch8_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 615 ]
prefill:
num-worker: 4
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep32_batch16_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx4dep4_gen1dep32_batch16_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 1127 ]
prefill:
num-worker: 5
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep16_batch64_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx5dep4_gen1dep16_batch64_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [ 1229 ]
prefill:
num-worker: 6
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx6dep4_gen1dep32_batch32_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx6dep4_gen1dep32_batch32_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [ 2151 ]
prefill:
num-worker: 9
tp: 4
ep: 4
dp-attn: true
additional-settings:
# https://github.com/NVIDIA/srt-slurm/blob/sa-submission-q2-2026/recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx9dep4_gen1dep16_batch128_eplb0_mtp0.yaml
- "CONFIG_FILE=recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL8K_OSL1K/STP/ctx9dep4_gen1dep16_batch128_eplb0_mtp0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

qwen3.5-fp8-b200-sglang-mtp:
image: lmsysorg/sglang:v0.5.12-cu130
model: Qwen/Qwen3.5-397B-A17B-FP8
Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3863,3 +3863,13 @@
- "Align MiniMax-M3 B200 vLLM fixed-sequence serving with MiniMax-M2.5 FP8 B200 settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and restoring max cudagraph capture size 2048."
- "Add TP4+EP4 coverage for MiniMax-M3 B200: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1779

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate changelog triggers wrong sweep

Medium Severity

This commit appends a second perf-changelog.yaml block for minimaxm3-fp8-b200-vllm that matches the entry already at lines 3853–3858. process_changelog.py builds the PR sweep from added diff lines only, so that duplicate is treated as a new changelog contribution and schedules minimaxm3-fp8-b200-vllm benchmarks alongside glm5-fp4-gb200-dynamo-trt.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 0137194. Configure here.

- config-keys:
- glm5-fp4-gb200-dynamo-trt
description:
- "Add GLM-5 NVFP4 GB200 disaggregated TRT-LLM (STP, non-MTP) benchmarks via Dynamo"
- "New multinode model: glm5 with dynamo-trt framework on GB200"
- "Container: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.3.0-dev.1-cuda13"
- "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026 (gb200_nvfp4 STP recipes); prefill tp=4/ep=4 (dep4)"
- "launch_gb200-nv.sh: added glm5-fp4 case to dynamo-trt branch with SRT_SLURM_MODEL_PREFIX=nvidia/GLM-5-NVFP4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
6 changes: 5 additions & 1 deletion runners/launch_gb200-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,12 @@ elif [[ $FRAMEWORK == "dynamo-trt" ]]; then
export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
export SERVED_MODEL_NAME="kimi-k2.5-nvfp4"
export SRT_SLURM_MODEL_PREFIX="nvidia/Kimi-K2.5-NVFP4"
elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then
export MODEL_PATH="/mnt/lustre01/models/GLM-5-NVFP4"
export SERVED_MODEL_NAME="glm-5-nvfp4"
export SRT_SLURM_MODEL_PREFIX="nvidia/GLM-5-NVFP4"
else
echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss, dsr1, or kimik2.5"
echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss, dsr1, kimik2.5, or glm5"

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GLM5 missing srt-slurm checkout

High Severity

The new glm5-fp4-gb200-dynamo-trt sweep uses CONFIG_FILE paths under recipes/GLM5/... on NVIDIA srt-slurm sa-submission-q2-2026, but launch_gb200-nv.sh only selects that repo for kimik2.5 dynamo-trt. glm5 falls through to the cquil11/srt-slurm-nv clone, so recipe files are missing and the launcher exits when validating CONFIG_FILE.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit e99f46e. Configure here.

exit 1
fi
elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
Expand Down