Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12242,6 +12242,32 @@ qwen3.5-fp4-b200-trt:
- { tp: 4, ep: 4, dp-attn: true, conc-list: [1024] }
- { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] }

qwen3.5-fp4-b200-trt-mtp:
image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc18
model: nvidia/Qwen3.5-397B-A17B-NVFP4
model-prefix: qwen3.5
runner: b200
precision: fp4
framework: trt
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, ep: 1, spec-decoding: "mtp", conc-list: [8] }
- { tp: 2, ep: 2, spec-decoding: "mtp", conc-list: [4] }
- { tp: 8, ep: 8, spec-decoding: "mtp", conc-list: [4] }
- { tp: 8, ep: 8, dp-attn: true, spec-decoding: "mtp", conc-list: [64, 128, 256, 512, 1024] }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, ep: 1, spec-decoding: "mtp", conc-list: [4] }
- { tp: 2, ep: 2, spec-decoding: "mtp", conc-list: [8, 16] }
- { tp: 4, ep: 4, spec-decoding: "mtp", conc-list: [4] }
- { tp: 8, ep: 8, spec-decoding: "mtp", conc-list: [4] }
- { tp: 8, ep: 8, dp-attn: true, spec-decoding: "mtp", conc-list: [128, 256, 1024] }

# MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
Expand Down
170 changes: 170 additions & 0 deletions benchmarks/single_node/fixed_seq_len/qwen3.5_fp4_b200_trt_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
DP_ATTENTION \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

# MTP (multi-token prediction) speculative decode requires the FlashInfer GDN
# prefill path to be disabled.
export TLLM_USE_FLASHINFER_GDN_PREFILL="0"

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

nvidia-smi

SERVER_LOG=/workspace/server.log
EXTRA_CONFIG_FILE="qwen3.5-fp4-trt-mtp.yml"
NUM_NEXTN_PREDICT_LAYERS=3

# Attention-DP layouts run CUTEDSL MoE; everything else runs the TRTLLM backend.
# With MTP the served batch is much smaller than raw concurrency: attention-DP
# runs at CONC/8, everything else at CONC. The KV-cache memory fraction is tuned
# per layout (there is no single derivable rule).
if [[ "$DP_ATTENTION" == "true" ]]; then
MAX_BATCH_SIZE=$(( CONC / 8 ))
MOE_BACKEND="CUTEDSL"
# attention-DP: 0.9 up to conc 512, backed off to 0.8 at conc 1024.
if (( CONC >= 1024 )); then KV_MEMORY_FRACTION=0.8; else KV_MEMORY_FRACTION=0.9; fi
MODE_CONFIG="enable_attention_dp: true
attention_dp_config:
enable_balance: true
batching_wait_iters: 10
timeout_iters: 500"
else
MAX_BATCH_SIZE="$CONC"
MOE_BACKEND="TRTLLM"
# non-attention-DP fraction, tuned per (ISL, TP, EP) layout.
case "${ISL}_tp${TP}_ep${EP_SIZE}" in
1024_tp2_ep1) KV_MEMORY_FRACTION=0.6 ;;
1024_tp2_ep2) KV_MEMORY_FRACTION=0.75 ;;
1024_tp8_ep8) KV_MEMORY_FRACTION=0.8 ;;
8192_tp2_ep1) KV_MEMORY_FRACTION=0.7 ;;
8192_tp2_ep2) KV_MEMORY_FRACTION=0.6 ;;
8192_tp4_ep4) KV_MEMORY_FRACTION=0.75 ;;
8192_tp8_ep8) KV_MEMORY_FRACTION=0.8 ;;
*) KV_MEMORY_FRACTION=0.8 ;;
esac
# Short-context runs hold less in flight, so they wait on a tighter token
# ratio before flushing a batch.
case "$ISL" in
1024) BATCH_WAIT_MAX_TOKENS_RATIO=0.0625 ;;
*) BATCH_WAIT_MAX_TOKENS_RATIO=0.45 ;;
esac
MODE_CONFIG="batch_wait_timeout_iters: 50
batch_wait_max_tokens_ratio: $BATCH_WAIT_MAX_TOKENS_RATIO"
fi

cat > "$EXTRA_CONFIG_FILE" << EOF
backend: pytorch
print_iter_log: true
enable_layerwise_nvtx_marker: false
disable_overlap_scheduler: false
enable_iter_perf_stats: true
enable_chunked_prefill: false
stream_interval: 20
num_postprocess_workers: 4
scheduler_config:
capacity_scheduler_policy: MAX_UTILIZATION
context_chunking_policy: FIRST_COME_FIRST_SERVED
kv_cache_config:
free_gpu_memory_fraction: $KV_MEMORY_FRACTION
enable_block_reuse: false
dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA graph sizes exceed max batch

Medium Severity

The extra LLM config hardcodes cuda_graph_config.batch_sizes through 128, while trtllm-serve gets --max_batch_size from CONC or CONC/8 (often 4–16 in this recipe). Peer Qwen and TRT-MTP scripts tie CUDA graph capture to MAX_BATCH_SIZE via max_batch_size, so graph warmup can overshoot the runtime batch cap and risk validation failures or excess memory use on low-concurrency jobs.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 7649ae1. Configure here.

moe_config:
backend: $MOE_BACKEND
use_low_precision_moe_combine: true
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: $NUM_NEXTN_PREDICT_LAYERS
$MODE_CONFIG
EOF

echo "Generated config file contents:"
cat "$EXTRA_CONFIG_FILE"

MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))

case "${ISL}_${OSL}" in
8192_1024) MAX_NUM_TOKENS=32768 ;;
1024_1024) MAX_NUM_TOKENS=16384 ;;
*)
MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
;;
esac

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve "$MODEL" --port="$PORT" \
--trust_remote_code \
--backend=pytorch \
--max_batch_size "$MAX_BATCH_SIZE" \
--max_seq_len="$MAX_MODEL_LEN" \
--max_num_tokens="$MAX_NUM_TOKENS" \
--tp_size="$TP" --ep_size="$EP_SIZE" \
--extra_llm_api_options="$EXTRA_CONFIG_FILE" \
> "$SERVER_LOG" 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend openai \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$(( CONC * 10 ))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--use-chat-template

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4123,6 +4123,13 @@
- "Allocate FlashInfer MNNVL workspace for one-shot TP8 all-reduce during CUDA graph capture"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1897

- config-keys:
- qwen3.5-fp4-b200-trt-mtp
description:
- "Add Qwen3.5-397B-A17B-NVFP4 B200 single-node TensorRT-LLM benchmark with MTP speculative decode (1k/1k and 8k/1k) across a TP/TEP/DEP parallelism sweep"
- "Image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc18"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1894

- config-keys:
- dsr1-fp4-b200-sglang
description:
Expand Down