From c3221b2da2fc314b79c2429679e82b32d8601c75 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 13:26:18 +0900 Subject: [PATCH 01/13] feat: add minimaxm3-fp4-mi355x-atom-mtp benchmark script and CI config Add single-node ATOM MTP (EAGLE3) benchmark for MiniMax-M3-MXFP4 on MI355X with TP/EP/DP-attention support and matching amd-master.yaml entry. Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 22 +++++ .../minimaxm3_fp4_mi355x_atom_mtp.sh | 82 +++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 92f8a5609..d2ccbeb2c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2876,6 +2876,28 @@ minimaxm3-fp4-mi355x-atom: - { tp: 4, conc-start: 1, conc-end: 256 } - { tp: 8, conc-start: 1, conc-end: 2 } +minimaxm3-fp4-mi355x-atom-mtp: + image: rocm/atom-dev:M3 + model: amd/MiniMax-M3-MXFP4 + model-prefix: minimaxm3 + runner: mi355x + precision: fp4 + framework: atom + multinode: false + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 128, conc-end: 256 } + - { tp: 4, conc-start: 1, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 128, conc-end: 256 } + - { tp: 4, conc-start: 1, conc-end: 256 } + - { tp: 8, conc-start: 1, conc-end: 2 } + # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this # checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh new file mode 100644 index 000000000..53238959a --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE \ + DP_ATTENTION + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +SERVER_LOG=/workspace/server.log + +PARALLEL_ARGS=(-tp "$TP") #TP +if [ "$DP_ATTENTION" = "true" ]; then + if [ "$EP_SIZE" -gt 1 ]; then #DP+EP + PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) + else #DP+TP + PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) + fi +fi + +SPEC_ARGS=(--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3 ) + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor +MEM_FRAC_STATIC=0.8 + +set -x +# (srok), not yet +# --kv_cache_dtype fp8 \ +python3 -m atom.entrypoints.openai_server \ + --model $MODEL \ + --server-port $PORT \ + -tp $TP \ + --max-model-len $MAX_MODEL_LEN $EP \ + "${PARALLEL_ARGS[@]}" \ + "${SPEC_ARGS[@]}" \ + --block-size 128 \ + --gpu-memory-utilization $MEM_FRAC_STATIC \ + --trust-remote-code \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +export PYTHONDONTWRITEBYTECODE=1 +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code \ + --use-chat-template + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x From d6426c4f567306edb80cf5f3eaa3aa8b988af2db Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 13:29:08 +0900 Subject: [PATCH 02/13] chore: add perf-changelog entry for minimaxm3-fp4-mi355x-atom-mtp Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 46bee4d44..c05963d36 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3964,6 +3964,13 @@ - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 +- config-keys: + - minimaxm3-fp4-mi355x-atom-mtp + description: + - "Add minimaxm3-fp4-mi355x-atom-mtp CI recipe: single-node ATOM with EAGLE3 speculative decoding (3 draft tokens) for MiniMax-M3-MXFP4 on MI355X" + - "Supports TP/EP/DP-attention combinations; search space mirrors minimaxm3-fp4-mi355x-atom (ISL=1024,8192 OSL=1024 TP2/TP4/TP8)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1866 + - config-keys: - kimik2.5-fp4-gb200-dynamo-trt description: From a1b42c4fe02fc1fe8145af79a86e2ab6ce992a86 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 13:29:52 +0900 Subject: [PATCH 03/13] fix: update minimaxm3-fp4-mi355x-atom-mtp image to MiniMax-M3-20260619 Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d2ccbeb2c..bd13cd743 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2877,7 +2877,7 @@ minimaxm3-fp4-mi355x-atom: - { tp: 8, conc-start: 1, conc-end: 2 } minimaxm3-fp4-mi355x-atom-mtp: - image: rocm/atom-dev:M3 + image: rocm/atom-dev:MiniMax-M3-20260619 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x From ae97ec64e34a4d410f5bbab7365578ff143fccc3 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 13:36:44 +0900 Subject: [PATCH 04/13] fix: add spec-decoding: mtp to minimaxm3-fp4-mi355x-atom-mtp search space; remove stray \$EP arg Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 10 +++++----- .../fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index bd13cd743..96e29427a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2889,14 +2889,14 @@ minimaxm3-fp4-mi355x-atom-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256 } - - { tp: 4, conc-start: 1, conc-end: 256 } + - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256 } - - { tp: 4, conc-start: 1, conc-end: 256 } - - { tp: 8, conc-start: 1, conc-end: 2 } + - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } + - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh index 53238959a..7276bcb4f 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh @@ -43,7 +43,7 @@ python3 -m atom.entrypoints.openai_server \ --model $MODEL \ --server-port $PORT \ -tp $TP \ - --max-model-len $MAX_MODEL_LEN $EP \ + --max-model-len $MAX_MODEL_LEN \ "${PARALLEL_ARGS[@]}" \ "${SPEC_ARGS[@]}" \ --block-size 128 \ From 4fcaa1ef3f9b93baed13f486b0d4e2b5930041c3 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 13:42:47 +0900 Subject: [PATCH 05/13] fix: remove duplicate -tp and --max-model-len from minimaxm3_fp4_mi355x_atom_mtp.sh server args Co-Authored-By: Claude Sonnet 4.6 --- .../single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh index 7276bcb4f..120a801e2 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh @@ -42,8 +42,6 @@ set -x python3 -m atom.entrypoints.openai_server \ --model $MODEL \ --server-port $PORT \ - -tp $TP \ - --max-model-len $MAX_MODEL_LEN \ "${PARALLEL_ARGS[@]}" \ "${SPEC_ARGS[@]}" \ --block-size 128 \ From d806d96d19b27df6b976c1b9385bc9a6ef6e1664 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Sat, 20 Jun 2026 15:30:12 +0900 Subject: [PATCH 06/13] fix: trim minimaxm3-fp4-mi355x-atom-mtp ISL=8192 search space; remove TP8, narrow TP2 to conc=128 Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 96e29427a..5c047cc5a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2894,9 +2894,8 @@ minimaxm3-fp4-mi355x-atom-mtp: - isl: 8192 osl: 1024 search-space: - - { tp: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 2, conc-start: 128, conc-end: 128, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 1, conc-end: 2, spec-decoding: mtp } # MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and # MI355X serving shape, but retain the default BF16 KV cache because this From 86308ff5423c3c3e3255794cb1e518fd612125a7 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 10:46:49 +0900 Subject: [PATCH 07/13] fix: bump minimaxm3-fp4-mi355x-atom-mtp image to MiniMax-M3-20260622 Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 30d7da3c4..706ca31d2 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2613,7 +2613,7 @@ minimaxm3-fp4-mi355x-atom: - { tp: 8, conc-start: 1, conc-end: 2 } minimaxm3-fp4-mi355x-atom-mtp: - image: rocm/atom-dev:MiniMax-M3-20260619 + image: rocm/atom-dev:MiniMax-M3-20260622 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x From 4458bbe355121e1a6bb89956557d84bbc1524db2 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 12:02:40 +0900 Subject: [PATCH 08/13] fix: disable prefix caching for minimaxm3-fp4-mi355x-atom-mtp Co-Authored-By: Claude Sonnet 4.6 --- .../single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh index 120a801e2..3b2a28907 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh @@ -47,6 +47,7 @@ python3 -m atom.entrypoints.openai_server \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ --trust-remote-code \ + --no-enable_prefix_caching \ > $SERVER_LOG 2>&1 & SERVER_PID=$! From bb638f914e4282ab64c15f7ee1a851d6871a5171 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 12:27:42 +0900 Subject: [PATCH 09/13] fix: update minimaxm3-fp4-mi355x-atom scripts and image bump - minimaxm3_fp4_mi355x_atom.sh: refactor parallel args, set MAX_MODEL_LEN/ MAX_NUM_BATCHED_TOKENS/MAX_NUM_SEQS, disable prefix caching, conditional --use-chat-template, enable AITER_QUICK_REDUCE_QUANTIZATION=INT4 - minimaxm3_fp4_mi355x_atom_mtp.sh: same server arg additions and conditional --use-chat-template - amd-master.yaml: bump minimaxm3-fp4-mi355x-atom image to MiniMax-M3-20260622, drop tp8 search-space entry Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 3 +- .../minimaxm3_fp4_mi355x_atom.sh | 48 +++++++++---------- .../minimaxm3_fp4_mi355x_atom_mtp.sh | 10 +++- 3 files changed, 32 insertions(+), 29 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3a61aed78..3a41a9e21 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2591,7 +2591,7 @@ minimaxm3-fp8-mi355x-vllm-mtp: # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md # block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe. minimaxm3-fp4-mi355x-atom: - image: rocm/atom-dev:M3 + image: rocm/atom-dev:MiniMax-M3-20260622 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x @@ -2610,7 +2610,6 @@ minimaxm3-fp4-mi355x-atom: search-space: - { tp: 2, conc-start: 128, conc-end: 256 } - { tp: 4, conc-start: 1, conc-end: 256 } - - { tp: 8, conc-start: 1, conc-end: 2 } minimaxm3-fp4-mi355x-atom-mtp: image: rocm/atom-dev:MiniMax-M3-20260622 diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh index 80f230f28..9bf74b6e6 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh @@ -11,8 +11,7 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ EP_SIZE \ - DP_ATTENTION \ - MAX_MODEL_LEN + DP_ATTENTION if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -22,41 +21,40 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO SERVER_LOG=/workspace/server.log -export OMP_NUM_THREADS=1 +PARALLEL_ARGS=(-tp "$TP") #TP +if [ "$DP_ATTENTION" = "true" ]; then + if [ "$EP_SIZE" -gt 1 ]; then #DP+EP + PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) + else #DP+TP + PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) + fi +fi -# Use the matrix-supplied MAX_MODEL_LEN (isl + osl + 256). Eval-only jobs need a -# larger window for the eval prompts, so override it from the eval context. -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi - -if [ "$EP_SIZE" -gt 1 ]; then - EP=" --enable-expert-parallel" -else - EP=" " -fi +SPEC_ARGS=() # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor MEM_FRAC_STATIC=0.8 set -x - -# Flags follow the ATOM MiniMax-M3 MXFP4 recipe (FP4 on 4xMI355 section): -# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md -# --block-size 128 is mandatory for MiniMax MSA. KV cache is left at the default -# dtype: amd/MiniMax-M3-MXFP4 ships no calibrated FP8 KV scales, so -# --kv_cache_dtype fp8 trips an assertion (k_scale is None) in the MSA -# fused_qknorm kernel during init. +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export MAX_MODEL_LEN=32768 +export MAX_NUM_BATCHED_TOKENS=32768 +export MAX_NUM_SEQS=128 +# (srok), not yet +# --kv_cache_dtype fp8 \ python3 -m atom.entrypoints.openai_server \ --model $MODEL \ --server-port $PORT \ - -tp $TP \ - --max-model-len $MAX_MODEL_LEN $EP \ + "${PARALLEL_ARGS[@]}" \ + "${SPEC_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ + --max-model-len $MAX_MODEL_LEN \ + --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ + --max-num-seqs $MAX_NUM_SEQS \ --trust-remote-code \ + --no-enable_prefix_caching \ > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -76,7 +74,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ - --trust-remote-code + --trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" ) # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh index 3b2a28907..0ee78f504 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh @@ -37,6 +37,10 @@ start_gpu_monitor MEM_FRAC_STATIC=0.8 set -x +export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +export MAX_MODEL_LEN=32768 +export MAX_NUM_BATCHED_TOKENS=32768 +export MAX_NUM_SEQS=128 # (srok), not yet # --kv_cache_dtype fp8 \ python3 -m atom.entrypoints.openai_server \ @@ -46,6 +50,9 @@ python3 -m atom.entrypoints.openai_server \ "${SPEC_ARGS[@]}" \ --block-size 128 \ --gpu-memory-utilization $MEM_FRAC_STATIC \ + --max-model-len $MAX_MODEL_LEN \ + --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ + --max-num-seqs $MAX_NUM_SEQS \ --trust-remote-code \ --no-enable_prefix_caching \ > $SERVER_LOG 2>&1 & @@ -67,8 +74,7 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ \ - --trust-remote-code \ - --use-chat-template + --trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" ) # After throughput, run evaluation only if RUN_EVAL is true if [ "${RUN_EVAL}" = "true" ]; then From a592385a9b4ea81b158605a719b32b930380998d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 12:30:54 +0900 Subject: [PATCH 10/13] fix: add minimaxm3-fp4-mi355x-atom to perf-changelog entry Co-Authored-By: Claude Sonnet 4.6 --- perf-changelog.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9934e1f94..82201851b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3985,6 +3985,7 @@ - config-keys: - minimaxm3-fp4-mi355x-atom-mtp + - minimaxm3-fp4-mi355x-atom description: - "Add minimaxm3-fp4-mi355x-atom-mtp CI recipe: single-node ATOM with EAGLE3 speculative decoding (3 draft tokens) for MiniMax-M3-MXFP4 on MI355X" - "Supports TP/EP/DP-attention combinations; search space mirrors minimaxm3-fp4-mi355x-atom (ISL=1024,8192 OSL=1024 TP2/TP4/TP8)" From 182decd12ae646ee71f8901fbc3263c7d8cd3562 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 13:04:18 +0900 Subject: [PATCH 11/13] fix: revert minimaxm3-fp4-mi355x-atom image/search-space, delete script, update perf-changelog - amd-master.yaml: revert image to rocm/atom-dev:M3, restore tp8 search-space entry - perf-changelog.yaml: remove minimaxm3-fp4-mi355x-atom from mtp entry - delete minimaxm3_fp4_mi355x_atom.sh Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 3 +- .../minimaxm3_fp4_mi355x_atom.sh | 87 ------------------- perf-changelog.yaml | 1 - 3 files changed, 2 insertions(+), 89 deletions(-) delete mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 3a41a9e21..3a61aed78 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2591,7 +2591,7 @@ minimaxm3-fp8-mi355x-vllm-mtp: # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md # block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe. minimaxm3-fp4-mi355x-atom: - image: rocm/atom-dev:MiniMax-M3-20260622 + image: rocm/atom-dev:M3 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x @@ -2610,6 +2610,7 @@ minimaxm3-fp4-mi355x-atom: search-space: - { tp: 2, conc-start: 128, conc-end: 256 } - { tp: 4, conc-start: 1, conc-end: 256 } + - { tp: 8, conc-start: 1, conc-end: 2 } minimaxm3-fp4-mi355x-atom-mtp: image: rocm/atom-dev:MiniMax-M3-20260622 diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh deleted file mode 100644 index 9bf74b6e6..000000000 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE \ - DP_ATTENTION - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" - -SERVER_LOG=/workspace/server.log - -PARALLEL_ARGS=(-tp "$TP") #TP -if [ "$DP_ATTENTION" = "true" ]; then - if [ "$EP_SIZE" -gt 1 ]; then #DP+EP - PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) - else #DP+TP - PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) - fi -fi - -SPEC_ARGS=() - -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor -MEM_FRAC_STATIC=0.8 - -set -x -export AITER_QUICK_REDUCE_QUANTIZATION=INT4 -export MAX_MODEL_LEN=32768 -export MAX_NUM_BATCHED_TOKENS=32768 -export MAX_NUM_SEQS=128 -# (srok), not yet -# --kv_cache_dtype fp8 \ -python3 -m atom.entrypoints.openai_server \ - --model $MODEL \ - --server-port $PORT \ - "${PARALLEL_ARGS[@]}" \ - "${SPEC_ARGS[@]}" \ - --block-size 128 \ - --gpu-memory-utilization $MEM_FRAC_STATIC \ - --max-model-len $MAX_MODEL_LEN \ - --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ - --max-num-seqs $MAX_NUM_SEQS \ - --trust-remote-code \ - --no-enable_prefix_caching \ - > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -export PYTHONDONTWRITEBYTECODE=1 -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" ) - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -# Stop GPU monitoring -stop_gpu_monitor -set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 82201851b..9934e1f94 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3985,7 +3985,6 @@ - config-keys: - minimaxm3-fp4-mi355x-atom-mtp - - minimaxm3-fp4-mi355x-atom description: - "Add minimaxm3-fp4-mi355x-atom-mtp CI recipe: single-node ATOM with EAGLE3 speculative decoding (3 draft tokens) for MiniMax-M3-MXFP4 on MI355X" - "Supports TP/EP/DP-attention combinations; search space mirrors minimaxm3-fp4-mi355x-atom (ISL=1024,8192 OSL=1024 TP2/TP4/TP8)" From e8b2abacfb5127af564b55b72ec349495673566f Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 13:13:49 +0900 Subject: [PATCH 12/13] fix: restore minimaxm3_fp4_mi355x_atom.sh Co-Authored-By: Claude Sonnet 4.6 --- .../minimaxm3_fp4_mi355x_atom.sh | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh new file mode 100644 index 000000000..80f230f28 --- /dev/null +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE \ + DP_ATTENTION \ + MAX_MODEL_LEN + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +SERVER_LOG=/workspace/server.log + +export OMP_NUM_THREADS=1 + +# Use the matrix-supplied MAX_MODEL_LEN (isl + osl + 256). Eval-only jobs need a +# larger window for the eval prompts, so override it from the eval context. +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +fi + +if [ "$EP_SIZE" -gt 1 ]; then + EP=" --enable-expert-parallel" +else + EP=" " +fi + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor +MEM_FRAC_STATIC=0.8 + +set -x + +# Flags follow the ATOM MiniMax-M3 MXFP4 recipe (FP4 on 4xMI355 section): +# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md +# --block-size 128 is mandatory for MiniMax MSA. KV cache is left at the default +# dtype: amd/MiniMax-M3-MXFP4 ships no calibrated FP8 KV scales, so +# --kv_cache_dtype fp8 trips an assertion (k_scale is None) in the MSA +# fused_qknorm kernel during init. +python3 -m atom.entrypoints.openai_server \ + --model $MODEL \ + --server-port $PORT \ + -tp $TP \ + --max-model-len $MAX_MODEL_LEN $EP \ + --block-size 128 \ + --gpu-memory-utilization $MEM_FRAC_STATIC \ + --trust-remote-code \ + > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +export PYTHONDONTWRITEBYTECODE=1 +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x From c6e8d15770bd349cdb0dc1aae2618d6decaa0ec2 Mon Sep 17 00:00:00 2001 From: seungrokj Date: Tue, 23 Jun 2026 14:48:25 +0900 Subject: [PATCH 13/13] fix: bump MAX_NUM_SEQS to 256 for minimaxm3-fp4-mi355x-atom-mtp Co-Authored-By: Claude Sonnet 4.6 --- .../single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh index 0ee78f504..fc0619b7b 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom_mtp.sh @@ -40,7 +40,7 @@ set -x export AITER_QUICK_REDUCE_QUANTIZATION=INT4 export MAX_MODEL_LEN=32768 export MAX_NUM_BATCHED_TOKENS=32768 -export MAX_NUM_SEQS=128 +export MAX_NUM_SEQS=256 # (srok), not yet # --kv_cache_dtype fp8 \ python3 -m atom.entrypoints.openai_server \