Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
4d484fb
feat: add minimaxm3-fp8-mi355x-atom and minimaxm3-fp8-mi355x-atom-mtp…
seungrokj Jun 20, 2026
45b360c
chore: add perf-changelog entries for minimaxm3-fp8-mi355x-atom and a…
seungrokj Jun 20, 2026
4700bd4
fix: remove stray GitHub review comment artifacts from minimaxm3_fp8 …
seungrokj Jun 20, 2026
e17cbaa
fix: use MiniMaxAI/MiniMax-M3-MXFP8 model id for minimaxm3-fp8-mi355x…
seungrokj Jun 20, 2026
f48400e
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 20, 2026
a068fde
fix: correct model id in amd-master.yaml for minimaxm3-fp8-mi355x-ato…
seungrokj Jun 20, 2026
196961a
fix: trim minimaxm3-fp8-mi355x-atom and atom-mtp search spaces to TP4…
seungrokj Jun 20, 2026
e016169
fix: correct minimaxm3-fp8-mi355x-atom-mtp ISL=1024 search space to tp4
seungrokj Jun 20, 2026
3913e9a
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 20, 2026
f3def1c
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 21, 2026
9f9c9a7
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 23, 2026
5c44abf
fix: bump minimaxm3-fp8-mi355x-atom and atom-mtp images to MiniMax-M3…
seungrokj Jun 23, 2026
d2cd74e
fix: disable prefix caching for minimaxm3-fp8-mi355x-atom and atom-mtp
seungrokj Jun 23, 2026
5a0d2cf
fix: update minimaxm3-fp8-mi355x-atom scripts to align with fp4 recipe
seungrokj Jun 23, 2026
ecd0bef
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 23, 2026
45dfb13
fix: bump MAX_NUM_SEQS to 256 for minimaxm3-fp8-mi355x-atom scripts
seungrokj Jun 23, 2026
60deebe
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 23, 2026
7557212
fix: cap minimaxm3-fp8-mi355x-atom search-space conc-end to 128
seungrokj Jun 23, 2026
0d1ed51
fix: remove minimaxm3-fp8-mi355x-atom recipe, script, and perf-change…
seungrokj Jun 23, 2026
3ac8eeb
Merge branch 'main' into amd/m3_atom_fp8
seungrokj Jun 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2876,6 +2876,44 @@ minimaxm3-fp4-mi355x-atom:
- { tp: 4, conc-start: 1, conc-end: 256 }
- { tp: 8, conc-start: 1, conc-end: 2 }

minimaxm3-fp8-mi355x-atom:
image: rocm/atom-dev:MiniMax-M3-20260619
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x
precision: fp8
framework: atom
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 256 }

minimaxm3-fp8-mi355x-atom-mtp:
image: rocm/atom-dev:MiniMax-M3-20260619
model: MiniMaxAI/MiniMax-M3-MXFP8
Comment thread
cursor[bot] marked this conversation as resolved.
model-prefix: minimaxm3
runner: mi355x
precision: fp8
framework: atom
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
Comment thread
seungrokj marked this conversation as resolved.

# MiniMax-M3 MXFP8 MI300X day-zero recipe. Reuse the dedicated ROCm image and
# MI355X serving shape, but retain the default BF16 KV cache because this
# checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100
Expand Down
79 changes: 79 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log

PARALLEL_ARGS=(-tp "$TP") #TP
if [ "$DP_ATTENTION" = "true" ]; then
if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
else #DP+TP
PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
fi
fi

SPEC_ARGS=()

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.8

set -x
# (srok), not yet
# --kv_cache_dtype fp8 \
python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
"${PARALLEL_ARGS[@]}" \
"${SPEC_ARGS[@]}" \
--block-size 128 \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--trust-remote-code \
Comment thread
seungrokj marked this conversation as resolved.
Outdated
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log

PARALLEL_ARGS=(-tp "$TP") #TP
if [ "$DP_ATTENTION" = "true" ]; then
if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
else #DP+TP
PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
fi
fi

SPEC_ARGS=(--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3 )

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.8

set -x
# (srok), not yet
# --kv_cache_dtype fp8 \
python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
"${PARALLEL_ARGS[@]}" \
"${SPEC_ARGS[@]}" \
--block-size 128 \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--trust-remote-code \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code \
--use-chat-template

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3964,6 +3964,15 @@
- "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843

- config-keys:
- minimaxm3-fp8-mi355x-atom
- minimaxm3-fp8-mi355x-atom-mtp
description:
- "Add minimaxm3-fp8-mi355x-atom CI recipe: single-node ATOM benchmark for MiniMax-M3-MXFP8 on MI355X"
- "Add minimaxm3-fp8-mi355x-atom-mtp: same with EAGLE3 speculative decoding (3 draft tokens)"
- "Both use rocm/atom-dev:MiniMax-M3-20260619; search space mirrors FP4 atom variants (ISL=1024,8192 OSL=1024 TP2/TP4/TP8)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1867

- config-keys:
- kimik2.5-fp4-gb200-dynamo-trt
description:
Expand Down
Loading