-
Notifications
You must be signed in to change notification settings - Fork 207
[AMD] Add MiniMax-M3-FP8 MI355X ATOM EAGLE3 / non-EAGLE3 update 0623 #1916
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
6a1a203
03f5752
a245590
25b8fab
d8f360d
68aba47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| source "$(dirname "$0")/../../benchmark_lib.sh" | ||
|
|
||
| check_env_vars \ | ||
| MODEL \ | ||
| TP \ | ||
| CONC \ | ||
| ISL \ | ||
| OSL \ | ||
| RANDOM_RANGE_RATIO \ | ||
| RESULT_FILENAME \ | ||
| EP_SIZE \ | ||
| DP_ATTENTION | ||
|
|
||
| if [[ -n "$SLURM_JOB_ID" ]]; then | ||
| echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" | ||
| fi | ||
|
|
||
| echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" | ||
|
|
||
| SERVER_LOG=/workspace/server.log | ||
|
|
||
| PARALLEL_ARGS=(-tp "$TP") #TP | ||
| if [ "$DP_ATTENTION" = "true" ]; then | ||
| if [ "$EP_SIZE" -gt 1 ]; then #DP+EP | ||
| PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention ) | ||
| else #DP+TP | ||
| PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) | ||
| fi | ||
| fi | ||
|
|
||
| SPEC_ARGS=() | ||
|
|
||
| # Start GPU monitoring (power, temperature, clocks every second) | ||
| start_gpu_monitor | ||
| MEM_FRAC_STATIC=0.8 | ||
|
|
||
| set -x | ||
| #export AITER_QUICK_REDUCE_QUANTIZATION=INT4 | ||
| export ATOM_M3_SPARSE_USE_ASM_PA=1 | ||
| export MAX_MODEL_LEN=32768 | ||
| export MAX_NUM_BATCHED_TOKENS=32768 | ||
| export MAX_NUM_SEQS=256 | ||
| python3 -m atom.entrypoints.openai_server \ | ||
| --model $MODEL \ | ||
| --server-port $PORT \ | ||
| "${PARALLEL_ARGS[@]}" \ | ||
| "${SPEC_ARGS[@]}" \ | ||
| --block-size 128 \ | ||
| --gpu-memory-utilization $MEM_FRAC_STATIC \ | ||
| --max-model-len $MAX_MODEL_LEN \ | ||
| --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ | ||
| --max-num-seqs $MAX_NUM_SEQS \ | ||
| --kv_cache_dtype fp8 \ | ||
| --trust-remote-code \ | ||
| --no-enable_prefix_caching \ | ||
| > $SERVER_LOG 2>&1 & | ||
|
|
||
| SERVER_PID=$! | ||
|
|
||
| # Wait for server to be ready | ||
| wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" | ||
|
|
||
| export PYTHONDONTWRITEBYTECODE=1 | ||
| run_benchmark_serving \ | ||
| --model "$MODEL" \ | ||
| --port "$PORT" \ | ||
| --backend vllm \ | ||
| --input-len "$ISL" \ | ||
| --output-len "$OSL" \ | ||
| --random-range-ratio "$RANDOM_RANGE_RATIO" \ | ||
| --num-prompts "$((CONC * 10))" \ | ||
| --max-concurrency "$CONC" \ | ||
| --result-filename "$RESULT_FILENAME" \ | ||
| --result-dir /workspace/ \ | ||
| --trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" ) | ||
|
Check warning on line 77 in benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_atom.sh
|
||
|
|
||
| # After throughput, run evaluation only if RUN_EVAL is true | ||
| if [ "${RUN_EVAL}" = "true" ]; then | ||
| run_eval --framework lm-eval --port "$PORT" | ||
| append_lm_eval_summary | ||
| fi | ||
|
|
||
| # Stop GPU monitoring | ||
| stop_gpu_monitor | ||
| set +x | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4001,6 +4001,13 @@ | |
| - "Remove the runtime SupportsEagle3 source patch now included in the pinned nightly" | ||
| pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1843 | ||
|
|
||
| - config-keys: | ||
| - minimaxm3-fp8-mi355x-atom | ||
| description: | ||
| - "Add minimaxm3-fp8-mi355x-atom: MiniMax-M3 MXFP8 single-node benchmark on MI355X using ATOM framework" | ||
| - "Uses rocm/atom-dev:MiniMax-M3-20260623; TP4, block size 128, ISL=1024,8192 OSL=1024" | ||
| pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1929 | ||
|
Check warning on line 4009 in perf-changelog.yaml
|
||
|
seungrokj marked this conversation as resolved.
Outdated
|
||
|
|
||
|
Check warning on line 4010 in perf-changelog.yaml
|
||
|
seungrokj marked this conversation as resolved.
|
||
| - config-keys: | ||
| - minimaxm3-fp8-mi355x-atom-mtp | ||
| description: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.