diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4eddd318f..23f52948e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11187,6 +11187,230 @@ glm5-fp4-gb300-dynamo-sglang: ep: 1 dp-attn: false +glm5-fp8-gb200-dynamo-sglang: + image: lmsysorg/sglang:v0.5.12 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: gb200 + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + # ---------- 8k1k high-throughput (wide-EP decode) ---------- + - isl: 8192 + osl: 1024 + search-space: + # 3p1d wide-EP (dep32, mrr256). 14 nodes. + - conc-list: [519] + prefill: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + # 4p1d wide-EP (dep16). 12 nodes. + - conc-list: [1484] + prefill: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 5p1d wide-EP (dep32). 18 nodes. + - conc-list: [1688] + prefill: + num-worker: 5 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + # 7p1d wide-EP (dep16). 18 nodes. + - conc-list: [2699] + prefill: + num-worker: 7 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # ---------- 8k1k low-latency (per-node TP=8 decode workers) ---------- + - isl: 8192 + osl: 1024 + search-space: + # 1p8d, mrr1 (single-stream). 18 nodes. + - conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + # 1p8d, mrr16. 18 nodes. + - conc-list: [90] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + # 1p8d, mrr4. 18 nodes. + - conc-list: [9] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml" + decode: + num-worker: 8 + tp: 8 + ep: 1 + dp-attn: false + # ---------- 1k1k high-throughput (wide-EP decode) ---------- + - isl: 1024 + osl: 1024 + search-space: + # 2p1d wide-EP (dep32, mrr1024). 12 nodes. + - conc-list: [2161] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + # ---------- 1k1k low-latency ---------- + - isl: 1024 + osl: 1024 + search-space: + # 1p1d wide-EP (dep16). 6 nodes. + - conc-list: [1955] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 1p1d wide-EP (dep16, mrr1024). 6 nodes. + - conc-list: [1170] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 1p1d wide-EP (dep16, mrr256). 6 nodes. + - conc-list: [298] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 1p6d per-node TP=8, mrr1 (single-stream). 14 nodes. + - conc-list: [8] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # 1p6d per-node TP=8, mrr16. 14 nodes. + - conc-list: [72] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # 1p6d per-node TP=8, mrr4. 14 nodes. + - conc-list: [20] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + glm5-fp8-gb300-dynamo-sglang: image: lmsysorg/sglang:v0.5.11-cu130 model: zai-org/GLM-5-FP8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml new file mode 100644 index 000000000..f5e837fd0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml @@ -0,0 +1,132 @@ +name: gb200-fp8-glm5_1k1k_hightpt_0 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 4 + prefill_workers: 2 + decode_nodes: 8 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 2112 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 32 + data-parallel-size: 32 + expert-parallel-size: 32 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: deep_gemm + moe-a2a-backend: deepep + moe-dense-tp-size: 1 + ep-dispatch-algorithm: static + ep-num-redundant-experts: 0 + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + mem-fraction-static: 0.886 + context-length: 2112 + chunked-prefill-size: 64 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '2161' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml new file mode 100644 index 000000000..02ce0b956 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml @@ -0,0 +1,130 @@ +name: gb200-fp8-glm5_1k1k_lowlat_0 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 1 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 2112 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: deep_gemm + moe-a2a-backend: deepep + moe-dense-tp-size: 1 + ep-dispatch-algorithm: static + ep-num-redundant-experts: 0 + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + mem-fraction-static: 0.89 + context-length: 2112 + chunked-prefill-size: 64 + max-running-requests: 1815 + cuda-graph-max-bs: 1815 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '1955' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml new file mode 100644 index 000000000..2e6ff233f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml @@ -0,0 +1,130 @@ +name: gb200-fp8-glm5_1k1k_lowlat_1 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 1 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 2112 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: deep_gemm + moe-a2a-backend: deepep + moe-dense-tp-size: 1 + ep-dispatch-algorithm: static + ep-num-redundant-experts: 0 + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + mem-fraction-static: 0.89 + context-length: 2112 + chunked-prefill-size: 64 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '1170' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml new file mode 100644 index 000000000..9fe47d8d8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml @@ -0,0 +1,130 @@ +name: gb200-fp8-glm5_1k1k_lowlat_2 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 1 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 2112 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: deep_gemm + moe-a2a-backend: deepep + moe-dense-tp-size: 1 + ep-dispatch-algorithm: static + ep-num-redundant-experts: 0 + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + mem-fraction-static: 0.89 + context-length: 2112 + chunked-prefill-size: 64 + max-running-requests: 256 + cuda-graph-max-bs: 256 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '298' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml new file mode 100644 index 000000000..61c53d704 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml @@ -0,0 +1,121 @@ +name: gb200-fp8-glm5_1k1k_lowlat_3 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 12 + decode_workers: 6 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 2112 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + mem-fraction-static: 0.942 + context-length: 2112 + chunked-prefill-size: 64 + max-running-requests: 1 + cuda-graph-max-bs: 1 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '8' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml new file mode 100644 index 000000000..b8f303698 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml @@ -0,0 +1,121 @@ +name: gb200-fp8-glm5_1k1k_lowlat_4 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 12 + decode_workers: 6 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 2112 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + mem-fraction-static: 0.942 + context-length: 2112 + chunked-prefill-size: 64 + max-running-requests: 16 + cuda-graph-max-bs: 16 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '72' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml new file mode 100644 index 000000000..1c89a8eaa --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml @@ -0,0 +1,121 @@ +name: gb200-fp8-glm5_1k1k_lowlat_5 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 12 + decode_workers: 6 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 2112 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + mem-fraction-static: 0.942 + context-length: 2112 + chunked-prefill-size: 64 + max-running-requests: 4 + cuda-graph-max-bs: 4 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '20' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml new file mode 100644 index 000000000..375b38878 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml @@ -0,0 +1,132 @@ +name: gb200-fp8-glm5_8k1k_hightpt_0 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 6 + prefill_workers: 3 + decode_nodes: 8 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 9280 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 32 + data-parallel-size: 32 + expert-parallel-size: 32 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: deep_gemm + moe-a2a-backend: deepep + moe-dense-tp-size: 1 + ep-dispatch-algorithm: static + ep-num-redundant-experts: 0 + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + mem-fraction-static: 0.89 + context-length: 9280 + chunked-prefill-size: 64 + max-running-requests: 256 + cuda-graph-max-bs: 256 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '519' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml new file mode 100644 index 000000000..5d4faed21 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml @@ -0,0 +1,132 @@ +name: gb200-fp8-glm5_8k1k_hightpt_1 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 8 + prefill_workers: 4 + decode_nodes: 4 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 9280 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: deep_gemm + moe-a2a-backend: deepep + moe-dense-tp-size: 1 + ep-dispatch-algorithm: static + ep-num-redundant-experts: 0 + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + mem-fraction-static: 0.89 + context-length: 9280 + chunked-prefill-size: 64 + max-running-requests: 802 + cuda-graph-max-bs: 802 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '1484' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml new file mode 100644 index 000000000..1a20b38a0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml @@ -0,0 +1,132 @@ +name: gb200-fp8-glm5_8k1k_hightpt_2 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 10 + prefill_workers: 5 + decode_nodes: 8 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 9280 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 32 + data-parallel-size: 32 + expert-parallel-size: 32 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: deep_gemm + moe-a2a-backend: deepep + moe-dense-tp-size: 1 + ep-dispatch-algorithm: static + ep-num-redundant-experts: 0 + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + mem-fraction-static: 0.89 + context-length: 9280 + chunked-prefill-size: 64 + max-running-requests: 828 + cuda-graph-max-bs: 828 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '1688' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml new file mode 100644 index 000000000..2c8b7987d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml @@ -0,0 +1,132 @@ +name: gb200-fp8-glm5_8k1k_hightpt_3 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 14 + prefill_workers: 7 + decode_nodes: 4 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 9280 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + disable-radix-cache: true + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: deep_gemm + moe-a2a-backend: deepep + moe-dense-tp-size: 1 + ep-dispatch-algorithm: static + ep-num-redundant-experts: 0 + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + mem-fraction-static: 0.89 + context-length: 9280 + chunked-prefill-size: 64 + max-running-requests: 1514 + cuda-graph-max-bs: 1514 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '2699' + random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml new file mode 100644 index 000000000..80f4aa9de --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml @@ -0,0 +1,119 @@ +name: gb200-fp8-glm5_8k1k_lowlat_0 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 16 + decode_workers: 8 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 9280 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + mem-fraction-static: 0.943 + context-length: 9280 + chunked-prefill-size: 64 + max-running-requests: 1 + cuda-graph-max-bs: 1 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '8' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml new file mode 100644 index 000000000..65681de1c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml @@ -0,0 +1,119 @@ +name: gb200-fp8-glm5_8k1k_lowlat_1 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 16 + decode_workers: 8 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 9280 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + mem-fraction-static: 0.943 + context-length: 9280 + chunked-prefill-size: 64 + max-running-requests: 16 + cuda-graph-max-bs: 16 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '90' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml new file mode 100644 index 000000000..dd5eea2d6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml @@ -0,0 +1,119 @@ +name: gb200-fp8-glm5_8k1k_lowlat_2 + +# Combined upstream srt-slurm recipe split into one flat yaml per concrete +# topology to match the InferenceX glm5 sglang convention +# (see ../../../gb300-fp8/). All shared base envs and the prefill +# sglang_config are inlined here verbatim; the decode block is the shared +# base plus the topology-specific override for this concrete shape. + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.12" + precision: fp8 + +resources: + gpu_type: gb200 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 1 + decode_nodes: 16 + decode_workers: 8 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + sglang_config: + prefill: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + enable-flashinfer-allreduce-fusion: false + chunked-prefill-size: 131072 + max-prefill-tokens: 16384 + context-length: 9280 + nsa-prefill-backend: trtllm + nsa-decode-backend: trtllm + moe-runner-backend: flashinfer_trtllm + max-running-requests: 128 + cuda-graph-max-bs: 128 + mem-fraction-static: 0.7 + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + enable-flashinfer-allreduce-fusion: false + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + mem-fraction-static: 0.943 + context-length: 9280 + chunked-prefill-size: 64 + max-running-requests: 4 + cuda-graph-max-bs: 4 + skip-tokenizer-init: true + stream-interval: 30 + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '9' diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f26f1a72a..a2a660b9c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4124,3 +4124,11 @@ - "For DP-attention runs, use TP-sized data parallelism with DP attention local-control broadcast, DP LM head, prefill delayer, scheduler recv interval 1, chunked prefill size 32768, and schedule conservativeness 3.33." - "Set SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache, and explicitly pass --disable-piecewise-cuda-graph." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1792 + +- config-keys: + - glm5-fp8-gb200-dynamo-sglang + description: + - "Add GLM-5-FP8 GB200 disaggregated multinode SGLang benchmarks via Dynamo" + - "Image: lmsysorg/sglang:v0.5.12" + - "14 topologies across 1k/1k and 8k/1k: prefill TP8 STP + decode wide-EP (DEP16/DEP32 high-throughput) and per-node TP8 low-latency, recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1895 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 4017b1fd2..9d7ca1a10 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -24,6 +24,11 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then elif [[ $MODEL_PREFIX == "qwen3.5" && $PRECISION == "fp8" ]]; then export MODEL_PATH="/mnt/lustre01/models/Qwen3.5-397B-A17B-FP8" export SRT_SLURM_MODEL_PREFIX="qwen3.5-fp8" + elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then + # SRT_SLURM_MODEL_PREFIX matches the model.path alias ("glm-5-fp8") + # in our GLM-5 sglang recipes. + export MODEL_PATH="/mnt/lustre01/models/GLM-5-FP8" + export SRT_SLURM_MODEL_PREFIX="glm-5-fp8" else export MODEL_PATH=$MODEL fi @@ -263,6 +268,11 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "qwen3.5" ]]; then cd "$SRT_REPO_DIR" mkdir -p recipes/sglang/qwen3.5 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5" recipes/sglang/qwen3.5 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + mkdir -p recipes/sglang/glm5 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5" recipes/sglang/glm5 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 cd "$SRT_REPO_DIR" || exit 1