Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11186,6 +11186,230 @@ glm5-fp4-gb300-dynamo-sglang:
ep: 1
dp-attn: false

glm5-fp8-gb200-dynamo-sglang:
image: lmsysorg/sglang:v0.5.12
model: zai-org/GLM-5-FP8
model-prefix: glm5
runner: gb200
precision: fp8
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
# ---------- 8k1k high-throughput (wide-EP decode) ----------
- isl: 8192
osl: 1024
search-space:
# 3p1d wide-EP (dep32, mrr256). 14 nodes.
- conc-list: [519]
prefill:
num-worker: 3
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
# 4p1d wide-EP (dep16). 12 nodes.
- conc-list: [1484]
prefill:
num-worker: 4
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# 5p1d wide-EP (dep32). 18 nodes.
- conc-list: [1688]
prefill:
num-worker: 5
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
# 7p1d wide-EP (dep16). 18 nodes.
- conc-list: [2699]
prefill:
num-worker: 7
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# ---------- 8k1k low-latency (per-node TP=8 decode workers) ----------
- isl: 8192
osl: 1024
search-space:
# 1p8d, mrr1 (single-stream). 18 nodes.
- conc-list: [8]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
decode:
num-worker: 8
tp: 8
ep: 1
dp-attn: false
# 1p8d, mrr16. 18 nodes.
- conc-list: [90]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
decode:
num-worker: 8
tp: 8
ep: 1
dp-attn: false
# 1p8d, mrr4. 18 nodes.
- conc-list: [9]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
decode:
num-worker: 8
tp: 8
ep: 1
dp-attn: false
# ---------- 1k1k high-throughput (wide-EP decode) ----------
- isl: 1024
osl: 1024
search-space:
# 2p1d wide-EP (dep32, mrr1024). 12 nodes.
- conc-list: [2161]
prefill:
num-worker: 2
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
# ---------- 1k1k low-latency ----------
- isl: 1024
osl: 1024
search-space:
# 1p1d wide-EP (dep16). 6 nodes.
- conc-list: [1955]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# 1p1d wide-EP (dep16, mrr1024). 6 nodes.
- conc-list: [1170]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# 1p1d wide-EP (dep16, mrr256). 6 nodes.
- conc-list: [298]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# 1p6d per-node TP=8, mrr1 (single-stream). 14 nodes.
- conc-list: [8]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml"
decode:
num-worker: 6
tp: 8
ep: 1
dp-attn: false
# 1p6d per-node TP=8, mrr16. 14 nodes.
- conc-list: [72]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml"
decode:
num-worker: 6
tp: 8
ep: 1
dp-attn: false
# 1p6d per-node TP=8, mrr4. 14 nodes.
- conc-list: [20]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml"
decode:
num-worker: 6
tp: 8
ep: 1
dp-attn: false

glm5-fp8-gb300-dynamo-sglang:
image: lmsysorg/sglang:v0.5.11-cu130
model: zai-org/GLM-5-FP8
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
name: gb200-fp8-glm5_1k1k_hightpt_0

# Combined upstream srt-slurm recipe split into one flat yaml per concrete
# topology to match the InferenceX glm5 sglang convention
# (see ../../../gb300-fp8/). All shared base envs and the prefill
# sglang_config are inlined here verbatim; the decode block is the shared
# base plus the topology-specific override for this concrete shape.

model:
path: glm-5-fp8
container: "lmsysorg/sglang:v0.5.12"
precision: fp8

resources:
gpu_type: gb200
gpus_per_node: 4
prefill_nodes: 4
prefill_workers: 2
decode_nodes: 8
decode_workers: 1
frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 9
dynamo:
version: 1.1.0

backend:
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
PYTHONUNBUFFERED: '1'
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
MC_TE_METRIC: 'true'
MC_FORCE_MNNVL: '1'
NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
DYN_REQUEST_PLANE: nats

decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
PYTHONUNBUFFERED: '1'
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
MC_TE_METRIC: 'true'
MC_FORCE_MNNVL: '1'
NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
DYN_REQUEST_PLANE: nats
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'

sglang_config:
prefill:
served-model-name: GLM-5-FP8
trust-remote-code: true
quantization: fp8
kv-cache-dtype: fp8_e4m3
disaggregation-mode: prefill
disaggregation-transfer-backend: nixl
tensor-parallel-size: 8
data-parallel-size: 8
expert-parallel-size: 1
enable-dp-attention: true
enable-dp-lm-head: true
enable-flashinfer-allreduce-fusion: false
chunked-prefill-size: 131072
max-prefill-tokens: 16384
context-length: 2112
nsa-prefill-backend: trtllm
nsa-decode-backend: trtllm
moe-runner-backend: flashinfer_trtllm
max-running-requests: 128
cuda-graph-max-bs: 128
mem-fraction-static: 0.7
weight-loader-prefetch-checkpoints: true
model-loader-extra-config: '{"enable_multithread_load": true}'
disable-radix-cache: true

decode:
served-model-name: GLM-5-FP8
trust-remote-code: true
quantization: fp8
kv-cache-dtype: fp8_e4m3
disaggregation-mode: decode
disaggregation-transfer-backend: nixl
tensor-parallel-size: 32
data-parallel-size: 32
expert-parallel-size: 32
enable-dp-attention: true
enable-dp-lm-head: true
enable-flashinfer-allreduce-fusion: false
nsa-decode-backend: trtllm
nsa-prefill-backend: trtllm
moe-runner-backend: deep_gemm
moe-a2a-backend: deepep
moe-dense-tp-size: 1
ep-dispatch-algorithm: static
ep-num-redundant-experts: 0
deepep-mode: low_latency
deepep-config: /configs/deepep_config.json
mem-fraction-static: 0.886
context-length: 2112
chunked-prefill-size: 64
max-running-requests: 1024
cuda-graph-max-bs: 1024
skip-tokenizer-init: true
stream-interval: 30
disable-radix-cache: true
weight-loader-prefetch-checkpoints: true
model-loader-extra-config: '{"enable_multithread_load": true}'

health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: sa-bench
req_rate: inf
isl: 1024
osl: 1024
concurrencies: '2161'
random_range_ratio: 1.0
Loading