diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4eddd318f..23f52948e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11187,6 +11187,230 @@ glm5-fp4-gb300-dynamo-sglang:
           ep: 1
           dp-attn: false
 
+glm5-fp8-gb200-dynamo-sglang:
+  image: lmsysorg/sglang:v0.5.12
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: gb200
+  precision: fp8
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    # ---------- 8k1k high-throughput (wide-EP decode) ----------
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 3p1d wide-EP (dep32, mrr256). 14 nodes.
+      - conc-list: [519]
+        prefill:
+          num-worker: 3
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+      # 4p1d wide-EP (dep16). 12 nodes.
+      - conc-list: [1484]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+      # 5p1d wide-EP (dep32). 18 nodes.
+      - conc-list: [1688]
+        prefill:
+          num-worker: 5
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+      # 7p1d wide-EP (dep16). 18 nodes.
+      - conc-list: [2699]
+        prefill:
+          num-worker: 7
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+    # ---------- 8k1k low-latency (per-node TP=8 decode workers) ----------
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1p8d, mrr1 (single-stream). 18 nodes.
+      - conc-list: [8]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
+        decode:
+          num-worker: 8
+          tp: 8
+          ep: 1
+          dp-attn: false
+      # 1p8d, mrr16. 18 nodes.
+      - conc-list: [90]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
+        decode:
+          num-worker: 8
+          tp: 8
+          ep: 1
+          dp-attn: false
+      # 1p8d, mrr4. 18 nodes.
+      - conc-list: [9]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
+        decode:
+          num-worker: 8
+          tp: 8
+          ep: 1
+          dp-attn: false
+    # ---------- 1k1k high-throughput (wide-EP decode) ----------
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 2p1d wide-EP (dep32, mrr1024). 12 nodes.
+      - conc-list: [2161]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+    # ---------- 1k1k low-latency ----------
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1p1d wide-EP (dep16). 6 nodes.
+      - conc-list: [1955]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+      # 1p1d wide-EP (dep16, mrr1024). 6 nodes.
+      - conc-list: [1170]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+      # 1p1d wide-EP (dep16, mrr256). 6 nodes.
+      - conc-list: [298]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+      # 1p6d per-node TP=8, mrr1 (single-stream). 14 nodes.
+      - conc-list: [8]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml"
+        decode:
+          num-worker: 6
+          tp: 8
+          ep: 1
+          dp-attn: false
+      # 1p6d per-node TP=8, mrr16. 14 nodes.
+      - conc-list: [72]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml"
+        decode:
+          num-worker: 6
+          tp: 8
+          ep: 1
+          dp-attn: false
+      # 1p6d per-node TP=8, mrr4. 14 nodes.
+      - conc-list: [20]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml"
+        decode:
+          num-worker: 6
+          tp: 8
+          ep: 1
+          dp-attn: false
+
 glm5-fp8-gb300-dynamo-sglang:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: zai-org/GLM-5-FP8
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml
new file mode 100644
index 000000000..f5e837fd0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml
@@ -0,0 +1,132 @@
+name: gb200-fp8-glm5_1k1k_hightpt_0
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 4
+  prefill_workers: 2
+  decode_nodes: 8
+  decode_workers: 1
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 2112
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 32
+      data-parallel-size: 32
+      expert-parallel-size: 32
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: deep_gemm
+      moe-a2a-backend: deepep
+      moe-dense-tp-size: 1
+      ep-dispatch-algorithm: static
+      ep-num-redundant-experts: 0
+      deepep-mode: low_latency
+      deepep-config: /configs/deepep_config.json
+      mem-fraction-static: 0.886
+      context-length: 2112
+      chunked-prefill-size: 64
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 1024
+  osl: 1024
+  concurrencies: '2161'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml
new file mode 100644
index 000000000..02ce0b956
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml
@@ -0,0 +1,130 @@
+name: gb200-fp8-glm5_1k1k_lowlat_0
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 1
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 2112
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 16
+      data-parallel-size: 16
+      expert-parallel-size: 16
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: deep_gemm
+      moe-a2a-backend: deepep
+      moe-dense-tp-size: 1
+      ep-dispatch-algorithm: static
+      ep-num-redundant-experts: 0
+      deepep-mode: low_latency
+      deepep-config: /configs/deepep_config.json
+      mem-fraction-static: 0.89
+      context-length: 2112
+      chunked-prefill-size: 64
+      max-running-requests: 1815
+      cuda-graph-max-bs: 1815
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 1024
+  osl: 1024
+  concurrencies: '1955'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml
new file mode 100644
index 000000000..2e6ff233f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml
@@ -0,0 +1,130 @@
+name: gb200-fp8-glm5_1k1k_lowlat_1
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 1
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 2112
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 16
+      data-parallel-size: 16
+      expert-parallel-size: 16
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: deep_gemm
+      moe-a2a-backend: deepep
+      moe-dense-tp-size: 1
+      ep-dispatch-algorithm: static
+      ep-num-redundant-experts: 0
+      deepep-mode: low_latency
+      deepep-config: /configs/deepep_config.json
+      mem-fraction-static: 0.89
+      context-length: 2112
+      chunked-prefill-size: 64
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 1024
+  osl: 1024
+  concurrencies: '1170'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml
new file mode 100644
index 000000000..9fe47d8d8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_2.yaml
@@ -0,0 +1,130 @@
+name: gb200-fp8-glm5_1k1k_lowlat_2
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 4
+  decode_workers: 1
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 2112
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 16
+      data-parallel-size: 16
+      expert-parallel-size: 16
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: deep_gemm
+      moe-a2a-backend: deepep
+      moe-dense-tp-size: 1
+      ep-dispatch-algorithm: static
+      ep-num-redundant-experts: 0
+      deepep-mode: low_latency
+      deepep-config: /configs/deepep_config.json
+      mem-fraction-static: 0.89
+      context-length: 2112
+      chunked-prefill-size: 64
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 1024
+  osl: 1024
+  concurrencies: '298'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml
new file mode 100644
index 000000000..61c53d704
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_3.yaml
@@ -0,0 +1,121 @@
+name: gb200-fp8-glm5_1k1k_lowlat_3
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 12
+  decode_workers: 6
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 2112
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      mem-fraction-static: 0.942
+      context-length: 2112
+      chunked-prefill-size: 64
+      max-running-requests: 1
+      cuda-graph-max-bs: 1
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 1024
+  osl: 1024
+  concurrencies: '8'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml
new file mode 100644
index 000000000..b8f303698
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_4.yaml
@@ -0,0 +1,121 @@
+name: gb200-fp8-glm5_1k1k_lowlat_4
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 12
+  decode_workers: 6
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 2112
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      mem-fraction-static: 0.942
+      context-length: 2112
+      chunked-prefill-size: 64
+      max-running-requests: 16
+      cuda-graph-max-bs: 16
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 1024
+  osl: 1024
+  concurrencies: '72'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml
new file mode 100644
index 000000000..1c89a8eaa
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_5.yaml
@@ -0,0 +1,121 @@
+name: gb200-fp8-glm5_1k1k_lowlat_5
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 12
+  decode_workers: 6
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 2112
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      mem-fraction-static: 0.942
+      context-length: 2112
+      chunked-prefill-size: 64
+      max-running-requests: 4
+      cuda-graph-max-bs: 4
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 1024
+  osl: 1024
+  concurrencies: '20'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml
new file mode 100644
index 000000000..375b38878
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml
@@ -0,0 +1,132 @@
+name: gb200-fp8-glm5_8k1k_hightpt_0
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 6
+  prefill_workers: 3
+  decode_nodes: 8
+  decode_workers: 1
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 9280
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 32
+      data-parallel-size: 32
+      expert-parallel-size: 32
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: deep_gemm
+      moe-a2a-backend: deepep
+      moe-dense-tp-size: 1
+      ep-dispatch-algorithm: static
+      ep-num-redundant-experts: 0
+      deepep-mode: low_latency
+      deepep-config: /configs/deepep_config.json
+      mem-fraction-static: 0.89
+      context-length: 9280
+      chunked-prefill-size: 64
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 8192
+  osl: 1024
+  concurrencies: '519'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml
new file mode 100644
index 000000000..5d4faed21
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml
@@ -0,0 +1,132 @@
+name: gb200-fp8-glm5_8k1k_hightpt_1
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 8
+  prefill_workers: 4
+  decode_nodes: 4
+  decode_workers: 1
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 9280
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 16
+      data-parallel-size: 16
+      expert-parallel-size: 16
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: deep_gemm
+      moe-a2a-backend: deepep
+      moe-dense-tp-size: 1
+      ep-dispatch-algorithm: static
+      ep-num-redundant-experts: 0
+      deepep-mode: low_latency
+      deepep-config: /configs/deepep_config.json
+      mem-fraction-static: 0.89
+      context-length: 9280
+      chunked-prefill-size: 64
+      max-running-requests: 802
+      cuda-graph-max-bs: 802
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 8192
+  osl: 1024
+  concurrencies: '1484'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml
new file mode 100644
index 000000000..1a20b38a0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml
@@ -0,0 +1,132 @@
+name: gb200-fp8-glm5_8k1k_hightpt_2
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 10
+  prefill_workers: 5
+  decode_nodes: 8
+  decode_workers: 1
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 9280
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 32
+      data-parallel-size: 32
+      expert-parallel-size: 32
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: deep_gemm
+      moe-a2a-backend: deepep
+      moe-dense-tp-size: 1
+      ep-dispatch-algorithm: static
+      ep-num-redundant-experts: 0
+      deepep-mode: low_latency
+      deepep-config: /configs/deepep_config.json
+      mem-fraction-static: 0.89
+      context-length: 9280
+      chunked-prefill-size: 64
+      max-running-requests: 828
+      cuda-graph-max-bs: 828
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 8192
+  osl: 1024
+  concurrencies: '1688'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml
new file mode 100644
index 000000000..2c8b7987d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml
@@ -0,0 +1,132 @@
+name: gb200-fp8-glm5_8k1k_hightpt_3
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 14
+  prefill_workers: 7
+  decode_nodes: 4
+  decode_workers: 1
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 9280
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 16
+      data-parallel-size: 16
+      expert-parallel-size: 16
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: deep_gemm
+      moe-a2a-backend: deepep
+      moe-dense-tp-size: 1
+      ep-dispatch-algorithm: static
+      ep-num-redundant-experts: 0
+      deepep-mode: low_latency
+      deepep-config: /configs/deepep_config.json
+      mem-fraction-static: 0.89
+      context-length: 9280
+      chunked-prefill-size: 64
+      max-running-requests: 1514
+      cuda-graph-max-bs: 1514
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 8192
+  osl: 1024
+  concurrencies: '2699'
+  random_range_ratio: 1.0
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
new file mode 100644
index 000000000..80f4aa9de
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
@@ -0,0 +1,119 @@
+name: gb200-fp8-glm5_8k1k_lowlat_0
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 16
+  decode_workers: 8
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 9280
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      mem-fraction-static: 0.943
+      context-length: 9280
+      chunked-prefill-size: 64
+      max-running-requests: 1
+      cuda-graph-max-bs: 1
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 8192
+  osl: 1024
+  concurrencies: '8'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml
new file mode 100644
index 000000000..65681de1c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml
@@ -0,0 +1,119 @@
+name: gb200-fp8-glm5_8k1k_lowlat_1
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 16
+  decode_workers: 8
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 9280
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      mem-fraction-static: 0.943
+      context-length: 9280
+      chunked-prefill-size: 64
+      max-running-requests: 16
+      cuda-graph-max-bs: 16
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 8192
+  osl: 1024
+  concurrencies: '90'
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml
new file mode 100644
index 000000000..dd5eea2d6
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml
@@ -0,0 +1,119 @@
+name: gb200-fp8-glm5_8k1k_lowlat_2
+
+# Combined upstream srt-slurm recipe split into one flat yaml per concrete
+# topology to match the InferenceX glm5 sglang convention
+# (see ../../../gb300-fp8/). All shared base envs and the prefill
+# sglang_config are inlined here verbatim; the decode block is the shared
+# base plus the topology-specific override for this concrete shape.
+
+model:
+  path: glm-5-fp8
+  container: "lmsysorg/sglang:v0.5.12"
+  precision: fp8
+
+resources:
+  gpu_type: gb200
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 16
+  decode_workers: 8
+frontend:
+  type: dynamo
+dynamo:
+  version: 1.1.0
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
+    PYTHONUNBUFFERED: '1'
+    DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
+    MC_TE_METRIC: 'true'
+    MC_FORCE_MNNVL: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
+    DYN_REQUEST_PLANE: nats
+
+  sglang_config:
+    prefill:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: prefill
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      enable-flashinfer-allreduce-fusion: false
+      chunked-prefill-size: 131072
+      max-prefill-tokens: 16384
+      context-length: 9280
+      nsa-prefill-backend: trtllm
+      nsa-decode-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      mem-fraction-static: 0.7
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+    decode:
+      served-model-name: GLM-5-FP8
+      trust-remote-code: true
+      quantization: fp8
+      kv-cache-dtype: fp8_e4m3
+      disaggregation-mode: decode
+      disaggregation-transfer-backend: nixl
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      enable-flashinfer-allreduce-fusion: false
+      nsa-decode-backend: trtllm
+      nsa-prefill-backend: trtllm
+      moe-runner-backend: flashinfer_trtllm
+      mem-fraction-static: 0.943
+      context-length: 9280
+      chunked-prefill-size: 64
+      max-running-requests: 4
+      cuda-graph-max-bs: 4
+      skip-tokenizer-init: true
+      stream-interval: 30
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: sa-bench
+  req_rate: inf
+  isl: 8192
+  osl: 1024
+  concurrencies: '9'
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index f26f1a72a..a2a660b9c 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4124,3 +4124,11 @@
     - "For DP-attention runs, use TP-sized data parallelism with DP attention local-control broadcast, DP LM head, prefill delayer, scheduler recv interval 1, chunked prefill size 32768, and schedule conservativeness 3.33."
     - "Set SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache, and explicitly pass --disable-piecewise-cuda-graph."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1792
+
+- config-keys:
+    - glm5-fp8-gb200-dynamo-sglang
+  description:
+    - "Add GLM-5-FP8 GB200 disaggregated multinode SGLang benchmarks via Dynamo"
+    - "Image: lmsysorg/sglang:v0.5.12"
+    - "14 topologies across 1k/1k and 8k/1k: prefill TP8 STP + decode wide-EP (DEP16/DEP32 high-throughput) and per-node TP8 low-latency, recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb200-fp8/"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1895
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 4017b1fd2..9d7ca1a10 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -24,6 +24,11 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
     elif [[ $MODEL_PREFIX == "qwen3.5" && $PRECISION == "fp8" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/Qwen3.5-397B-A17B-FP8"
         export SRT_SLURM_MODEL_PREFIX="qwen3.5-fp8"
+    elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then
+        # SRT_SLURM_MODEL_PREFIX matches the model.path alias ("glm-5-fp8")
+        # in our GLM-5 sglang recipes.
+        export MODEL_PATH="/mnt/lustre01/models/GLM-5-FP8"
+        export SRT_SLURM_MODEL_PREFIX="glm-5-fp8"
     else
         export MODEL_PATH=$MODEL
     fi
@@ -263,6 +268,11 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "qwen3.5" ]]; then
     cd "$SRT_REPO_DIR"
     mkdir -p recipes/sglang/qwen3.5
     cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/qwen3.5" recipes/sglang/qwen3.5
+elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    mkdir -p recipes/sglang/glm5
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5" recipes/sglang/glm5
 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1
     cd "$SRT_REPO_DIR" || exit 1