docs: update Qwen3-235B-A22B-FP8 recipes (#5254)

Signed-off-by: Elnifio <elnifio0519@gmail.com>

docs: update Qwen3-235B-A22B-FP8 recipes (#5254)
Signed-off-by: Elnifio <elnifio0519@gmail.com>
b43a131c · Yunzhou Liu · GitHub · 84b5e9b5 · b43a131c · b43a131c
Unverified Commit b43a131c authored Feb 03, 2026 by Yunzhou Liu Committed by GitHub Feb 03, 2026
2 changed files
--- a/recipes/qwen3-235b-a22b-fp8/trtllm/agg/deploy.yaml
+++ b/recipes/qwen3-235b-a22b-fp8/trtllm/agg/deploy.yaml
@@ -13,10 +13,6 @@ data:
    moe_tensor_parallel_size: 1
    enable_attention_dp: false
    enable_chunked_prefill: true
-    build_config:
-      max_batch_size: 128
-      max_num_tokens: 8192
-      max_seq_len: 8192
    kv_cache_config:
      enable_block_reuse: true
      free_gpu_memory_fraction: 0.8
@@ -91,6 +87,9 @@ spec:
              python3 -m dynamo.trtllm \
              --model-path "${MODEL_PATH}" \
              --served-model-name "Qwen/Qwen3-235B-A22B-FP8" \
+              --max-batch-size 128 \
+              --max-num-tokens 8192 \
+              --max-seq-len 8192 \
              --extra-engine-args "${ENGINE_ARGS}"
          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
          workingDir: /workspace/components/backends/trtllm

--- a/recipes/qwen3-235b-a22b-fp8/trtllm/disagg/deploy.yaml
+++ b/recipes/qwen3-235b-a22b-fp8/trtllm/disagg/deploy.yaml
@@ -13,10 +13,6 @@ data:
    moe_expert_parallel_size: 1
    enable_attention_dp: false
    enable_chunked_prefill: false
-    build_config:
-      max_batch_size: 2
-      max_num_tokens: 8192
-      max_seq_len: 8192
    kv_cache_config:
      enable_block_reuse: true
      free_gpu_memory_fraction: 0.7
@@ -42,10 +38,6 @@ data:
    moe_tensor_parallel_size: 1
    enable_attention_dp: false
    enable_chunked_prefill: false
-    build_config:
-      max_batch_size: 512
-      max_num_tokens: 1024
-      max_seq_len: 8192
    kv_cache_config:
      enable_block_reuse: false
      free_gpu_memory_fraction: 0.95
@@ -127,9 +119,11 @@ spec:
              python3 -m dynamo.trtllm \
                --model-path "${MODEL_PATH}" \
                --served-model-name "Qwen/Qwen3-235B-A22B-FP8" \
+                --max-batch-size 2 \
+                --max-num-tokens 8192 \
+                --max-seq-len 8192 \
                --extra-engine-args "${ENGINE_ARGS}" \
-                --disaggregation-mode prefill \
-                --disaggregation-strategy prefill_first
+                --disaggregation-mode prefill
          volumeMounts:
            - name: prefill-config
              mountPath: /engine_configs
@@ -180,9 +174,11 @@ spec:
              python3 -m dynamo.trtllm \
                --model-path "${MODEL_PATH}" \
                --served-model-name "Qwen/Qwen3-235B-A22B-FP8" \
+                --max-batch-size 512 \
+                --max-num-tokens 1024 \
+                --max-seq-len 8192 \
                --extra-engine-args "${ENGINE_ARGS}" \
-                --disaggregation-mode decode \
-                --disaggregation-strategy prefill_first
+                --disaggregation-mode decode
          volumeMounts:
            - name: decode-config
              mountPath: /engine_configs