Unverified Commit b43a131c authored by Yunzhou Liu's avatar Yunzhou Liu Committed by GitHub
Browse files

docs: update Qwen3-235B-A22B-FP8 recipes (#5254)


Signed-off-by: default avatarElnifio <elnifio0519@gmail.com>
parent 84b5e9b5
...@@ -13,10 +13,6 @@ data: ...@@ -13,10 +13,6 @@ data:
moe_tensor_parallel_size: 1 moe_tensor_parallel_size: 1
enable_attention_dp: false enable_attention_dp: false
enable_chunked_prefill: true enable_chunked_prefill: true
build_config:
max_batch_size: 128
max_num_tokens: 8192
max_seq_len: 8192
kv_cache_config: kv_cache_config:
enable_block_reuse: true enable_block_reuse: true
free_gpu_memory_fraction: 0.8 free_gpu_memory_fraction: 0.8
...@@ -91,6 +87,9 @@ spec: ...@@ -91,6 +87,9 @@ spec:
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \ --model-path "${MODEL_PATH}" \
--served-model-name "Qwen/Qwen3-235B-A22B-FP8" \ --served-model-name "Qwen/Qwen3-235B-A22B-FP8" \
--max-batch-size 128 \
--max-num-tokens 8192 \
--max-seq-len 8192 \
--extra-engine-args "${ENGINE_ARGS}" --extra-engine-args "${ENGINE_ARGS}"
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0 image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
......
...@@ -13,10 +13,6 @@ data: ...@@ -13,10 +13,6 @@ data:
moe_expert_parallel_size: 1 moe_expert_parallel_size: 1
enable_attention_dp: false enable_attention_dp: false
enable_chunked_prefill: false enable_chunked_prefill: false
build_config:
max_batch_size: 2
max_num_tokens: 8192
max_seq_len: 8192
kv_cache_config: kv_cache_config:
enable_block_reuse: true enable_block_reuse: true
free_gpu_memory_fraction: 0.7 free_gpu_memory_fraction: 0.7
...@@ -42,10 +38,6 @@ data: ...@@ -42,10 +38,6 @@ data:
moe_tensor_parallel_size: 1 moe_tensor_parallel_size: 1
enable_attention_dp: false enable_attention_dp: false
enable_chunked_prefill: false enable_chunked_prefill: false
build_config:
max_batch_size: 512
max_num_tokens: 1024
max_seq_len: 8192
kv_cache_config: kv_cache_config:
enable_block_reuse: false enable_block_reuse: false
free_gpu_memory_fraction: 0.95 free_gpu_memory_fraction: 0.95
...@@ -127,9 +119,11 @@ spec: ...@@ -127,9 +119,11 @@ spec:
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \ --model-path "${MODEL_PATH}" \
--served-model-name "Qwen/Qwen3-235B-A22B-FP8" \ --served-model-name "Qwen/Qwen3-235B-A22B-FP8" \
--max-batch-size 2 \
--max-num-tokens 8192 \
--max-seq-len 8192 \
--extra-engine-args "${ENGINE_ARGS}" \ --extra-engine-args "${ENGINE_ARGS}" \
--disaggregation-mode prefill \ --disaggregation-mode prefill
--disaggregation-strategy prefill_first
volumeMounts: volumeMounts:
- name: prefill-config - name: prefill-config
mountPath: /engine_configs mountPath: /engine_configs
...@@ -180,9 +174,11 @@ spec: ...@@ -180,9 +174,11 @@ spec:
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \ --model-path "${MODEL_PATH}" \
--served-model-name "Qwen/Qwen3-235B-A22B-FP8" \ --served-model-name "Qwen/Qwen3-235B-A22B-FP8" \
--max-batch-size 512 \
--max-num-tokens 1024 \
--max-seq-len 8192 \
--extra-engine-args "${ENGINE_ARGS}" \ --extra-engine-args "${ENGINE_ARGS}" \
--disaggregation-mode decode \ --disaggregation-mode decode
--disaggregation-strategy prefill_first
volumeMounts: volumeMounts:
- name: decode-config - name: decode-config
mountPath: /engine_configs mountPath: /engine_configs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment