Unverified Commit 48733546 authored by Jacky's avatar Jacky Committed by GitHub
Browse files

docs: Benchmarking guide updates (#678)

parent a2709c62
...@@ -26,6 +26,15 @@ This guide provides detailed steps on benchmarking Large Language Models (LLMs) ...@@ -26,6 +26,15 @@ This guide provides detailed steps on benchmarking Large Language Models (LLMs)
H100 80GB x8 node(s) are required for benchmarking. H100 80GB x8 node(s) are required for benchmarking.
> [!NOTE]
> This guide was tested on node(s) with the following hardware configuration:
> * **GPUs**: 8xH100 80GB HBM3 (GPU Memory Bandwidth 3.2 TBs)
> * **CPU**: 2x Intel Saphire Rapids, Intel(R) Xeon(R) Platinum 8480CL E5, 112 cores (56 cores per CPU), 2.00 GHz (Base), 3.8 Ghz (Max boost), PCIe Gen5
> * **NVLink**: NVLink 4th Generation, 900 GB/s (GPU to GPU NVLink bidirectional bandwidth), 18 Links per GPU
> * **InfiniBand**: 8X400Gbit/s (Compute Links), 2X400Gbit/s (Storage Links)
>
> Benchmarking with a different hardware configuration may yield suboptimal results.
1\. Build benchmarking image 1\. Build benchmarking image
```bash ```bash
./container/build.sh ./container/build.sh
...@@ -43,7 +52,7 @@ docker compose -f deploy/docker_compose.yml up -d ...@@ -43,7 +52,7 @@ docker compose -f deploy/docker_compose.yml up -d
## Disaggregated Single Node Benchmarking ## Disaggregated Single Node Benchmarking
*One H100 80GB x8 node is required for this setup.* One H100 80GB x8 node is required for this setup.
In the following setup we compare Dynamo disaggregated vLLM performance to In the following setup we compare Dynamo disaggregated vLLM performance to
[native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on a single node. These were chosen to optimize [native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on a single node. These were chosen to optimize
...@@ -72,12 +81,7 @@ Collect the performance numbers as shown on the [Collecting Performance Numbers] ...@@ -72,12 +81,7 @@ Collect the performance numbers as shown on the [Collecting Performance Numbers]
## Disaggregated Multi Node Benchmarking ## Disaggregated Multi Node Benchmarking
*Two H100 80GB x8 nodes are required for this setup.* Two H100 80GB x8 nodes are required for this setup.
> [!Note]
> Nodes used for benchmarking were part of a cluster connected via InfiniBand
> NDR with 8 connections for compute and 2 for storage. Both fabrics were on
> their own fat tree non-blocking topology.
In the following steps we compare Dynamo disaggregated vLLM performance to In the following steps we compare Dynamo disaggregated vLLM performance to
[native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on two nodes. These were chosen to optimize [native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on two nodes. These were chosen to optimize
......
...@@ -13,6 +13,16 @@ ...@@ -13,6 +13,16 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
Common:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
router: round-robin
# Number of tokens in a batch for more efficient chunked transfers to GPUs.
block-size: 128
max-model-len: 3500
max-num-batched-tokens: 3500
disable-log-requests: true
Frontend: Frontend:
# This model was chosen for its 70B size and FP8 precision, which the TP and # This model was chosen for its 70B size and FP8 precision, which the TP and
# DP configurations were tuned for its size, and its precision reduces model # DP configurations were tuned for its size, and its precision reduces model
...@@ -22,23 +32,17 @@ Frontend: ...@@ -22,23 +32,17 @@ Frontend:
port: 8000 port: 8000
Processor: Processor:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic common-configs: [model, router]
router: round-robin
# x1 process with 4 GPUs generating output tokens (the "decode" phase). # x1 process with 4 GPUs generating output tokens (the "decode" phase).
VllmWorker: VllmWorker:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
# Number of tokens in a batch for more efficient chunked transfers to GPUs.
block-size: 128
max-model-len: 3500
# Enable prefill at different workers. # Enable prefill at different workers.
remote-prefill: true remote-prefill: true
# Disable local prefill so only disaggregated prefill is used. # Disable local prefill so only disaggregated prefill is used.
conditional-disagg: false conditional-disagg: false
tensor-parallel-size: 4
gpu-memory-utilization: 0.95 gpu-memory-utilization: 0.95
disable-log-requests: true tensor-parallel-size: 4
ServiceArgs: ServiceArgs:
workers: 1 workers: 1
resources: resources:
...@@ -46,14 +50,8 @@ VllmWorker: ...@@ -46,14 +50,8 @@ VllmWorker:
# x4 processes each with 1 GPU handling the initial prefill (context embedding) phase. # x4 processes each with 1 GPU handling the initial prefill (context embedding) phase.
PrefillWorker: PrefillWorker:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, gpu-memory-utilization, disable-log-requests]
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 128
max-model-len: 3500
max-num-batched-tokens: 3500
tensor-parallel-size: 1 tensor-parallel-size: 1
gpu-memory-utilization: 0.95
disable-log-requests: true
ServiceArgs: ServiceArgs:
workers: 4 workers: 4
resources: resources:
......
...@@ -13,15 +13,9 @@ ...@@ -13,15 +13,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
Frontend: Common:
served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
block-size: 128 kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
max-model-len: 3500
# Routing policy determines how remote workers are selected for processing # Routing policy determines how remote workers are selected for processing
# prefill requests # prefill requests
# 1. random: randomly select workers for prefill requests # 1. random: randomly select workers for prefill requests
...@@ -31,39 +25,43 @@ Processor: ...@@ -31,39 +25,43 @@ Processor:
# 3. kv: finding prefill workers by KV cache is not beneficial when caching is # 3. kv: finding prefill workers by KV cache is not beneficial when caching is
# disabled on this setup # disabled on this setup
router: round-robin router: round-robin
# Number of tokens in a batch for more efficient chunked transfers to GPUs.
block-size: 128
max-model-len: 3500
max-num-batched-tokens: 3500
disable-log-requests: true
Frontend:
served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
common-configs: [model, block-size, max-model-len, router]
Router: Router:
model-name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic common-configs: [model]
min-workers: 1 min-workers: 1
VllmWorker: VllmWorker:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 128
max-model-len: 3500
# Enable prefill at different workers. # Enable prefill at different workers.
remote-prefill: true remote-prefill: true
# Disable local prefill so only disaggregated prefill is used. # Disable local prefill so only disaggregated prefill is used.
conditional-disagg: false conditional-disagg: false
# The GPU memory utilization do not have to match between VllmWorker and PrefillWorker.
gpu-memory-utilization: 0.95
# TP size is doubled from single node setup # TP size is doubled from single node setup
tensor-parallel-size: 8 tensor-parallel-size: 8
gpu-memory-utilization: 0.95
disable-log-requests: true
router: round-robin
ServiceArgs: ServiceArgs:
workers: 1 workers: 1
resources: resources:
gpu: 8 gpu: 8
PrefillWorker: PrefillWorker:
model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, disable-log-requests]
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 128
max-model-len: 3500
max-num-batched-tokens: 3500
tensor-parallel-size: 1
gpu-memory-utilization: 0.95 gpu-memory-utilization: 0.95
disable-log-requests: true tensor-parallel-size: 1
ServiceArgs: ServiceArgs:
# DP size is doubled from single node setup # DP size is doubled from single node setup
workers: 8 workers: 8
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment