docs: Benchmarking guide updates (#678)

48733546 · Jacky · GitHub · a2709c62 · 48733546 · 48733546
Unverified Commit 48733546 authored Apr 15, 2025 by Jacky Committed by GitHub Apr 16, 2025
3 changed files
--- a/examples/llm/benchmarks/README.md
+++ b/examples/llm/benchmarks/README.md
@@ -26,6 +26,15 @@ This guide provides detailed steps on benchmarking Large Language Models (LLMs)
 H100 80GB x8 node(s) are required for benchmarking.
+> [!NOTE]
+> This guide was tested on node(s) with the following hardware configuration:
+> * **GPUs**: 8xH100 80GB HBM3 (GPU Memory Bandwidth 3.2 TBs)
+> * **CPU**: 2x Intel Saphire Rapids, Intel(R) Xeon(R) Platinum 8480CL E5, 112 cores (56 cores per CPU), 2.00 GHz (Base), 3.8 Ghz (Max boost), PCIe Gen5
+> * **NVLink**: NVLink 4th Generation, 900 GB/s (GPU to GPU NVLink bidirectional bandwidth), 18 Links per GPU
+> * **InfiniBand**: 8X400Gbit/s (Compute Links), 2X400Gbit/s (Storage Links)
+>
+> Benchmarking with a different hardware configuration may yield suboptimal results.
 1\. Build benchmarking image
 ```bash
 ./container/build.sh
@@ -43,7 +52,7 @@ docker compose -f deploy/docker_compose.yml up -d
 ## Disaggregated Single Node Benchmarking
-*One H100 80GB x8 node is required for this setup.*
+One H100 80GB x8 node is required for this setup.
 In the following setup we compare Dynamo disaggregated vLLM performance to
 [native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on a single node. These were chosen to optimize
@@ -72,12 +81,7 @@ Collect the performance numbers as shown on the [Collecting Performance Numbers]
 ## Disaggregated Multi Node Benchmarking
-*Two H100 80GB x8 nodes are required for this setup.*
+Two H100 80GB x8 nodes are required for this setup.
-> [!Note]
-> Nodes used for benchmarking were part of a cluster connected via InfiniBand
-> NDR with 8 connections for compute and 2 for storage. Both fabrics were on
-> their own fat tree non-blocking topology.
 In the following steps we compare Dynamo disaggregated vLLM performance to
 [native vLLM Aggregated Baseline](#vllm-aggregated-baseline-benchmarking) on two nodes. These were chosen to optimize

--- a/examples/llm/benchmarks/disagg.yaml
+++ b/examples/llm/benchmarks/disagg.yaml
@@ -13,6 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+Common:
+  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+  router: round-robin
+  # Number of tokens in a batch for more efficient chunked transfers to GPUs.
+  block-size: 128
+  max-model-len: 3500
+  max-num-batched-tokens: 3500
+  disable-log-requests: true
 Frontend:
  # This model was chosen for its 70B size and FP8 precision, which the TP and
  # DP configurations were tuned for its size, and its precision reduces model
@@ -22,23 +32,17 @@ Frontend:
  port: 8000
 Processor:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  common-configs: [model, router]
-  router: round-robin
 # x1 process with 4 GPUs generating output tokens (the "decode" phase).
 VllmWorker:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  # Number of tokens in a batch for more efficient chunked transfers to GPUs.
-  block-size: 128
-  max-model-len: 3500
  # Enable prefill at different workers.
  remote-prefill: true
  # Disable local prefill so only disaggregated prefill is used.
  conditional-disagg: false
-  tensor-parallel-size: 4
  gpu-memory-utilization: 0.95
-  disable-log-requests: true
+  tensor-parallel-size: 4
  ServiceArgs:
    workers: 1
    resources:
@@ -46,14 +50,8 @@ VllmWorker:
 # x4 processes each with 1 GPU handling the initial prefill (context embedding) phase.
 PrefillWorker:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, gpu-memory-utilization, disable-log-requests]
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  block-size: 128
-  max-model-len: 3500
-  max-num-batched-tokens: 3500
  tensor-parallel-size: 1
-  gpu-memory-utilization: 0.95
-  disable-log-requests: true
  ServiceArgs:
    workers: 4
    resources:

--- a/examples/llm/benchmarks/disagg_multinode.yaml
+++ b/examples/llm/benchmarks/disagg_multinode.yaml
@@ -13,15 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-Frontend:
+Common:
-  served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  endpoint: dynamo.Processor.chat/completions
-  port: 8000
-Processor:
  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
-  block-size: 128
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  max-model-len: 3500
  # Routing policy determines how remote workers are selected for processing
  # prefill requests
  # 1. random: randomly select workers for prefill requests
@@ -31,39 +25,43 @@ Processor:
  # 3. kv: finding prefill workers by KV cache is not beneficial when caching is
  #        disabled on this setup
  router: round-robin
+  # Number of tokens in a batch for more efficient chunked transfers to GPUs.
+  block-size: 128
+  max-model-len: 3500
+  max-num-batched-tokens: 3500
+  disable-log-requests: true
+Frontend:
+  served_model_name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  common-configs: [model, block-size, max-model-len, router]
 Router:
-  model-name: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  common-configs: [model]
  min-workers: 1
 VllmWorker:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  common-configs: [model, kv-transfer-config, router, block-size, max-model-len, disable-log-requests]
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  block-size: 128
-  max-model-len: 3500
  # Enable prefill at different workers.
  remote-prefill: true
  # Disable local prefill so only disaggregated prefill is used.
  conditional-disagg: false
+  # The GPU memory utilization do not have to match between VllmWorker and PrefillWorker.
+  gpu-memory-utilization: 0.95
  # TP size is doubled from single node setup
  tensor-parallel-size: 8
-  gpu-memory-utilization: 0.95
-  disable-log-requests: true
-  router: round-robin
  ServiceArgs:
    workers: 1
    resources:
      gpu: 8
 PrefillWorker:
-  model: neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic
+  common-configs: [model, kv-transfer-config, block-size, max-model-len, max-num-batched-tokens, disable-log-requests]
-  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
-  block-size: 128
-  max-model-len: 3500
-  max-num-batched-tokens: 3500
-  tensor-parallel-size: 1
  gpu-memory-utilization: 0.95
-  disable-log-requests: true
+  tensor-parallel-size: 1
  ServiceArgs:
    # DP size is doubled from single node setup
    workers: 8