Update default max_num_batch_tokens for chunked prefill to 2048 (#10544)

02a43f82 · Michael Goin · GitHub · cfea9c04 · 02a43f82
Unverified Commit 02a43f82 authored Nov 23, 2024 by Michael Goin Committed by GitHub Nov 22, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

vllm/config.py vllm/config.py +3 -3

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1133,9 +1133,9 @@ class SchedulerConfig:
                    # max_num_batched_tokens.
                    self.max_num_batched_tokens = max(self.max_model_len, 2048)
                else:
-                    # It is the values that have the best balance between ITL
-                    # and TTFT on A100. Note it is not optimized for throughput.
-                    self.max_num_batched_tokens = 512
+                    # This value is chosen to have a balance between ITL
+                    # and TTFT. Note it is not optimized for throughput.
+                    self.max_num_batched_tokens = 2048
            else:
                # If max_model_len is too short, use 2048 as the default value
                # for higher throughput.