Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
46e678bc
Unverified
Commit
46e678bc
authored
Apr 23, 2025
by
Woosuk Kwon
Committed by
GitHub
Apr 23, 2025
Browse files
[Minor] Use larger batch sizes for A100/B100/B200/MI300x (#17073)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
6b2427f9
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
5 additions
and
5 deletions
+5
-5
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+5
-5
No files found.
vllm/engine/arg_utils.py
View file @
46e678bc
...
@@ -35,7 +35,7 @@ from vllm.reasoning import ReasoningParserManager
...
@@ -35,7 +35,7 @@ from vllm.reasoning import ReasoningParserManager
from
vllm.test_utils
import
MODEL_WEIGHTS_S3_BUCKET
,
MODELS_ON_S3
from
vllm.test_utils
import
MODEL_WEIGHTS_S3_BUCKET
,
MODELS_ON_S3
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
FlexibleArgumentParser
,
is_in_ray_actor
from
vllm.utils
import
FlexibleArgumentParser
,
GiB_bytes
,
is_in_ray_actor
# yapf: enable
# yapf: enable
...
@@ -1625,13 +1625,13 @@ class EngineArgs:
...
@@ -1625,13 +1625,13 @@ class EngineArgs:
# values for non-H100/H200 GPUs.
# values for non-H100/H200 GPUs.
try
:
try
:
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
device_
na
me
=
current_platform
.
get_device_
name
().
lower
()
device_me
mory
=
current_platform
.
get_device_
total_memory
()
except
Exception
:
except
Exception
:
# This is only used to set default_max_num_batched_tokens
# This is only used to set default_max_num_batched_tokens
device_
name
=
"no-device"
device_
memory
=
0
if
"h100"
in
device_
na
me
or
"h200"
in
device_name
:
if
device_me
m
or
y
>=
70
*
GiB_bytes
:
# For H100 and
H200, we
use larger default values.
# For
GPUs like
H100 and
MI300x,
use larger default values.
default_max_num_batched_tokens
=
{
default_max_num_batched_tokens
=
{
UsageContext
.
LLM_CLASS
:
16384
,
UsageContext
.
LLM_CLASS
:
16384
,
UsageContext
.
OPENAI_API_SERVER
:
8192
,
UsageContext
.
OPENAI_API_SERVER
:
8192
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment