Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e88db68c
Unverified
Commit
e88db68c
authored
Dec 17, 2024
by
wangxiyuan
Committed by
GitHub
Dec 16, 2024
Browse files
[Platform] platform agnostic for EngineArgs initialization (#11225)
Signed-off-by:
wangxiyuan
<
wangxiyuan1007@gmail.com
>
parent
59c9b6eb
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
37 additions
and
6 deletions
+37
-6
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+2
-6
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+3
-0
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+4
-0
vllm/platforms/hpu.py
vllm/platforms/hpu.py
+6
-0
vllm/platforms/neuron.py
vllm/platforms/neuron.py
+6
-0
vllm/platforms/openvino.py
vllm/platforms/openvino.py
+3
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+4
-0
vllm/platforms/tpu.py
vllm/platforms/tpu.py
+5
-0
vllm/platforms/xpu.py
vllm/platforms/xpu.py
+4
-0
No files found.
vllm/engine/arg_utils.py
View file @
e88db68c
...
...
@@ -112,9 +112,7 @@ class EngineArgs:
pipeline_parallel_size
:
int
=
1
tensor_parallel_size
:
int
=
1
max_parallel_loading_workers
:
Optional
[
int
]
=
None
# NOTE(kzawora): default block size for Gaudi should be 128
# smaller sizes still work, but very inefficiently
block_size
:
int
=
16
if
not
current_platform
.
is_hpu
()
else
128
block_size
:
Optional
[
int
]
=
None
enable_prefix_caching
:
Optional
[
bool
]
=
None
disable_sliding_window
:
bool
=
False
use_v2_block_manager
:
bool
=
True
...
...
@@ -1036,9 +1034,7 @@ class EngineArgs:
self
.
enable_prefix_caching
=
False
cache_config
=
CacheConfig
(
# neuron needs block_size = max_model_len
block_size
=
self
.
block_size
if
self
.
device
!=
"neuron"
else
(
self
.
max_model_len
if
self
.
max_model_len
is
not
None
else
0
),
block_size
=
self
.
block_size
,
gpu_memory_utilization
=
self
.
gpu_memory_utilization
,
swap_space
=
self
.
swap_space
,
cache_dtype
=
self
.
kv_cache_dtype
,
...
...
vllm/platforms/cpu.py
View file @
e88db68c
...
...
@@ -60,6 +60,9 @@ class CpuPlatform(Platform):
cache_config
=
vllm_config
.
cache_config
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
16
kv_cache_space
=
envs
.
VLLM_CPU_KVCACHE_SPACE
if
kv_cache_space
>=
0
:
...
...
vllm/platforms/cuda.py
View file @
e88db68c
...
...
@@ -137,6 +137,10 @@ class CudaPlatformBase(Platform):
else
:
parallel_config
.
worker_cls
=
"vllm.worker.worker.Worker"
cache_config
=
vllm_config
.
cache_config
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
16
# NVML utils
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
...
...
vllm/platforms/hpu.py
View file @
e88db68c
...
...
@@ -48,6 +48,12 @@ class HpuPlatform(Platform):
if
parallel_config
.
worker_cls
==
"auto"
:
parallel_config
.
worker_cls
=
"vllm.worker.hpu_worker.HPUWorker"
# NOTE(kzawora): default block size for Gaudi should be 128
# smaller sizes still work, but very inefficiently
cache_config
=
vllm_config
.
cache_config
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
128
@
classmethod
def
is_pin_memory_available
(
cls
):
logger
.
warning
(
"Pin memory is not supported on HPU."
)
...
...
vllm/platforms/neuron.py
View file @
e88db68c
...
...
@@ -33,6 +33,12 @@ class NeuronPlatform(Platform):
parallel_config
.
worker_cls
=
\
"vllm.worker.neuron_worker.NeuronWorker"
cache_config
=
vllm_config
.
cache_config
if
cache_config
:
# neuron needs block_size = max_model_len
vllm_config
.
cache_config
.
block_size
=
\
vllm_config
.
model_config
.
max_model_len
@
classmethod
def
is_pin_memory_available
(
cls
)
->
bool
:
logger
.
warning
(
"Pin memory is not supported on Neuron."
)
...
...
vllm/platforms/openvino.py
View file @
e88db68c
...
...
@@ -87,6 +87,9 @@ class OpenVinoPlatform(Platform):
# check and update cache config
ov_core
=
ov
.
Core
()
cache_config
=
vllm_config
.
cache_config
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
16
if
envs
.
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION
==
"u8"
:
if
not
OpenVinoPlatform
.
is_openvino_cpu
():
logger
.
info
(
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
...
...
vllm/platforms/rocm.py
View file @
e88db68c
...
...
@@ -84,6 +84,10 @@ class RocmPlatform(Platform):
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
cache_config
=
vllm_config
.
cache_config
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
16
parallel_config
=
vllm_config
.
parallel_config
scheduler_config
=
vllm_config
.
scheduler_config
if
parallel_config
.
worker_cls
==
"auto"
:
...
...
vllm/platforms/tpu.py
View file @
e88db68c
...
...
@@ -46,6 +46,11 @@ class TpuPlatform(Platform):
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
from
vllm.config
import
CompilationLevel
cache_config
=
vllm_config
.
cache_config
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
16
compilation_config
=
vllm_config
.
compilation_config
if
compilation_config
.
level
==
CompilationLevel
.
NO_COMPILATION
:
# TPU does not support NO_COMPILATION
...
...
vllm/platforms/xpu.py
View file @
e88db68c
...
...
@@ -51,6 +51,10 @@ class XPUPlatform(Platform):
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
cache_config
=
vllm_config
.
cache_config
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
16
# check and update model config
model_config
=
vllm_config
.
model_config
if
model_config
.
dtype
==
torch
.
bfloat16
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment