Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
077f0a2e
Unverified
Commit
077f0a2e
authored
Apr 22, 2024
by
Tao He
Committed by
GitHub
Apr 22, 2024
Browse files
[Frontend] Enable support for CPU backend in AsyncLLMEngine. (#3993)
Signed-off-by:
Tao He
<
sighingnow@gmail.com
>
parent
e73ed0f1
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
30 additions
and
2 deletions
+30
-2
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+5
-0
vllm/executor/cpu_executor.py
vllm/executor/cpu_executor.py
+25
-2
No files found.
vllm/engine/async_llm_engine.py
View file @
077f0a2e
...
...
@@ -343,6 +343,11 @@ class AsyncLLMEngine:
if
engine_config
.
device_config
.
device_type
==
"neuron"
:
from
vllm.executor.neuron_executor
import
NeuronExecutorAsync
executor_class
=
NeuronExecutorAsync
elif
engine_config
.
device_config
.
device_type
==
"cpu"
:
assert
not
engine_config
.
parallel_config
.
worker_use_ray
,
(
"Ray is not supported with the CPU backend."
)
from
vllm.executor.cpu_executor
import
CPUExecutorAsync
executor_class
=
CPUExecutorAsync
elif
engine_config
.
parallel_config
.
worker_use_ray
:
initialize_ray_cluster
(
engine_config
.
parallel_config
)
from
vllm.executor.ray_gpu_executor
import
RayGPUExecutorAsync
...
...
vllm/executor/cpu_executor.py
View file @
077f0a2e
...
...
@@ -4,11 +4,12 @@ from typing import Dict, List, Set, Tuple
import
torch
from
vllm.config
import
CacheConfig
,
ModelConfig
,
SchedulerConfig
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.executor.executor_base
import
ExecutorAsyncBase
,
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
SamplerOutput
,
SequenceGroupMetadata
from
vllm.utils
import
get_distributed_init_method
,
get_ip
,
get_open_port
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
logger
=
init_logger
(
__name__
)
...
...
@@ -100,6 +101,28 @@ class CPUExecutor(ExecutorBase):
return
class
CPUExecutorAsync
(
CPUExecutor
,
ExecutorAsyncBase
):
async
def
execute_model_async
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
blocks_to_swap_in
:
Dict
[
int
,
int
],
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
SamplerOutput
:
output
=
await
make_async
(
self
.
driver_worker
.
execute_model
)(
seq_group_metadata_list
=
seq_group_metadata_list
,
blocks_to_swap_in
=
blocks_to_swap_in
,
blocks_to_swap_out
=
blocks_to_swap_out
,
blocks_to_copy
=
blocks_to_copy
)
return
output
async
def
check_health_async
(
self
)
->
None
:
# CPUExecutor will always be healthy as long as
# it's running.
return
def
_verify_and_get_model_config
(
config
:
ModelConfig
)
->
ModelConfig
:
if
config
.
dtype
==
torch
.
float16
:
logger
.
warning
(
"float16 is not supported on CPU, casting to bfloat16."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment