Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fc5ebbd1
Unverified
Commit
fc5ebbd1
authored
Aug 23, 2024
by
Kunshang Ji
Committed by
GitHub
Aug 22, 2024
Browse files
[Hardware][Intel GPU] refactor xpu_model_runner for tp (#7712)
parent
c01a6cb2
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
370 additions
and
650 deletions
+370
-650
vllm/executor/ray_xpu_executor.py
vllm/executor/ray_xpu_executor.py
+17
-366
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_model_runner.py
+350
-278
vllm/worker/xpu_worker.py
vllm/worker/xpu_worker.py
+3
-6
No files found.
vllm/executor/ray_xpu_executor.py
View file @
fc5ebbd1
import
asyncio
import
asyncio
import
os
from
typing
import
List
,
Optional
from
collections
import
defaultdict
from
itertools
import
islice
,
repeat
from
typing
import
(
TYPE_CHECKING
,
Any
,
Awaitable
,
Dict
,
List
,
Optional
,
Set
,
Tuple
,
Union
)
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
from
vllm.executor.ray_gpu_executor
import
RayGPUExecutor
,
RayGPUExecutorAsync
ModelConfig
,
ParallelConfig
,
PromptAdapterConfig
,
from
vllm.executor.xpu_executor
import
XPUExecutor
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.executor.distributed_gpu_executor
import
(
# yapf: disable
DistributedGPUExecutor
,
DistributedGPUExecutorAsync
)
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
get_vllm_instance_id
,
make_async
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
make_async
)
if
ray
is
not
None
:
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
if
TYPE_CHECKING
:
from
ray.util.placement_group
import
PlacementGroup
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
# If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
USE_RAY_COMPILED_DAG
=
envs
.
VLLM_USE_RAY_COMPILED_DAG
class
RayXPUExecutor
(
DistributedGPUExecutor
):
uses_ray
:
bool
=
True
def
__init__
(
self
,
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
)
->
None
:
assert
device_config
.
device_type
==
"xpu"
assert
(
not
speculative_config
),
"Speculative decoding not yet supported for XPU backend"
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
self
.
load_config
=
load_config
self
.
lora_config
=
lora_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
prompt_adapter_config
=
prompt_adapter_config
placement_group
=
self
.
parallel_config
.
placement_group
# Disable Ray usage stats collection.
ray_usage
=
os
.
environ
.
get
(
"RAY_USAGE_STATS_ENABLED"
,
"0"
)
if
ray_usage
!=
"1"
:
os
.
environ
[
"RAY_USAGE_STATS_ENABLED"
]
=
"0"
# Create the parallel GPU workers.
self
.
_init_workers_ray
(
placement_group
)
self
.
forward_dag
=
None
if
USE_RAY_COMPILED_DAG
:
self
.
forward_dag
=
self
.
_compiled_ray_dag
(
enable_asyncio
=
False
)
# This is non-None when the execute model loop is running
# in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
self
.
parallel_worker_tasks
:
Optional
[
Union
[
Any
,
Awaitable
[
Any
]]]
=
None
# Updated by implementations that require additional args to be passed
# to the _run_workers execute_model call
self
.
extra_execute_model_run_workers_kwargs
:
Dict
[
str
,
Any
]
=
{}
def
_init_executor
(
self
)
->
None
:
pass
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
"""Determine the number of available KV blocks.
This invokes `determine_num_available_blocks` on each worker and takes
class
RayXPUExecutor
(
RayGPUExecutor
,
XPUExecutor
):
the min of the results, guaranteeing that the selected cache sizes are
compatible with all workers.
Returns:
- Tuple[num_gpu_blocks, num_cpu_blocks]
"""
# Get the maximum number of blocks that can be allocated on GPU and CPU.
num_blocks
=
self
.
_run_workers
(
"determine_num_available_blocks"
,
)
# Since we use a shared centralized controller, we take the minimum
# number of blocks across all workers to make sure all the memory
# operators can be applied to all workers.
num_gpu_blocks
=
min
(
b
[
0
]
for
b
in
num_blocks
)
num_cpu_blocks
=
min
(
b
[
1
]
for
b
in
num_blocks
)
return
num_gpu_blocks
,
num_cpu_blocks
def
_get_worker_wrapper_args
(
self
)
->
Dict
[
str
,
Any
]:
return
dict
(
worker_module_name
=
"vllm.worker.xpu_worker"
,
worker_class_name
=
"XPUWorker"
,
trust_remote_code
=
self
.
model_config
.
trust_remote_code
,
)
def
_init_workers_ray
(
self
,
placement_group
:
"PlacementGroup"
,
**
ray_remote_kwargs
):
if
self
.
parallel_config
.
tensor_parallel_size
==
1
:
# For single GPU case, we use a ray worker with constrained memory.
num_gpus
=
self
.
cache_config
.
gpu_memory_utilization
else
:
# Otherwise, the ray workers are allocated with a full GPU.
num_gpus
=
1
# The driver dummy worker does not actually use any resources.
# It holds the resource for the driver worker.
self
.
driver_dummy_worker
:
Optional
[
RayWorkerWrapper
]
=
None
# The remaining workers are the actual ray actors.
self
.
workers
:
List
[
RayWorkerWrapper
]
=
[]
# Create the workers.
driver_ip
=
get_ip
()
worker_wrapper_kwargs
=
self
.
_get_worker_wrapper_args
()
for
bundle_id
,
bundle
in
enumerate
(
placement_group
.
bundle_specs
):
if
not
bundle
.
get
(
"GPU"
,
0
):
continue
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
placement_group
,
placement_group_capture_child_tasks
=
True
,
placement_group_bundle_index
=
bundle_id
,
)
worker
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
num_gpus
,
scheduling_strategy
=
scheduling_strategy
,
**
ray_remote_kwargs
,
)(
RayWorkerWrapper
).
remote
(
**
worker_wrapper_kwargs
)
worker_ip
=
ray
.
get
(
worker
.
get_node_ip
.
remote
())
if
worker_ip
==
driver_ip
and
self
.
driver_dummy_worker
is
None
:
# If the worker is on the same node as the driver, we use it
# as the resource holder for the driver process.
self
.
driver_dummy_worker
=
worker
self
.
driver_worker
=
RayWorkerWrapper
(
**
worker_wrapper_kwargs
)
else
:
# Else, added to the list of workers.
self
.
workers
.
append
(
worker
)
if
self
.
driver_dummy_worker
is
None
:
raise
ValueError
(
"Ray does not allocate any GPUs on the driver node. Consider "
"adjusting the Ray placement group or running the driver on a "
"GPU node."
)
def
_get_env_vars_to_be_updated
(
self
):
# Get the set of GPU IDs used on each node.
# Get the set of GPU IDs used on each node.
worker_node_and_gpu_ids
=
self
.
_run_workers
(
"get_node_and_gpu_ids"
,
worker_node_and_gpu_ids
=
self
.
_run_workers
(
"get_node_and_gpu_ids"
,
use_dummy_driver
=
True
)
use_dummy_driver
=
True
)
node_workers
=
defaultdict
(
list
)
VLLM_INSTANCE_ID
=
get_vllm_instance_id
()
node_gpus
=
defaultdict
(
list
)
for
i
,
(
node_id
,
gpu_ids
)
in
enumerate
(
worker_node_and_gpu_ids
):
node_workers
[
node_id
].
append
(
i
)
node_gpus
[
node_id
].
extend
(
gpu_ids
)
for
node_id
,
gpu_ids
in
node_gpus
.
items
():
node_gpus
[
node_id
]
=
sorted
(
gpu_ids
)
# TODO: add env var for xpu
distributed_init_method
=
get_distributed_init_method
(
driver_ip
,
get_open_port
())
def
collect_arg_helper_func
(
**
kwargs
):
# avoid writing `{"name": value}` manually
return
kwargs
init_worker_all_kwargs
=
[]
# Initialize the actual workers inside worker wrapper.
for
rank
,
(
node_id
,
_
)
in
enumerate
(
worker_node_and_gpu_ids
,
):
local_rank
=
node_workers
[
node_id
].
index
(
rank
)
init_worker_all_kwargs
.
append
(
collect_arg_helper_func
(
model_config
=
self
.
model_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
device_config
=
self
.
device_config
,
cache_config
=
self
.
cache_config
,
load_config
=
self
.
load_config
,
local_rank
=
local_rank
,
rank
=
rank
,
distributed_init_method
=
distributed_init_method
,
lora_config
=
self
.
lora_config
,
is_driver_worker
=
rank
==
0
,
))
self
.
_run_workers
(
"init_worker"
,
all_kwargs
=
init_worker_all_kwargs
)
self
.
_run_workers
(
"init_device"
)
# Set environment variables for the driver and workers.
self
.
_run_workers
(
all_args_to_update_environment_variables
=
[({
"load_model"
,
"VLLM_INSTANCE_ID"
:
max_concurrent_workers
=
self
.
parallel_config
.
VLLM_INSTANCE_ID
,
max_parallel_loading_workers
,
"VLLM_TRACE_FUNCTION"
:
)
str
(
envs
.
VLLM_TRACE_FUNCTION
),
},
)
for
(
_
,
_
)
in
worker_node_and_gpu_ids
]
return
all_args_to_update_environment_variables
def
initialize_cache
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
)
->
None
:
"""Initialize the KV cache in all workers.
"""
# NOTE: We log here to avoid multiple logs when number of workers is
class
RayXPUExecutorAsync
(
RayXPUExecutor
,
RayGPUExecutorAsync
):
# greater than one. We could log in the engine, but not all executors
# have GPUs.
logger
.
info
(
"# GPU blocks: %d, "
"# CPU blocks: %d"
,
num_gpu_blocks
,
num_cpu_blocks
)
self
.
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
self
.
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
self
.
_run_workers
(
"initialize_cache"
,
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
num_cpu_blocks
)
def
_driver_execute_model
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
List
[
SamplerOutput
]:
"""Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution
loop running in each of the remote workers.
"""
return
self
.
driver_worker
.
execute_method
(
"execute_model"
,
execute_model_req
)
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
assert
lora_request
.
lora_int_id
>
0
,
"lora_id must be greater than 0."
return
self
.
_run_workers
(
"add_lora"
,
lora_request
=
lora_request
,
)
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
self
.
_run_workers
(
"remove_lora"
,
lora_id
=
lora_id
,
)
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
_run_workers
(
"list_loras"
)
def
_run_workers
(
self
,
method
:
str
,
*
args
,
async_run_remote_workers_only
:
bool
=
False
,
all_args
:
Optional
[
List
[
Tuple
[
Any
,
...]]]
=
None
,
all_kwargs
:
Optional
[
List
[
Dict
[
str
,
Any
]]]
=
None
,
use_dummy_driver
:
bool
=
False
,
max_concurrent_workers
:
Optional
[
int
]
=
None
,
**
kwargs
,
)
->
Any
:
"""Runs the given method on all workers. Can be used in the following
ways:
- args/kwargs: All workers share the same args/kwargs
- args/kwargs and driver_args/driver_kwargs: Driver worker has
different args
- all_args/all_kwargs: args/kwargs for each worker are specified
individually
"""
if
max_concurrent_workers
:
raise
NotImplementedError
(
"max_concurrent_workers is not supported yet."
)
count
=
len
(
self
.
workers
)
all_worker_args
=
repeat
(
args
,
count
)
if
all_args
is
None
\
else
islice
(
all_args
,
1
,
None
)
all_worker_kwargs
=
repeat
(
kwargs
,
count
)
if
all_kwargs
is
None
\
else
islice
(
all_kwargs
,
1
,
None
)
# Start the ray workers first.
ray_worker_outputs
=
[
worker
.
execute_method
.
remote
(
method
,
*
worker_args
,
**
worker_kwargs
)
for
(
worker
,
worker_args
,
worker_kwargs
)
in
zip
(
self
.
workers
,
all_worker_args
,
all_worker_kwargs
)
]
if
async_run_remote_workers_only
:
# Just return futures
return
ray_worker_outputs
driver_worker_output
=
[]
driver_args
=
args
if
all_args
is
None
else
all_args
[
0
]
driver_kwargs
=
kwargs
if
all_kwargs
is
None
else
all_kwargs
[
0
]
# Start the driver worker after all the ray workers.
if
not
use_dummy_driver
:
driver_worker_output
=
self
.
driver_worker
.
execute_method
(
method
,
*
driver_args
,
**
driver_kwargs
)
else
:
assert
self
.
driver_dummy_worker
is
not
None
driver_worker_output
=
ray
.
get
(
self
.
driver_dummy_worker
.
execute_method
.
remote
(
method
,
*
driver_args
,
**
driver_kwargs
))
# Get the results of the ray workers.
if
self
.
workers
:
ray_worker_outputs
=
ray
.
get
(
ray_worker_outputs
)
return
driver_worker_output
+
ray_worker_outputs
def
_wait_for_tasks_completion
(
self
,
parallel_worker_tasks
:
Any
)
->
None
:
"""Wait for futures returned from _run_workers() with
async_run_remote_workers_only to complete."""
ray
.
get
(
parallel_worker_tasks
)
def
_compiled_ray_dag
(
self
,
enable_asyncio
:
bool
):
import
pkg_resources
from
packaging
import
version
required_version
=
version
.
parse
(
"2.32"
)
current_version
=
version
.
parse
(
pkg_resources
.
get_distribution
(
"ray"
).
version
)
if
current_version
<
required_version
:
raise
ValueError
(
f
"Ray version
{
required_version
}
or greater is "
f
"required, but found
{
current_version
}
"
)
from
ray.dag
import
InputNode
,
MultiOutputNode
assert
self
.
parallel_config
.
use_ray
# Right now, compiled DAG requires at least 1 arg. We send
# a dummy value for now. It will be fixed soon.
with
InputNode
()
as
input_data
:
forward_dag
=
MultiOutputNode
([
worker
.
execute_model_compiled_dag_remote
.
bind
(
# type: ignore[attr-defined]
input_data
)
for
worker
in
self
.
workers
])
return
forward_dag
.
experimental_compile
(
enable_asyncio
=
enable_asyncio
)
def
check_health
(
self
)
->
None
:
"""Raises an error if engine is unhealthy."""
self
.
_check_if_any_actor_is_dead
()
def
_check_if_any_actor_is_dead
(
self
):
if
not
self
.
workers
:
return
dead_actors
=
[]
for
actor
in
self
.
workers
:
actor_state
=
ray
.
state
.
actors
(
actor
.
_ray_actor_id
.
hex
())
# pylint: disable=protected-access
if
actor_state
[
"State"
]
==
"DEAD"
:
dead_actors
.
append
(
actor
)
if
dead_actors
:
raise
RuntimeError
(
"At least one Worker is dead. "
f
"Dead Workers:
{
dead_actors
}
. "
)
class
RayXPUExecutorAsync
(
RayXPUExecutor
,
DistributedGPUExecutorAsync
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
driver_exec_method
=
make_async
(
self
.
driver_worker
.
execute_method
)
self
.
driver_exec_method
=
make_async
(
self
.
driver_worker
.
execute_method
)
self
.
pp_locks
:
Optional
[
List
[
asyncio
.
Lock
]]
=
None
async
def
_driver_execute_model_async
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
List
[
SamplerOutput
]:
return
await
self
.
driver_exec_method
(
"execute_model"
,
execute_model_req
)
async
def
_start_worker_execution_loop
(
self
):
coros
=
[
worker
.
execute_method
.
remote
(
"start_worker_execution_loop"
)
for
worker
in
self
.
workers
]
return
await
asyncio
.
gather
(
*
coros
)
vllm/worker/xpu_model_runner.py
View file @
fc5ebbd1
This diff is collapsed.
Click to expand it.
vllm/worker/xpu_worker.py
View file @
fc5ebbd1
...
@@ -9,8 +9,8 @@ import torch
...
@@ -9,8 +9,8 @@ import torch
import
torch.distributed
import
torch.distributed
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ObservabilityConfig
,
ModelConfig
,
ObservabilityConfig
,
ParallelConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
SpeculativeConfig
)
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
init_distributed_environment
)
...
@@ -46,7 +46,6 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
...
@@ -46,7 +46,6 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
rank
:
int
,
rank
:
int
,
distributed_init_method
:
str
,
distributed_init_method
:
str
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
multimodal_config
:
Optional
[
MultiModalConfig
]
=
None
,
speculative_config
:
Optional
[
SpeculativeConfig
]
=
None
,
speculative_config
:
Optional
[
SpeculativeConfig
]
=
None
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
]
=
None
,
is_driver_worker
:
bool
=
False
,
is_driver_worker
:
bool
=
False
,
...
@@ -73,8 +72,6 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
...
@@ -73,8 +72,6 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
assert
rank
%
parallel_config
.
tensor_parallel_size
==
0
,
\
assert
rank
%
parallel_config
.
tensor_parallel_size
==
0
,
\
"Driver worker should be rank 0 of tensor parallel group."
"Driver worker should be rank 0 of tensor parallel group."
self
.
multimodal_config
=
multimodal_config
self
.
model_runner
=
XPUModelRunner
(
# type: ignore
self
.
model_runner
=
XPUModelRunner
(
# type: ignore
model_config
,
model_config
,
parallel_config
,
parallel_config
,
...
@@ -85,7 +82,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
...
@@ -85,7 +82,7 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker):
lora_config
=
self
.
lora_config
,
lora_config
=
self
.
lora_config
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
is_driver_worker
=
is_driver_worker
,
is_driver_worker
=
is_driver_worker
,
multimodal_config
=
multimodal
_config
,
observability_config
=
self
.
observability
_config
,
)
)
# Uninitialized cache engine. Will be initialized by
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
# initialize_cache.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment