Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d345f409
Unverified
Commit
d345f409
authored
Nov 23, 2024
by
Zhonghua Deng
Committed by
GitHub
Nov 22, 2024
Browse files
[V1] EngineCore supports profiling (#10564)
Signed-off-by:
Abatom
<
abzhonghua@gmail.com
>
parent
28598f39
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
68 additions
and
9 deletions
+68
-9
vllm/v1/engine/__init__.py
vllm/v1/engine/__init__.py
+6
-0
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+2
-2
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+12
-2
vllm/v1/engine/core_client.py
vllm/v1/engine/core_client.py
+23
-5
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+25
-0
No files found.
vllm/v1/engine/__init__.py
View file @
d345f409
...
...
@@ -68,6 +68,11 @@ class EngineCoreOutputs(msgspec.Struct,
outputs
:
List
[
EngineCoreOutput
]
@
dataclass
class
EngineCoreProfile
:
is_start
:
bool
class
EngineCoreRequestType
(
enum
.
Enum
):
"""
Request types defined as hex byte strings, so it can be sent over sockets
...
...
@@ -75,3 +80,4 @@ class EngineCoreRequestType(enum.Enum):
"""
ADD
=
b
'
\x00
'
ABORT
=
b
'
\x01
'
PROFILE
=
b
'
\x02
'
vllm/v1/engine/async_llm.py
View file @
d345f409
...
...
@@ -346,10 +346,10 @@ class AsyncLLM(EngineClient):
logger
.
debug
(
"Called check_health."
)
async
def
start_profile
(
self
)
->
None
:
raise
ValueError
(
"Not supported on V1 yet."
)
await
self
.
engine_core
.
profile
(
True
)
async
def
stop_profile
(
self
)
->
None
:
raise
ValueError
(
"Not supported on V1 yet."
)
await
self
.
engine_core
.
profile
(
False
)
@
property
def
is_running
(
self
)
->
bool
:
...
...
vllm/v1/engine/core.py
View file @
d345f409
import
multiprocessing
import
pickle
import
queue
import
threading
import
time
...
...
@@ -16,7 +17,8 @@ from vllm.logger import init_logger
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.v1.core.scheduler
import
Scheduler
from
vllm.v1.engine
import
(
EngineCoreOutput
,
EngineCoreOutputs
,
EngineCoreRequest
,
EngineCoreRequestType
)
EngineCoreProfile
,
EngineCoreRequest
,
EngineCoreRequestType
)
from
vllm.v1.engine.mm_input_mapper
import
MMInputMapper
from
vllm.v1.executor.gpu_executor
import
GPUExecutor
from
vllm.v1.request
import
Request
,
RequestStatus
...
...
@@ -126,6 +128,9 @@ class EngineCore:
scheduler_output
,
output
)
return
engine_core_outputs
def
profile
(
self
,
is_start
=
True
):
self
.
model_executor
.
worker
.
profile
(
is_start
)
class
EngineCoreProc
(
EngineCore
):
"""ZMQ-wrapper for running EngineCore in background process."""
...
...
@@ -312,11 +317,14 @@ class EngineCoreProc(EngineCore):
self
.
_last_logging_time
=
now
def
_handle_client_request
(
self
,
request
:
Union
[
EngineCoreRequest
,
List
[
str
]])
->
None
:
self
,
request
:
Union
[
EngineCoreRequest
,
EngineCoreProfile
,
List
[
str
]])
->
None
:
"""Handle EngineCoreRequest or EngineCoreABORT from Client."""
if
isinstance
(
request
,
EngineCoreRequest
):
self
.
add_request
(
request
)
elif
isinstance
(
request
,
EngineCoreProfile
):
self
.
model_executor
.
worker
.
profile
(
request
.
is_start
)
else
:
# TODO: make an EngineCoreAbort wrapper
assert
isinstance
(
request
,
list
)
...
...
@@ -341,6 +349,8 @@ class EngineCoreProc(EngineCore):
request
=
decoder_add_req
.
decode
(
request_data
)
elif
request_type
==
EngineCoreRequestType
.
ABORT
.
value
:
request
=
decoder_abort_req
.
decode
(
request_data
)
elif
request_type
==
EngineCoreRequestType
.
PROFILE
.
value
:
request
=
pickle
.
loads
(
request_data
)
else
:
raise
ValueError
(
f
"Unknown RequestType:
{
request_type
}
"
)
...
...
vllm/v1/engine/core_client.py
View file @
d345f409
...
...
@@ -9,7 +9,8 @@ import zmq.asyncio
from
vllm.logger
import
init_logger
from
vllm.utils
import
get_open_zmq_ipc_path
from
vllm.v1.engine
import
(
EngineCoreOutput
,
EngineCoreOutputs
,
EngineCoreRequest
,
EngineCoreRequestType
)
EngineCoreProfile
,
EngineCoreRequest
,
EngineCoreRequestType
)
from
vllm.v1.engine.core
import
EngineCore
,
EngineCoreProc
from
vllm.v1.serial_utils
import
PickleEncoder
...
...
@@ -58,6 +59,9 @@ class EngineCoreClient:
def
add_request
(
self
,
request
:
EngineCoreRequest
)
->
None
:
raise
NotImplementedError
async
def
profile
(
self
,
is_start
=
True
)
->
None
:
raise
NotImplementedError
def
abort_requests
(
self
,
request_ids
:
List
[
str
])
->
None
:
raise
NotImplementedError
...
...
@@ -95,6 +99,9 @@ class InprocClient(EngineCoreClient):
def
abort_requests
(
self
,
request_ids
:
List
[
str
])
->
None
:
self
.
engine_core
.
abort_requests
(
request_ids
)
async
def
profile
(
self
,
is_start
=
True
)
->
None
:
self
.
engine_core
.
profile
(
is_start
)
class
MPClient
(
EngineCoreClient
):
"""
...
...
@@ -177,8 +184,10 @@ class SyncMPClient(MPClient):
engine_core_outputs
=
self
.
decoder
.
decode
(
frame
.
buffer
).
outputs
return
engine_core_outputs
def
_send_input
(
self
,
request_type
:
EngineCoreRequestType
,
request
:
Union
[
EngineCoreRequest
,
List
[
str
]])
->
None
:
def
_send_input
(
self
,
request_type
:
EngineCoreRequestType
,
request
:
Union
[
EngineCoreRequest
,
EngineCoreProfile
,
List
[
str
]])
->
None
:
# (RequestType, SerializedRequest)
msg
=
(
request_type
.
value
,
self
.
encoder
.
encode
(
request
))
...
...
@@ -190,6 +199,10 @@ class SyncMPClient(MPClient):
def
abort_requests
(
self
,
request_ids
:
List
[
str
])
->
None
:
self
.
_send_input
(
EngineCoreRequestType
.
ABORT
,
request_ids
)
async
def
profile
(
self
,
is_start
=
True
)
->
None
:
self
.
_send_input
(
EngineCoreRequestType
.
PROFILE
,
EngineCoreProfile
(
is_start
))
class
AsyncMPClient
(
MPClient
):
"""Asyncio-compatible client for multi-proc EngineCore."""
...
...
@@ -205,8 +218,9 @@ class AsyncMPClient(MPClient):
return
engine_core_outputs
async
def
_send_input
(
self
,
request_type
:
EngineCoreRequestType
,
request
:
Union
[
EngineCoreRequest
,
List
[
str
]])
->
None
:
self
,
request_type
:
EngineCoreRequestType
,
request
:
Union
[
EngineCoreRequest
,
EngineCoreProfile
,
List
[
str
]])
->
None
:
msg
=
(
request_type
.
value
,
self
.
encoder
.
encode
(
request
))
await
self
.
input_socket
.
send_multipart
(
msg
,
copy
=
False
)
...
...
@@ -217,3 +231,7 @@ class AsyncMPClient(MPClient):
async
def
abort_requests_async
(
self
,
request_ids
:
List
[
str
])
->
None
:
if
len
(
request_ids
)
>
0
:
await
self
.
_send_input
(
EngineCoreRequestType
.
ABORT
,
request_ids
)
async
def
profile
(
self
,
is_start
=
True
)
->
None
:
await
self
.
_send_input
(
EngineCoreRequestType
.
PROFILE
,
EngineCoreProfile
(
is_start
))
vllm/v1/worker/gpu_worker.py
View file @
d345f409
...
...
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Optional, Tuple
import
torch
import
torch.distributed
import
vllm.envs
as
envs
from
vllm.config
import
CacheConfig
,
ModelConfig
,
ParallelConfig
,
VllmConfig
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
,
...
...
@@ -56,6 +57,22 @@ class Worker:
init_cached_hf_modules
()
self
.
model_runner
=
GPUModelRunner
(
vllm_config
)
# Torch profiler. Enabled and configured through env vars:
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if
envs
.
VLLM_TORCH_PROFILER_DIR
:
torch_profiler_trace_dir
=
envs
.
VLLM_TORCH_PROFILER_DIR
logger
.
info
(
"Profiling enabled. Traces will be saved to: %s"
,
torch_profiler_trace_dir
)
self
.
profiler
=
torch
.
profiler
.
profile
(
activities
=
[
torch
.
profiler
.
ProfilerActivity
.
CPU
,
torch
.
profiler
.
ProfilerActivity
.
CUDA
,
],
with_stack
=
True
,
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
torch_profiler_trace_dir
,
use_gzip
=
True
))
else
:
self
.
profiler
=
None
def
initialize
(
self
):
if
self
.
device_config
.
device
.
type
==
"cuda"
:
...
...
@@ -184,6 +201,14 @@ class Worker:
# TODO(woosuk): Send the output to the engine process.
return
output
def
profile
(
self
,
is_start
=
True
):
if
self
.
profiler
is
None
:
raise
RuntimeError
(
"Profiler is not enabled."
)
if
is_start
:
self
.
profiler
.
start
()
else
:
self
.
profiler
.
stop
()
def
init_worker_distributed_environment
(
parallel_config
:
ParallelConfig
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment