Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1543914c
Unverified
Commit
1543914c
authored
Jan 03, 2025
by
Robert Shaw
Committed by
GitHub
Jan 03, 2025
Browse files
[V1] Improve TP>1 Error Handling + Stack Trace (#11721)
Co-authored-by:
Tyler Michael Smith
<
tyler@neuralmagic.com
>
parent
61fed92c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
40 additions
and
21 deletions
+40
-21
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+0
-16
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+1
-1
vllm/v1/engine/core_client.py
vllm/v1/engine/core_client.py
+18
-1
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/multiproc_executor.py
+21
-3
No files found.
vllm/v1/engine/async_llm.py
View file @
1543914c
import
asyncio
import
os
import
signal
from
typing
import
AsyncGenerator
,
Dict
,
List
,
Mapping
,
Optional
,
Type
,
Union
from
vllm.config
import
ModelConfig
,
VllmConfig
...
...
@@ -42,21 +41,6 @@ class AsyncLLM(EngineClient):
start_engine_loop
:
bool
=
True
,
)
->
None
:
# The child processes will send SIGQUIT when unrecoverable
# errors happen. We kill the process tree here so that the
# stack trace is very evident.
# TODO: rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.
def
sigquit_handler
(
signum
,
frame
):
logger
.
fatal
(
"AsyncLLM got SIGQUIT from worker processes, shutting "
"down. See stack trace above for root cause issue."
)
kill_process_tree
(
os
.
getpid
())
signal
.
signal
(
signal
.
SIGQUIT
,
sigquit_handler
)
assert
start_engine_loop
self
.
log_requests
=
log_requests
...
...
vllm/v1/engine/core.py
View file @
1543914c
...
...
@@ -198,7 +198,7 @@ class EngineCoreProc(EngineCore):
except
Exception
:
traceback
=
get_exception_traceback
()
logger
.
error
(
"EngineCore hit an exception: %s"
,
traceback
)
parent_process
.
send_signal
(
signal
.
SIG
QUIT
)
parent_process
.
send_signal
(
signal
.
SIG
USR1
)
finally
:
if
engine_core
is
not
None
:
...
...
vllm/v1/engine/core_client.py
View file @
1543914c
import
os
import
signal
import
weakref
from
abc
import
ABC
,
abstractmethod
from
typing
import
List
,
Type
...
...
@@ -8,7 +10,8 @@ import zmq.asyncio
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.utils
import
get_open_zmq_ipc_path
,
make_zmq_socket
from
vllm.utils
import
(
get_open_zmq_ipc_path
,
kill_process_tree
,
make_zmq_socket
)
from
vllm.v1.engine
import
(
EngineCoreOutput
,
EngineCoreOutputs
,
EngineCoreProfile
,
EngineCoreRequest
,
EngineCoreRequestType
,
EngineCoreRequestUnion
)
...
...
@@ -134,6 +137,20 @@ class MPClient(EngineCoreClient):
executor_class
:
Type
[
Executor
],
log_stats
:
bool
=
False
,
):
# The child processes will send SIGUSR1 when unrecoverable
# errors happen. We kill the process tree here so that the
# stack trace is very evident.
# TODO(rob): rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.
def
sigusr1_handler
(
signum
,
frame
):
logger
.
fatal
(
"Got fatal signal from worker processes, shutting "
"down. See stack trace above for root cause issue."
)
kill_process_tree
(
os
.
getpid
())
signal
.
signal
(
signal
.
SIGUSR1
,
sigusr1_handler
)
# Serialization setup.
self
.
encoder
=
PickleEncoder
()
self
.
decoder
=
msgspec
.
msgpack
.
Decoder
(
EngineCoreOutputs
)
...
...
vllm/v1/executor/multiproc_executor.py
View file @
1543914c
...
...
@@ -9,6 +9,7 @@ from enum import Enum, auto
from
multiprocessing.process
import
BaseProcess
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
import
psutil
import
zmq
from
vllm.config
import
VllmConfig
...
...
@@ -38,6 +39,19 @@ class MultiprocExecutor(Executor):
# and ensure workers will be terminated.
self
.
_finalizer
=
weakref
.
finalize
(
self
,
self
.
shutdown
)
# The child processes will send SIGUSR1 when unrecoverable
# errors happen.
def
sigusr1_handler
(
signum
,
frame
):
logger
.
fatal
(
"MulitprocExecutor got fatal signal from worker processes, "
"shutting down. See stack trace above for root cause issue."
)
# Propagate error up to parent process.
parent_process
=
psutil
.
Process
().
parent
()
parent_process
.
send_signal
(
signal
.
SIGUSR1
)
self
.
shutdown
()
signal
.
signal
(
signal
.
SIGUSR1
,
sigusr1_handler
)
self
.
vllm_config
=
vllm_config
self
.
parallel_config
=
vllm_config
.
parallel_config
...
...
@@ -335,8 +349,11 @@ class WorkerProc:
except
SystemExit
:
logger
.
debug
(
"Worker interrupted."
)
except
BaseException
as
e
:
logger
.
exception
(
e
)
except
Exception
:
# worker_busy_loop sends exceptions exceptons to Executor
# for shutdown, but if there is an error in startup or an
# error with IPC itself, we need to alert the parent.
psutil
.
Process
().
parent
().
send_signal
(
signal
.
SIGUSR1
)
raise
finally
:
...
...
@@ -377,9 +394,10 @@ class WorkerProc:
try
:
output
=
getattr
(
self
.
worker
,
method
)(
*
args
,
**
kwargs
)
except
Base
Exception
as
e
:
except
Exception
as
e
:
self
.
worker_response_mq
.
enqueue
(
(
WorkerProc
.
ResponseStatus
.
FAILURE
,
e
))
logger
.
exception
(
"WorkerProc hit an exception: %s"
,
exc_info
=
e
)
continue
self
.
worker_response_mq
.
enqueue
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment