Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9b018700
Unverified
Commit
9b018700
authored
Aug 27, 2025
by
Li, Jiang
Committed by
GitHub
Aug 26, 2025
Browse files
[Bugfix] Fix cuda event usage with CPU model runner (#23643)
Signed-off-by:
jiang1.li
<
jiang1.li@intel.com
>
parent
44ac25ea
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
26 additions
and
4 deletions
+26
-4
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+25
-3
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-1
No files found.
vllm/v1/worker/cpu_model_runner.py
View file @
9b018700
...
...
@@ -11,6 +11,7 @@ from vllm.logger import init_logger
from
vllm.model_executor.model_loader
import
get_model
from
vllm.v1.attention.backends.cpu_attn
import
TorchSDPAMetadataBuilderV1
from
vllm.v1.worker.gpu_model_runner
import
GPUModelRunner
from
vllm.v1.worker.utils
import
CpuGpuBuffer
if
TYPE_CHECKING
:
from
vllm.v1.core.sched.output
import
SchedulerOutput
...
...
@@ -21,6 +22,7 @@ logger = init_logger(__name__)
class
CPUModelRunner
(
GPUModelRunner
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
device
:
torch
.
device
):
with
_torch_cuda_wrapper
():
super
().
__init__
(
vllm_config
,
device
)
assert
device
==
torch
.
device
(
"cpu"
)
...
...
@@ -71,8 +73,8 @@ class CPUModelRunner(GPUModelRunner):
setattr
(
obj
,
device_attr_name
,
cpu_tensor
)
for
k
,
v
in
vars
(
self
).
items
():
if
k
.
endswith
(
"_cpu"
)
and
isinstance
(
v
,
torch
.
Tenso
r
):
replace_tensor
(
self
,
k
,
k
[:
-
4
])
if
isinstance
(
v
,
CpuGpuBuffe
r
):
v
.
gpu
=
v
.
cpu
for
k
,
v
in
vars
(
self
.
input_batch
).
items
():
if
k
.
endswith
(
"_cpu_tensor"
)
and
isinstance
(
v
,
torch
.
Tensor
):
...
...
@@ -108,6 +110,26 @@ class CPUModelRunner(GPUModelRunner):
def
_sync_device
(
self
)
->
None
:
pass
def
_to_list
(
self
,
sampled_token_ids
:
torch
.
Tensor
)
->
list
[
list
[
int
]]:
return
sampled_token_ids
.
tolist
()
@
contextmanager
def
_torch_cuda_wrapper
():
class
_EventPlaceholder
:
def
__init__
(
self
,
*
args
,
**
kwargs
)
->
None
:
self
.
record
=
lambda
:
None
self
.
synchronize
=
lambda
:
None
try
:
cuda_event
=
torch
.
cuda
.
Event
torch
.
cuda
.
Event
=
_EventPlaceholder
yield
finally
:
torch
.
cuda
.
Event
=
cuda_event
@
contextmanager
def
_set_global_compilation_settings
(
config
:
VllmConfig
):
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
9b018700
...
...
@@ -321,7 +321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
(
self
.
max_model_len
,
1
),
dtype
=
torch
.
int64
,
device
=
"cpu"
,
pin_memory
=
True
)
pin_memory
=
self
.
pin_memory
)
def
_make_buffer
(
self
,
*
args
,
dtype
:
torch
.
dtype
)
->
CpuGpuBuffer
:
return
CpuGpuBuffer
(
*
args
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment