Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9fccd04e
Unverified
Commit
9fccd04e
authored
Sep 17, 2025
by
Li, Jiang
Committed by
GitHub
Sep 17, 2025
Browse files
[Bugfix] Fix Stream usage in CPU model runner and OneDNN kernel check (#25046)
Signed-off-by:
jiang1.li
<
jiang1.li@intel.com
>
parent
252ada55
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
14 additions
and
1 deletion
+14
-1
csrc/cpu/dnnl_kernels.cpp
csrc/cpu/dnnl_kernels.cpp
+1
-1
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+5
-0
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+8
-0
No files found.
csrc/cpu/dnnl_kernels.cpp
View file @
9fccd04e
...
@@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
...
@@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
CPU_KERNEL_GUARD_IN
(
onednn_mm
)
CPU_KERNEL_GUARD_IN
(
onednn_mm
)
TORCH_CHECK
(
a
.
dim
()
==
2
);
TORCH_CHECK
(
a
.
dim
()
==
2
);
TORCH_CHECK
(
a
.
stride
(
-
1
)
==
1
);
TORCH_CHECK
(
a
.
stride
(
-
1
)
==
1
);
TORCH_CHECK
(
c
.
is_contiguous
()
);
TORCH_CHECK
(
c
.
stride
(
-
1
)
==
1
);
MatMulPrimitiveHandler
*
ptr
=
MatMulPrimitiveHandler
*
ptr
=
reinterpret_cast
<
MatMulPrimitiveHandler
*>
(
handler
);
reinterpret_cast
<
MatMulPrimitiveHandler
*>
(
handler
);
...
...
vllm/platforms/cpu.py
View file @
9fccd04e
...
@@ -185,6 +185,11 @@ class CpuPlatform(Platform):
...
@@ -185,6 +185,11 @@ class CpuPlatform(Platform):
parallel_config
.
distributed_executor_backend
=
"mp"
parallel_config
.
distributed_executor_backend
=
"mp"
if
parallel_config
.
worker_cls
==
"auto"
:
if
parallel_config
.
worker_cls
==
"auto"
:
parallel_config
.
worker_cls
=
"vllm.v1.worker.cpu_worker.CPUWorker"
parallel_config
.
worker_cls
=
"vllm.v1.worker.cpu_worker.CPUWorker"
# Disable DBO
if
parallel_config
.
enable_dbo
:
logger
.
warning
(
"Dual-Batch Overlap is not supported on CPU, disabled."
)
parallel_config
.
enable_dbo
=
False
# Note: workaround for v1 gpu_model_runner
# Note: workaround for v1 gpu_model_runner
from
vllm.config
import
CompilationLevel
from
vllm.config
import
CompilationLevel
...
...
vllm/v1/worker/cpu_model_runner.py
View file @
9fccd04e
...
@@ -145,12 +145,20 @@ def _torch_cuda_wrapper():
...
@@ -145,12 +145,20 @@ def _torch_cuda_wrapper():
self
.
record
=
lambda
:
None
self
.
record
=
lambda
:
None
self
.
synchronize
=
lambda
:
None
self
.
synchronize
=
lambda
:
None
class
_StreamPlaceholder
:
def
__init__
(
self
,
*
args
,
**
kwargs
)
->
None
:
pass
cuda_event
=
torch
.
cuda
.
Event
cuda_event
=
torch
.
cuda
.
Event
cuda_stream
=
torch
.
cuda
.
Stream
try
:
try
:
torch
.
cuda
.
Event
=
_EventPlaceholder
torch
.
cuda
.
Event
=
_EventPlaceholder
torch
.
cuda
.
Stream
=
_StreamPlaceholder
yield
yield
finally
:
finally
:
torch
.
cuda
.
Event
=
cuda_event
torch
.
cuda
.
Event
=
cuda_event
torch
.
cuda
.
Stream
=
cuda_stream
@
contextmanager
@
contextmanager
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment