Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
352b90c4
Unverified
Commit
352b90c4
authored
Mar 24, 2026
by
Li, Jiang
Committed by
GitHub
Mar 24, 2026
Browse files
[Bugfix] Add replacement of _compute_slot_mapping_kernel on CPU (#37987)
Signed-off-by:
jiang1.li
<
jiang1.li@intel.com
>
parent
1c0aabde
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
105 additions
and
9 deletions
+105
-9
.buildkite/hardware_tests/cpu.yaml
.buildkite/hardware_tests/cpu.yaml
+0
-7
csrc/cpu/torch_bindings.cpp
csrc/cpu/torch_bindings.cpp
+12
-0
csrc/cpu/utils.cpp
csrc/cpu/utils.cpp
+35
-0
docker/Dockerfile.cpu
docker/Dockerfile.cpu
+1
-1
vllm/model_executor/layers/quantization/awq_marlin.py
vllm/model_executor/layers/quantization/awq_marlin.py
+1
-1
vllm/utils/cpu_triton_utils.py
vllm/utils/cpu_triton_utils.py
+47
-0
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+9
-0
No files found.
.buildkite/hardware_tests/cpu.yaml
View file @
352b90c4
...
...
@@ -3,7 +3,6 @@ depends_on: []
steps
:
-
label
:
CPU-Kernel Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
...
...
@@ -23,7 +22,6 @@ steps:
-
label
:
CPU-Compatibility Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
...
...
@@ -37,7 +35,6 @@ steps:
-
label
:
CPU-Language Generation and Pooling Model Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
...
...
@@ -53,7 +50,6 @@ steps:
-
label
:
CPU-Quantization Model Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
...
...
@@ -73,7 +69,6 @@ steps:
-
label
:
CPU-Distributed Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
...
...
@@ -92,7 +87,6 @@ steps:
-
label
:
CPU-Multi-Modal Model Tests %N
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
...
...
@@ -107,7 +101,6 @@ steps:
-
label
:
"
Arm
CPU
Test"
depends_on
:
[]
soft_fail
:
true
device
:
arm_cpu
no_plugin
:
true
commands
:
...
...
csrc/cpu/torch_bindings.cpp
View file @
352b90c4
...
...
@@ -126,6 +126,12 @@ void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input,
const
torch
::
Tensor
&
topk_id
,
const
bool
skip_weighted
,
const
std
::
string
&
act
,
const
std
::
string
&
isa
);
void
compute_slot_mapping_kernel_impl
(
const
torch
::
Tensor
query_start_loc
,
const
torch
::
Tensor
positions
,
const
torch
::
Tensor
block_table
,
torch
::
Tensor
slot_mapping
,
const
int64_t
block_size
);
TORCH_LIBRARY_EXPAND
(
TORCH_EXTENSION_NAME
,
ops
)
{
// vLLM custom ops
...
...
@@ -334,6 +340,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! out, Tensor query, Tensor kv_cache,"
" float scale, Tensor block_tables, Tensor seq_lens) -> ()"
);
ops
.
impl
(
"mla_decode_kvcache"
,
torch
::
kCPU
,
&
mla_decode_kvcache
);
ops
.
def
(
"compute_slot_mapping_kernel_impl(Tensor query_start_loc, Tensor "
"positions, Tensor block_table, Tensor(a3!) slot_mapping, SymInt "
"block_size) -> ()"
,
&
compute_slot_mapping_kernel_impl
);
}
REGISTER_EXTENSION
(
TORCH_EXTENSION_NAME
)
csrc/cpu/utils.cpp
View file @
352b90c4
...
...
@@ -189,3 +189,38 @@ ScratchPadManager* ScratchPadManager::get_scratchpad_manager() {
return
&
manager
;
}
}
// namespace cpu_utils
void
compute_slot_mapping_kernel_impl
(
const
torch
::
Tensor
query_start_loc
,
const
torch
::
Tensor
positions
,
const
torch
::
Tensor
block_table
,
torch
::
Tensor
slot_mapping
,
const
int64_t
block_size
)
{
const
int32_t
req_num
=
query_start_loc
.
size
(
0
)
-
1
;
const
int64_t
block_table_stride
=
block_table
.
stride
(
0
);
const
int32_t
*
__restrict__
query_start_loc_ptr
=
query_start_loc
.
data_ptr
<
int32_t
>
();
const
int64_t
*
__restrict__
positions_ptr
=
positions
.
data_ptr
<
int64_t
>
();
const
int32_t
*
__restrict__
blocktable_ptr
=
block_table
.
data_ptr
<
int32_t
>
();
int64_t
*
__restrict__
slot_mapping_ptr
=
slot_mapping
.
data_ptr
<
int64_t
>
();
#pragma omp parallel for
for
(
int32_t
req_idx
=
0
;
req_idx
<
req_num
;
++
req_idx
)
{
int32_t
token_start_idx
=
query_start_loc_ptr
[
req_idx
];
int32_t
token_end_idx
=
query_start_loc_ptr
[
req_idx
+
1
];
int32_t
token_num
=
token_end_idx
-
token_start_idx
;
const
int64_t
*
__restrict__
curr_position_ptr
=
positions_ptr
+
token_start_idx
;
int64_t
*
__restrict__
curr_slot_mapping_ptr
=
slot_mapping_ptr
+
token_start_idx
;
const
int32_t
*
__restrict__
curr_block_table_ptr
=
blocktable_ptr
+
req_idx
*
block_table_stride
;
for
(
int32_t
token_idx
=
0
;
token_idx
<
token_num
;
++
token_idx
)
{
int64_t
token_position
=
curr_position_ptr
[
token_idx
];
int64_t
block_id
=
curr_block_table_ptr
[
token_position
/
block_size
];
curr_slot_mapping_ptr
[
token_idx
]
=
block_id
*
block_size
+
token_position
%
block_size
;
}
}
}
docker/Dockerfile.cpu
View file @
352b90c4
...
...
@@ -161,7 +161,7 @@ RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
uv pip install
--no-build-isolation
-e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
...
...
vllm/model_executor/layers/quantization/awq_marlin.py
View file @
352b90c4
...
...
@@ -309,7 +309,7 @@ class AWQMarlinConfig(QuantizationConfig):
group_size
=
quant_config
.
get
(
"group_size"
)
zero_point
=
quant_config
.
get
(
"zero_point"
)
if
not
(
current_platform
.
is_cuda_alike
()
or
current_platform
.
is_cpu
())
:
if
not
current_platform
.
is_cuda_alike
():
return
False
if
quant_method
!=
"awq"
:
...
...
vllm/utils/cpu_triton_utils.py
0 → 100644
View file @
352b90c4
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Contains replacement functions to fallback Triton usages in CPU backend
"""
from
collections.abc
import
Callable
import
torch
class
_FuncWrapper
:
def
__init__
(
self
,
func
:
Callable
)
->
None
:
self
.
func
=
func
def
__getitem__
(
self
,
*
args
,
**
kwargs
)
->
Callable
:
return
self
.
func
# For _compute_slot_mapping_kernel in vllm/v1/worker/block_table.py
def
_compute_slot_mapping_kernel_impl
(
num_tokens
:
int
,
max_num_tokens
:
int
,
query_start_loc
:
torch
.
Tensor
,
# [num_reqs + 1], int32
positions
:
torch
.
Tensor
,
# [num_tokens], int64
block_table
:
torch
.
Tensor
,
# [max_num_reqs, max_num_blocks_per_req], int32
block_table_stride
:
int
,
# max_num_blocks_per_req
block_size
:
int
,
slot_mapping
:
torch
.
Tensor
,
# [max_num_tokens], int64
TOTAL_CP_WORLD_SIZE
:
int
,
TOTAL_CP_RANK
:
int
,
CP_KV_CACHE_INTERLEAVE_SIZE
:
int
,
PAD_ID
:
int
,
BLOCK_SIZE
:
int
,
)
->
None
:
assert
TOTAL_CP_WORLD_SIZE
==
1
,
"Context Parallelism is not supported on CPU."
torch
.
ops
.
_C
.
compute_slot_mapping_kernel_impl
(
query_start_loc
,
positions
,
block_table
,
slot_mapping
,
block_size
,
)
compute_slot_mapping_kernel
=
_FuncWrapper
(
_compute_slot_mapping_kernel_impl
)
vllm/v1/worker/cpu_model_runner.py
View file @
352b90c4
...
...
@@ -6,6 +6,7 @@ from typing import Any
import
torch
import
torch.nn
as
nn
import
vllm.utils.cpu_triton_utils
as
cpu_tl
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.model_loader
import
get_model
...
...
@@ -28,6 +29,7 @@ class CPUModelRunner(GPUModelRunner):
self
.
cascade_attn_enabled
=
False
self
.
_postprocess_tensors
()
self
.
_postprocess_triton
()
def
_postprocess_tensors
(
self
)
->
None
:
# Note: replace device tensors with cpu tensors
...
...
@@ -52,6 +54,13 @@ class CPUModelRunner(GPUModelRunner):
if
isinstance
(
v
,
CpuGpuBuffer
):
v
.
gpu
=
v
.
cpu
def
_postprocess_triton
(
self
)
->
None
:
import
vllm.v1.worker.block_table
vllm
.
v1
.
worker
.
block_table
.
_compute_slot_mapping_kernel
=
(
cpu_tl
.
compute_slot_mapping_kernel
)
@
instrument
(
span_name
=
"Loading (CPU)"
)
def
load_model
(
self
,
load_dummy_weights
:
bool
=
False
)
->
None
:
if
load_dummy_weights
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment