Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6fa64fbe
Commit
6fa64fbe
authored
Jan 16, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0rc2' into v0.14.0rc2-ori
parents
7aa5c03c
7f42dc20
Changes
40
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
305 additions
and
51 deletions
+305
-51
tests/v1/kv_connector/unit/test_config.py
tests/v1/kv_connector/unit/test_config.py
+18
-1
tools/vllm-rocm/pin_rocm_dependencies.py
tools/vllm-rocm/pin_rocm_dependencies.py
+221
-0
vllm/config/cache.py
vllm/config/cache.py
+5
-5
vllm/config/pooler.py
vllm/config/pooler.py
+16
-9
vllm/config/vllm.py
vllm/config/vllm.py
+4
-6
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
...tributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+1
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-3
vllm/entrypoints/pooling/embed/protocol.py
vllm/entrypoints/pooling/embed/protocol.py
+2
-2
vllm/entrypoints/pooling/pooling/protocol.py
vllm/entrypoints/pooling/pooling/protocol.py
+0
-2
vllm/model_executor/layers/pooler/seqwise/heads.py
vllm/model_executor/layers/pooler/seqwise/heads.py
+1
-1
vllm/model_executor/layers/pooler/seqwise/poolers.py
vllm/model_executor/layers/pooler/seqwise/poolers.py
+2
-2
vllm/model_executor/layers/pooler/tokwise/heads.py
vllm/model_executor/layers/pooler/tokwise/heads.py
+3
-3
vllm/model_executor/layers/pooler/tokwise/poolers.py
vllm/model_executor/layers/pooler/tokwise/poolers.py
+2
-2
vllm/model_executor/layers/quantization/input_quant_fp8.py
vllm/model_executor/layers/quantization/input_quant_fp8.py
+3
-1
vllm/model_executor/models/bert.py
vllm/model_executor/models/bert.py
+1
-1
vllm/model_executor/models/modernbert.py
vllm/model_executor/models/modernbert.py
+2
-1
vllm/pooling_params.py
vllm/pooling_params.py
+8
-9
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+1
-1
vllm/v1/attention/backends/rocm_attn.py
vllm/v1/attention/backends/rocm_attn.py
+10
-1
vllm/v1/sample/ops/topk_topp_sampler.py
vllm/v1/sample/ops/topk_topp_sampler.py
+4
-0
No files found.
tests/v1/kv_connector/unit/test_config.py
View file @
6fa64fbe
...
...
@@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test
(
"lmcache"
,
4.0
,
1
,
1
,
"LMCacheConnectorV1"
,
4.0
),
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
(
"lmcache"
,
8.0
,
2
,
2
,
"LMCacheConnectorV1"
,
2.0
),
(
None
,
None
,
1
,
1
,
None
,
None
),
# When kv_offloading_size is None, offloading is disabled (backend is ignored)
(
"native"
,
None
,
1
,
1
,
None
,
None
),
],
)
def
test_kv_connector
(
...
...
@@ -62,3 +63,19 @@ def test_kv_connector(
assert
kv_connector_extra_config
[
"lmcache.max_local_cpu_size"
]
==
expected_bytes
# Existing config should be replaced
assert
"existing_key"
not
in
kv_connector_extra_config
def
test_kv_offloading_size_only_uses_native_default
():
"""Test that setting only kv_offloading_size enables native offloading."""
vllm_config
=
VllmConfig
(
cache_config
=
CacheConfig
(
kv_offloading_size
=
4.0
,
# kv_offloading_backend not set, should default to "native"
),
)
kv_transfer_config
=
vllm_config
.
kv_transfer_config
kv_connector_extra_config
=
kv_transfer_config
.
kv_connector_extra_config
assert
kv_transfer_config
.
kv_connector
==
"OffloadingConnector"
assert
kv_transfer_config
.
kv_role
==
"kv_both"
assert
kv_connector_extra_config
[
"cpu_bytes_to_use"
]
==
4.0
*
(
1
<<
30
)
tools/vllm-rocm/pin_rocm_dependencies.py
0 → 100644
View file @
6fa64fbe
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Pin vLLM dependencies to exact versions of custom ROCm wheels.
This script modifies vLLM's requirements files to replace version constraints
with exact versions of custom-built ROCm wheels (torch, triton, torchvision, amdsmi).
This ensures that 'pip install vllm' automatically installs the correct custom wheels
instead of allowing pip to download different versions from PyPI.
"""
import
re
import
sys
from
pathlib
import
Path
def
extract_version_from_wheel
(
wheel_name
:
str
)
->
str
:
"""
Extract version from wheel filename.
Example:
torch-2.9.0a0+git1c57644-cp312-cp312-linux_x86_64.whl -> 2.9.0a0+git1c57644
triton-3.4.0-cp312-cp312-linux_x86_64.whl -> 3.4.0
"""
# Wheel format:
# {distribution}-{version}(-{build tag})?-{python}-{abi}-{platform}.whl
parts
=
wheel_name
.
replace
(
".whl"
,
""
).
split
(
"-"
)
if
len
(
parts
)
<
5
:
raise
ValueError
(
f
"Invalid wheel filename format:
{
wheel_name
}
"
)
# Version is the second part
version
=
parts
[
1
]
return
version
def
get_custom_wheel_versions
(
install_dir
:
str
)
->
dict
[
str
,
str
]:
"""
Read /install directory and extract versions of custom wheels.
Returns:
Dict mapping package names to exact versions
"""
install_path
=
Path
(
install_dir
)
if
not
install_path
.
exists
():
print
(
f
"ERROR: Install directory not found:
{
install_dir
}
"
,
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
versions
=
{}
# Map wheel prefixes to package names
# IMPORTANT: Use dashes to avoid matching substrings
# (e.g., 'torch' would match 'torchvision')
# ORDER MATTERS: This order is preserved when pinning dependencies
# in requirements files
package_mapping
=
[
(
"torch-"
,
"torch"
),
# Match torch- (not torchvision)
(
"triton-"
,
"triton"
),
# Match triton- (not triton_kernels)
(
"triton_kernels-"
,
"triton-kernels"
),
# Match triton_kernels-
(
"torchvision-"
,
"torchvision"
),
# Match torchvision-
(
"torchaudio-"
,
"torchaudio"
),
# Match torchaudio-
(
"amdsmi-"
,
"amdsmi"
),
# Match amdsmi-
(
"flash_attn-"
,
"flash-attn"
),
# Match flash_attn-
(
"aiter-"
,
"aiter"
),
# Match aiter-
]
for
wheel_file
in
install_path
.
glob
(
"*.whl"
):
wheel_name
=
wheel_file
.
name
for
prefix
,
package_name
in
package_mapping
:
if
wheel_name
.
startswith
(
prefix
):
try
:
version
=
extract_version_from_wheel
(
wheel_name
)
versions
[
package_name
]
=
version
print
(
f
"Found
{
package_name
}
==
{
version
}
"
,
file
=
sys
.
stderr
)
except
Exception
as
e
:
print
(
f
"WARNING: Could not extract version from
{
wheel_name
}
:
{
e
}
"
,
file
=
sys
.
stderr
,
)
break
# Return versions in the order defined by package_mapping
ordered_versions
=
{}
for
_
,
package_name
in
package_mapping
:
if
package_name
in
versions
:
ordered_versions
[
package_name
]
=
versions
[
package_name
]
return
ordered_versions
def
pin_dependencies_in_requirements
(
requirements_path
:
str
,
versions
:
dict
[
str
,
str
]):
"""
Insert custom wheel pins at the TOP of requirements file.
This ensures that when setup.py processes the file line-by-line,
custom wheels (torch, triton, etc.) are encountered FIRST, before
any `-r common.txt` includes that might pull in other dependencies.
Creates:
# Custom ROCm wheel pins (auto-generated)
torch==2.9.0a0+git1c57644
triton==3.4.0
torchvision==0.23.0a0+824e8c8
amdsmi==26.1.0+5df6c765
-r common.txt
... rest of file ...
"""
requirements_file
=
Path
(
requirements_path
)
if
not
requirements_file
.
exists
():
print
(
f
"ERROR: Requirements file not found:
{
requirements_path
}
"
,
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
# Backup original file
backup_file
=
requirements_file
.
with_suffix
(
requirements_file
.
suffix
+
".bak"
)
with
open
(
requirements_file
)
as
f
:
original_lines
=
f
.
readlines
()
# Write backup
with
open
(
backup_file
,
"w"
)
as
f
:
f
.
writelines
(
original_lines
)
# Build header with pinned custom wheels
header_lines
=
[
"# Custom ROCm wheel pins (auto-generated by pin_rocm_dependencies.py)
\n
"
,
"# These must come FIRST to ensure correct dependency resolution
\n
"
,
]
for
package_name
,
exact_version
in
versions
.
items
():
header_lines
.
append
(
f
"
{
package_name
}
==
{
exact_version
}
\n
"
)
header_lines
.
append
(
"
\n
"
)
# Blank line separator
# Filter out any existing entries for custom packages from original file
filtered_lines
=
[]
removed_packages
=
[]
for
line
in
original_lines
:
stripped
=
line
.
strip
()
should_keep
=
True
# Check if this line is for one of our custom packages
if
stripped
and
not
stripped
.
startswith
(
"#"
)
and
not
stripped
.
startswith
(
"-"
):
for
package_name
in
versions
:
# Handle both hyphen and underscore variations
pattern_name
=
package_name
.
replace
(
"-"
,
"[-_]"
)
pattern
=
rf
"^
{
pattern_name
}
\s*[=<>]=?\s*[\d.a-zA-Z+]+"
if
re
.
match
(
pattern
,
stripped
,
re
.
IGNORECASE
):
removed_packages
.
append
(
f
"
{
package_name
}
:
{
stripped
}
"
)
should_keep
=
False
break
if
should_keep
:
filtered_lines
.
append
(
line
)
# Combine: header + filtered original content
final_lines
=
header_lines
+
filtered_lines
# Write modified content
with
open
(
requirements_file
,
"w"
)
as
f
:
f
.
writelines
(
final_lines
)
# Print summary
print
(
"
\n
✓ Inserted custom wheel pins at TOP of requirements:"
,
file
=
sys
.
stderr
)
for
package_name
,
exact_version
in
versions
.
items
():
print
(
f
" -
{
package_name
}
==
{
exact_version
}
"
,
file
=
sys
.
stderr
)
if
removed_packages
:
print
(
"
\n
✓ Removed old package entries:"
,
file
=
sys
.
stderr
)
for
pkg
in
removed_packages
:
print
(
f
" -
{
pkg
}
"
,
file
=
sys
.
stderr
)
print
(
f
"
\n
✓ Patched requirements file:
{
requirements_path
}
"
,
file
=
sys
.
stderr
)
print
(
f
" Backup saved:
{
backup_file
}
"
,
file
=
sys
.
stderr
)
def
main
():
if
len
(
sys
.
argv
)
!=
3
:
print
(
f
"Usage:
{
sys
.
argv
[
0
]
}
<install_dir> <requirements_file>"
,
file
=
sys
.
stderr
)
print
(
f
"Example:
{
sys
.
argv
[
0
]
}
/install /app/vllm/requirements/rocm.txt"
,
file
=
sys
.
stderr
,
)
sys
.
exit
(
1
)
install_dir
=
sys
.
argv
[
1
]
requirements_path
=
sys
.
argv
[
2
]
print
(
"="
*
70
,
file
=
sys
.
stderr
)
print
(
"Pinning vLLM dependencies to custom ROCm wheel versions"
,
file
=
sys
.
stderr
)
print
(
"="
*
70
,
file
=
sys
.
stderr
)
# Get versions from custom wheels
print
(
f
"
\n
Scanning
{
install_dir
}
for custom wheels..."
,
file
=
sys
.
stderr
)
versions
=
get_custom_wheel_versions
(
install_dir
)
if
not
versions
:
print
(
"
\n
ERROR: No custom wheels found in /install!"
,
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
# Pin dependencies in requirements file
print
(
f
"
\n
Patching
{
requirements_path
}
..."
,
file
=
sys
.
stderr
)
pin_dependencies_in_requirements
(
requirements_path
,
versions
)
print
(
"
\n
"
+
"="
*
70
,
file
=
sys
.
stderr
)
print
(
"✓ Dependency pinning complete!"
,
file
=
sys
.
stderr
)
print
(
"="
*
70
,
file
=
sys
.
stderr
)
sys
.
exit
(
0
)
if
__name__
==
"__main__"
:
main
()
vllm/config/cache.py
View file @
6fa64fbe
...
...
@@ -152,13 +152,13 @@ class CacheConfig:
kv_offloading_size
:
float
|
None
=
None
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
the total buffer size summed across all TP ranks. By default, this is set
to None, which means no KV offloading is enabled. When set wi
th
kv_offloading_backend, vLLM will
enable KV cache offloading to CPU"""
to None, which means no KV offloading is enabled. When set
, vLLM
wi
ll
enable KV cache offloading to CPU
using the kv_offloading_backend.
"""
kv_offloading_backend
:
KVOffloadingBackend
|
None
=
None
kv_offloading_backend
:
KVOffloadingBackend
=
"native"
"""The backend to use for KV cache offloading. Supported backends include
'native' (vLLM native CPU offloading), 'lmcache'
This option must be used
together with
kv_offloading_size."""
'native' (vLLM native CPU offloading), 'lmcache'
.
KV offloading is only activated when
kv_offloading_size
is set
."""
def
compute_hash
(
self
)
->
str
:
"""
...
...
vllm/config/pooler.py
View file @
6fa64fbe
...
...
@@ -48,7 +48,7 @@ class PoolerConfig:
## for embeddings models
normalize
:
bool
|
None
=
None
"""
Whether to normalize the embeddings outputs. Defaults to True
.
DEPRECATED: please use `use_activation` instead
.
"""
dimensions
:
int
|
None
=
None
"""
...
...
@@ -75,11 +75,11 @@ class PoolerConfig:
## for classification models
softmax
:
float
|
None
=
None
"""
softmax will be deprecated,
please use use_activation instead.
DEPRECATED:
please use
`
use_activation
`
instead.
"""
activation
:
float
|
None
=
None
"""
activation will be deprecated,
please use use_activation instead.
DEPRECATED:
please use
`
use_activation
`
instead.
"""
use_activation
:
bool
|
None
=
None
"""
...
...
@@ -164,17 +164,24 @@ class PoolerConfig:
def
get_use_activation
(
o
:
object
):
if
softmax
:
=
getattr
(
o
,
"
softmax
"
,
None
)
is
not
None
:
if
(
normalize
:
=
getattr
(
o
,
"
normalize
"
,
None
)
)
is
not
None
:
logger
.
warning_once
(
"softmax will be deprecated and will be removed in v0.15. "
"Please use use_activation instead."
"`normalize` is deprecated and will be removed in v0.15. "
"Please use `use_activation` instead."
)
return
normalize
if
(
softmax
:
=
getattr
(
o
,
"softmax"
,
None
))
is
not
None
:
logger
.
warning_once
(
"`softmax` is deprecated and will be removed in v0.15. "
"Please use `use_activation` instead."
)
return
softmax
if
activation
:
=
getattr
(
o
,
"activation"
,
None
)
is
not
None
:
if
(
activation
:
=
getattr
(
o
,
"activation"
,
None
)
)
is
not
None
:
logger
.
warning_once
(
"activation
will be
deprecated and will be removed in v0.15. "
"Please use use_activation instead."
"
`
activation
` is
deprecated and will be removed in v0.15. "
"Please use
`
use_activation
`
instead."
)
return
activation
...
...
vllm/config/vllm.py
View file @
6fa64fbe
...
...
@@ -498,17 +498,15 @@ class VllmConfig:
Right now, this function reads the offloading settings from
CacheConfig and configures the KVTransferConfig accordingly.
"""
if
(
kv_offloading_backend
:
=
self
.
cache_config
.
kv_offloading_backend
)
is
None
:
# KV offloading is only activated when kv_offloading_size is set.
if
(
kv_offloading_size
:
=
self
.
cache_config
.
kv_offloading_size
)
is
None
:
return
kv_offloading_backend
=
self
.
cache_config
.
kv_offloading_backend
# If no KVTransferConfig is provided, create a default one.
if
self
.
kv_transfer_config
is
None
:
self
.
kv_transfer_config
=
KVTransferConfig
()
if
(
kv_offloading_size
:
=
self
.
cache_config
.
kv_offloading_size
)
is
None
:
raise
ValueError
(
"You must set kv_offloading_size when kv_offloading_backend is set."
)
num_kv_ranks
=
(
self
.
parallel_config
.
tensor_parallel_size
*
self
.
parallel_config
.
pipeline_parallel_size
...
...
vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
View file @
6fa64fbe
...
...
@@ -234,7 +234,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
lora_id
=
e
.
lora_id
,
block_size
=
e
.
block_size
,
medium
=
e
.
medium
,
lora_name
=
e
.
lora_name
,
lora_name
=
getattr
(
e
,
"
lora_name
"
,
None
)
,
)
for
e
in
events
]
...
...
vllm/engine/arg_utils.py
View file @
6fa64fbe
...
...
@@ -578,9 +578,7 @@ class EngineArgs:
optimization_level
:
OptimizationLevel
=
VllmConfig
.
optimization_level
kv_offloading_size
:
float
|
None
=
CacheConfig
.
kv_offloading_size
kv_offloading_backend
:
KVOffloadingBackend
|
None
=
(
CacheConfig
.
kv_offloading_backend
)
kv_offloading_backend
:
KVOffloadingBackend
=
CacheConfig
.
kv_offloading_backend
tokens_only
:
bool
=
False
def
__post_init__
(
self
):
...
...
vllm/entrypoints/pooling/embed/protocol.py
View file @
6fa64fbe
...
...
@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
dimensions
=
self
.
dimensions
,
normalize
=
self
.
normalize
,
use_activation
=
self
.
normalize
,
)
...
...
@@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
dimensions
=
self
.
dimensions
,
normalize
=
self
.
normalize
,
use_activation
=
self
.
normalize
,
)
...
...
vllm/entrypoints/pooling/pooling/protocol.py
View file @
6fa64fbe
...
...
@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
dimensions
=
self
.
dimensions
,
normalize
=
self
.
normalize
,
use_activation
=
get_use_activation
(
self
),
)
...
...
@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest):
return
PoolingParams
(
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
,
dimensions
=
self
.
dimensions
,
normalize
=
self
.
normalize
,
use_activation
=
get_use_activation
(
self
),
)
...
...
vllm/model_executor/layers/pooler/seqwise/heads.py
View file @
6fa64fbe
...
...
@@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead):
# for normalize
if
self
.
activation
is
not
None
:
flags
=
[
p
.
normalize
for
p
in
pooling_params
]
flags
=
[
p
.
use_activation
for
p
in
pooling_params
]
if
len
(
set
(
flags
))
==
1
:
if
flags
[
0
]:
pooled_data
=
self
.
activation
(
pooled_data
)
...
...
vllm/model_executor/layers/pooler/seqwise/poolers.py
View file @
6fa64fbe
...
...
@@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig):
vllm_config
=
get_current_vllm_config
()
model_config
=
vllm_config
.
model_config
head
=
EmbeddingPoolerHead
(
projector
=
_load_st_projector
(
model_config
),
head_dtype
=
model_config
.
head_dtype
,
projector
=
_load_st_projector
(
model_config
),
activation
=
PoolerNormalize
(),
)
...
...
@@ -116,9 +116,9 @@ def pooler_for_classify(
vllm_config
=
get_current_vllm_config
()
model_config
=
vllm_config
.
model_config
head
=
ClassifierPoolerHead
(
head_dtype
=
model_config
.
head_dtype
,
classifier
=
classifier
,
logit_bias
=
model_config
.
pooler_config
.
logit_bias
,
head_dtype
=
model_config
.
head_dtype
,
activation
=
resolve_classifier_act_fn
(
model_config
,
static_num_labels
=
True
,
act_fn
=
act_fn
),
...
...
vllm/model_executor/layers/pooler/tokwise/heads.py
View file @
6fa64fbe
...
...
@@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC):
class
TokenEmbeddingPoolerHead
(
TokenPoolerHead
):
def
__init__
(
self
,
projector
:
ProjectorFn
|
None
=
None
,
head_dtype
:
torch
.
dtype
|
str
|
None
=
None
,
projector
:
ProjectorFn
|
None
=
None
,
activation
:
ActivationFn
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
projector
=
projector
self
.
head_dtype
=
head_dtype
self
.
projector
=
projector
self
.
activation
=
activation
def
get_supported_tasks
(
self
)
->
Set
[
PoolingTask
]:
...
...
@@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead):
pooled_data
=
pooled_data
[...,
:
pooling_param
.
dimensions
]
# for normalize
if
self
.
activation
is
not
None
and
pooling_param
.
normalize
:
if
self
.
activation
is
not
None
and
pooling_param
.
use_activation
:
pooled_data
=
self
.
activation
(
pooled_data
)
# pooled_data shape: [n_tokens, embedding_dimension]
...
...
vllm/model_executor/layers/pooler/tokwise/poolers.py
View file @
6fa64fbe
...
...
@@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig):
vllm_config
=
get_current_vllm_config
()
model_config
=
vllm_config
.
model_config
head
=
TokenEmbeddingPoolerHead
(
projector
=
_load_st_projector
(
model_config
),
head_dtype
=
model_config
.
head_dtype
,
projector
=
_load_st_projector
(
model_config
),
activation
=
PoolerNormalize
(),
)
...
...
@@ -116,9 +116,9 @@ def pooler_for_token_classify(
vllm_config
=
get_current_vllm_config
()
model_config
=
vllm_config
.
model_config
head
=
TokenClassifierPoolerHead
(
head_dtype
=
model_config
.
head_dtype
,
classifier
=
classifier
,
logit_bias
=
model_config
.
pooler_config
.
logit_bias
,
head_dtype
=
model_config
.
head_dtype
,
activation
=
resolve_classifier_act_fn
(
model_config
,
static_num_labels
=
False
,
act_fn
=
act_fn
),
...
...
vllm/model_executor/layers/quantization/input_quant_fp8.py
View file @
6fa64fbe
...
...
@@ -98,7 +98,9 @@ class QuantFP8(CustomOp):
num_token_padding
=
self
.
num_token_padding
,
scale_ub
=
scale_ub
,
use_per_token_if_dynamic
=
self
.
use_per_token_if_dynamic
,
group_shape
=
self
.
group_shape
if
self
.
static
else
None
,
group_shape
=
(
self
.
group_shape
.
row
,
self
.
group_shape
.
col
)
if
self
.
static
else
None
,
)
def
forward_hip
(
...
...
vllm/model_executor/models/bert.py
View file @
6fa64fbe
...
...
@@ -116,8 +116,8 @@ class BertPooler(SequencePooler):
# Use lambdas so that weights are not registered under `self.head`
self
.
head
=
EmbeddingPoolerHead
(
projector
=
lambda
x
:
self
.
dense
(
x
),
head_dtype
=
head_dtype
,
projector
=
lambda
x
:
self
.
dense
(
x
),
activation
=
LambdaPoolerActivation
(
self
.
act_fn
),
)
...
...
vllm/model_executor/models/modernbert.py
View file @
6fa64fbe
...
...
@@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler):
config
.
hidden_size
,
eps
=
config
.
norm_eps
,
bias
=
config
.
norm_bias
,
dtype
=
head_dtype
,
)
# Use lambdas so that weights are not registered under `self.head`
self
.
head
=
EmbeddingPoolerHead
(
projector
=
lambda
x
:
self
.
dense
(
x
),
head_dtype
=
head_dtype
,
projector
=
lambda
x
:
self
.
dense
(
x
),
activation
=
LambdaPoolerActivation
(
lambda
x
:
self
.
norm
(
self
.
act
(
x
))),
)
...
...
vllm/pooling_params.py
View file @
6fa64fbe
...
...
@@ -26,9 +26,9 @@ class PoolingParams(
Set to None to disable truncation.
dimensions: Reduce the dimensions of embeddings
if model support matryoshka representation.
normalize:
Whether to normalize the embeddings outputs
.
softmax:
softmax will be d
eprecated, please use use_activation instead.
activation:
activation will be d
eprecated, please use use_activation instead.
normalize:
Deprecated, please use use_activation instead
.
softmax:
D
eprecated, please use use_activation instead.
activation:
D
eprecated, please use use_activation instead.
use_activation: Whether to apply activation function to
the classification outputs.
"""
...
...
@@ -63,15 +63,15 @@ class PoolingParams(
@
property
def
all_parameters
(
self
)
->
list
[
str
]:
return
[
"dimensions"
,
"normalize"
,
"use_activation"
]
return
[
"dimensions"
,
"use_activation"
]
@
property
def
valid_parameters
(
self
):
return
{
"embed"
:
[
"dimensions"
,
"
normalize
"
],
"embed"
:
[
"dimensions"
,
"
use_activation
"
],
"classify"
:
[
"use_activation"
],
"score"
:
[
"use_activation"
],
"token_embed"
:
[
"dimensions"
,
"
normalize
"
],
"token_embed"
:
[
"dimensions"
,
"
use_activation
"
],
"token_classify"
:
[
"use_activation"
],
}
...
...
@@ -162,8 +162,8 @@ class PoolingParams(
def
_set_default_parameters
(
self
,
model_config
:
Optional
[
"ModelConfig"
]):
if
self
.
task
in
[
"embed"
,
"token_embed"
]:
if
self
.
normalize
is
None
:
self
.
normalize
=
True
if
self
.
use_activation
is
None
:
self
.
use_activation
=
True
if
self
.
dimensions
is
not
None
and
model_config
is
not
None
:
if
not
model_config
.
is_matryoshka
:
...
...
@@ -213,7 +213,6 @@ class PoolingParams(
return
(
f
"PoolingParams("
f
"task=
{
self
.
task
}
, "
f
"normalize=
{
self
.
normalize
}
, "
f
"dimensions=
{
self
.
dimensions
}
, "
f
"use_activation=
{
self
.
use_activation
}
, "
f
"step_tag_id=
{
self
.
step_tag_id
}
, "
...
...
vllm/transformers_utils/config.py
View file @
6fa64fbe
...
...
@@ -801,7 +801,7 @@ def get_pooling_config(
logger
.
info
(
"Found pooling configuration."
)
config
:
dict
[
str
,
Any
]
=
{
"
normalize
"
:
normalize
}
config
:
dict
[
str
,
Any
]
=
{
"
use_activation
"
:
normalize
}
for
key
,
val
in
pooling_dict
.
items
():
if
val
is
True
:
pooling_type
=
parse_pooling_type
(
key
)
...
...
vllm/v1/attention/backends/rocm_attn.py
View file @
6fa64fbe
...
...
@@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend):
# ROCM paged attention kernel only supports block sizes 16 and 32
# due to shared memory (LDS) constraints on AMD GPUs.
# See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
return
[
16
,
32
]
# However, The limitations in [16, 32] are reasonable for a native C++ kernel,
# but vLLM should allow support for non-standard sizes via the Triton path,
# as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
# where the Triton kernel under rocm_atten does not support inference
# for a non-standard qwen3-next model with a block_size of 544.
# We have fixed the Triton kernel so that the standard model uses the original
# bit-addressing logic, while the non-standard model
# uses our optimized kernel logic.
return
[
16
,
32
,
544
]
@
classmethod
def
get_supported_head_sizes
(
cls
)
->
list
[
int
]:
...
...
vllm/v1/sample/ops/topk_topp_sampler.py
View file @
6fa64fbe
...
...
@@ -174,6 +174,8 @@ class TopKTopPSampler(nn.Module):
k
:
torch
.
Tensor
|
None
,
p
:
torch
.
Tensor
|
None
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
|
None
]:
# FIXME: Fix aiter_sampler's accuracy issue and remove this flag
DISABLE_AITER_SAMPLER
=
True
"""Optimized ROCm/aiter path (same structure as forward_cuda)."""
if
(
k
is
None
and
p
is
None
)
or
generators
:
if
generators
:
...
...
@@ -186,6 +188,8 @@ class TopKTopPSampler(nn.Module):
"processed_logits"
,
"processed_logprobs"
,
),
"aiter sampler does not support returning logits/logprobs."
if
DISABLE_AITER_SAMPLER
:
return
self
.
forward_native
(
logits
,
generators
,
k
,
p
)
return
self
.
aiter_sample
(
logits
,
k
,
p
,
generators
),
None
def
aiter_sample
(
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment