Commit 6fa64fbe authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc2' into v0.14.0rc2-ori

parents 7aa5c03c 7f42dc20
...@@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test ...@@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test
("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0), ("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB # size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0), ("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
(None, None, 1, 1, None, None), # When kv_offloading_size is None, offloading is disabled (backend is ignored)
("native", None, 1, 1, None, None),
], ],
) )
def test_kv_connector( def test_kv_connector(
...@@ -62,3 +63,19 @@ def test_kv_connector( ...@@ -62,3 +63,19 @@ def test_kv_connector(
assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
# Existing config should be replaced # Existing config should be replaced
assert "existing_key" not in kv_connector_extra_config assert "existing_key" not in kv_connector_extra_config
def test_kv_offloading_size_only_uses_native_default():
"""Test that setting only kv_offloading_size enables native offloading."""
vllm_config = VllmConfig(
cache_config=CacheConfig(
kv_offloading_size=4.0,
# kv_offloading_backend not set, should default to "native"
),
)
kv_transfer_config = vllm_config.kv_transfer_config
kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
assert kv_transfer_config.kv_connector == "OffloadingConnector"
assert kv_transfer_config.kv_role == "kv_both"
assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30)
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Pin vLLM dependencies to exact versions of custom ROCm wheels.
This script modifies vLLM's requirements files to replace version constraints
with exact versions of custom-built ROCm wheels (torch, triton, torchvision, amdsmi).
This ensures that 'pip install vllm' automatically installs the correct custom wheels
instead of allowing pip to download different versions from PyPI.
"""
import re
import sys
from pathlib import Path
def extract_version_from_wheel(wheel_name: str) -> str:
"""
Extract version from wheel filename.
Example:
torch-2.9.0a0+git1c57644-cp312-cp312-linux_x86_64.whl -> 2.9.0a0+git1c57644
triton-3.4.0-cp312-cp312-linux_x86_64.whl -> 3.4.0
"""
# Wheel format:
# {distribution}-{version}(-{build tag})?-{python}-{abi}-{platform}.whl
parts = wheel_name.replace(".whl", "").split("-")
if len(parts) < 5:
raise ValueError(f"Invalid wheel filename format: {wheel_name}")
# Version is the second part
version = parts[1]
return version
def get_custom_wheel_versions(install_dir: str) -> dict[str, str]:
"""
Read /install directory and extract versions of custom wheels.
Returns:
Dict mapping package names to exact versions
"""
install_path = Path(install_dir)
if not install_path.exists():
print(f"ERROR: Install directory not found: {install_dir}", file=sys.stderr)
sys.exit(1)
versions = {}
# Map wheel prefixes to package names
# IMPORTANT: Use dashes to avoid matching substrings
# (e.g., 'torch' would match 'torchvision')
# ORDER MATTERS: This order is preserved when pinning dependencies
# in requirements files
package_mapping = [
("torch-", "torch"), # Match torch- (not torchvision)
("triton-", "triton"), # Match triton- (not triton_kernels)
("triton_kernels-", "triton-kernels"), # Match triton_kernels-
("torchvision-", "torchvision"), # Match torchvision-
("torchaudio-", "torchaudio"), # Match torchaudio-
("amdsmi-", "amdsmi"), # Match amdsmi-
("flash_attn-", "flash-attn"), # Match flash_attn-
("aiter-", "aiter"), # Match aiter-
]
for wheel_file in install_path.glob("*.whl"):
wheel_name = wheel_file.name
for prefix, package_name in package_mapping:
if wheel_name.startswith(prefix):
try:
version = extract_version_from_wheel(wheel_name)
versions[package_name] = version
print(f"Found {package_name}=={version}", file=sys.stderr)
except Exception as e:
print(
f"WARNING: Could not extract version from {wheel_name}: {e}",
file=sys.stderr,
)
break
# Return versions in the order defined by package_mapping
ordered_versions = {}
for _, package_name in package_mapping:
if package_name in versions:
ordered_versions[package_name] = versions[package_name]
return ordered_versions
def pin_dependencies_in_requirements(requirements_path: str, versions: dict[str, str]):
"""
Insert custom wheel pins at the TOP of requirements file.
This ensures that when setup.py processes the file line-by-line,
custom wheels (torch, triton, etc.) are encountered FIRST, before
any `-r common.txt` includes that might pull in other dependencies.
Creates:
# Custom ROCm wheel pins (auto-generated)
torch==2.9.0a0+git1c57644
triton==3.4.0
torchvision==0.23.0a0+824e8c8
amdsmi==26.1.0+5df6c765
-r common.txt
... rest of file ...
"""
requirements_file = Path(requirements_path)
if not requirements_file.exists():
print(
f"ERROR: Requirements file not found: {requirements_path}", file=sys.stderr
)
sys.exit(1)
# Backup original file
backup_file = requirements_file.with_suffix(requirements_file.suffix + ".bak")
with open(requirements_file) as f:
original_lines = f.readlines()
# Write backup
with open(backup_file, "w") as f:
f.writelines(original_lines)
# Build header with pinned custom wheels
header_lines = [
"# Custom ROCm wheel pins (auto-generated by pin_rocm_dependencies.py)\n",
"# These must come FIRST to ensure correct dependency resolution\n",
]
for package_name, exact_version in versions.items():
header_lines.append(f"{package_name}=={exact_version}\n")
header_lines.append("\n") # Blank line separator
# Filter out any existing entries for custom packages from original file
filtered_lines = []
removed_packages = []
for line in original_lines:
stripped = line.strip()
should_keep = True
# Check if this line is for one of our custom packages
if stripped and not stripped.startswith("#") and not stripped.startswith("-"):
for package_name in versions:
# Handle both hyphen and underscore variations
pattern_name = package_name.replace("-", "[-_]")
pattern = rf"^{pattern_name}\s*[=<>]=?\s*[\d.a-zA-Z+]+"
if re.match(pattern, stripped, re.IGNORECASE):
removed_packages.append(f"{package_name}: {stripped}")
should_keep = False
break
if should_keep:
filtered_lines.append(line)
# Combine: header + filtered original content
final_lines = header_lines + filtered_lines
# Write modified content
with open(requirements_file, "w") as f:
f.writelines(final_lines)
# Print summary
print("\n✓ Inserted custom wheel pins at TOP of requirements:", file=sys.stderr)
for package_name, exact_version in versions.items():
print(f" - {package_name}=={exact_version}", file=sys.stderr)
if removed_packages:
print("\n✓ Removed old package entries:", file=sys.stderr)
for pkg in removed_packages:
print(f" - {pkg}", file=sys.stderr)
print(f"\n✓ Patched requirements file: {requirements_path}", file=sys.stderr)
print(f" Backup saved: {backup_file}", file=sys.stderr)
def main():
if len(sys.argv) != 3:
print(
f"Usage: {sys.argv[0]} <install_dir> <requirements_file>", file=sys.stderr
)
print(
f"Example: {sys.argv[0]} /install /app/vllm/requirements/rocm.txt",
file=sys.stderr,
)
sys.exit(1)
install_dir = sys.argv[1]
requirements_path = sys.argv[2]
print("=" * 70, file=sys.stderr)
print("Pinning vLLM dependencies to custom ROCm wheel versions", file=sys.stderr)
print("=" * 70, file=sys.stderr)
# Get versions from custom wheels
print(f"\nScanning {install_dir} for custom wheels...", file=sys.stderr)
versions = get_custom_wheel_versions(install_dir)
if not versions:
print("\nERROR: No custom wheels found in /install!", file=sys.stderr)
sys.exit(1)
# Pin dependencies in requirements file
print(f"\nPatching {requirements_path}...", file=sys.stderr)
pin_dependencies_in_requirements(requirements_path, versions)
print("\n" + "=" * 70, file=sys.stderr)
print("✓ Dependency pinning complete!", file=sys.stderr)
print("=" * 70, file=sys.stderr)
sys.exit(0)
if __name__ == "__main__":
main()
...@@ -152,13 +152,13 @@ class CacheConfig: ...@@ -152,13 +152,13 @@ class CacheConfig:
kv_offloading_size: float | None = None kv_offloading_size: float | None = None
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is """Size of the KV cache offloading buffer in GiB. When TP > 1, this is
the total buffer size summed across all TP ranks. By default, this is set the total buffer size summed across all TP ranks. By default, this is set
to None, which means no KV offloading is enabled. When set with to None, which means no KV offloading is enabled. When set, vLLM will
kv_offloading_backend, vLLM will enable KV cache offloading to CPU""" enable KV cache offloading to CPU using the kv_offloading_backend."""
kv_offloading_backend: KVOffloadingBackend | None = None kv_offloading_backend: KVOffloadingBackend = "native"
"""The backend to use for KV cache offloading. Supported backends include """The backend to use for KV cache offloading. Supported backends include
'native' (vLLM native CPU offloading), 'lmcache' This option must be used 'native' (vLLM native CPU offloading), 'lmcache'.
together with kv_offloading_size.""" KV offloading is only activated when kv_offloading_size is set."""
def compute_hash(self) -> str: def compute_hash(self) -> str:
""" """
......
...@@ -48,7 +48,7 @@ class PoolerConfig: ...@@ -48,7 +48,7 @@ class PoolerConfig:
## for embeddings models ## for embeddings models
normalize: bool | None = None normalize: bool | None = None
""" """
Whether to normalize the embeddings outputs. Defaults to True. DEPRECATED: please use `use_activation` instead.
""" """
dimensions: int | None = None dimensions: int | None = None
""" """
...@@ -75,11 +75,11 @@ class PoolerConfig: ...@@ -75,11 +75,11 @@ class PoolerConfig:
## for classification models ## for classification models
softmax: float | None = None softmax: float | None = None
""" """
softmax will be deprecated, please use use_activation instead. DEPRECATED: please use `use_activation` instead.
""" """
activation: float | None = None activation: float | None = None
""" """
activation will be deprecated, please use use_activation instead. DEPRECATED: please use `use_activation` instead.
""" """
use_activation: bool | None = None use_activation: bool | None = None
""" """
...@@ -164,17 +164,24 @@ class PoolerConfig: ...@@ -164,17 +164,24 @@ class PoolerConfig:
def get_use_activation(o: object): def get_use_activation(o: object):
if softmax := getattr(o, "softmax", None) is not None: if (normalize := getattr(o, "normalize", None)) is not None:
logger.warning_once( logger.warning_once(
"softmax will be deprecated and will be removed in v0.15. " "`normalize` is deprecated and will be removed in v0.15. "
"Please use use_activation instead." "Please use `use_activation` instead."
)
return normalize
if (softmax := getattr(o, "softmax", None)) is not None:
logger.warning_once(
"`softmax` is deprecated and will be removed in v0.15. "
"Please use `use_activation` instead."
) )
return softmax return softmax
if activation := getattr(o, "activation", None) is not None: if (activation := getattr(o, "activation", None)) is not None:
logger.warning_once( logger.warning_once(
"activation will be deprecated and will be removed in v0.15. " "`activation` is deprecated and will be removed in v0.15. "
"Please use use_activation instead." "Please use `use_activation` instead."
) )
return activation return activation
......
...@@ -498,17 +498,15 @@ class VllmConfig: ...@@ -498,17 +498,15 @@ class VllmConfig:
Right now, this function reads the offloading settings from Right now, this function reads the offloading settings from
CacheConfig and configures the KVTransferConfig accordingly. CacheConfig and configures the KVTransferConfig accordingly.
""" """
if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None: # KV offloading is only activated when kv_offloading_size is set.
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
return return
kv_offloading_backend = self.cache_config.kv_offloading_backend
# If no KVTransferConfig is provided, create a default one. # If no KVTransferConfig is provided, create a default one.
if self.kv_transfer_config is None: if self.kv_transfer_config is None:
self.kv_transfer_config = KVTransferConfig() self.kv_transfer_config = KVTransferConfig()
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
raise ValueError(
"You must set kv_offloading_size when kv_offloading_backend is set."
)
num_kv_ranks = ( num_kv_ranks = (
self.parallel_config.tensor_parallel_size self.parallel_config.tensor_parallel_size
* self.parallel_config.pipeline_parallel_size * self.parallel_config.pipeline_parallel_size
......
...@@ -234,7 +234,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1): ...@@ -234,7 +234,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
lora_id=e.lora_id, lora_id=e.lora_id,
block_size=e.block_size, block_size=e.block_size,
medium=e.medium, medium=e.medium,
lora_name=e.lora_name, lora_name=getattr(e, "lora_name", None),
) )
for e in events for e in events
] ]
......
...@@ -578,9 +578,7 @@ class EngineArgs: ...@@ -578,9 +578,7 @@ class EngineArgs:
optimization_level: OptimizationLevel = VllmConfig.optimization_level optimization_level: OptimizationLevel = VllmConfig.optimization_level
kv_offloading_size: float | None = CacheConfig.kv_offloading_size kv_offloading_size: float | None = CacheConfig.kv_offloading_size
kv_offloading_backend: KVOffloadingBackend | None = ( kv_offloading_backend: KVOffloadingBackend = CacheConfig.kv_offloading_backend
CacheConfig.kv_offloading_backend
)
tokens_only: bool = False tokens_only: bool = False
def __post_init__(self): def __post_init__(self):
......
...@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): ...@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
return PoolingParams( return PoolingParams(
truncate_prompt_tokens=self.truncate_prompt_tokens, truncate_prompt_tokens=self.truncate_prompt_tokens,
dimensions=self.dimensions, dimensions=self.dimensions,
normalize=self.normalize, use_activation=self.normalize,
) )
...@@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): ...@@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
return PoolingParams( return PoolingParams(
truncate_prompt_tokens=self.truncate_prompt_tokens, truncate_prompt_tokens=self.truncate_prompt_tokens,
dimensions=self.dimensions, dimensions=self.dimensions,
normalize=self.normalize, use_activation=self.normalize,
) )
......
...@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest): ...@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest):
return PoolingParams( return PoolingParams(
truncate_prompt_tokens=self.truncate_prompt_tokens, truncate_prompt_tokens=self.truncate_prompt_tokens,
dimensions=self.dimensions, dimensions=self.dimensions,
normalize=self.normalize,
use_activation=get_use_activation(self), use_activation=get_use_activation(self),
) )
...@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest): ...@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest):
return PoolingParams( return PoolingParams(
truncate_prompt_tokens=self.truncate_prompt_tokens, truncate_prompt_tokens=self.truncate_prompt_tokens,
dimensions=self.dimensions, dimensions=self.dimensions,
normalize=self.normalize,
use_activation=get_use_activation(self), use_activation=get_use_activation(self),
) )
......
...@@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead): ...@@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead):
# for normalize # for normalize
if self.activation is not None: if self.activation is not None:
flags = [p.normalize for p in pooling_params] flags = [p.use_activation for p in pooling_params]
if len(set(flags)) == 1: if len(set(flags)) == 1:
if flags[0]: if flags[0]:
pooled_data = self.activation(pooled_data) pooled_data = self.activation(pooled_data)
......
...@@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig): ...@@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig):
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config model_config = vllm_config.model_config
head = EmbeddingPoolerHead( head = EmbeddingPoolerHead(
projector=_load_st_projector(model_config),
head_dtype=model_config.head_dtype, head_dtype=model_config.head_dtype,
projector=_load_st_projector(model_config),
activation=PoolerNormalize(), activation=PoolerNormalize(),
) )
...@@ -116,9 +116,9 @@ def pooler_for_classify( ...@@ -116,9 +116,9 @@ def pooler_for_classify(
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config model_config = vllm_config.model_config
head = ClassifierPoolerHead( head = ClassifierPoolerHead(
head_dtype=model_config.head_dtype,
classifier=classifier, classifier=classifier,
logit_bias=model_config.pooler_config.logit_bias, logit_bias=model_config.pooler_config.logit_bias,
head_dtype=model_config.head_dtype,
activation=resolve_classifier_act_fn( activation=resolve_classifier_act_fn(
model_config, static_num_labels=True, act_fn=act_fn model_config, static_num_labels=True, act_fn=act_fn
), ),
......
...@@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC): ...@@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC):
class TokenEmbeddingPoolerHead(TokenPoolerHead): class TokenEmbeddingPoolerHead(TokenPoolerHead):
def __init__( def __init__(
self, self,
projector: ProjectorFn | None = None,
head_dtype: torch.dtype | str | None = None, head_dtype: torch.dtype | str | None = None,
projector: ProjectorFn | None = None,
activation: ActivationFn | None = None, activation: ActivationFn | None = None,
) -> None: ) -> None:
super().__init__() super().__init__()
self.projector = projector
self.head_dtype = head_dtype self.head_dtype = head_dtype
self.projector = projector
self.activation = activation self.activation = activation
def get_supported_tasks(self) -> Set[PoolingTask]: def get_supported_tasks(self) -> Set[PoolingTask]:
...@@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead): ...@@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead):
pooled_data = pooled_data[..., : pooling_param.dimensions] pooled_data = pooled_data[..., : pooling_param.dimensions]
# for normalize # for normalize
if self.activation is not None and pooling_param.normalize: if self.activation is not None and pooling_param.use_activation:
pooled_data = self.activation(pooled_data) pooled_data = self.activation(pooled_data)
# pooled_data shape: [n_tokens, embedding_dimension] # pooled_data shape: [n_tokens, embedding_dimension]
......
...@@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig): ...@@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig):
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config model_config = vllm_config.model_config
head = TokenEmbeddingPoolerHead( head = TokenEmbeddingPoolerHead(
projector=_load_st_projector(model_config),
head_dtype=model_config.head_dtype, head_dtype=model_config.head_dtype,
projector=_load_st_projector(model_config),
activation=PoolerNormalize(), activation=PoolerNormalize(),
) )
...@@ -116,9 +116,9 @@ def pooler_for_token_classify( ...@@ -116,9 +116,9 @@ def pooler_for_token_classify(
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
model_config = vllm_config.model_config model_config = vllm_config.model_config
head = TokenClassifierPoolerHead( head = TokenClassifierPoolerHead(
head_dtype=model_config.head_dtype,
classifier=classifier, classifier=classifier,
logit_bias=model_config.pooler_config.logit_bias, logit_bias=model_config.pooler_config.logit_bias,
head_dtype=model_config.head_dtype,
activation=resolve_classifier_act_fn( activation=resolve_classifier_act_fn(
model_config, static_num_labels=False, act_fn=act_fn model_config, static_num_labels=False, act_fn=act_fn
), ),
......
...@@ -98,7 +98,9 @@ class QuantFP8(CustomOp): ...@@ -98,7 +98,9 @@ class QuantFP8(CustomOp):
num_token_padding=self.num_token_padding, num_token_padding=self.num_token_padding,
scale_ub=scale_ub, scale_ub=scale_ub,
use_per_token_if_dynamic=self.use_per_token_if_dynamic, use_per_token_if_dynamic=self.use_per_token_if_dynamic,
group_shape=self.group_shape if self.static else None, group_shape=(self.group_shape.row, self.group_shape.col)
if self.static
else None,
) )
def forward_hip( def forward_hip(
......
...@@ -116,8 +116,8 @@ class BertPooler(SequencePooler): ...@@ -116,8 +116,8 @@ class BertPooler(SequencePooler):
# Use lambdas so that weights are not registered under `self.head` # Use lambdas so that weights are not registered under `self.head`
self.head = EmbeddingPoolerHead( self.head = EmbeddingPoolerHead(
projector=lambda x: self.dense(x),
head_dtype=head_dtype, head_dtype=head_dtype,
projector=lambda x: self.dense(x),
activation=LambdaPoolerActivation(self.act_fn), activation=LambdaPoolerActivation(self.act_fn),
) )
......
...@@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler): ...@@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler):
config.hidden_size, config.hidden_size,
eps=config.norm_eps, eps=config.norm_eps,
bias=config.norm_bias, bias=config.norm_bias,
dtype=head_dtype,
) )
# Use lambdas so that weights are not registered under `self.head` # Use lambdas so that weights are not registered under `self.head`
self.head = EmbeddingPoolerHead( self.head = EmbeddingPoolerHead(
projector=lambda x: self.dense(x),
head_dtype=head_dtype, head_dtype=head_dtype,
projector=lambda x: self.dense(x),
activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))), activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))),
) )
......
...@@ -26,9 +26,9 @@ class PoolingParams( ...@@ -26,9 +26,9 @@ class PoolingParams(
Set to None to disable truncation. Set to None to disable truncation.
dimensions: Reduce the dimensions of embeddings dimensions: Reduce the dimensions of embeddings
if model support matryoshka representation. if model support matryoshka representation.
normalize: Whether to normalize the embeddings outputs. normalize: Deprecated, please use use_activation instead.
softmax: softmax will be deprecated, please use use_activation instead. softmax: Deprecated, please use use_activation instead.
activation: activation will be deprecated, please use use_activation instead. activation: Deprecated, please use use_activation instead.
use_activation: Whether to apply activation function to use_activation: Whether to apply activation function to
the classification outputs. the classification outputs.
""" """
...@@ -63,15 +63,15 @@ class PoolingParams( ...@@ -63,15 +63,15 @@ class PoolingParams(
@property @property
def all_parameters(self) -> list[str]: def all_parameters(self) -> list[str]:
return ["dimensions", "normalize", "use_activation"] return ["dimensions", "use_activation"]
@property @property
def valid_parameters(self): def valid_parameters(self):
return { return {
"embed": ["dimensions", "normalize"], "embed": ["dimensions", "use_activation"],
"classify": ["use_activation"], "classify": ["use_activation"],
"score": ["use_activation"], "score": ["use_activation"],
"token_embed": ["dimensions", "normalize"], "token_embed": ["dimensions", "use_activation"],
"token_classify": ["use_activation"], "token_classify": ["use_activation"],
} }
...@@ -162,8 +162,8 @@ class PoolingParams( ...@@ -162,8 +162,8 @@ class PoolingParams(
def _set_default_parameters(self, model_config: Optional["ModelConfig"]): def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
if self.task in ["embed", "token_embed"]: if self.task in ["embed", "token_embed"]:
if self.normalize is None: if self.use_activation is None:
self.normalize = True self.use_activation = True
if self.dimensions is not None and model_config is not None: if self.dimensions is not None and model_config is not None:
if not model_config.is_matryoshka: if not model_config.is_matryoshka:
...@@ -213,7 +213,6 @@ class PoolingParams( ...@@ -213,7 +213,6 @@ class PoolingParams(
return ( return (
f"PoolingParams(" f"PoolingParams("
f"task={self.task}, " f"task={self.task}, "
f"normalize={self.normalize}, "
f"dimensions={self.dimensions}, " f"dimensions={self.dimensions}, "
f"use_activation={self.use_activation}, " f"use_activation={self.use_activation}, "
f"step_tag_id={self.step_tag_id}, " f"step_tag_id={self.step_tag_id}, "
......
...@@ -801,7 +801,7 @@ def get_pooling_config( ...@@ -801,7 +801,7 @@ def get_pooling_config(
logger.info("Found pooling configuration.") logger.info("Found pooling configuration.")
config: dict[str, Any] = {"normalize": normalize} config: dict[str, Any] = {"use_activation": normalize}
for key, val in pooling_dict.items(): for key, val in pooling_dict.items():
if val is True: if val is True:
pooling_type = parse_pooling_type(key) pooling_type = parse_pooling_type(key)
......
...@@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend): ...@@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend):
# ROCM paged attention kernel only supports block sizes 16 and 32 # ROCM paged attention kernel only supports block sizes 16 and 32
# due to shared memory (LDS) constraints on AMD GPUs. # due to shared memory (LDS) constraints on AMD GPUs.
# See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro. # See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
return [16, 32]
# However, The limitations in [16, 32] are reasonable for a native C++ kernel,
# but vLLM should allow support for non-standard sizes via the Triton path,
# as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
# where the Triton kernel under rocm_atten does not support inference
# for a non-standard qwen3-next model with a block_size of 544.
# We have fixed the Triton kernel so that the standard model uses the original
# bit-addressing logic, while the non-standard model
# uses our optimized kernel logic.
return [16, 32, 544]
@classmethod @classmethod
def get_supported_head_sizes(cls) -> list[int]: def get_supported_head_sizes(cls) -> list[int]:
......
...@@ -174,6 +174,8 @@ class TopKTopPSampler(nn.Module): ...@@ -174,6 +174,8 @@ class TopKTopPSampler(nn.Module):
k: torch.Tensor | None, k: torch.Tensor | None,
p: torch.Tensor | None, p: torch.Tensor | None,
) -> tuple[torch.Tensor, torch.Tensor | None]: ) -> tuple[torch.Tensor, torch.Tensor | None]:
# FIXME: Fix aiter_sampler's accuracy issue and remove this flag
DISABLE_AITER_SAMPLER = True
"""Optimized ROCm/aiter path (same structure as forward_cuda).""" """Optimized ROCm/aiter path (same structure as forward_cuda)."""
if (k is None and p is None) or generators: if (k is None and p is None) or generators:
if generators: if generators:
...@@ -186,6 +188,8 @@ class TopKTopPSampler(nn.Module): ...@@ -186,6 +188,8 @@ class TopKTopPSampler(nn.Module):
"processed_logits", "processed_logits",
"processed_logprobs", "processed_logprobs",
), "aiter sampler does not support returning logits/logprobs." ), "aiter sampler does not support returning logits/logprobs."
if DISABLE_AITER_SAMPLER:
return self.forward_native(logits, generators, k, p)
return self.aiter_sample(logits, k, p, generators), None return self.aiter_sample(logits, k, p, generators), None
def aiter_sample( def aiter_sample(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment