Merge tag 'v0.14.0rc2' into v0.14.0rc2-ori

6fa64fbe · zhuwenwen · 7aa5c03c · 7f42dc20 · 6fa64fbe · 6fa64fbe
Commit 6fa64fbe authored Jan 16, 2026 by zhuwenwen
20 changed files
--- a/tests/v1/kv_connector/unit/test_config.py
+++ b/tests/v1/kv_connector/unit/test_config.py
@@ -19,7 +19,8 @@ pytestmark = pytest.mark.cpu_test
        ("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
        # size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
        ("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
-        (None, None, 1, 1, None, None),
+        # When kv_offloading_size is None, offloading is disabled (backend is ignored)
+        ("native", None, 1, 1, None, None),
    ],
 )
 def test_kv_connector(
@@ -62,3 +63,19 @@ def test_kv_connector(
        assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
        # Existing config should be replaced
        assert "existing_key" not in kv_connector_extra_config
+def test_kv_offloading_size_only_uses_native_default():
+    """Test that setting only kv_offloading_size enables native offloading."""
+    vllm_config = VllmConfig(
+        cache_config=CacheConfig(
+            kv_offloading_size=4.0,
+            # kv_offloading_backend not set, should default to "native"
+        ),
+    )
+    kv_transfer_config = vllm_config.kv_transfer_config
+    kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
+    assert kv_transfer_config.kv_connector == "OffloadingConnector"
+    assert kv_transfer_config.kv_role == "kv_both"
+    assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30)
--- a/tools/vllm-rocm/pin_rocm_dependencies.py
+++ b/tools/vllm-rocm/pin_rocm_dependencies.py
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Pin vLLM dependencies to exact versions of custom ROCm wheels.
+This script modifies vLLM's requirements files to replace version constraints
+with exact versions of custom-built ROCm wheels (torch, triton, torchvision, amdsmi).
+This ensures that 'pip install vllm' automatically installs the correct custom wheels
+instead of allowing pip to download different versions from PyPI.
+"""
+import re
+import sys
+from pathlib import Path
+def extract_version_from_wheel(wheel_name: str) -> str:
+    """
+    Extract version from wheel filename.
+    Example:
+        torch-2.9.0a0+git1c57644-cp312-cp312-linux_x86_64.whl -> 2.9.0a0+git1c57644
+        triton-3.4.0-cp312-cp312-linux_x86_64.whl -> 3.4.0
+    """
+    # Wheel format:
+    #    {distribution}-{version}(-{build tag})?-{python}-{abi}-{platform}.whl
+    parts = wheel_name.replace(".whl", "").split("-")
+    if len(parts) < 5:
+        raise ValueError(f"Invalid wheel filename format: {wheel_name}")
+    # Version is the second part
+    version = parts[1]
+    return version
+def get_custom_wheel_versions(install_dir: str) -> dict[str, str]:
+    """
+    Read /install directory and extract versions of custom wheels.
+    Returns:
+        Dict mapping package names to exact versions
+    """
+    install_path = Path(install_dir)
+    if not install_path.exists():
+        print(f"ERROR: Install directory not found: {install_dir}", file=sys.stderr)
+        sys.exit(1)
+    versions = {}
+    # Map wheel prefixes to package names
+    # IMPORTANT: Use dashes to avoid matching substrings
+    #            (e.g., 'torch' would match 'torchvision')
+    # ORDER MATTERS: This order is preserved when pinning dependencies
+    #               in requirements files
+    package_mapping = [
+        ("torch-", "torch"),  # Match torch- (not torchvision)
+        ("triton-", "triton"),  # Match triton- (not triton_kernels)
+        ("triton_kernels-", "triton-kernels"),  # Match triton_kernels-
+        ("torchvision-", "torchvision"),  # Match torchvision-
+        ("torchaudio-", "torchaudio"),  # Match torchaudio-
+        ("amdsmi-", "amdsmi"),  # Match amdsmi-
+        ("flash_attn-", "flash-attn"),  # Match flash_attn-
+        ("aiter-", "aiter"),  # Match aiter-
+    ]
+    for wheel_file in install_path.glob("*.whl"):
+        wheel_name = wheel_file.name
+        for prefix, package_name in package_mapping:
+            if wheel_name.startswith(prefix):
+                try:
+                    version = extract_version_from_wheel(wheel_name)
+                    versions[package_name] = version
+                    print(f"Found {package_name}=={version}", file=sys.stderr)
+                except Exception as e:
+                    print(
+                        f"WARNING: Could not extract version from {wheel_name}: {e}",
+                        file=sys.stderr,
+                    )
+                break
+    # Return versions in the order defined by package_mapping
+    ordered_versions = {}
+    for _, package_name in package_mapping:
+        if package_name in versions:
+            ordered_versions[package_name] = versions[package_name]
+    return ordered_versions
+def pin_dependencies_in_requirements(requirements_path: str, versions: dict[str, str]):
+    """
+    Insert custom wheel pins at the TOP of requirements file.
+    This ensures that when setup.py processes the file line-by-line,
+    custom wheels (torch, triton, etc.) are encountered FIRST, before
+    any `-r common.txt` includes that might pull in other dependencies.
+    Creates:
+        # Custom ROCm wheel pins (auto-generated)
+        torch==2.9.0a0+git1c57644
+        triton==3.4.0
+        torchvision==0.23.0a0+824e8c8
+        amdsmi==26.1.0+5df6c765
+        -r common.txt
+        ... rest of file ...
+    """
+    requirements_file = Path(requirements_path)
+    if not requirements_file.exists():
+        print(
+            f"ERROR: Requirements file not found: {requirements_path}", file=sys.stderr
+        )
+        sys.exit(1)
+    # Backup original file
+    backup_file = requirements_file.with_suffix(requirements_file.suffix + ".bak")
+    with open(requirements_file) as f:
+        original_lines = f.readlines()
+    # Write backup
+    with open(backup_file, "w") as f:
+        f.writelines(original_lines)
+    # Build header with pinned custom wheels
+    header_lines = [
+        "# Custom ROCm wheel pins (auto-generated by pin_rocm_dependencies.py)\n",
+        "# These must come FIRST to ensure correct dependency resolution\n",
+    ]
+    for package_name, exact_version in versions.items():
+        header_lines.append(f"{package_name}=={exact_version}\n")
+    header_lines.append("\n")  # Blank line separator
+    # Filter out any existing entries for custom packages from original file
+    filtered_lines = []
+    removed_packages = []
+    for line in original_lines:
+        stripped = line.strip()
+        should_keep = True
+        # Check if this line is for one of our custom packages
+        if stripped and not stripped.startswith("#") and not stripped.startswith("-"):
+            for package_name in versions:
+                # Handle both hyphen and underscore variations
+                pattern_name = package_name.replace("-", "[-_]")
+                pattern = rf"^{pattern_name}\s*[=<>]=?\s*[\d.a-zA-Z+]+"
+                if re.match(pattern, stripped, re.IGNORECASE):
+                    removed_packages.append(f"{package_name}: {stripped}")
+                    should_keep = False
+                    break
+        if should_keep:
+            filtered_lines.append(line)
+    # Combine: header + filtered original content
+    final_lines = header_lines + filtered_lines
+    # Write modified content
+    with open(requirements_file, "w") as f:
+        f.writelines(final_lines)
+    # Print summary
+    print("\n✓ Inserted custom wheel pins at TOP of requirements:", file=sys.stderr)
+    for package_name, exact_version in versions.items():
+        print(f"  - {package_name}=={exact_version}", file=sys.stderr)
+    if removed_packages:
+        print("\n✓ Removed old package entries:", file=sys.stderr)
+        for pkg in removed_packages:
+            print(f"  - {pkg}", file=sys.stderr)
+    print(f"\n✓ Patched requirements file: {requirements_path}", file=sys.stderr)
+    print(f"  Backup saved: {backup_file}", file=sys.stderr)
+def main():
+    if len(sys.argv) != 3:
+        print(
+            f"Usage: {sys.argv[0]} <install_dir> <requirements_file>", file=sys.stderr
+        )
+        print(
+            f"Example: {sys.argv[0]} /install /app/vllm/requirements/rocm.txt",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    install_dir = sys.argv[1]
+    requirements_path = sys.argv[2]
+    print("=" * 70, file=sys.stderr)
+    print("Pinning vLLM dependencies to custom ROCm wheel versions", file=sys.stderr)
+    print("=" * 70, file=sys.stderr)
+    # Get versions from custom wheels
+    print(f"\nScanning {install_dir} for custom wheels...", file=sys.stderr)
+    versions = get_custom_wheel_versions(install_dir)
+    if not versions:
+        print("\nERROR: No custom wheels found in /install!", file=sys.stderr)
+        sys.exit(1)
+    # Pin dependencies in requirements file
+    print(f"\nPatching {requirements_path}...", file=sys.stderr)
+    pin_dependencies_in_requirements(requirements_path, versions)
+    print("\n" + "=" * 70, file=sys.stderr)
+    print("✓ Dependency pinning complete!", file=sys.stderr)
+    print("=" * 70, file=sys.stderr)
+    sys.exit(0)
+if __name__ == "__main__":
+    main()
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -152,13 +152,13 @@ class CacheConfig:
    kv_offloading_size: float | None = None
    """Size of the KV cache offloading buffer in GiB. When TP > 1, this is
    the total buffer size summed across all TP ranks. By default, this is set
-    to None, which means no KV offloading is enabled. When set with
+    to None, which means no KV offloading is enabled. When set, vLLM will
-    kv_offloading_backend, vLLM will enable KV cache offloading to CPU"""
+    enable KV cache offloading to CPU using the kv_offloading_backend."""
-    kv_offloading_backend: KVOffloadingBackend | None = None
+    kv_offloading_backend: KVOffloadingBackend = "native"
    """The backend to use for KV cache offloading. Supported backends include
-    'native' (vLLM native CPU offloading), 'lmcache' This option must be used
+    'native' (vLLM native CPU offloading), 'lmcache'.
-    together with kv_offloading_size."""
+    KV offloading is only activated when kv_offloading_size is set."""
    def compute_hash(self) -> str:
        """

--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -48,7 +48,7 @@ class PoolerConfig:
    ## for embeddings models
    normalize: bool | None = None
    """
-    Whether to normalize the embeddings outputs. Defaults to True.
+    DEPRECATED: please use `use_activation` instead.
    """
    dimensions: int | None = None
    """
@@ -75,11 +75,11 @@ class PoolerConfig:
    ## for classification models
    softmax: float | None = None
    """
-    softmax will be deprecated, please use use_activation instead.
+    DEPRECATED: please use `use_activation` instead.
    """
    activation: float | None = None
    """
-    activation will be deprecated, please use use_activation instead.
+    DEPRECATED: please use `use_activation` instead.
    """
    use_activation: bool | None = None
    """
@@ -164,17 +164,24 @@ class PoolerConfig:
 def get_use_activation(o: object):
-    if softmax := getattr(o, "softmax", None) is not None:
+    if (normalize := getattr(o, "normalize", None)) is not None:
        logger.warning_once(
-            "softmax will be deprecated and will be removed in v0.15. "
+            "`normalize` is deprecated and will be removed in v0.15. "
-            "Please use use_activation instead."
+            "Please use `use_activation` instead."
+        )
+        return normalize
+    if (softmax := getattr(o, "softmax", None)) is not None:
+        logger.warning_once(
+            "`softmax` is deprecated and will be removed in v0.15. "
+            "Please use `use_activation` instead."
        )
        return softmax
-    if activation := getattr(o, "activation", None) is not None:
+    if (activation := getattr(o, "activation", None)) is not None:
        logger.warning_once(
-            "activation will be deprecated and will be removed in v0.15. "
+            "`activation` is deprecated and will be removed in v0.15. "
-            "Please use use_activation instead."
+            "Please use `use_activation` instead."
        )
        return activation

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -498,17 +498,15 @@ class VllmConfig:
        Right now, this function reads the offloading settings from
        CacheConfig and configures the KVTransferConfig accordingly.
        """
-        if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None:
+        # KV offloading is only activated when kv_offloading_size is set.
+        if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
            return
+        kv_offloading_backend = self.cache_config.kv_offloading_backend
        # If no KVTransferConfig is provided, create a default one.
        if self.kv_transfer_config is None:
            self.kv_transfer_config = KVTransferConfig()
-        if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
-            raise ValueError(
-                "You must set kv_offloading_size when kv_offloading_backend is set."
-            )
        num_kv_ranks = (
            self.parallel_config.tensor_parallel_size
            * self.parallel_config.pipeline_parallel_size

--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -234,7 +234,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
                lora_id=e.lora_id,
                block_size=e.block_size,
                medium=e.medium,
-                lora_name=e.lora_name,
+                lora_name=getattr(e, "lora_name", None),
            )
            for e in events
        ]

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -578,9 +578,7 @@ class EngineArgs:
    optimization_level: OptimizationLevel = VllmConfig.optimization_level
    kv_offloading_size: float | None = CacheConfig.kv_offloading_size
-    kv_offloading_backend: KVOffloadingBackend | None = (
+    kv_offloading_backend: KVOffloadingBackend = CacheConfig.kv_offloading_backend
-        CacheConfig.kv_offloading_backend
-    )
    tokens_only: bool = False
    def __post_init__(self):

--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -75,7 +75,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
-            normalize=self.normalize,
+            use_activation=self.normalize,
        )
@@ -189,7 +189,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
-            normalize=self.normalize,
+            use_activation=self.normalize,
        )

--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -40,7 +40,6 @@ class PoolingCompletionRequest(EmbeddingCompletionRequest):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
-            normalize=self.normalize,
            use_activation=get_use_activation(self),
        )
@@ -66,7 +65,6 @@ class PoolingChatRequest(EmbeddingChatRequest):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
-            normalize=self.normalize,
            use_activation=get_use_activation(self),
        )

--- a/vllm/model_executor/layers/pooler/seqwise/heads.py
+++ b/vllm/model_executor/layers/pooler/seqwise/heads.py
@@ -83,7 +83,7 @@ class EmbeddingPoolerHead(SequencePoolerHead):
        # for normalize
        if self.activation is not None:
-            flags = [p.normalize for p in pooling_params]
+            flags = [p.use_activation for p in pooling_params]
            if len(set(flags)) == 1:
                if flags[0]:
                    pooled_data = self.activation(pooled_data)

--- a/vllm/model_executor/layers/pooler/seqwise/poolers.py
+++ b/vllm/model_executor/layers/pooler/seqwise/poolers.py
@@ -95,8 +95,8 @@ def pooler_for_embed(pooler_config: PoolerConfig):
    vllm_config = get_current_vllm_config()
    model_config = vllm_config.model_config
    head = EmbeddingPoolerHead(
-        projector=_load_st_projector(model_config),
        head_dtype=model_config.head_dtype,
+        projector=_load_st_projector(model_config),
        activation=PoolerNormalize(),
    )
@@ -116,9 +116,9 @@ def pooler_for_classify(
    vllm_config = get_current_vllm_config()
    model_config = vllm_config.model_config
    head = ClassifierPoolerHead(
+        head_dtype=model_config.head_dtype,
        classifier=classifier,
        logit_bias=model_config.pooler_config.logit_bias,
-        head_dtype=model_config.head_dtype,
        activation=resolve_classifier_act_fn(
            model_config, static_num_labels=True, act_fn=act_fn
        ),

--- a/vllm/model_executor/layers/pooler/tokwise/heads.py
+++ b/vllm/model_executor/layers/pooler/tokwise/heads.py
@@ -44,14 +44,14 @@ class TokenPoolerHead(nn.Module, ABC):
 class TokenEmbeddingPoolerHead(TokenPoolerHead):
    def __init__(
        self,
-        projector: ProjectorFn | None = None,
        head_dtype: torch.dtype | str | None = None,
+        projector: ProjectorFn | None = None,
        activation: ActivationFn | None = None,
    ) -> None:
        super().__init__()
-        self.projector = projector
        self.head_dtype = head_dtype
+        self.projector = projector
        self.activation = activation
    def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -79,7 +79,7 @@ class TokenEmbeddingPoolerHead(TokenPoolerHead):
        pooled_data = pooled_data[..., : pooling_param.dimensions]
        # for normalize
-        if self.activation is not None and pooling_param.normalize:
+        if self.activation is not None and pooling_param.use_activation:
            pooled_data = self.activation(pooled_data)
        # pooled_data shape: [n_tokens, embedding_dimension]

--- a/vllm/model_executor/layers/pooler/tokwise/poolers.py
+++ b/vllm/model_executor/layers/pooler/tokwise/poolers.py
@@ -95,8 +95,8 @@ def pooler_for_token_embed(pooler_config: PoolerConfig):
    vllm_config = get_current_vllm_config()
    model_config = vllm_config.model_config
    head = TokenEmbeddingPoolerHead(
-        projector=_load_st_projector(model_config),
        head_dtype=model_config.head_dtype,
+        projector=_load_st_projector(model_config),
        activation=PoolerNormalize(),
    )
@@ -116,9 +116,9 @@ def pooler_for_token_classify(
    vllm_config = get_current_vllm_config()
    model_config = vllm_config.model_config
    head = TokenClassifierPoolerHead(
+        head_dtype=model_config.head_dtype,
        classifier=classifier,
        logit_bias=model_config.pooler_config.logit_bias,
-        head_dtype=model_config.head_dtype,
        activation=resolve_classifier_act_fn(
            model_config, static_num_labels=False, act_fn=act_fn
        ),

--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -98,7 +98,9 @@ class QuantFP8(CustomOp):
            num_token_padding=self.num_token_padding,
            scale_ub=scale_ub,
            use_per_token_if_dynamic=self.use_per_token_if_dynamic,
-            group_shape=self.group_shape if self.static else None,
+            group_shape=(self.group_shape.row, self.group_shape.col)
+            if self.static
+            else None,
        )
    def forward_hip(

--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -116,8 +116,8 @@ class BertPooler(SequencePooler):
        # Use lambdas so that weights are not registered under `self.head`
        self.head = EmbeddingPoolerHead(
-            projector=lambda x: self.dense(x),
            head_dtype=head_dtype,
+            projector=lambda x: self.dense(x),
            activation=LambdaPoolerActivation(self.act_fn),
        )

--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -309,12 +309,13 @@ class ModernBertPooler(SequencePooler):
            config.hidden_size,
            eps=config.norm_eps,
            bias=config.norm_bias,
+            dtype=head_dtype,
        )
        # Use lambdas so that weights are not registered under `self.head`
        self.head = EmbeddingPoolerHead(
-            projector=lambda x: self.dense(x),
            head_dtype=head_dtype,
+            projector=lambda x: self.dense(x),
            activation=LambdaPoolerActivation(lambda x: self.norm(self.act(x))),
        )

--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -26,9 +26,9 @@ class PoolingParams(
            Set to None to disable truncation.
        dimensions: Reduce the dimensions of embeddings
            if model support matryoshka representation.
-        normalize: Whether to normalize the embeddings outputs.
+        normalize: Deprecated, please use use_activation instead.
-        softmax: softmax will be deprecated, please use use_activation instead.
+        softmax: Deprecated, please use use_activation instead.
-        activation: activation will be deprecated, please use use_activation instead.
+        activation: Deprecated, please use use_activation instead.
        use_activation: Whether to apply activation function to
            the classification outputs.
    """
@@ -63,15 +63,15 @@ class PoolingParams(
    @property
    def all_parameters(self) -> list[str]:
-        return ["dimensions", "normalize", "use_activation"]
+        return ["dimensions", "use_activation"]
    @property
    def valid_parameters(self):
        return {
-            "embed": ["dimensions", "normalize"],
+            "embed": ["dimensions", "use_activation"],
            "classify": ["use_activation"],
            "score": ["use_activation"],
-            "token_embed": ["dimensions", "normalize"],
+            "token_embed": ["dimensions", "use_activation"],
            "token_classify": ["use_activation"],
        }
@@ -162,8 +162,8 @@ class PoolingParams(
    def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
        if self.task in ["embed", "token_embed"]:
-            if self.normalize is None:
+            if self.use_activation is None:
-                self.normalize = True
+                self.use_activation = True
            if self.dimensions is not None and model_config is not None:
                if not model_config.is_matryoshka:
@@ -213,7 +213,6 @@ class PoolingParams(
        return (
            f"PoolingParams("
            f"task={self.task}, "
-            f"normalize={self.normalize}, "
            f"dimensions={self.dimensions}, "
            f"use_activation={self.use_activation}, "
            f"step_tag_id={self.step_tag_id}, "

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -801,7 +801,7 @@ def get_pooling_config(
        logger.info("Found pooling configuration.")
-        config: dict[str, Any] = {"normalize": normalize}
+        config: dict[str, Any] = {"use_activation": normalize}
        for key, val in pooling_dict.items():
            if val is True:
                pooling_type = parse_pooling_type(key)

--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend):
        # ROCM paged attention kernel only supports block sizes 16 and 32
        # due to shared memory (LDS) constraints on AMD GPUs.
        # See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
-        return [16, 32]
+        # However, The limitations in [16, 32] are reasonable for a native C++ kernel,
+        # but vLLM should allow support for non-standard sizes via the Triton path,
+        # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
+        # where the Triton kernel under rocm_atten does not support inference
+        # for a non-standard qwen3-next model with a block_size of 544.
+        # We have fixed the Triton kernel so that the standard model uses the original
+        # bit-addressing logic, while the non-standard model
+        # uses our optimized kernel logic.
+        return [16, 32, 544]
    @classmethod
    def get_supported_head_sizes(cls) -> list[int]:

--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -174,6 +174,8 @@ class TopKTopPSampler(nn.Module):
        k: torch.Tensor | None,
        p: torch.Tensor | None,
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # FIXME: Fix aiter_sampler's accuracy issue and remove this flag
+        DISABLE_AITER_SAMPLER = True
        """Optimized ROCm/aiter path (same structure as forward_cuda)."""
        if (k is None and p is None) or generators:
            if generators:
@@ -186,6 +188,8 @@ class TopKTopPSampler(nn.Module):
            "processed_logits",
            "processed_logprobs",
        ), "aiter sampler does not support returning logits/logprobs."
+        if DISABLE_AITER_SAMPLER:
+            return self.forward_native(logits, generators, k, p)
        return self.aiter_sample(logits, k, p, generators), None
    def aiter_sample(