Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev

48a9e546 · 王敏 · 6372a1f3 · c11b09df · 48a9e546 · 48a9e546
Commit 48a9e546 authored Sep 07, 2025 by 王敏
20 changed files
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -173,6 +173,35 @@ __global__ void moe_sum_kernel(
  }
 }
+template <typename scalar_t, int TOPK, int SPLIT_D, int BLOCK_DIM>
+__global__ void moe_sum_sharedmem_topk8(
+    scalar_t* __restrict__ out,          
+    const scalar_t* __restrict__ input,  
+    const int d) {
+    const int token_idx = blockIdx.x / SPLIT_D;  
+    const int sub_block = blockIdx.x % SPLIT_D;  
+    const int d_per_block = (d + SPLIT_D - 1) / SPLIT_D;
+    const int64_t d_start = sub_block * d_per_block;
+    const int64_t token_offset = token_idx * TOPK * d;
+    const int64_t d_end = min(d_start + d_per_block, d);  
+    __shared__ __align__(16) scalar_t sem_input[TOPK][BLOCK_DIM];
+    for (int64_t idx = d_start + threadIdx.x; idx < d_end; idx += blockDim.x) {
+        sem_input[0][threadIdx.x] = input[token_offset + 0 * d + idx];
+        sem_input[1][threadIdx.x] = input[token_offset + 1 * d + idx];
+        sem_input[2][threadIdx.x] = input[token_offset + 2 * d + idx];
+        sem_input[3][threadIdx.x] = input[token_offset + 3 * d + idx];
+        sem_input[4][threadIdx.x] = input[token_offset + 4 * d + idx];
+        sem_input[5][threadIdx.x] = input[token_offset + 5 * d + idx];
+        sem_input[6][threadIdx.x] = input[token_offset + 6 * d + idx];
+        sem_input[7][threadIdx.x] = input[token_offset + 7 * d + idx];
+        __syncthreads();
+        scalar_t x = sem_input[0][threadIdx.x] + sem_input[1][threadIdx.x] + sem_input[2][threadIdx.x] + 
+          sem_input[3][threadIdx.x] + sem_input[4][threadIdx.x] + sem_input[5][threadIdx.x] + 
+          sem_input[6][threadIdx.x] + sem_input[7][threadIdx.x];
+        out[token_idx * d + idx] = x;
+    }
+}
 template <typename scalar_t>
 __global__ void moe_align_block_size_small_batch_expert_kernel(
    const scalar_t* __restrict__ topk_ids,
@@ -353,6 +382,67 @@ void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
      });
      break;
+    default:
+      at::sum_out(output, input, 1);
+      break;
+  }
+}
+void moe_sum_opt1(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
+             torch::Tensor& output)  // [num_tokens, hidden_size]
+{
+  const int hidden_size = input.size(-1);
+  const auto num_tokens = output.numel() / hidden_size;
+  const int topk = input.size(1);
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  constexpr int splitD_ = 8;
+  const int TOPK8_GRID_DIM = num_tokens * splitD_;
+  constexpr int TOPK8_BLOCK_DIM = 256;
+  dim3 grid_8(TOPK8_GRID_DIM);
+  dim3 block_8(TOPK8_BLOCK_DIM);
+  switch (topk) {
+    case 2:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+    case 3:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 3><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+    case 4:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+        vllm::moe::moe_sum_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
+    case 8:
+      VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_sharedmem_topk8", [&]{
+        vllm::moe::moe_sum_sharedmem_topk8<scalar_t, 8, splitD_, TOPK8_BLOCK_DIM><<<grid_8, block_8, 0, stream>>>(
+            output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+            hidden_size);
+      });
+      break;
    default:
      at::sum_out(output, input, 1);
      break;

--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -7,6 +7,7 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& gating_output);
 void moe_sum(torch::Tensor& input, torch::Tensor& output);
+void moe_sum_opt1(torch::Tensor& input, torch::Tensor& output);
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,

--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -11,8 +11,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Calculate the result of moe by summing up the partial results
  // from all selected experts.
  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
+  m.def("moe_sum_opt1(Tensor input, Tensor! output) -> ()");
  m.impl("moe_sum", torch::kCUDA, &moe_sum);
+  m.impl("moe_sum_opt1", torch::kCUDA, &moe_sum_opt1);
  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size.
  m.def(

--- a/setup.py
+++ b/setup.py
@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
            if sha is None:
                sha = get_sha(vllm_root)
            if (major, minor) >= ('2', '5'):
-                version = 'das.opt1.' + sha[:7]
+                version = 'das.opt1.rc1.' + sha[:7]
    else:
        if (major, minor) >= ('2', '5'):
-            version = 'das.opt1'
+            version = 'das.opt1.rc1'
    # dtk version

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 import os
 from ..utils import models_path_prefix
-from vllm.utils import gpuname
-import vllm.envs as envs
 MODELS = [
    os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
@@ -41,10 +39,10 @@ def v1(run_with_both_engines):
 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
-    if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
+    if not current_platform.is_rocm():
-        llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
-    else:
        llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
+    else:
+        llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
    weak_llm = weakref.ref(llm)
    del llm
@@ -111,13 +109,12 @@ def test_models(
                    prompt_embeds = hf_model.get_prompt_embeddings(
                        example_prompts)
-        if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
+        if not current_platform.is_rocm():
            with VllmRunner(model,
                        max_model_len=8192,
                        enforce_eager=enforce_eager,
                        enable_prompt_embeds=enable_prompt_embeds,
-                        gpu_memory_utilization=0.7,
+                        gpu_memory_utilization=0.7) as vllm_model:
-                        block_size=64) as vllm_model:
                if enable_prompt_embeds:
                    vllm_outputs = vllm_model.generate_greedy(
                        prompt_embeds, max_tokens)
@@ -131,7 +128,8 @@ def test_models(
                        max_model_len=8192,
                        enforce_eager=enforce_eager,
                        enable_prompt_embeds=enable_prompt_embeds,
-                        gpu_memory_utilization=0.7) as vllm_model:
+                        gpu_memory_utilization=0.7,
+                        block_size=64) as vllm_model:
                if enable_prompt_embeds:
                    vllm_outputs = vllm_model.generate_greedy(
                        prompt_embeds, max_tokens)

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -94,7 +94,7 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
-                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                block_size=64 if current_platform.is_rocm() else 16,
        ) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
@@ -128,7 +128,7 @@ def test_models_distributed(
 ) -> None:
    with monkeypatch.context() as m:
        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-        if (model == "meta-llama/Llama-3.2-1B-Instruct"
+        if (model == os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
                and distributed_executor_backend == "ray"):
            # test Ray Compiled Graph
            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
@@ -158,7 +158,7 @@ def test_models_distributed(
                enable_chunked_prefill=enable_chunked_prefill,
                max_num_batched_tokens=max_num_batched_tokens,
                distributed_executor_backend=distributed_executor_backend,
-                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                block_size=64 if current_platform.is_rocm() else 16,
        ) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(
                example_prompts,
@@ -220,22 +220,25 @@ def test_models_with_fp8_kv_cache(
            max_num_seqs=max_num_seqs,
            kv_cache_dtype=kv_cache_dtype,
            disable_async_output_proc=disable_async_output_proc,
+            block_size=64 if current_platform.is_rocm() else 16,
    ) as vllm_model:
        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, NUM_LOG_PROBS)
    with vllm_runner(
-            model,
+        model,
-            max_num_batched_tokens=max_num_batched_tokens,
+        max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
+        enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
+        tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
+        enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
+        max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
+        kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
+        disable_async_output_proc=disable_async_output_proc,
+        block_size=64 if current_platform.is_rocm() else 16,
    ) as vllm_model:
        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, NUM_LOG_PROBS)
    check_logprobs_close(
        outputs_0_lst=no_chunked_prefill_outputs,
@@ -286,7 +289,7 @@ def test_with_prefix_caching(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
-                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                block_size=64 if current_platform.is_rocm() else 16,
        ) as vllm_model:
            outputs[enable] = []
            for prompt in full_prompts:
@@ -303,7 +306,7 @@ def test_with_prefix_caching(
    )
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
 @pytest.mark.parametrize("dtype", ["bfloat16", "half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
 Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
 pytest tests/basic_correctness/test_preemption.py`.
 """
+import os
 import pytest
 from prometheus_client import REGISTRY
@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
 from ..models.utils import check_outputs_equal
 from ..utils import models_path_prefix
-import os
+from vllm.platforms import current_platform
 MODELS = [
    os.path.join(models_path_prefix, "distilbert/distilgpt2"),
@@ -74,18 +75,33 @@ def test_chunked_prefill_recompute(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    with vllm_runner(
+    if not current_platform.is_rocm():
-            model,
+        with vllm_runner(
-            dtype=dtype,
+                model,
-            max_num_batched_tokens=max_num_batched_tokens,
+                dtype=dtype,
-            enable_chunked_prefill=enable_chunked_prefill,
+                max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
+                enable_chunked_prefill=enable_chunked_prefill,
-            distributed_executor_backend=distributed_executor_backend,
+                max_num_seqs=max_num_seqs,
-            disable_log_stats=False,
+                distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
+                disable_log_stats=False,
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        ) as vllm_model:
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+            assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                    < ARTIFICIAL_PREEMPTION_MAX_CNT)
+    else:
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=enable_chunked_prefill,
+                max_num_seqs=max_num_seqs,
+                distributed_executor_backend=distributed_executor_backend,
+                disable_log_stats=False,
+                block_size=64,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                    < ARTIFICIAL_PREEMPTION_MAX_CNT)
    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@@ -115,17 +131,31 @@ def test_preemption(
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    with vllm_runner(
+    if not current_platform.is_rocm():
-            model,
+        with vllm_runner(
-            dtype=dtype,
+                model,
-            disable_log_stats=False,
+                dtype=dtype,
-            distributed_executor_backend=distributed_executor_backend,
+                disable_log_stats=False,
-    ) as vllm_model:
+                distributed_executor_backend=distributed_executor_backend,
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        ) as vllm_model:
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
+            assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
-        total_preemption = (
+                    < ARTIFICIAL_PREEMPTION_MAX_CNT)
-            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+            total_preemption = (
+                vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+    else:
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                disable_log_stats=False,
+                distributed_executor_backend=distributed_executor_backend,
+                block_size=64,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+            assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                    < ARTIFICIAL_PREEMPTION_MAX_CNT)
+            total_preemption = (
+                vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
    check_outputs_equal(
        outputs_0_lst=hf_outputs,
@@ -163,7 +193,7 @@ def test_preemption_infeasible(
    distributed_executor_backend: str,
 ) -> None:
    """Verify infeasible preemption request will be ignored."""
-    BLOCK_SIZE = 16
+    BLOCK_SIZE = 16 if not current_platform.is_rocm() else 64
    prefill_blocks = 2
    decode_blocks = max_tokens // BLOCK_SIZE
    with vllm_runner(

--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import json
 import pytest
@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import (compare_two_settings, create_new_process_for_each_test,
                     multi_gpu_test)
 from .backend import TestBackend
+from ..utils import models_path_prefix
 prompts = [
    "Hello, my name is",
@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"])
+@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")])
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("async_tp_enabled", [True])
 @pytest.mark.parametrize("distributed_backend", ["mp"])

--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -84,16 +84,17 @@ class TestSetting:
        #     method="encode",
        #     fullgraph=True,
        # ),
+        # TODO
        # vision language model
-        TestSetting(
+        # TestSetting(
-            model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
+        #     model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
-            model_args=["--trust-remote-code", "--max-model-len", "2048"],
+        #     model_args=["--trust-remote-code", "--max-model-len", "2048"],
-            pp_size=2,
+        #     pp_size=2,
-            tp_size=1,
+        #     tp_size=1,
-            attn_backend="FLASH_ATTN",
+        #     attn_backend="FLASH_ATTN",
-            method="generate_with_image",
+        #     method="generate_with_image",
-            fullgraph=False,
+        #     fullgraph=False,
-        ),
+        # ),
    ])
 def test_compile_correctness(
    monkeypatch: pytest.MonkeyPatch,

--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import pytest
 import vllm
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
 from vllm.utils import _is_torch_equal_or_newer
+from ..utils import models_path_prefix
 def test_version():
    assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev')
@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch):
    assert not vllm_config.compilation_config.use_cudagraph
-@pytest.mark.parametrize("enabled", [True, False])
+# TODO: when True num_cudagraph_captured=13 
+# @pytest.mark.parametrize("enabled", [True, False])
+@pytest.mark.parametrize("enabled", [False])
 def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
    assert vllm.envs.VLLM_USE_V1
@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
                num_cudagraph_captured=13 if enabled else 0,
            ),
            # loading the model causes compilation (if enabled) to happen
-            vllm_runner('facebook/opt-125m',
+            vllm_runner(os.path.join(models_path_prefix, 'facebook/opt-125m'),
                        compilation_config=compilation_config,
                        gpu_memory_utilization=0.4) as _):
        pass
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
--- a/tests/config/test_mp_reducer.py
+++ b/tests/config/test_mp_reducer.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import sys
 from unittest.mock import patch
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.v1.engine.async_llm import AsyncLLM
+from ..utils import models_path_prefix
 def test_mp_reducer(monkeypatch):
@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
    with patch('multiprocessing.reducer.register') as mock_register:
        engine_args = AsyncEngineArgs(
-            model="facebook/opt-125m",
+            model=os.path.join(models_path_prefix, "facebook/opt-125m"),
            max_model_len=32,
            gpu_memory_utilization=0.1,
            disable_log_stats=True,

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams
 from vllm.transformers_utils.utils import maybe_model_redirect
 from .utils import models_path_prefix
+from vllm.platforms import current_platform
 logger = init_logger(__name__)
@@ -783,7 +784,7 @@ class VllmRunner:
        dtype: str = "auto",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
-        block_size: int = 16,
+        block_size: int = 16 if not current_platform.is_rocm() else 64,
        enable_chunked_prefill: Optional[bool] = False,
        swap_space: int = 4,
        enforce_eager: Optional[bool] = False,

--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 from itertools import cycle
 import pytest
@@ -8,10 +9,8 @@ import pytest
 from vllm import SamplingParams
 from .conftest import get_token_ids_from_llm_generator
-import os
 from ....utils import models_path_prefix
-import vllm.envs as envs
+from vllm.platforms import current_platform
-from vllm.utils import SUPPORT_TC, gpuname
 @pytest.mark.parametrize(
@@ -24,7 +23,7 @@ from vllm.utils import SUPPORT_TC, gpuname
        "enforce_eager": True,
        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        "block_size": 64 if current_platform.is_rocm() else 16,
        "num_gpu_blocks_override": 5 * (64 + 1),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@@ -107,7 +106,7 @@ def test_block_manager_with_preemption(baseline_llm_generator,
    "per_test_common_llm_kwargs",
    [
        {
-            "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+            "block_size": 64 if current_platform.is_rocm() else 16,
            # Allow only 2 sequences of ~128 tokens in worst case.
            # Note 8 = 128/block_size
@@ -200,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs",
                         [{
-                             "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                             "block_size": 64 if current_platform.is_rocm() else 16,
                             "max_num_batched_tokens": 2,
                             "max_num_seqs": 2,
                         }, {
-                             "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                             "block_size": 64 if current_platform.is_rocm() else 16,
                             "max_num_batched_tokens": 3,
                             "max_num_seqs": 2,
                         }, {
-                             "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                             "block_size": 64 if current_platform.is_rocm() else 16,
                             "max_num_batched_tokens": 256,
                             "max_num_seqs": 10,
                         }])
@@ -274,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
        "enforce_eager": True,
        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        "block_size": 64 if current_platform.is_rocm() else 16,
        "num_gpu_blocks_override": 5 * (64 + 1),
        # Enable prefill cache
@@ -355,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
        "enforce_eager": True,
        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        "block_size": 64 if current_platform.is_rocm() else 16,
        "num_gpu_blocks_override": 5 * (64 + 1),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@@ -430,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
        # we keep the blocks small, so that hit eviction quickly
        "max_model_len": 48,
-        "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        "block_size": 64 if current_platform.is_rocm() else 16,
        "num_gpu_blocks_override": 3,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])

--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -15,8 +15,7 @@ from vllm.sequence import Logprob, SequenceGroup
 from .utils import create_dummy_prompt
 from ..utils import models_path_prefix
-from vllm.utils import SUPPORT_TC, gpuname
+from vllm.platforms import current_platform
-import vllm.envs as envs
 def get_sequence_groups(scheduler_output):
@@ -852,7 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
        max_num_seqs=8,
        enable_chunked_prefill=True,
        gpu_memory_utilization=0.8,
-        block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        block_size=64 if current_platform.is_rocm() else 16,
    )
    engine = LLMEngine.from_engine_args(engine_args)

--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -10,8 +10,6 @@ from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
 from vllm.sequence import SequenceGroup
 from ..utils import models_path_prefix
-from vllm.utils import SUPPORT_TC, gpuname
-import vllm.envs as envs
 MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
@@ -41,7 +39,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
                        num_scheduler_steps=num_scheduler_steps,
                        enable_chunked_prefill=enable_chunked_prefill,
                        enforce_eager=enforce_eager,
-                        block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
+                        block_size=64 if current_platform.is_rocm() else 16)
    engine: LLMEngine = runner.model.llm_engine
    # In multi-step + chunked-prefill there is no separate single prompt step.

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -15,6 +15,7 @@ from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SequenceGroup, SequenceStatus
+from vllm.platforms import current_platform
 from .utils import (append_new_token, append_new_token_seq,
                    append_new_token_seq_group, create_dummy_prompt,
@@ -22,7 +23,7 @@ from .utils import (append_new_token, append_new_token_seq,
 def test_scheduler_add_seq_group():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler_config = SchedulerConfig(
        "generate",
        max_num_batched_tokens=100,
@@ -45,7 +46,7 @@ def test_scheduler_add_seq_group():
 def test_scheduler_abort_seq_group():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler_config = SchedulerConfig(
        "generate",
        max_num_batched_tokens=100,
@@ -72,7 +73,7 @@ def test_scheduler_abort_seq_group():
 def test_scheduler_schedule_simple():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    num_seq_group = 4
    max_model_len = 16
    scheduler_config = SchedulerConfig(
@@ -117,7 +118,7 @@ def test_scheduler_schedule_simple():
 def test_scheduler_prefill_prioritized():
    """Verify running batched tokens are not applied to prefill requests."""
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    max_model_len = 30
    max_batched_num_tokens = 30
    scheduler_config = SchedulerConfig(
@@ -150,7 +151,7 @@ def test_scheduler_prefill_prioritized():
 def test_scheduler_schedule_preempt_abort():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    max_model_len = 16
    scheduler_config = SchedulerConfig(
        "generate",
@@ -208,7 +209,7 @@ def test_scheduler_schedule_preempt_abort():
 def test_scheduler_max_seqs():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    num_seq_group = 4
    max_seq_group = 2
    max_model_len = 16
@@ -256,7 +257,7 @@ def test_scheduler_max_seqs():
 def test_scheduler_delay_factor():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler_config = SchedulerConfig(
        "generate",
        max_num_batched_tokens=100,
@@ -306,7 +307,7 @@ def initialize_scheduler(
    max_token_budget=1000,
    max_model_len=1000,
    lora_config=None,
-    block_size=4,
+    block_size=4 if not current_platform.is_rocm() else 64,
    num_cpu_blocks=8,
    num_gpu_blocks=8,
    enable_prefix_caching=False,
@@ -354,7 +355,7 @@ def test_prefill_schedule_max_prompt_len():
    """
    Test prompt longer than max_prompt_len is aborted.
    """
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
    _, seq_group = create_dummy_prompt("0",
                                       prompt_length=60,
@@ -374,7 +375,7 @@ def test_prefill_schedule_token_budget():
    """
    Test token budget respected.
    """
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
@@ -436,7 +437,7 @@ def test_prefill_schedule_max_seqs():
    """
    Test max seq respected.
    """
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
@@ -475,7 +476,7 @@ def test_prefill_schedule_max_lora():
    """
    Test max lora is respected and prioritized.
    """
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config,
                                     block_size=block_size,
@@ -528,7 +529,7 @@ def test_prefill_schedule_no_block_manager_capacity():
    """
    Test sequence cannot be scheduled due to block manager has no capacity.
    """
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(block_size=block_size,
                                     num_gpu_blocks=128,
                                     num_cpu_blocks=128)
@@ -570,7 +571,7 @@ def test_decode_schedule_preempted():
    """
    Test decodes cannot be scheduled and preempted.
    """
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=64,
                                     num_gpu_blocks=64)
@@ -614,7 +615,7 @@ def test_schedule_decode_blocks_to_copy_update():
    """
    Verify blocks_to_copy is updated.
    """
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(block_size=4,
                                     num_cpu_blocks=16,
                                     num_gpu_blocks=16)
@@ -646,7 +647,7 @@ def test_schedule_decode_blocks_to_copy_update():
 def test_schedule_swapped_max_loras():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config,
                                     block_size=block_size,
@@ -679,7 +680,7 @@ def test_schedule_swapped_max_loras():
 def test_schedule_swapped_cannot_swap_in():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
@@ -709,7 +710,7 @@ def test_schedule_swapped_cannot_swap_in():
 def test_infeasible_swap():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
@@ -740,7 +741,7 @@ def test_infeasible_swap():
 def test_schedule_swapped_blocks_to_copy():
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    scheduler = initialize_scheduler(block_size=block_size,
                                     num_cpu_blocks=32,
                                     num_gpu_blocks=32)
@@ -825,7 +826,7 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
    considering prefix caching.
    """
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    max_num_batched_tokens = 12
    max_seq_group = 3
    scheduler = initialize_scheduler(
@@ -912,7 +913,7 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
        block-size aligned).
    """
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_num_batched_tokens = 4
    max_seq_group = 3
    scheduler = initialize_scheduler(
@@ -978,7 +979,7 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
    Test that the scheduler does not schedule batches with prompt tokens and 
    prompt embeddings co-mingled.
    """
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_seq_group = 3
    scheduler = initialize_scheduler(
        block_size=block_size,
@@ -1057,7 +1058,7 @@ def test_remove_seq_from_computed_blocks_tracker():
    _seq_id_to_num_tokens_computed.
    """
    # Budget can not schedule in swapped
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_seq_group = 3
    seq_tokens_with_swapped: list[list[int]] = []
    blocks_to_swap_out: list[tuple[int, int]] = []
@@ -1097,7 +1098,7 @@ def test_remove_seq_from_computed_blocks_tracker():
    # Prefill schedule don't have a space for another LoRA, so
    # we ignore this request for now.
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
    scheduler = initialize_scheduler(lora_config=lora_config,
                                     block_size=block_size,
@@ -1131,7 +1132,7 @@ def test_remove_seq_from_computed_blocks_tracker():
    # Prefill scheduler does not schedule batches with prompt tokens and
    # prompt embeddings co-mingled.
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_seq_group = 3
    scheduler = initialize_scheduler(
        block_size=block_size,
@@ -1170,7 +1171,7 @@ def test_remove_seq_from_computed_blocks_tracker():
    #  Prefill scheduler budget num_batched_tokens
    #  >= scheduler_config max_num_batched_tokens
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_seq_group = 3
    seq_tokens_prefill_budget: list[list[int]] = []
@@ -1205,7 +1206,7 @@ def test_remove_seq_from_computed_blocks_tracker():
    assert seq_id_to_num_tokens_computed is None
    # Budget can not schedule in waiting
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_seq_group = 3
    scheduler = initialize_scheduler(
@@ -1241,7 +1242,7 @@ def test_remove_seq_from_computed_blocks_tracker():
    assert seq_id_to_num_tokens_computed is None
    # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_seq_group = 3
    scheduler = initialize_scheduler(
        block_size=block_size,
@@ -1269,7 +1270,7 @@ def test_remove_seq_from_computed_blocks_tracker():
    assert seq_id_to_num_tokens_computed is None
    # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_seq_group = 3
    scheduler = initialize_scheduler(
        block_size=block_size,
@@ -1303,7 +1304,7 @@ def test_remove_seq_from_computed_blocks_tracker():
    assert seq_id_to_num_tokens_computed is None
    # Budget can not allocate, AllocStatus is LATER
-    block_size = 2
+    block_size = 2 if not current_platform.is_rocm() else 64
    max_seq_group = 3
    scheduler = initialize_scheduler(
        block_size=block_size,

--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -6,6 +6,7 @@ import pytest  # noqa
 from vllm.config import CacheConfig, SchedulerConfig
 from vllm.core.scheduler import Scheduler
 from vllm.sequence import SequenceGroup
+from vllm.platforms import current_platform
 from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
                    get_sequence_groups, schedule_and_update_computed_tokens)
@@ -34,7 +35,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
      cross-attention block table
    '''
-    block_size = 4
+    block_size = 4 if not current_platform.is_rocm() else 64
    num_seq_group = 4
    max_model_len = 16
    scheduler_config = SchedulerConfig(

--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -7,8 +7,7 @@ import pytest
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 from ..utils import models_path_prefix
-import vllm.envs as envs
+from vllm.platforms import current_platform
-from vllm.utils import SUPPORT_TC, gpuname
 @pytest.mark.skip_v1
@@ -23,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
        "paper clips? Is there an easy to follow video tutorial available "
        "online for free?")
-    llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
+    llm = LLM(model=model, block_size=64 if current_platform.is_rocm() else 16)
    sampling_params = SamplingParams(max_tokens=10,
                                     temperature=0.0,
                                     detokenize=False)