remove unused code

84dfdb17 · zhuwenwen · f137e58c · 84dfdb17 · 84dfdb17 · f137e58c
Commit 84dfdb17 authored Jul 31, 2025 by zhuwenwen
20 changed files
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -106,79 +106,6 @@ __global__ void count_and_sort_expert_tokens_kernel(
  }
 }

-// taken from
-// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
-template <typename scalar_t>
-__global__ void sgl_ep_moe_align_block_size_kernel(
-    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
-    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
-    int32_t block_size, size_t numel, int32_t* cumsum,
-    int32_t start_expert, int32_t end_expert) {
-  __shared__ int32_t shared_counts[32][8];
-  __shared__ int32_t local_offsets[256];
-
-  const int warp_id = threadIdx.x / 32;
-  const int lane_id = threadIdx.x % 32;
-  const int experts_per_warp = 8;
-  const int my_expert_start = warp_id * experts_per_warp;
-
-  for (int i = 0; i < experts_per_warp; ++i) {
-    if (my_expert_start + i < num_experts) {
-      shared_counts[warp_id][i] = 0;
-    }
-  }
-
-  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
-  const size_t start_idx = threadIdx.x * tokens_per_thread;
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int expert_id = topk_ids[i];
-    if (expert_id >= start_expert && expert_id < end_expert) {
-      expert_id -= start_expert;
-      int warp_idx = expert_id / experts_per_warp;
-      int expert_offset = expert_id % experts_per_warp;
-      atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
-    }
-  }
-
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    cumsum[0] = 0;
-    for (int i = 1; i <= num_experts; ++i) {
-      int expert_count = 0;
-      int warp_idx = (i - 1) / experts_per_warp;
-      int expert_offset = (i - 1) % experts_per_warp;
-      expert_count = shared_counts[warp_idx][expert_offset];
-
-      cumsum[i] =
-          cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
-    }
-    *total_tokens_post_pad = cumsum[num_experts];
-  }
-
-  __syncthreads();
-
-  if (threadIdx.x < num_experts) {
-    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
-         i += block_size) {
-      expert_ids[i / block_size] = threadIdx.x;
-    }
-    local_offsets[threadIdx.x] = cumsum[threadIdx.x];
-  }
-
-  __syncthreads();
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-    int32_t expert_id = topk_ids[i];
-    if (expert_id >= start_expert && expert_id < end_expert) {
-      expert_id -= start_expert;
-      int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1);
-      sorted_token_ids[rank_post_pad] = i;
-    }
-  }
-}
-
 template <typename scalar_t, int TOPK>
 __global__ void moe_sum_kernel(
    scalar_t* __restrict__ out,          // [..., d]

--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -250,13 +250,6 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                              int64_t head_size, torch::Tensor& cos_sin_cache,
                              bool is_neox, int64_t rot_dim,
                              torch::Tensor& cos_sin_cache_offsets);
-void rotary_embedding_tgi(
-  torch::Tensor& query,
-  torch::Tensor& key,
-  int64_t head_size,
-  torch::Tensor& cos_cache,
-  torch::Tensor& sin_cache,
-  bool is_neox);

 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);


--- a/csrc/pos_encoding_tgi_kernels.cu
+++ b/csrc/pos_encoding_tgi_kernels.cu
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
-
-namespace vllm {
-
-template<typename scalar_t, bool IS_NEOX>
-inline __device__ void apply_token_rotary_embedding_tgi(
-  scalar_t* __restrict__ arr,
-  const float* __restrict__ cos_ptr,
-  const float* __restrict__ sin_ptr,
-  int rot_offset,
-  int embed_dim)
-{
-  int x_index, y_index;
-  float cos, sin;
-  if (IS_NEOX) {
-    // GPT-NeoX style rotary embedding.
-    x_index = rot_offset;
-    y_index = embed_dim + rot_offset;
-    cos = VLLM_LDG(cos_ptr + x_index);
-    sin = VLLM_LDG(sin_ptr + x_index);
-  } else {
-    // GPT-J style rotary embedding.
-    x_index = 2 * rot_offset;
-    y_index = 2 * rot_offset + 1;
-    cos = VLLM_LDG(cos_ptr + x_index / 2);
-    sin = VLLM_LDG(sin_ptr + x_index / 2);
-  }
-
-  const scalar_t x = arr[x_index];
-  const scalar_t y = arr[y_index];
-  arr[x_index] = x * cos - y * sin;
-  arr[y_index] = y * cos + x * sin;
-}
-
-template<typename scalar_t, bool IS_NEOX>
-inline __device__ void apply_rotary_embedding_tgi(
-  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
-  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
-  const float* __restrict__ cos_ptr,   // [max_position, 1, rot_dim]
-  const float* __restrict__ sin_ptr,   // [max_position, 1, rot_dim]
-  const int head_size,
-  const int num_heads,
-  const int num_kv_heads,
-  const int rot_dim,
-  const int token_idx,
-  const int64_t query_stride,
-  const int64_t key_stride)
-{
-  const int nq = num_heads * rot_dim;
-  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
-    const int head_idx = i / rot_dim;
-    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
-    const int rot_offset = i % rot_dim;
-    apply_token_rotary_embedding_tgi<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
-                                              sin_ptr, rot_offset, rot_dim);
-  }
-
-  const int nk = num_kv_heads * rot_dim;
-  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
-    const int head_idx = i / rot_dim;
-    const int64_t token_head = token_idx * key_stride + head_idx * head_size;
-    const int rot_offset = i % rot_dim;
-    apply_token_rotary_embedding_tgi<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
-                                              sin_ptr, rot_offset, rot_dim);
-  }
-}
-
-template<typename scalar_t, bool IS_NEOX>
-__global__ void rotary_embedding_tgi_kernel(
-  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
-  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
-  const float* __restrict__ cos_cache,   // [max_position, 1, rot_dim]
-  const float* __restrict__ sin_cache,   // [max_position, 1, rot_dim]
-  const int rot_dim,
-  const int64_t query_stride,
-  const int64_t key_stride,
-  const int num_heads,
-  const int num_kv_heads,
-  const int head_size) {
-  // Each thread block is responsible for one token.
-  const int token_idx = blockIdx.x;
-
-  const float* cos_ptr = cos_cache + token_idx * rot_dim;
-  const float* sin_ptr = sin_cache + token_idx * rot_dim;
-
-  apply_rotary_embedding_tgi<scalar_t, IS_NEOX>(query, key, cos_ptr, sin_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
-}
-
-} // namespace vllm
-
-void rotary_embedding_tgi(
-  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
-  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
-  int64_t head_size,
-  torch::Tensor& cos_cache,
-  torch::Tensor& sin_cache,
-  bool is_neox) {
-  int num_tokens = query.size(0);
-  int rot_dim = cos_cache.size(2);
-  int num_heads = query.size(1);
-  int num_kv_heads = key.size(1);
-  int query_stride = query.stride(0);
-  int key_stride = key.stride(0);
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * rot_dim / 2, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-    query.scalar_type(),
-    "rotary_embedding_tgi",
-    [&] {
-      if (is_neox) {
-        vllm::rotary_embedding_tgi_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
-          query.data_ptr<scalar_t>(),
-          key.data_ptr<scalar_t>(),
-          cos_cache.data_ptr<float>(),
-          sin_cache.data_ptr<float>(),
-          rot_dim,
-          query_stride,
-          key_stride,
-          num_heads,
-          num_kv_heads,
-          head_size);
-      } else {
-        vllm::rotary_embedding_tgi_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
-          query.data_ptr<scalar_t>(),
-          key.data_ptr<scalar_t>(),
-          cos_cache.data_ptr<float>(),
-          sin_cache.data_ptr<float>(),
-          rot_dim,
-          query_stride,
-          key_stride,
-          num_heads,
-          num_kv_heads,
-          head_size);
-      }
-    });
-}
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -391,15 +391,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
  ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);

-  // Rotary embedding TGI for TGI
-  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
-  ops.def(
-      "rotary_embedding_tgi(Tensor! query, Tensor! key,"
-      "                 int head_size, Tensor cos_cache,"
-      "                 Tensor sin_cache, bool is_neox) -> ()");
-//   ops.def("rotary_embedding_tgi",&rotary_embedding_tgi);
-  ops.impl("rotary_embedding_tgi", torch::kCUDA, &rotary_embedding_tgi);
-
  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key
  // (supports multiple loras).
  ops.def(

--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
@@ -35,6 +35,7 @@ def get_llm() -> LLM:
        },
    )

+
 # Why do we need hf_overrides for the official original version:
 # vllm converts it to Qwen3ForSequenceClassification when loaded for
 # better performance.

--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -8,7 +8,7 @@ image:
  # -- Image tag
  tag: "latest"
  # -- Container launch command
-  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]

 # -- Container port
 containerPort: 8000

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -41,7 +41,7 @@ def v1(run_with_both_engines):

 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
-    if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
+    if envs.VLLM_USE_FLASH_ATTN_PA:
        llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
    else:
        llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
@@ -111,7 +111,7 @@ def test_models(
                    prompt_embeds = hf_model.get_prompt_embeddings(
                        example_prompts)

-        if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
+        if envs.VLLM_USE_FLASH_ATTN_PA:
            with VllmRunner(model,
                        max_model_len=8192,
                        enforce_eager=enforce_eager,

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -86,6 +86,20 @@ def test_models(
        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

+        if envs.VLLM_USE_FLASH_ATTN_PA:
+            with vllm_runner(
+                    model,
+                    dtype=dtype,
+                    max_num_batched_tokens=max_num_batched_tokens,
+                    enable_chunked_prefill=True,
+                    tensor_parallel_size=tensor_parallel_size,
+                    enforce_eager=enforce_eager,
+                    max_num_seqs=max_num_seqs,
+                    block_size=64,
+            ) as vllm_model:
+                vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                        max_tokens)
+        else:
            with vllm_runner(
                    model,
                    dtype=dtype,
@@ -94,7 +108,6 @@ def test_models(
                    tensor_parallel_size=tensor_parallel_size,
                    enforce_eager=enforce_eager,
                    max_num_seqs=max_num_seqs,
-                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
            ) as vllm_model:
                vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                        max_tokens)
@@ -150,6 +163,22 @@ def test_models_distributed(
        # will hurt multiprocessing backend with
        # fork method (the default method).

+        if envs.VLLM_USE_FLASH_ATTN_PA:
+            with vllm_runner(
+                    model,
+                    dtype=dtype,
+                    tensor_parallel_size=2,
+                    max_num_seqs=max_num_seqs,
+                    enable_chunked_prefill=enable_chunked_prefill,
+                    max_num_batched_tokens=max_num_batched_tokens,
+                    distributed_executor_backend=distributed_executor_backend,
+                    block_size=64,
+            ) as vllm_model:
+                vllm_outputs = vllm_model.generate_greedy(
+                    example_prompts,
+                    max_tokens,
+                )
+        else:
            with vllm_runner(
                model,
                dtype=dtype,
@@ -158,7 +187,6 @@ def test_models_distributed(
                enable_chunked_prefill=enable_chunked_prefill,
                max_num_batched_tokens=max_num_batched_tokens,
                distributed_executor_backend=distributed_executor_backend,
-                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
            ) as vllm_model:
                vllm_outputs = vllm_model.generate_greedy(
                    example_prompts,
@@ -277,6 +305,25 @@ def test_with_prefix_caching(
    max_num_batched_tokens = max_num_seqs = chunk_size
    outputs = {}  # type: ignore
    for enable in (True, False):
+        if envs.VLLM_USE_FLASH_ATTN_PA:
+            with vllm_runner(
+                    model,
+                    dtype=dtype,
+                    max_num_batched_tokens=max_num_batched_tokens,
+                    enable_chunked_prefill=True,
+                    enable_prefix_caching=enable,
+                    tensor_parallel_size=tensor_parallel_size,
+                    enforce_eager=enforce_eager,
+                    max_num_seqs=max_num_seqs,
+                    block_size=64,
+            ) as vllm_model:
+                outputs[enable] = []
+                for prompt in full_prompts:
+                    outputs[enable] += vllm_model.generate_greedy(
+                        [prompt],
+                        max_tokens,
+                    )
+        else:
            with vllm_runner(
                model,
                dtype=dtype,
@@ -286,7 +333,6 @@ def test_with_prefix_caching(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
-                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
            ) as vllm_model:
                outputs[enable] = []
                for prompt in full_prompts:

--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import torch
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-import os
-from ..utils import models_path_prefix
-
-TEST_MODELS = [
-    (os.path.join(models_path_prefix, "facebook/opt-125m"), {}),
-    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    # (os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic"), {
-    #     "dtype": torch.float16,
-    #     "quantization": "fp8"
-    # }),
-    (os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"), {
-        "quantization": "compressed-tensors"
-    }),
-    (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), {}),
-]
-
-if is_quant_method_supported("aqlm"):
-    TEST_MODELS.append((os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"), {
-        "quantization": "aqlm"
-    }))
-
-# TODO: figure out why this fails.
-if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"), {
-        "quantization": "gguf"
-    }))
-
-if is_quant_method_supported("gptq"):
-    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"), {
-        "quantization": "gptq"
-    }))
-
-# if is_quant_method_supported("gptq_marlin"):
-#     TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
-#         "quantization": "gptq_marlin"
-#     }))
-
-# if is_quant_method_supported("gptq_marlin_24"):
-#     TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
-#         "quantization": "gptq_marlin_24"
-#     }))
-
-# if is_quant_method_supported("marlin"):
-#     TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
-#         "quantization": "marlin"
-#     }))
-
-
-if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {
-        "quantization": "AWQ"
-    }))
-
-
-def check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1):
-    # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
-
-    print(f"MODEL={model}")
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=model,
-              enforce_eager=True,
-              tensor_parallel_size=tp_size,
-              disable_custom_all_reduce=True,
-              compilation_config=optimization_level,
-              **model_kwargs)
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -24,7 +24,7 @@ from vllm.utils import SUPPORT_TC, gpuname
        "enforce_eager": True,

        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        "block_size": 64,
        "num_gpu_blocks_override": 5 * (64 + 1),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@@ -107,7 +107,7 @@ def test_block_manager_with_preemption(baseline_llm_generator,
    "per_test_common_llm_kwargs",
    [
        {
-            "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+            "block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,

            # Allow only 2 sequences of ~128 tokens in worst case.
            # Note 8 = 128/block_size
@@ -200,15 +200,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
    ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs",
                         [{
-                             "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                             "block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
                             "max_num_batched_tokens": 2,
                             "max_num_seqs": 2,
                         }, {
-                             "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                             "block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
                             "max_num_batched_tokens": 3,
                             "max_num_seqs": 2,
                         }, {
-                             "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+                             "block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
                             "max_num_batched_tokens": 256,
                             "max_num_seqs": 10,
                         }])
@@ -274,7 +274,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
        "enforce_eager": True,

        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        "block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
        "num_gpu_blocks_override": 5 * (64 + 1),

        # Enable prefill cache
@@ -355,7 +355,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
        "enforce_eager": True,

        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        "block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
        "num_gpu_blocks_override": 5 * (64 + 1),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@@ -430,7 +430,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,

        # we keep the blocks small, so that hit eviction quickly
        "max_model_len": 48,
-        "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        "block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
        "num_gpu_blocks_override": 3,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])

--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -844,7 +844,17 @@ def test_chunked_prefill_with_actual_engine(model: str,
    """

    prompt = "hello" * 40
-
+    if envs.VLLM_USE_FLASH_ATTN_PA:
+        engine_args = EngineArgs(
+            model=model,
+            max_num_partial_prefills=max_num_partial_prefills,
+            max_num_batched_tokens=40,
+            max_num_seqs=8,
+            enable_chunked_prefill=True,
+            gpu_memory_utilization=0.8,
+            block_size=64,
+        )
+    else:
        engine_args = EngineArgs(
        model=model,
        max_num_partial_prefills=max_num_partial_prefills,
@@ -852,7 +862,6 @@ def test_chunked_prefill_with_actual_engine(model: str,
        max_num_seqs=8,
        enable_chunked_prefill=True,
        gpu_memory_utilization=0.8,
-        block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
    )

    engine = LLMEngine.from_engine_args(engine_args)

--- a/tests/detokenizer/test_disable_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -23,7 +23,7 @@ def test_computed_prefix_blocks(model: str):
        "paper clips? Is there an easy to follow video tutorial available "
        "online for free?")

-    llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
+    llm = LLM(model=model, block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA else 16)
    sampling_params = SamplingParams(max_tokens=10,
                                     temperature=0.0,
                                     detokenize=False)

--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -13,7 +13,7 @@ import vllm.envs as envs


 @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
-@pytest.mark.parametrize("block_size", [64] if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else [16])
+@pytest.mark.parametrize("block_size", [64] if envs.VLLM_USE_FLASH_ATTN_PA else [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
    # This test checks if we are able to run the engine to completion
    # without triggering asserts.

--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -60,7 +60,7 @@ def test_custom_executor(model, tmp_path):
            model=model,
            distributed_executor_backend=CustomUniExecutor,
            enforce_eager=True,  # reduce test time
-            block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+            block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
        )
        engine = LLMEngine.from_engine_args(engine_args)
        sampling_params = SamplingParams(max_tokens=1)
@@ -84,7 +84,7 @@ def test_custom_executor_async(model, tmp_path):
            model=model,
            distributed_executor_backend=CustomUniExecutorAsync,
            enforce_eager=True,  # reduce test time
-            block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+            block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
        )
        engine = AsyncLLMEngine.from_engine_args(engine_args)
        sampling_params = SamplingParams(max_tokens=1)
@@ -111,7 +111,7 @@ def test_respect_ray(model):
        model=model,
        distributed_executor_backend="ray",
        enforce_eager=True,  # reduce test time
-        block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
+        block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
    )
    engine = LLMEngine.from_engine_args(engine_args)
    assert engine.model_executor.uses_ray
\ No newline at end of file
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Any, List, Optional
-
-import pytest
-import os
-
-from vllm import CompletionOutput, LLMEngine, SamplingParams
-from ..utils import models_path_prefix
-
-MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
-MAX_TOKENS = 200
-
-IS_ASYNC = False
-
-
-@pytest.fixture(scope="session")
-def vllm_model(vllm_runner):
-    with vllm_runner(MODEL) as vllm_model:
-        yield vllm_model
-
-
-def _test_stopping(llm_engine: LLMEngine,
-                   expected_output: str,
-                   expected_reason: Any,
-                   stop: Optional[List[str]] = None,
-                   stop_token_ids: Optional[List[int]] = None,
-                   include_in_output: bool = False,
-                   use_async_output_proc: bool = False) -> None:
-    llm_engine.add_request(
-        "id", "A story about vLLM:\n",
-        SamplingParams(
-            temperature=0.0,
-            max_tokens=MAX_TOKENS,
-            stop=stop,
-            stop_token_ids=stop_token_ids,
-            include_stop_str_in_output=include_in_output,
-        ), None)
-
-    output: Optional[CompletionOutput] = None
-    output_text = ""
-    stop_reason = None
-
-    if use_async_output_proc:
-        llm_engine.step()
-
-    while llm_engine.has_unfinished_requests():
-        (request_output, ) = llm_engine.step()
-        (output, ) = request_output.outputs
-
-        # Ensure we don't backtrack
-        assert output.text.startswith(output_text)
-        output_text = output.text
-        stop_reason = output.stop_reason
-
-    assert output is not None
-    assert output_text == expected_output
-    assert stop_reason == expected_reason
-
-
-def _set_async_mode(llm_engine, is_async):
-    llm_engine.scheduler[0].use_async_output_proc = is_async
-
-
-def _stop_basic(llm_engine, is_async):
-    _test_stopping(llm_engine,
-                   stop=["."],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=".",
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop=["."],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization.",
-                   expected_reason=".",
-                   use_async_output_proc=is_async)
-
-
-def _stop_multi_tokens(llm_engine, is_async):
-    _test_stopping(
-        llm_engine,
-        stop=["group of peo", "short"],
-        include_in_output=False,
-        expected_output="VLLM is a 100% volunteer organization. We are a ",
-        expected_reason="group of peo",
-        use_async_output_proc=is_async)
-
-    _test_stopping(
-        llm_engine,
-        stop=["group of peo", "short"],
-        include_in_output=True,
-        expected_output=
-        "VLLM is a 100% volunteer organization. We are a group of peo",
-        expected_reason="group of peo",
-        use_async_output_proc=is_async)
-
-
-def _stop_partial_token(llm_engine, is_async):
-    _test_stopping(llm_engine,
-                   stop=["gani"],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer or",
-                   expected_reason="gani",
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop=["gani"],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organi",
-                   expected_reason="gani",
-                   use_async_output_proc=is_async)
-
-
-def _stop_token_id(llm_engine, is_async):
-    # token id 13013 => " organization"
-
-    _test_stopping(llm_engine,
-                   stop_token_ids=[13013],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer",
-                   expected_reason=13013,
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop_token_ids=[13013],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=13013,
-                   use_async_output_proc=is_async)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_basic(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_basic(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_basic(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_multi_tokens(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_partial_token(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_partial_token(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_partial_token(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_token_id(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_token_id(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_token_id(vllm_model.model.llm_engine, is_async=False)
--- a/tests/entrypoints/openai/test_lora_lineage.py
+++ b/tests/entrypoints/openai/test_lora_lineage.py
-import json
-
-import openai  # use the official client for correctness check
-import pytest
-import os
-import pytest_asyncio
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from ...utils import RemoteOpenAIServer, models_path_prefix
-
-# any model with a chat template should work here
-MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
-LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora")
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
-    # Define the json format LoRA module configurations
-    lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    lora_module_2 = {
-        "name": "zephyr-lora2",
-        "path": zephyr_lora_files,
-        "base_model_name": MODEL_NAME
-    }
-
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        # lora config below
-        "--enable-lora",
-        "--lora-modules",
-        json.dumps(lora_module_1),
-        json.dumps(lora_module_2),
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-        "--max-num-seqs",
-        "64",
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client_for_lora_lineage(server_with_lora_modules_json):
-    async with server_with_lora_modules_json.get_async_client(
-    ) as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
-                                  zephyr_lora_files):
-    models = await client_for_lora_lineage.models.list()
-    models = models.data
-    served_model = models[0]
-    lora_models = models[1:]
-    assert served_model.id == MODEL_NAME
-    assert served_model.root == MODEL_NAME
-    assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files
-               for lora_model in lora_models)
-    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
-    assert lora_models[1].id == "zephyr-lora2"
--- a/tests/kernels/attention/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from typing import Optional
-
-import pytest
-import torch
-
-from tests.kernels.allclose_default import get_default_atol, get_default_rtol
-from vllm import _custom_ops as ops
-from vllm.attention.ops.blocksparse_attention.interface import (
-    LocalStridedBlockSparseAttn)
-from vllm.platforms import current_platform
-from vllm.utils import get_max_shared_memory_bytes
-
-FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-# This will change depending on the compute capability.
-# - 512 as a buffer
-MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
-# MAX_SEQ_LEN = 2771
-
-# There may not be enough gpu memory due to large NUM_BLOCKS.
-# Reduce NUM_BLOCKS when it happens.
-NUM_BLOCKS = 4321  # Arbitrary values for testing
-PARTITION_SIZE = 512
-DTYPES = [torch.half, torch.bfloat16]
-NUM_GEN_SEQS = [3]  # Arbitrary values for testing
-NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
-NUM_HEADS = [(40, 40)]  # Arbitrary values for testing
-
-HEAD_SIZES = [64, 112]
-BLOCK_SIZES = [16]
-USE_ALIBI = [False, True]
-KV_CACHE_DTYPE = ["auto", "fp8"] if not current_platform.is_rocm() else ["auto"]
-SEEDS = [0]
-CUDA_DEVICES = ['cuda:0']
-BLOCKSPARSE_LOCAL_BLOCKS = [16]
-BLOCKSPARSE_VERT_STRIDES = [8]
-
-BLOCKSPARSE_BLOCK_SIZES = [64]
-BLOCKSPARSE_HEADS_SLIDINGS = [2, -1]
-BLOCKSPARSE_HOMO_HEADS = [True, False]
-
-
-def ref_masked_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    scale: float,
-    attn_mask: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
-    if attn_mask is not None:
-        attn_weights = attn_weights + attn_mask.float()
-    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
-    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
-    return out
-
-
-def ref_single_query_cached_kv_attention(
-    output: torch.Tensor,
-    query: torch.Tensor,
-    num_queries_per_kv: int,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    block_tables: torch.Tensor,
-    seq_lens: torch.Tensor,
-    scale: float,
-    alibi_slopes: Optional[torch.Tensor],
-    tp_rank: int = 0,
-    blocksparse_local_blocks: int = 0,
-    blocksparse_vert_stride: int = 1,
-    blocksparse_block_size: int = 64,
-    blocksparse_head_sliding_step: int = 0,
-) -> None:
-    num_query_heads = query.shape[1]
-    num_kv_heads = value_cache.shape[1]
-    head_size = value_cache.shape[2]
-    block_size = value_cache.shape[3]
-    num_seqs = query.shape[0]
-
-    block_tables_lst = block_tables.cpu().tolist()
-    seq_lens_lst = seq_lens.cpu().tolist()
-    for i in range(num_seqs):
-        q = query[i].unsqueeze(0)
-        block_table = block_tables_lst[i]
-        seq_len = int(seq_lens_lst[i])
-
-        keys_lst: list[torch.Tensor] = []
-        values_lst: list[torch.Tensor] = []
-        for j in range(seq_len):
-            block_number = int(block_table[j // block_size])
-            block_offset = j % block_size
-
-            k = key_cache[block_number, :, :, block_offset, :]
-            k = k.reshape(num_kv_heads, head_size)
-            keys_lst.append(k)
-
-            v = value_cache[block_number, :, :, block_offset]
-            values_lst.append(v)
-        keys = torch.stack(keys_lst, dim=0)
-        values = torch.stack(values_lst, dim=0)
-        if num_queries_per_kv > 1:
-            # Handle MQA and GQA
-            keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
-            values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
-
-        alibi_bias = None
-        if alibi_slopes is not None:
-            # Create the ALiBi bias used in the paged attention kernel.
-            position_ids = torch.arange(seq_len).int()
-            alibi_bias = (position_ids - seq_len + 1).float()
-            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
-                1, 1, -1)
-
-        if blocksparse_vert_stride >= 1:
-            bsize = blocksparse_block_size
-            hsliding = blocksparse_head_sliding_step
-            vert = blocksparse_vert_stride
-            locals = blocksparse_local_blocks
-            qb = (seq_len - 1) // bsize
-            attn_mask = q.new_zeros(
-                (num_query_heads, 1, seq_len)).float() - torch.inf
-            for h in range(num_query_heads):
-                if hsliding >= 0:  # slide with q heads
-                    bs_offset = (tp_rank * num_query_heads + h) * hsliding + 1
-                else:  # slide with kv heads
-                    bs_offset = (tp_rank * num_kv_heads +
-                                 h // num_queries_per_kv) * (-hsliding) + 1
-                for kb in range(qb + 1):
-                    kj = kb * bsize
-                    if (qb - kb) < locals or \
-                        (kb + bs_offset) % vert == 0:
-                        attn_mask[h, 0, kj:min(kj + bsize, seq_len)] = 0
-            if alibi_bias is not None:
-                attn_mask += alibi_bias
-        else:
-            attn_mask = alibi_bias
-
-        out = ref_masked_attention(q, keys, values, scale, attn_mask=attn_mask)
-        out = out.view(num_query_heads, head_size)
-        output[i].copy_(out, non_blocking=True)
-
-
-@pytest.mark.parametrize("version", ["v1", "v2"])
-@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("use_alibi", USE_ALIBI)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
-@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
-@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
-@pytest.mark.parametrize("blocksparse_head_sliding_step",
-                         BLOCKSPARSE_HEADS_SLIDINGS)
-def test_paged_attention(
-    kv_cache_factory,
-    version: str,
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    use_alibi: bool,
-    block_size: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: str,
-    seed: int,
-    device: str,
-    blocksparse_local_blocks: int,
-    blocksparse_vert_stride: int,
-    blocksparse_block_size: int,
-    blocksparse_head_sliding_step: int,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    scale = float(1.0 / (head_size**0.5))
-    num_query_heads, num_kv_heads = num_heads
-    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
-    query.uniform_(-scale, scale)
-
-    assert num_query_heads % num_kv_heads == 0
-    num_queries_per_kv = num_query_heads // num_kv_heads
-    alibi_slopes = None
-    if use_alibi:
-        alibi_slopes = torch.rand(num_query_heads, dtype=torch.float)
-
-    seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
-    seq_lens[-1] = MAX_SEQ_LEN
-    max_seq_len = max(seq_lens)
-    seq_lens = torch.tensor(seq_lens, dtype=torch.int)
-
-    # Create the block tables.
-    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
-    for _ in range(num_seqs):
-        block_table = [
-            random.randint(0, NUM_BLOCKS - 1)
-            for _ in range(max_num_blocks_per_seq)
-        ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int)
-
-    # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
-                                                num_kv_heads, head_size,
-                                                kv_cache_dtype, dtype, seed,
-                                                device)
-    key_cache, value_cache = key_caches[0], value_caches[0]
-
-    # Using default kv_scale
-    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
-    tp_rank = 0
-
-    # Call the paged attention kernel.
-    output = torch.empty_like(query)
-    if version == "v1":
-        ops.paged_attention_v1(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            seq_lens,
-            block_size,
-            max_seq_len,
-            alibi_slopes,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-            tp_rank=tp_rank,
-            blocksparse_local_blocks=blocksparse_local_blocks,
-            blocksparse_vert_stride=blocksparse_vert_stride,
-            blocksparse_block_size=blocksparse_block_size,
-            blocksparse_head_sliding_step=blocksparse_head_sliding_step,
-        )
-    elif version == "v2":
-        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
-        assert PARTITION_SIZE % block_size == 0
-        num_seqs, num_heads, head_size = output.shape
-        tmp_output = torch.empty(
-            size=(num_seqs, num_heads, num_partitions, head_size),
-            dtype=output.dtype,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_heads, num_partitions),
-            dtype=torch.float32,
-        )
-        max_logits = torch.empty_like(exp_sums)
-        ops.paged_attention_v2(
-            output,
-            exp_sums,
-            max_logits,
-            tmp_output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            block_tables,
-            seq_lens,
-            block_size,
-            max_seq_len,
-            alibi_slopes,
-            kv_cache_dtype,
-            k_scale,
-            v_scale,
-            tp_rank=tp_rank,
-            blocksparse_local_blocks=blocksparse_local_blocks,
-            blocksparse_vert_stride=blocksparse_vert_stride,
-            blocksparse_block_size=blocksparse_block_size,
-            blocksparse_head_sliding_step=blocksparse_head_sliding_step,
-        )
-    else:
-        raise AssertionError(f"Unknown version: {version}")
-
-    # Run the reference implementation.
-    if kv_cache_dtype == "fp8":
-        # Convert cache data back to dtype.
-        x = 16 // torch.tensor([], dtype=dtype).element_size()
-        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
-                           block_size, x)
-        dequantized_key_cache = torch.empty(size=key_cache_shape,
-                                            dtype=dtype,
-                                            device=device)
-        ops.convert_fp8(dequantized_key_cache, key_cache)
-        key_cache = dequantized_key_cache
-
-        value_cache_shape = value_cache.shape
-        dequantized_value_cache = torch.empty(size=value_cache_shape,
-                                              dtype=dtype,
-                                              device=device)
-        ops.convert_fp8(dequantized_value_cache, value_cache)
-        value_cache = dequantized_value_cache
-
-    ref_output = torch.empty_like(query)
-    ref_single_query_cached_kv_attention(
-        ref_output,
-        query,
-        num_queries_per_kv,
-        key_cache,
-        value_cache,
-        block_tables,
-        seq_lens,
-        scale,
-        alibi_slopes,
-        tp_rank,
-        blocksparse_local_blocks,
-        blocksparse_vert_stride,
-        blocksparse_block_size,
-        blocksparse_head_sliding_step,
-    )
-
-    # NOTE(woosuk): Due to the kernel-level differences in the two
-    # implementations, there is a small numerical difference in the two
-    # outputs. Thus, we use a relaxed tolerance for the test.
-    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
-    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-
-    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
-    # so we use a relaxed tolerance for the test.
-    atol, rtol = 1e-3, 1e-5
-    if kv_cache_dtype == "fp8":
-        atol, rtol = 1e-2, 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
-
-
-def ref_multi_query_kv_attention(
-    cu_seq_lens: list[int],
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    scale: float,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    num_seqs = len(cu_seq_lens) - 1
-    ref_outputs = []
-    for i in range(num_seqs):
-        start_idx = cu_seq_lens[i]
-        end_idx = cu_seq_lens[i + 1]
-        seq_len = end_idx - start_idx
-
-        # Create attention mask.
-        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
-                               diagonal=1)
-        attn_mask = attn_mask * torch.finfo(dtype).min
-        attn_mask = attn_mask.to(dtype=dtype)
-
-        ref_output = ref_masked_attention(
-            query[start_idx:end_idx],
-            key[start_idx:end_idx],
-            value[start_idx:end_idx],
-            scale,
-            attn_mask=attn_mask,
-        )
-        ref_outputs.append(ref_output)
-    ref_output = torch.cat(ref_outputs, dim=0)
-    return ref_output
-
-
-# @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-# @pytest.mark.parametrize("num_heads", NUM_HEADS)
-# @pytest.mark.parametrize("head_size", HEAD_SIZES)
-# @pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
-# @pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
-# @pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
-# @pytest.mark.parametrize("blocksparse_homo_heads", BLOCKSPARSE_HOMO_HEADS)
-# @pytest.mark.parametrize("dtype", DTYPES)
-# @pytest.mark.parametrize("seed", SEEDS)
-# @pytest.mark.parametrize("device", CUDA_DEVICES)
-# @torch.inference_mode()
-# def test_varlen_blocksparse_attention_prefill(
-#     num_seqs: int,
-#     num_heads: tuple[int, int],
-#     head_size: int,
-#     blocksparse_local_blocks: int,
-#     blocksparse_vert_stride: int,
-#     blocksparse_block_size: int,
-#     blocksparse_homo_heads: bool,
-#     dtype: torch.dtype,
-#     seed: int,
-#     device: str,
-# ) -> None:
-#     current_platform.seed_everything(seed)
-#     torch.set_default_device(device)
-#     # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
-#     # As the xformers library is already tested with its own tests, we can use
-#     # a smaller MAX_SEQ_LEN here.
-#     max_len = min(MAX_SEQ_LEN, 4096)
-#     seq_lens = random.sample(range(1, max_len), num_seqs)
-#     cu_seq_lens = torch.cumsum(torch.tensor([0] + seq_lens), dim=0)
-#     num_tokens = sum(seq_lens)
-
-#     scale = float(1.0 / (head_size**0.5))
-#     num_query_heads, num_kv_heads = num_heads
-#     assert num_query_heads % num_kv_heads == 0
-#     num_queries_per_kv = num_query_heads // num_kv_heads
-
-#     qkv = torch.empty(num_tokens,
-#                       num_query_heads + 2 * num_kv_heads,
-#                       head_size,
-#                       dtype=dtype)
-#     qkv.uniform_(-scale, scale)
-#     query, key, value = qkv.split(
-#         [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
-
-#     bs_attn_op = LocalStridedBlockSparseAttn(
-#         num_query_heads,
-#         max_len,
-#         local_blocks=blocksparse_local_blocks,
-#         vert_stride=blocksparse_vert_stride,
-#         block_size=blocksparse_block_size,
-#         device=device,
-#         dtype=dtype,
-#         homo_head=blocksparse_homo_heads)
-
-#     output = bs_attn_op(query,
-#                         key,
-#                         value,
-#                         cu_seq_lens.to(device),
-#                         sm_scale=scale)
-
-#     if num_queries_per_kv > 1:
-#         # Handle MQA and GQA
-#         key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-#         value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-
-#     ref_output = ref_multi_query_kv_attention(
-#         cu_seq_lens.tolist(),
-#         query,
-#         key,
-#         value,
-#         scale,
-#         dtype,
-#     )
-#     torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
--- a/tests/lora/test_qwen.py
+++ b/tests/lora/test_qwen.py
-from typing import List
-import os
-
-import vllm
-from vllm.lora.request import LoRARequest
-from ..utils import models_path_prefix
-
-MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen1.5-32B-Chat")
-
-PROMPT_TEMPLATE = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(query="who are you?"),
-        PROMPT_TEMPLATE.format(
-            query="What is the capital city of China?"
-        ),
-        PROMPT_TEMPLATE.format(
-            query="What is the longest river in the world?"
-        ),
-    ]
-    print(prompts)
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_qwen_lora(qwen_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True)
-
-    expected_lora_output = [
-        "I am a large language model created by Alibaba Cloud. I am called Qwen.",
-        "The capital city of China is Beijing.",
-        "The longest river in the world is the Nile, located in Africa. It stretches for approximately 4,135 miles (6,650 kilometers) from its source in the highlands of Rwanda, through Tanzania, Uganda, South Sudan, Sudan, and Egypt, before emptying into the Mediterranean Sea. The Nile is famous for its historical and cultural significance, particularly in relation to ancient Egyptian civilization.",
-    ]
-
-    output1 = do_sample(llm, qwen_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i] == expected_lora_output[i]
-    output2 = do_sample(llm, qwen_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i] == expected_lora_output[i]
--- a/tests/models/quantization/test_mamba.py
+++ b/tests/models/quantization/test_mamba.py
-# SPDX-License-Identifier: Apache-2.0
-"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
-
-Run `pytest tests/models/test_mamba.py`.
-"""
-import os
-import pytest
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.sampling_params import SamplingParams
-
-from ...utils import check_outputs_equal
-from ....utils import models_path_prefix
-
-MODELS = [
-    os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"),
-    os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev"),
-    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
-    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
-    # See https://github.com/huggingface/transformers/pull/35943
-    # "mistralai/Mamba-Codestral-7B-v0.1",
-]
-
-
-# Use lower-level interfaces to create this greedy generator, as mamba will
-# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
-def generate_greedy(model_name, example_prompts, max_tokens):
-    # Create a text generation pipeline
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-
-    # Set the device (GPU if available, else CPU)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-
-    # Generate texts from the prompts
-    outputs = []
-    for prompt in example_prompts:
-        # Tokenize the input prompt with truncation
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
-        input_ids = inputs["input_ids"].to(model.device)
-
-        # Generate text using the model's generate method directly
-        generated_ids = model.generate(input_ids,
-                                       max_new_tokens=max_tokens,
-                                       do_sample=False)
-        generated_text = tokenizer.decode(generated_ids[0],
-                                          skip_special_tokens=True)
-
-        outputs.append((generated_ids[0].tolist(), generated_text))
-
-    return outputs
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    hf_outputs = generate_greedy(model, example_prompts, max_tokens)
-
-    # Set max_num_seqs to keep Codestral from going OOM at fp32
-    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_batching(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # To pass the small model tests, we need full precision.
-    for_loop_outputs = []
-    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-        for prompt in example_prompts:
-            for_loop_outputs.append(
-                vllm_model.generate_greedy([prompt], max_tokens)[0])
-
-        batched_outputs = vllm_model.generate_greedy(example_prompts,
-                                                     max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=for_loop_outputs,
-        outputs_1_lst=batched_outputs,
-        name_0="for_loop_vllm",
-        name_1="batched_vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [10])
-def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
-                                                model: str, dtype: str,
-                                                max_tokens: int) -> None:
-    # Tests chunked prefill in conjunction with n>1. In this case, prefill is
-    # populated with decoding tokens and we test that it doesn't fail.
-    # This test might fail if cache is not allocated correctly for n > 1
-    # decoding steps inside a chunked prefill forward pass (where we have both
-    # prefill and decode together )
-    sampling_params = SamplingParams(n=3,
-                                     temperature=1,
-                                     seed=0,
-                                     max_tokens=max_tokens)
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enable_chunked_prefill=True,
-            max_num_batched_tokens=30,
-            max_num_seqs=10  # forces prefill chunks with decoding
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
-                         max_tokens: int,
-                         chunked_prefill_token_size: int) -> None:
-    """
-    Checks exact match decode between huggingface model and vllm runner with
-    chunked prefill.
-    """
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    non_chunked = generate_greedy(model, example_prompts, max_tokens)
-
-    with vllm_runner(model,
-                     dtype=dtype,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        chunked = vllm_model.generate_greedy(example_prompts,
-                                             max_tokens=max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=chunked,
-        outputs_1_lst=non_chunked,
-        name_0="chunked",
-        name_1="non_chunked",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [15])
-def test_parallel_sampling(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-
-    # Numerical differences produce slightly different output for these
-    if 'state-spaces' in model:
-        example_prompts.pop(0)
-        example_prompts.pop(0)
-        example_prompts.pop(0)
-
-    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-        for_loop_outputs = []
-        for _ in range(10):
-            for_loop_outputs.append(
-                vllm_model.generate_greedy(example_prompts, max_tokens)[0])
-        sampling_params = SamplingParams(n=10,
-                                         temperature=0.001,
-                                         seed=0,
-                                         max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
-    token_ids, texts = n_lt_1_outputs[0]
-    n_lt_1_outputs = [(token_id, text)
-                      for token_id, text in zip(token_ids, texts)]
-
-    check_outputs_equal(
-        outputs_0_lst=n_lt_1_outputs,
-        outputs_1_lst=for_loop_outputs,
-        name_0="vllm_n_lt_1_outputs",
-        name_1="vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_mamba_cache_cg_padding(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # This test is for verifying that mamba cache is padded to CG captured
-    # batch size. If it's not, a torch RuntimeError will be raised because
-    # tensor dimensions aren't compatible
-    vllm_config = EngineArgs(model=model).create_engine_config()
-    while len(example_prompts) == vllm_config.pad_for_cudagraph(
-            len(example_prompts)):
-        example_prompts.append(example_prompts[0])
-
-    try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
-            vllm_model.generate_greedy(example_prompts, max_tokens)
-    except RuntimeError:
-        pytest.fail(
-            "Couldn't run batch size which is not equal to a Cuda Graph "
-            "captured batch size. "
-            "Could be related to mamba cache not padded correctly")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [20])
-def test_models_preemption_recompute(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # Tests that outputs are identical with and w/o preemtions (recompute)
-    assert dtype == "float"
-
-    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-        vllm_model.model.llm_engine.scheduler[
-            0].ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-        vllm_model.model.llm_engine.scheduler[
-            0].ENABLE_ARTIFICIAL_PREEMPT = False
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=preempt_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="vllm_preepmtions",
-        name_1="vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    # This test is for verifying that the Mamba inner state management doesn't
-    # collapse in case where the number of incoming requests and
-    # finished_requests_ids is larger than the maximum Mamba block capacity.
-    # This could generally happen due to the fact that Mamba does support
-    # statelessness mechanism where it can cleanup new incoming requests in
-    # a single step.
-    try:
-        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
-            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
-    except ValueError:
-        pytest.fail("Mamba inner state wasn't cleaned up properly between"
-                    "steps finished requests registered unnecessarily ")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_state_cleanup(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    # This test is for verifying that the Mamba state is cleaned up between
-    # steps, If its not cleaned, an error would be expected.
-    try:
-        with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
-            for _ in range(10):
-                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
-    except ValueError:
-        pytest.fail("Mamba inner state wasn't cleaned up between states, "
-                    "could be related to finished_requests_ids")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_multistep(
-    vllm_runner,
-    model: str,
-    dtype: str,
-    example_prompts,
-) -> None:
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [64])
-def test_multistep_correctness(vllm_runner, model: str, dtype: str,
-                               max_tokens: int, example_prompts) -> None:
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_multistep = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with vllm_runner(model, num_scheduler_steps=1,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_single_step = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_outputs_multistep,
-        outputs_1_lst=vllm_outputs_single_step,
-        name_0="vllm_outputs_multistep",
-        name_1="vllm_outputs_single_step",
-    )
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
-from contextlib import nullcontext
-
-import numpy as np
-import pytest
-import os
-
-from transformers import LlavaNextImageProcessor
-
-from vllm.config import ModelConfig
-from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.image import rescale_image_size
-from ..utils import models_path_prefix
-
-
-@pytest.fixture
-def mm_registry():
-    return MultiModalRegistry()
-
-
-@pytest.mark.parametrize("dtype", ["half", "float"])
-@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
-def test_llava_next_image_processor(image_assets, mm_registry, dtype,
-                                    size_factor):
-    MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-vicuna-7b-hf")
-
-    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
-    assert isinstance(hf_processor, LlavaNextImageProcessor)
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    for asset in image_assets:
-        image = rescale_image_size(asset.pil_image, size_factor)
-
-        hf_result = hf_processor.preprocess(
-            image,
-            return_tensors="pt",
-        )
-        vllm_result = mm_registry.map_input(
-            model_config,
-            {"image": image},
-        )
-
-        assert hf_result.keys() == vllm_result.keys()
-        for key, hf_tensor in hf_result.items():
-            hf_arr: np.ndarray = hf_tensor.numpy()
-            vllm_arr: np.ndarray = vllm_result[key].numpy()
-
-            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
-            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
-@pytest.mark.parametrize(
-    ("num_images", "limit", "is_valid"),
-    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
-     (2, 1, False), (2, 2, True)],
-)
-def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
-    MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="half",
-        revision=None,
-        limit_mm_per_prompt={"image": limit},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    image = image_assets[0].pil_image
-    if num_images == 0:
-        mm_inputs = {}
-    elif num_images == 1:
-        mm_inputs = {"image": image}
-    else:
-        mm_inputs = {"image": [image] * num_images}
-
-    with nullcontext() if is_valid else pytest.raises(ValueError):
-        mm_registry.map_input(model_config, mm_inputs)
-
-
-# NOTE: We don't test zero images since the HF processor doesn't support it
-@pytest.mark.parametrize("num_images", [1, 2])
-def test_image_mapper_multi(image_assets, mm_registry, num_images):
-    MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="half",
-        revision=None,
-        limit_mm_per_prompt={"image": num_images},
-    )
-
-    mm_registry.init_mm_limits_per_prompt(model_config)
-
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": [image] * num_images}
-
-    mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
-    assert len(mapped_inputs["pixel_values"]) == num_images