Commit 48a9e546 authored by 王敏's avatar 王敏
Browse files

Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev

parents 6372a1f3 c11b09df
...@@ -173,6 +173,35 @@ __global__ void moe_sum_kernel( ...@@ -173,6 +173,35 @@ __global__ void moe_sum_kernel(
} }
} }
template <typename scalar_t, int TOPK, int SPLIT_D, int BLOCK_DIM>
__global__ void moe_sum_sharedmem_topk8(
scalar_t* __restrict__ out,
const scalar_t* __restrict__ input,
const int d) {
const int token_idx = blockIdx.x / SPLIT_D;
const int sub_block = blockIdx.x % SPLIT_D;
const int d_per_block = (d + SPLIT_D - 1) / SPLIT_D;
const int64_t d_start = sub_block * d_per_block;
const int64_t token_offset = token_idx * TOPK * d;
const int64_t d_end = min(d_start + d_per_block, d);
__shared__ __align__(16) scalar_t sem_input[TOPK][BLOCK_DIM];
for (int64_t idx = d_start + threadIdx.x; idx < d_end; idx += blockDim.x) {
sem_input[0][threadIdx.x] = input[token_offset + 0 * d + idx];
sem_input[1][threadIdx.x] = input[token_offset + 1 * d + idx];
sem_input[2][threadIdx.x] = input[token_offset + 2 * d + idx];
sem_input[3][threadIdx.x] = input[token_offset + 3 * d + idx];
sem_input[4][threadIdx.x] = input[token_offset + 4 * d + idx];
sem_input[5][threadIdx.x] = input[token_offset + 5 * d + idx];
sem_input[6][threadIdx.x] = input[token_offset + 6 * d + idx];
sem_input[7][threadIdx.x] = input[token_offset + 7 * d + idx];
__syncthreads();
scalar_t x = sem_input[0][threadIdx.x] + sem_input[1][threadIdx.x] + sem_input[2][threadIdx.x] +
sem_input[3][threadIdx.x] + sem_input[4][threadIdx.x] + sem_input[5][threadIdx.x] +
sem_input[6][threadIdx.x] + sem_input[7][threadIdx.x];
out[token_idx * d + idx] = x;
}
}
template <typename scalar_t> template <typename scalar_t>
__global__ void moe_align_block_size_small_batch_expert_kernel( __global__ void moe_align_block_size_small_batch_expert_kernel(
const scalar_t* __restrict__ topk_ids, const scalar_t* __restrict__ topk_ids,
...@@ -353,6 +382,67 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] ...@@ -353,6 +382,67 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
}); });
break; break;
default:
at::sum_out(output, input, 1);
break;
}
}
void moe_sum_opt1(torch::Tensor& input, // [num_tokens, topk, hidden_size]
torch::Tensor& output) // [num_tokens, hidden_size]
{
const int hidden_size = input.size(-1);
const auto num_tokens = output.numel() / hidden_size;
const int topk = input.size(1);
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
constexpr int splitD_ = 8;
const int TOPK8_GRID_DIM = num_tokens * splitD_;
constexpr int TOPK8_BLOCK_DIM = 256;
dim3 grid_8(TOPK8_GRID_DIM);
dim3 block_8(TOPK8_BLOCK_DIM);
switch (topk) {
case 2:
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
vllm::moe::moe_sum_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
hidden_size);
});
break;
case 3:
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
vllm::moe::moe_sum_kernel<scalar_t, 3><<<grid, block, 0, stream>>>(
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
hidden_size);
});
break;
case 4:
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
vllm::moe::moe_sum_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
hidden_size);
});
break;
case 8:
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_sharedmem_topk8", [&]{
vllm::moe::moe_sum_sharedmem_topk8<scalar_t, 8, splitD_, TOPK8_BLOCK_DIM><<<grid_8, block_8, 0, stream>>>(
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
hidden_size);
});
break;
default: default:
at::sum_out(output, input, 1); at::sum_out(output, input, 1);
break; break;
......
...@@ -7,6 +7,7 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, ...@@ -7,6 +7,7 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch::Tensor& gating_output); torch::Tensor& gating_output);
void moe_sum(torch::Tensor& input, torch::Tensor& output); void moe_sum(torch::Tensor& input, torch::Tensor& output);
void moe_sum_opt1(torch::Tensor& input, torch::Tensor& output);
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids, int64_t block_size, torch::Tensor sorted_token_ids,
......
...@@ -11,8 +11,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { ...@@ -11,8 +11,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Calculate the result of moe by summing up the partial results // Calculate the result of moe by summing up the partial results
// from all selected experts. // from all selected experts.
m.def("moe_sum(Tensor input, Tensor! output) -> ()"); m.def("moe_sum(Tensor input, Tensor! output) -> ()");
m.def("moe_sum_opt1(Tensor input, Tensor! output) -> ()");
m.impl("moe_sum", torch::kCUDA, &moe_sum); m.impl("moe_sum", torch::kCUDA, &moe_sum);
m.impl("moe_sum_opt1", torch::kCUDA, &moe_sum_opt1);
// Aligning the number of tokens to be processed by each expert such // Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size. // that it is divisible by the block size.
m.def( m.def(
......
...@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
if sha is None: if sha is None:
sha = get_sha(vllm_root) sha = get_sha(vllm_root)
if (major, minor) >= ('2', '5'): if (major, minor) >= ('2', '5'):
version = 'das.opt1.' + sha[:7] version = 'das.opt1.rc1.' + sha[:7]
else: else:
if (major, minor) >= ('2', '5'): if (major, minor) >= ('2', '5'):
version = 'das.opt1' version = 'das.opt1.rc1'
# dtk version # dtk version
......
...@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal ...@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import gpuname
import vllm.envs as envs
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "google/gemma-2-2b-it"), os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
...@@ -41,10 +39,10 @@ def v1(run_with_both_engines): ...@@ -41,10 +39,10 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed(): def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted""" """Verify vllm instance is GC'ed when it is deleted"""
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND: if not current_platform.is_rocm():
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
else:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2")) llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
else:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
weak_llm = weakref.ref(llm) weak_llm = weakref.ref(llm)
del llm del llm
...@@ -111,13 +109,12 @@ def test_models( ...@@ -111,13 +109,12 @@ def test_models(
prompt_embeds = hf_model.get_prompt_embeddings( prompt_embeds = hf_model.get_prompt_embeddings(
example_prompts) example_prompts)
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND: if not current_platform.is_rocm():
with VllmRunner(model, with VllmRunner(model,
max_model_len=8192, max_model_len=8192,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds, enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7) as vllm_model:
block_size=64) as vllm_model:
if enable_prompt_embeds: if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy( vllm_outputs = vllm_model.generate_greedy(
prompt_embeds, max_tokens) prompt_embeds, max_tokens)
...@@ -131,7 +128,8 @@ def test_models( ...@@ -131,7 +128,8 @@ def test_models(
max_model_len=8192, max_model_len=8192,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds, enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7,
block_size=64) as vllm_model:
if enable_prompt_embeds: if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy( vllm_outputs = vllm_model.generate_greedy(
prompt_embeds, max_tokens) prompt_embeds, max_tokens)
......
...@@ -94,7 +94,7 @@ def test_models( ...@@ -94,7 +94,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens) max_tokens)
...@@ -128,7 +128,7 @@ def test_models_distributed( ...@@ -128,7 +128,7 @@ def test_models_distributed(
) -> None: ) -> None:
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend) m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct" if (model == os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
and distributed_executor_backend == "ray"): and distributed_executor_backend == "ray"):
# test Ray Compiled Graph # test Ray Compiled Graph
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1") m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
...@@ -158,7 +158,7 @@ def test_models_distributed( ...@@ -158,7 +158,7 @@ def test_models_distributed(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy( vllm_outputs = vllm_model.generate_greedy(
example_prompts, example_prompts,
...@@ -220,22 +220,25 @@ def test_models_with_fp8_kv_cache( ...@@ -220,22 +220,25 @@ def test_models_with_fp8_kv_cache(
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner( with vllm_runner(
model, model,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True, enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=no_chunked_prefill_outputs, outputs_0_lst=no_chunked_prefill_outputs,
...@@ -286,7 +289,7 @@ def test_with_prefix_caching( ...@@ -286,7 +289,7 @@ def test_with_prefix_caching(
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
...@@ -303,7 +306,7 @@ def test_with_prefix_caching( ...@@ -303,7 +306,7 @@ def test_with_prefix_caching(
) )
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"]) @pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
......
...@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. ...@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`. pytest tests/basic_correctness/test_preemption.py`.
""" """
import os
import pytest import pytest
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
...@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, ...@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
from ..utils import models_path_prefix from ..utils import models_path_prefix
import os from vllm.platforms import current_platform
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "distilbert/distilgpt2"), os.path.join(models_path_prefix, "distilbert/distilgpt2"),
...@@ -74,18 +75,33 @@ def test_chunked_prefill_recompute( ...@@ -74,18 +75,33 @@ def test_chunked_prefill_recompute(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner( if not current_platform.is_rocm():
model, with vllm_runner(
dtype=dtype, model,
max_num_batched_tokens=max_num_batched_tokens, dtype=dtype,
enable_chunked_prefill=enable_chunked_prefill, max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_seqs, enable_chunked_prefill=enable_chunked_prefill,
distributed_executor_backend=distributed_executor_backend, max_num_seqs=max_num_seqs,
disable_log_stats=False, distributed_executor_backend=distributed_executor_backend,
) as vllm_model: disable_log_stats=False,
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) ) as vllm_model:
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
< ARTIFICIAL_PREEMPTION_MAX_CNT) assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
else:
with vllm_runner(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
distributed_executor_backend=distributed_executor_backend,
disable_log_stats=False,
block_size=64,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
...@@ -115,17 +131,31 @@ def test_preemption( ...@@ -115,17 +131,31 @@ def test_preemption(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner( if not current_platform.is_rocm():
model, with vllm_runner(
dtype=dtype, model,
disable_log_stats=False, dtype=dtype,
distributed_executor_backend=distributed_executor_backend, disable_log_stats=False,
) as vllm_model: distributed_executor_backend=distributed_executor_backend,
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) ) as vllm_model:
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
< ARTIFICIAL_PREEMPTION_MAX_CNT) assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
total_preemption = ( < ARTIFICIAL_PREEMPTION_MAX_CNT)
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption) total_preemption = (
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
else:
with vllm_runner(
model,
dtype=dtype,
disable_log_stats=False,
distributed_executor_backend=distributed_executor_backend,
block_size=64,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
...@@ -163,7 +193,7 @@ def test_preemption_infeasible( ...@@ -163,7 +193,7 @@ def test_preemption_infeasible(
distributed_executor_backend: str, distributed_executor_backend: str,
) -> None: ) -> None:
"""Verify infeasible preemption request will be ignored.""" """Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16 BLOCK_SIZE = 16 if not current_platform.is_rocm() else 64
prefill_blocks = 2 prefill_blocks = 2
decode_blocks = max_tokens // BLOCK_SIZE decode_blocks = max_tokens // BLOCK_SIZE
with vllm_runner( with vllm_runner(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import json import json
import pytest import pytest
...@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS ...@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import (compare_two_settings, create_new_process_for_each_test, from ..utils import (compare_two_settings, create_new_process_for_each_test,
multi_gpu_test) multi_gpu_test)
from .backend import TestBackend from .backend import TestBackend
from ..utils import models_path_prefix
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
...@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int, ...@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
@create_new_process_for_each_test() @create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"]) @pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")])
@pytest.mark.parametrize("tp_size", [2]) @pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("async_tp_enabled", [True]) @pytest.mark.parametrize("async_tp_enabled", [True])
@pytest.mark.parametrize("distributed_backend", ["mp"]) @pytest.mark.parametrize("distributed_backend", ["mp"])
......
...@@ -84,16 +84,17 @@ class TestSetting: ...@@ -84,16 +84,17 @@ class TestSetting:
# method="encode", # method="encode",
# fullgraph=True, # fullgraph=True,
# ), # ),
# TODO
# vision language model # vision language model
TestSetting( # TestSetting(
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"), # model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"], # model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2, # pp_size=2,
tp_size=1, # tp_size=1,
attn_backend="FLASH_ATTN", # attn_backend="FLASH_ATTN",
method="generate_with_image", # method="generate_with_image",
fullgraph=False, # fullgraph=False,
), # ),
]) ])
def test_compile_correctness( def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch, monkeypatch: pytest.MonkeyPatch,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
import vllm import vllm
from vllm.compilation.counter import compilation_counter from vllm.compilation.counter import compilation_counter
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.utils import _is_torch_equal_or_newer from vllm.utils import _is_torch_equal_or_newer
from ..utils import models_path_prefix
def test_version(): def test_version():
assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev') assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev')
...@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch): ...@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch):
assert not vllm_config.compilation_config.use_cudagraph assert not vllm_config.compilation_config.use_cudagraph
@pytest.mark.parametrize("enabled", [True, False]) # TODO: when True num_cudagraph_captured=13
# @pytest.mark.parametrize("enabled", [True, False])
@pytest.mark.parametrize("enabled", [False])
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
assert vllm.envs.VLLM_USE_V1 assert vllm.envs.VLLM_USE_V1
...@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled): ...@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
num_cudagraph_captured=13 if enabled else 0, num_cudagraph_captured=13 if enabled else 0,
), ),
# loading the model causes compilation (if enabled) to happen # loading the model causes compilation (if enabled) to happen
vllm_runner('facebook/opt-125m', vllm_runner(os.path.join(models_path_prefix, 'facebook/opt-125m'),
compilation_config=compilation_config, compilation_config=compilation_config,
gpu_memory_utilization=0.4) as _): gpu_memory_utilization=0.4) as _):
pass pass
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys import sys
from unittest.mock import patch from unittest.mock import patch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
from ..utils import models_path_prefix
def test_mp_reducer(monkeypatch): def test_mp_reducer(monkeypatch):
...@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch): ...@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
with patch('multiprocessing.reducer.register') as mock_register: with patch('multiprocessing.reducer.register') as mock_register:
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model="facebook/opt-125m", model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_model_len=32, max_model_len=32,
gpu_memory_utilization=0.1, gpu_memory_utilization=0.1,
disable_log_stats=True, disable_log_stats=True,
......
...@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams ...@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.utils import maybe_model_redirect from vllm.transformers_utils.utils import maybe_model_redirect
from .utils import models_path_prefix from .utils import models_path_prefix
from vllm.platforms import current_platform
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -783,7 +784,7 @@ class VllmRunner: ...@@ -783,7 +784,7 @@ class VllmRunner:
dtype: str = "auto", dtype: str = "auto",
disable_log_stats: bool = True, disable_log_stats: bool = True,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
block_size: int = 16, block_size: int = 16 if not current_platform.is_rocm() else 64,
enable_chunked_prefill: Optional[bool] = False, enable_chunked_prefill: Optional[bool] = False,
swap_space: int = 4, swap_space: int = 4,
enforce_eager: Optional[bool] = False, enforce_eager: Optional[bool] = False,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from itertools import cycle from itertools import cycle
import pytest import pytest
...@@ -8,10 +9,8 @@ import pytest ...@@ -8,10 +9,8 @@ import pytest
from vllm import SamplingParams from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator from .conftest import get_token_ids_from_llm_generator
import os
from ....utils import models_path_prefix from ....utils import models_path_prefix
import vllm.envs as envs from vllm.platforms import current_platform
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -24,7 +23,7 @@ from vllm.utils import SUPPORT_TC, gpuname ...@@ -24,7 +23,7 @@ from vllm.utils import SUPPORT_TC, gpuname
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, "block_size": 64 if current_platform.is_rocm() else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
...@@ -107,7 +106,7 @@ def test_block_manager_with_preemption(baseline_llm_generator, ...@@ -107,7 +106,7 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
{ {
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, "block_size": 64 if current_platform.is_rocm() else 16,
# Allow only 2 sequences of ~128 tokens in worst case. # Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size # Note 8 = 128/block_size
...@@ -200,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, ...@@ -200,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
]) ])
@pytest.mark.parametrize("per_test_common_llm_kwargs", @pytest.mark.parametrize("per_test_common_llm_kwargs",
[{ [{
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, "block_size": 64 if current_platform.is_rocm() else 16,
"max_num_batched_tokens": 2, "max_num_batched_tokens": 2,
"max_num_seqs": 2, "max_num_seqs": 2,
}, { }, {
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, "block_size": 64 if current_platform.is_rocm() else 16,
"max_num_batched_tokens": 3, "max_num_batched_tokens": 3,
"max_num_seqs": 2, "max_num_seqs": 2,
}, { }, {
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, "block_size": 64 if current_platform.is_rocm() else 16,
"max_num_batched_tokens": 256, "max_num_batched_tokens": 256,
"max_num_seqs": 10, "max_num_seqs": 10,
}]) }])
...@@ -274,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator, ...@@ -274,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, "block_size": 64 if current_platform.is_rocm() else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache # Enable prefill cache
...@@ -355,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption( ...@@ -355,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, "block_size": 64 if current_platform.is_rocm() else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
...@@ -430,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, ...@@ -430,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly # we keep the blocks small, so that hit eviction quickly
"max_model_len": 48, "max_model_len": 48,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, "block_size": 64 if current_platform.is_rocm() else 16,
"num_gpu_blocks_override": 3, "num_gpu_blocks_override": 3,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......
...@@ -15,8 +15,7 @@ from vllm.sequence import Logprob, SequenceGroup ...@@ -15,8 +15,7 @@ from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt from .utils import create_dummy_prompt
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname from vllm.platforms import current_platform
import vllm.envs as envs
def get_sequence_groups(scheduler_output): def get_sequence_groups(scheduler_output):
...@@ -852,7 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str, ...@@ -852,7 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs=8, max_num_seqs=8,
enable_chunked_prefill=True, enable_chunked_prefill=True,
gpu_memory_utilization=0.8, gpu_memory_utilization=0.8,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16, block_size=64 if current_platform.is_rocm() else 16,
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
......
...@@ -10,8 +10,6 @@ from vllm.engine.llm_engine import LLMEngine ...@@ -10,8 +10,6 @@ from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import SequenceGroup from vllm.sequence import SequenceGroup
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m") MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
...@@ -41,7 +39,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, ...@@ -41,7 +39,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16) block_size=64 if current_platform.is_rocm() else 16)
engine: LLMEngine = runner.model.llm_engine engine: LLMEngine = runner.model.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step. # In multi-step + chunked-prefill there is no separate single prompt step.
......
...@@ -15,6 +15,7 @@ from vllm.core.interfaces import AllocStatus ...@@ -15,6 +15,7 @@ from vllm.core.interfaces import AllocStatus
from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.core.scheduler import Scheduler, SchedulingBudget
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sequence import SequenceGroup, SequenceStatus from vllm.sequence import SequenceGroup, SequenceStatus
from vllm.platforms import current_platform
from .utils import (append_new_token, append_new_token_seq, from .utils import (append_new_token, append_new_token_seq,
append_new_token_seq_group, create_dummy_prompt, append_new_token_seq_group, create_dummy_prompt,
...@@ -22,7 +23,7 @@ from .utils import (append_new_token, append_new_token_seq, ...@@ -22,7 +23,7 @@ from .utils import (append_new_token, append_new_token_seq,
def test_scheduler_add_seq_group(): def test_scheduler_add_seq_group():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate", "generate",
max_num_batched_tokens=100, max_num_batched_tokens=100,
...@@ -45,7 +46,7 @@ def test_scheduler_add_seq_group(): ...@@ -45,7 +46,7 @@ def test_scheduler_add_seq_group():
def test_scheduler_abort_seq_group(): def test_scheduler_abort_seq_group():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate", "generate",
max_num_batched_tokens=100, max_num_batched_tokens=100,
...@@ -72,7 +73,7 @@ def test_scheduler_abort_seq_group(): ...@@ -72,7 +73,7 @@ def test_scheduler_abort_seq_group():
def test_scheduler_schedule_simple(): def test_scheduler_schedule_simple():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
...@@ -117,7 +118,7 @@ def test_scheduler_schedule_simple(): ...@@ -117,7 +118,7 @@ def test_scheduler_schedule_simple():
def test_scheduler_prefill_prioritized(): def test_scheduler_prefill_prioritized():
"""Verify running batched tokens are not applied to prefill requests.""" """Verify running batched tokens are not applied to prefill requests."""
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
max_model_len = 30 max_model_len = 30
max_batched_num_tokens = 30 max_batched_num_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
...@@ -150,7 +151,7 @@ def test_scheduler_prefill_prioritized(): ...@@ -150,7 +151,7 @@ def test_scheduler_prefill_prioritized():
def test_scheduler_schedule_preempt_abort(): def test_scheduler_schedule_preempt_abort():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate", "generate",
...@@ -208,7 +209,7 @@ def test_scheduler_schedule_preempt_abort(): ...@@ -208,7 +209,7 @@ def test_scheduler_schedule_preempt_abort():
def test_scheduler_max_seqs(): def test_scheduler_max_seqs():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
num_seq_group = 4 num_seq_group = 4
max_seq_group = 2 max_seq_group = 2
max_model_len = 16 max_model_len = 16
...@@ -256,7 +257,7 @@ def test_scheduler_max_seqs(): ...@@ -256,7 +257,7 @@ def test_scheduler_max_seqs():
def test_scheduler_delay_factor(): def test_scheduler_delay_factor():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate", "generate",
max_num_batched_tokens=100, max_num_batched_tokens=100,
...@@ -306,7 +307,7 @@ def initialize_scheduler( ...@@ -306,7 +307,7 @@ def initialize_scheduler(
max_token_budget=1000, max_token_budget=1000,
max_model_len=1000, max_model_len=1000,
lora_config=None, lora_config=None,
block_size=4, block_size=4 if not current_platform.is_rocm() else 64,
num_cpu_blocks=8, num_cpu_blocks=8,
num_gpu_blocks=8, num_gpu_blocks=8,
enable_prefix_caching=False, enable_prefix_caching=False,
...@@ -354,7 +355,7 @@ def test_prefill_schedule_max_prompt_len(): ...@@ -354,7 +355,7 @@ def test_prefill_schedule_max_prompt_len():
""" """
Test prompt longer than max_prompt_len is aborted. Test prompt longer than max_prompt_len is aborted.
""" """
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(max_model_len=30, block_size=block_size) scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
_, seq_group = create_dummy_prompt("0", _, seq_group = create_dummy_prompt("0",
prompt_length=60, prompt_length=60,
...@@ -374,7 +375,7 @@ def test_prefill_schedule_token_budget(): ...@@ -374,7 +375,7 @@ def test_prefill_schedule_token_budget():
""" """
Test token budget respected. Test token budget respected.
""" """
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size, scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
...@@ -436,7 +437,7 @@ def test_prefill_schedule_max_seqs(): ...@@ -436,7 +437,7 @@ def test_prefill_schedule_max_seqs():
""" """
Test max seq respected. Test max seq respected.
""" """
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size, scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
...@@ -475,7 +476,7 @@ def test_prefill_schedule_max_lora(): ...@@ -475,7 +476,7 @@ def test_prefill_schedule_max_lora():
""" """
Test max lora is respected and prioritized. Test max lora is respected and prioritized.
""" """
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config, scheduler = initialize_scheduler(lora_config=lora_config,
block_size=block_size, block_size=block_size,
...@@ -528,7 +529,7 @@ def test_prefill_schedule_no_block_manager_capacity(): ...@@ -528,7 +529,7 @@ def test_prefill_schedule_no_block_manager_capacity():
""" """
Test sequence cannot be scheduled due to block manager has no capacity. Test sequence cannot be scheduled due to block manager has no capacity.
""" """
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size, scheduler = initialize_scheduler(block_size=block_size,
num_gpu_blocks=128, num_gpu_blocks=128,
num_cpu_blocks=128) num_cpu_blocks=128)
...@@ -570,7 +571,7 @@ def test_decode_schedule_preempted(): ...@@ -570,7 +571,7 @@ def test_decode_schedule_preempted():
""" """
Test decodes cannot be scheduled and preempted. Test decodes cannot be scheduled and preempted.
""" """
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size, scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64, num_cpu_blocks=64,
num_gpu_blocks=64) num_gpu_blocks=64)
...@@ -614,7 +615,7 @@ def test_schedule_decode_blocks_to_copy_update(): ...@@ -614,7 +615,7 @@ def test_schedule_decode_blocks_to_copy_update():
""" """
Verify blocks_to_copy is updated. Verify blocks_to_copy is updated.
""" """
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=4, scheduler = initialize_scheduler(block_size=4,
num_cpu_blocks=16, num_cpu_blocks=16,
num_gpu_blocks=16) num_gpu_blocks=16)
...@@ -646,7 +647,7 @@ def test_schedule_decode_blocks_to_copy_update(): ...@@ -646,7 +647,7 @@ def test_schedule_decode_blocks_to_copy_update():
def test_schedule_swapped_max_loras(): def test_schedule_swapped_max_loras():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config, scheduler = initialize_scheduler(lora_config=lora_config,
block_size=block_size, block_size=block_size,
...@@ -679,7 +680,7 @@ def test_schedule_swapped_max_loras(): ...@@ -679,7 +680,7 @@ def test_schedule_swapped_max_loras():
def test_schedule_swapped_cannot_swap_in(): def test_schedule_swapped_cannot_swap_in():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size, scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32, num_cpu_blocks=32,
num_gpu_blocks=32) num_gpu_blocks=32)
...@@ -709,7 +710,7 @@ def test_schedule_swapped_cannot_swap_in(): ...@@ -709,7 +710,7 @@ def test_schedule_swapped_cannot_swap_in():
def test_infeasible_swap(): def test_infeasible_swap():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size, scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32, num_cpu_blocks=32,
num_gpu_blocks=32) num_gpu_blocks=32)
...@@ -740,7 +741,7 @@ def test_infeasible_swap(): ...@@ -740,7 +741,7 @@ def test_infeasible_swap():
def test_schedule_swapped_blocks_to_copy(): def test_schedule_swapped_blocks_to_copy():
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size, scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32, num_cpu_blocks=32,
num_gpu_blocks=32) num_gpu_blocks=32)
...@@ -825,7 +826,7 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching): ...@@ -825,7 +826,7 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
considering prefix caching. considering prefix caching.
""" """
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
max_num_batched_tokens = 12 max_num_batched_tokens = 12
max_seq_group = 3 max_seq_group = 3
scheduler = initialize_scheduler( scheduler = initialize_scheduler(
...@@ -912,7 +913,7 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching( ...@@ -912,7 +913,7 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
block-size aligned). block-size aligned).
""" """
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_num_batched_tokens = 4 max_num_batched_tokens = 4
max_seq_group = 3 max_seq_group = 3
scheduler = initialize_scheduler( scheduler = initialize_scheduler(
...@@ -978,7 +979,7 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds(): ...@@ -978,7 +979,7 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
Test that the scheduler does not schedule batches with prompt tokens and Test that the scheduler does not schedule batches with prompt tokens and
prompt embeddings co-mingled. prompt embeddings co-mingled.
""" """
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3 max_seq_group = 3
scheduler = initialize_scheduler( scheduler = initialize_scheduler(
block_size=block_size, block_size=block_size,
...@@ -1057,7 +1058,7 @@ def test_remove_seq_from_computed_blocks_tracker(): ...@@ -1057,7 +1058,7 @@ def test_remove_seq_from_computed_blocks_tracker():
_seq_id_to_num_tokens_computed. _seq_id_to_num_tokens_computed.
""" """
# Budget can not schedule in swapped # Budget can not schedule in swapped
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3 max_seq_group = 3
seq_tokens_with_swapped: list[list[int]] = [] seq_tokens_with_swapped: list[list[int]] = []
blocks_to_swap_out: list[tuple[int, int]] = [] blocks_to_swap_out: list[tuple[int, int]] = []
...@@ -1097,7 +1098,7 @@ def test_remove_seq_from_computed_blocks_tracker(): ...@@ -1097,7 +1098,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill schedule don't have a space for another LoRA, so # Prefill schedule don't have a space for another LoRA, so
# we ignore this request for now. # we ignore this request for now.
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config, scheduler = initialize_scheduler(lora_config=lora_config,
block_size=block_size, block_size=block_size,
...@@ -1131,7 +1132,7 @@ def test_remove_seq_from_computed_blocks_tracker(): ...@@ -1131,7 +1132,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill scheduler does not schedule batches with prompt tokens and # Prefill scheduler does not schedule batches with prompt tokens and
# prompt embeddings co-mingled. # prompt embeddings co-mingled.
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3 max_seq_group = 3
scheduler = initialize_scheduler( scheduler = initialize_scheduler(
block_size=block_size, block_size=block_size,
...@@ -1170,7 +1171,7 @@ def test_remove_seq_from_computed_blocks_tracker(): ...@@ -1170,7 +1171,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill scheduler budget num_batched_tokens # Prefill scheduler budget num_batched_tokens
# >= scheduler_config max_num_batched_tokens # >= scheduler_config max_num_batched_tokens
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3 max_seq_group = 3
seq_tokens_prefill_budget: list[list[int]] = [] seq_tokens_prefill_budget: list[list[int]] = []
...@@ -1205,7 +1206,7 @@ def test_remove_seq_from_computed_blocks_tracker(): ...@@ -1205,7 +1206,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert seq_id_to_num_tokens_computed is None assert seq_id_to_num_tokens_computed is None
# Budget can not schedule in waiting # Budget can not schedule in waiting
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3 max_seq_group = 3
scheduler = initialize_scheduler( scheduler = initialize_scheduler(
...@@ -1241,7 +1242,7 @@ def test_remove_seq_from_computed_blocks_tracker(): ...@@ -1241,7 +1242,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert seq_id_to_num_tokens_computed is None assert seq_id_to_num_tokens_computed is None
# Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3 max_seq_group = 3
scheduler = initialize_scheduler( scheduler = initialize_scheduler(
block_size=block_size, block_size=block_size,
...@@ -1269,7 +1270,7 @@ def test_remove_seq_from_computed_blocks_tracker(): ...@@ -1269,7 +1270,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert seq_id_to_num_tokens_computed is None assert seq_id_to_num_tokens_computed is None
# Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3 max_seq_group = 3
scheduler = initialize_scheduler( scheduler = initialize_scheduler(
block_size=block_size, block_size=block_size,
...@@ -1303,7 +1304,7 @@ def test_remove_seq_from_computed_blocks_tracker(): ...@@ -1303,7 +1304,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert seq_id_to_num_tokens_computed is None assert seq_id_to_num_tokens_computed is None
# Budget can not allocate, AllocStatus is LATER # Budget can not allocate, AllocStatus is LATER
block_size = 2 block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3 max_seq_group = 3
scheduler = initialize_scheduler( scheduler = initialize_scheduler(
block_size=block_size, block_size=block_size,
......
...@@ -6,6 +6,7 @@ import pytest # noqa ...@@ -6,6 +6,7 @@ import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler from vllm.core.scheduler import Scheduler
from vllm.sequence import SequenceGroup from vllm.sequence import SequenceGroup
from vllm.platforms import current_platform
from .utils import (append_new_token, create_dummy_prompt_encoder_decoder, from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
get_sequence_groups, schedule_and_update_computed_tokens) get_sequence_groups, schedule_and_update_computed_tokens)
...@@ -34,7 +35,7 @@ def test_scheduler_schedule_simple_encoder_decoder(): ...@@ -34,7 +35,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
cross-attention block table cross-attention block table
''' '''
block_size = 4 block_size = 4 if not current_platform.is_rocm() else 64
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
......
...@@ -7,8 +7,7 @@ import pytest ...@@ -7,8 +7,7 @@ import pytest
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix from ..utils import models_path_prefix
import vllm.envs as envs from vllm.platforms import current_platform
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.skip_v1 @pytest.mark.skip_v1
...@@ -23,7 +22,7 @@ def test_computed_prefix_blocks(model: str): ...@@ -23,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available " "paper clips? Is there an easy to follow video tutorial available "
"online for free?") "online for free?")
llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16) llm = LLM(model=model, block_size=64 if current_platform.is_rocm() else 16)
sampling_params = SamplingParams(max_tokens=10, sampling_params = SamplingParams(max_tokens=10,
temperature=0.0, temperature=0.0,
detokenize=False) detokenize=False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment