Commit 48a9e546 authored by 王敏's avatar 王敏
Browse files

Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev

parents 6372a1f3 c11b09df
......@@ -173,6 +173,35 @@ __global__ void moe_sum_kernel(
}
}
template <typename scalar_t, int TOPK, int SPLIT_D, int BLOCK_DIM>
__global__ void moe_sum_sharedmem_topk8(
scalar_t* __restrict__ out,
const scalar_t* __restrict__ input,
const int d) {
const int token_idx = blockIdx.x / SPLIT_D;
const int sub_block = blockIdx.x % SPLIT_D;
const int d_per_block = (d + SPLIT_D - 1) / SPLIT_D;
const int64_t d_start = sub_block * d_per_block;
const int64_t token_offset = token_idx * TOPK * d;
const int64_t d_end = min(d_start + d_per_block, d);
__shared__ __align__(16) scalar_t sem_input[TOPK][BLOCK_DIM];
for (int64_t idx = d_start + threadIdx.x; idx < d_end; idx += blockDim.x) {
sem_input[0][threadIdx.x] = input[token_offset + 0 * d + idx];
sem_input[1][threadIdx.x] = input[token_offset + 1 * d + idx];
sem_input[2][threadIdx.x] = input[token_offset + 2 * d + idx];
sem_input[3][threadIdx.x] = input[token_offset + 3 * d + idx];
sem_input[4][threadIdx.x] = input[token_offset + 4 * d + idx];
sem_input[5][threadIdx.x] = input[token_offset + 5 * d + idx];
sem_input[6][threadIdx.x] = input[token_offset + 6 * d + idx];
sem_input[7][threadIdx.x] = input[token_offset + 7 * d + idx];
__syncthreads();
scalar_t x = sem_input[0][threadIdx.x] + sem_input[1][threadIdx.x] + sem_input[2][threadIdx.x] +
sem_input[3][threadIdx.x] + sem_input[4][threadIdx.x] + sem_input[5][threadIdx.x] +
sem_input[6][threadIdx.x] + sem_input[7][threadIdx.x];
out[token_idx * d + idx] = x;
}
}
template <typename scalar_t>
__global__ void moe_align_block_size_small_batch_expert_kernel(
const scalar_t* __restrict__ topk_ids,
......@@ -358,3 +387,64 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
break;
}
}
void moe_sum_opt1(torch::Tensor& input, // [num_tokens, topk, hidden_size]
torch::Tensor& output) // [num_tokens, hidden_size]
{
const int hidden_size = input.size(-1);
const auto num_tokens = output.numel() / hidden_size;
const int topk = input.size(1);
dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
constexpr int splitD_ = 8;
const int TOPK8_GRID_DIM = num_tokens * splitD_;
constexpr int TOPK8_BLOCK_DIM = 256;
dim3 grid_8(TOPK8_GRID_DIM);
dim3 block_8(TOPK8_BLOCK_DIM);
switch (topk) {
case 2:
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
vllm::moe::moe_sum_kernel<scalar_t, 2><<<grid, block, 0, stream>>>(
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
hidden_size);
});
break;
case 3:
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
vllm::moe::moe_sum_kernel<scalar_t, 3><<<grid, block, 0, stream>>>(
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
hidden_size);
});
break;
case 4:
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
vllm::moe::moe_sum_kernel<scalar_t, 4><<<grid, block, 0, stream>>>(
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
hidden_size);
});
break;
case 8:
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_sharedmem_topk8", [&]{
vllm::moe::moe_sum_sharedmem_topk8<scalar_t, 8, splitD_, TOPK8_BLOCK_DIM><<<grid_8, block_8, 0, stream>>>(
output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
hidden_size);
});
break;
default:
at::sum_out(output, input, 1);
break;
}
}
\ No newline at end of file
......@@ -7,6 +7,7 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch::Tensor& gating_output);
void moe_sum(torch::Tensor& input, torch::Tensor& output);
void moe_sum_opt1(torch::Tensor& input, torch::Tensor& output);
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
......
......@@ -11,8 +11,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Calculate the result of moe by summing up the partial results
// from all selected experts.
m.def("moe_sum(Tensor input, Tensor! output) -> ()");
m.def("moe_sum_opt1(Tensor input, Tensor! output) -> ()");
m.impl("moe_sum", torch::kCUDA, &moe_sum);
m.impl("moe_sum_opt1", torch::kCUDA, &moe_sum_opt1);
// Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size.
m.def(
......
......@@ -559,10 +559,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
if sha is None:
sha = get_sha(vllm_root)
if (major, minor) >= ('2', '5'):
version = 'das.opt1.' + sha[:7]
version = 'das.opt1.rc1.' + sha[:7]
else:
if (major, minor) >= ('2', '5'):
version = 'das.opt1'
version = 'das.opt1.rc1'
# dtk version
......
......@@ -20,8 +20,6 @@ from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test
import os
from ..utils import models_path_prefix
from vllm.utils import gpuname
import vllm.envs as envs
MODELS = [
os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
......@@ -41,10 +39,10 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
else:
if not current_platform.is_rocm():
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
else:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
weak_llm = weakref.ref(llm)
del llm
......@@ -111,13 +109,12 @@ def test_models(
prompt_embeds = hf_model.get_prompt_embeddings(
example_prompts)
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
if not current_platform.is_rocm():
with VllmRunner(model,
max_model_len=8192,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
block_size=64) as vllm_model:
gpu_memory_utilization=0.7) as vllm_model:
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(
prompt_embeds, max_tokens)
......@@ -131,7 +128,8 @@ def test_models(
max_model_len=8192,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7) as vllm_model:
gpu_memory_utilization=0.7,
block_size=64) as vllm_model:
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(
prompt_embeds, max_tokens)
......
......@@ -94,7 +94,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
......@@ -128,7 +128,7 @@ def test_models_distributed(
) -> None:
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
if (model == "meta-llama/Llama-3.2-1B-Instruct"
if (model == os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
and distributed_executor_backend == "ray"):
# test Ray Compiled Graph
m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
......@@ -158,7 +158,7 @@ def test_models_distributed(
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
......@@ -220,6 +220,7 @@ def test_models_with_fp8_kv_cache(
max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model:
no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
......@@ -233,10 +234,12 @@ def test_models_with_fp8_kv_cache(
max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model:
chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close(
outputs_0_lst=no_chunked_prefill_outputs,
outputs_1_lst=chunked_prefill_outputs,
......@@ -286,7 +289,7 @@ def test_with_prefix_caching(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
block_size=64 if current_platform.is_rocm() else 16,
) as vllm_model:
outputs[enable] = []
for prompt in full_prompts:
......@@ -303,7 +306,7 @@ def test_with_prefix_caching(
)
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
......
......@@ -7,6 +7,7 @@ VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
pytest tests/basic_correctness/test_preemption.py`.
"""
import os
import pytest
from prometheus_client import REGISTRY
......@@ -18,7 +19,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from ..models.utils import check_outputs_equal
from ..utils import models_path_prefix
import os
from vllm.platforms import current_platform
MODELS = [
os.path.join(models_path_prefix, "distilbert/distilgpt2"),
......@@ -74,6 +75,7 @@ def test_chunked_prefill_recompute(
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if not current_platform.is_rocm():
with vllm_runner(
model,
dtype=dtype,
......@@ -86,6 +88,20 @@ def test_chunked_prefill_recompute(
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
else:
with vllm_runner(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
distributed_executor_backend=distributed_executor_backend,
disable_log_stats=False,
block_size=64,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
......@@ -115,11 +131,25 @@ def test_preemption(
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if not current_platform.is_rocm():
with vllm_runner(
model,
dtype=dtype,
disable_log_stats=False,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
else:
with vllm_runner(
model,
dtype=dtype,
disable_log_stats=False,
distributed_executor_backend=distributed_executor_backend,
block_size=64,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
......@@ -163,7 +193,7 @@ def test_preemption_infeasible(
distributed_executor_backend: str,
) -> None:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16
BLOCK_SIZE = 16 if not current_platform.is_rocm() else 64
prefill_blocks = 2
decode_blocks = max_tokens // BLOCK_SIZE
with vllm_runner(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import json
import pytest
......@@ -21,6 +22,7 @@ from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import (compare_two_settings, create_new_process_for_each_test,
multi_gpu_test)
from .backend import TestBackend
from ..utils import models_path_prefix
prompts = [
"Hello, my name is",
......@@ -177,7 +179,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"])
@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")])
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("async_tp_enabled", [True])
@pytest.mark.parametrize("distributed_backend", ["mp"])
......
......@@ -84,16 +84,17 @@ class TestSetting:
# method="encode",
# fullgraph=True,
# ),
# TODO
# vision language model
TestSetting(
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
attn_backend="FLASH_ATTN",
method="generate_with_image",
fullgraph=False,
),
# TestSetting(
# model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
# model_args=["--trust-remote-code", "--max-model-len", "2048"],
# pp_size=2,
# tp_size=1,
# attn_backend="FLASH_ATTN",
# method="generate_with_image",
# fullgraph=False,
# ),
])
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
import vllm
from vllm.compilation.counter import compilation_counter
from vllm.config import VllmConfig
from vllm.utils import _is_torch_equal_or_newer
from ..utils import models_path_prefix
def test_version():
assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev')
......@@ -26,7 +27,9 @@ def test_use_cudagraphs_dynamic(monkeypatch):
assert not vllm_config.compilation_config.use_cudagraph
@pytest.mark.parametrize("enabled", [True, False])
# TODO: when True num_cudagraph_captured=13
# @pytest.mark.parametrize("enabled", [True, False])
@pytest.mark.parametrize("enabled", [False])
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
assert vllm.envs.VLLM_USE_V1
......@@ -44,7 +47,7 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
num_cudagraph_captured=13 if enabled else 0,
),
# loading the model causes compilation (if enabled) to happen
vllm_runner('facebook/opt-125m',
vllm_runner(os.path.join(models_path_prefix, 'facebook/opt-125m'),
compilation_config=compilation_config,
gpu_memory_utilization=0.4) as _):
pass
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys
from unittest.mock import patch
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM
from ..utils import models_path_prefix
def test_mp_reducer(monkeypatch):
......@@ -24,7 +26,7 @@ def test_mp_reducer(monkeypatch):
with patch('multiprocessing.reducer.register') as mock_register:
engine_args = AsyncEngineArgs(
model="facebook/opt-125m",
model=os.path.join(models_path_prefix, "facebook/opt-125m"),
max_model_len=32,
gpu_memory_utilization=0.1,
disable_log_stats=True,
......
......@@ -40,6 +40,7 @@ from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.utils import maybe_model_redirect
from .utils import models_path_prefix
from vllm.platforms import current_platform
logger = init_logger(__name__)
......@@ -783,7 +784,7 @@ class VllmRunner:
dtype: str = "auto",
disable_log_stats: bool = True,
tensor_parallel_size: int = 1,
block_size: int = 16,
block_size: int = 16 if not current_platform.is_rocm() else 64,
enable_chunked_prefill: Optional[bool] = False,
swap_space: int = 4,
enforce_eager: Optional[bool] = False,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from itertools import cycle
import pytest
......@@ -8,10 +9,8 @@ import pytest
from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator
import os
from ....utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
from vllm.platforms import current_platform
@pytest.mark.parametrize(
......@@ -24,7 +23,7 @@ from vllm.utils import SUPPORT_TC, gpuname
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if current_platform.is_rocm() else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -107,7 +106,7 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs",
[
{
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if current_platform.is_rocm() else 16,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
......@@ -200,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
@pytest.mark.parametrize("per_test_common_llm_kwargs",
[{
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if current_platform.is_rocm() else 16,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, {
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if current_platform.is_rocm() else 16,
"max_num_batched_tokens": 3,
"max_num_seqs": 2,
}, {
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if current_platform.is_rocm() else 16,
"max_num_batched_tokens": 256,
"max_num_seqs": 10,
}])
......@@ -274,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if current_platform.is_rocm() else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache
......@@ -355,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if current_platform.is_rocm() else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -430,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if current_platform.is_rocm() else 16,
"num_gpu_blocks_override": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......
......@@ -15,8 +15,7 @@ from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
from vllm.platforms import current_platform
def get_sequence_groups(scheduler_output):
......@@ -852,7 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs=8,
enable_chunked_prefill=True,
gpu_memory_utilization=0.8,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
block_size=64 if current_platform.is_rocm() else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
......
......@@ -10,8 +10,6 @@ from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform
from vllm.sequence import SequenceGroup
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
......@@ -41,7 +39,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
block_size=64 if current_platform.is_rocm() else 16)
engine: LLMEngine = runner.model.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
......
......@@ -15,6 +15,7 @@ from vllm.core.interfaces import AllocStatus
from vllm.core.scheduler import Scheduler, SchedulingBudget
from vllm.lora.request import LoRARequest
from vllm.sequence import SequenceGroup, SequenceStatus
from vllm.platforms import current_platform
from .utils import (append_new_token, append_new_token_seq,
append_new_token_seq_group, create_dummy_prompt,
......@@ -22,7 +23,7 @@ from .utils import (append_new_token, append_new_token_seq,
def test_scheduler_add_seq_group():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens=100,
......@@ -45,7 +46,7 @@ def test_scheduler_add_seq_group():
def test_scheduler_abort_seq_group():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens=100,
......@@ -72,7 +73,7 @@ def test_scheduler_abort_seq_group():
def test_scheduler_schedule_simple():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(
......@@ -117,7 +118,7 @@ def test_scheduler_schedule_simple():
def test_scheduler_prefill_prioritized():
"""Verify running batched tokens are not applied to prefill requests."""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
max_model_len = 30
max_batched_num_tokens = 30
scheduler_config = SchedulerConfig(
......@@ -150,7 +151,7 @@ def test_scheduler_prefill_prioritized():
def test_scheduler_schedule_preempt_abort():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
max_model_len = 16
scheduler_config = SchedulerConfig(
"generate",
......@@ -208,7 +209,7 @@ def test_scheduler_schedule_preempt_abort():
def test_scheduler_max_seqs():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
num_seq_group = 4
max_seq_group = 2
max_model_len = 16
......@@ -256,7 +257,7 @@ def test_scheduler_max_seqs():
def test_scheduler_delay_factor():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens=100,
......@@ -306,7 +307,7 @@ def initialize_scheduler(
max_token_budget=1000,
max_model_len=1000,
lora_config=None,
block_size=4,
block_size=4 if not current_platform.is_rocm() else 64,
num_cpu_blocks=8,
num_gpu_blocks=8,
enable_prefix_caching=False,
......@@ -354,7 +355,7 @@ def test_prefill_schedule_max_prompt_len():
"""
Test prompt longer than max_prompt_len is aborted.
"""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
_, seq_group = create_dummy_prompt("0",
prompt_length=60,
......@@ -374,7 +375,7 @@ def test_prefill_schedule_token_budget():
"""
Test token budget respected.
"""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
......@@ -436,7 +437,7 @@ def test_prefill_schedule_max_seqs():
"""
Test max seq respected.
"""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
......@@ -475,7 +476,7 @@ def test_prefill_schedule_max_lora():
"""
Test max lora is respected and prioritized.
"""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config,
block_size=block_size,
......@@ -528,7 +529,7 @@ def test_prefill_schedule_no_block_manager_capacity():
"""
Test sequence cannot be scheduled due to block manager has no capacity.
"""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size,
num_gpu_blocks=128,
num_cpu_blocks=128)
......@@ -570,7 +571,7 @@ def test_decode_schedule_preempted():
"""
Test decodes cannot be scheduled and preempted.
"""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=64,
num_gpu_blocks=64)
......@@ -614,7 +615,7 @@ def test_schedule_decode_blocks_to_copy_update():
"""
Verify blocks_to_copy is updated.
"""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=4,
num_cpu_blocks=16,
num_gpu_blocks=16)
......@@ -646,7 +647,7 @@ def test_schedule_decode_blocks_to_copy_update():
def test_schedule_swapped_max_loras():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config,
block_size=block_size,
......@@ -679,7 +680,7 @@ def test_schedule_swapped_max_loras():
def test_schedule_swapped_cannot_swap_in():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
......@@ -709,7 +710,7 @@ def test_schedule_swapped_cannot_swap_in():
def test_infeasible_swap():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
......@@ -740,7 +741,7 @@ def test_infeasible_swap():
def test_schedule_swapped_blocks_to_copy():
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
scheduler = initialize_scheduler(block_size=block_size,
num_cpu_blocks=32,
num_gpu_blocks=32)
......@@ -825,7 +826,7 @@ def test_prefix_caching_aware_prefills(enable_prefix_caching):
considering prefix caching.
"""
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
max_num_batched_tokens = 12
max_seq_group = 3
scheduler = initialize_scheduler(
......@@ -912,7 +913,7 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
block-size aligned).
"""
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_num_batched_tokens = 4
max_seq_group = 3
scheduler = initialize_scheduler(
......@@ -978,7 +979,7 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
Test that the scheduler does not schedule batches with prompt tokens and
prompt embeddings co-mingled.
"""
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3
scheduler = initialize_scheduler(
block_size=block_size,
......@@ -1057,7 +1058,7 @@ def test_remove_seq_from_computed_blocks_tracker():
_seq_id_to_num_tokens_computed.
"""
# Budget can not schedule in swapped
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3
seq_tokens_with_swapped: list[list[int]] = []
blocks_to_swap_out: list[tuple[int, int]] = []
......@@ -1097,7 +1098,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill schedule don't have a space for another LoRA, so
# we ignore this request for now.
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
scheduler = initialize_scheduler(lora_config=lora_config,
block_size=block_size,
......@@ -1131,7 +1132,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill scheduler does not schedule batches with prompt tokens and
# prompt embeddings co-mingled.
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3
scheduler = initialize_scheduler(
block_size=block_size,
......@@ -1170,7 +1171,7 @@ def test_remove_seq_from_computed_blocks_tracker():
# Prefill scheduler budget num_batched_tokens
# >= scheduler_config max_num_batched_tokens
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3
seq_tokens_prefill_budget: list[list[int]] = []
......@@ -1205,7 +1206,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert seq_id_to_num_tokens_computed is None
# Budget can not schedule in waiting
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3
scheduler = initialize_scheduler(
......@@ -1241,7 +1242,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert seq_id_to_num_tokens_computed is None
# Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3
scheduler = initialize_scheduler(
block_size=block_size,
......@@ -1269,7 +1270,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert seq_id_to_num_tokens_computed is None
# Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3
scheduler = initialize_scheduler(
block_size=block_size,
......@@ -1303,7 +1304,7 @@ def test_remove_seq_from_computed_blocks_tracker():
assert seq_id_to_num_tokens_computed is None
# Budget can not allocate, AllocStatus is LATER
block_size = 2
block_size = 2 if not current_platform.is_rocm() else 64
max_seq_group = 3
scheduler = initialize_scheduler(
block_size=block_size,
......
......@@ -6,6 +6,7 @@ import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig
from vllm.core.scheduler import Scheduler
from vllm.sequence import SequenceGroup
from vllm.platforms import current_platform
from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
get_sequence_groups, schedule_and_update_computed_tokens)
......@@ -34,7 +35,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
cross-attention block table
'''
block_size = 4
block_size = 4 if not current_platform.is_rocm() else 64
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(
......
......@@ -7,8 +7,7 @@ import pytest
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
from vllm.platforms import current_platform
@pytest.mark.skip_v1
......@@ -23,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?")
llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
llm = LLM(model=model, block_size=64 if current_platform.is_rocm() else 16)
sampling_params = SamplingParams(max_tokens=10,
temperature=0.0,
detokenize=False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment