Commit 84dfdb17 authored by zhuwenwen's avatar zhuwenwen
Browse files

remove unused code

parent f137e58c
......@@ -106,79 +106,6 @@ __global__ void count_and_sort_expert_tokens_kernel(
}
}
// taken from
// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
template <typename scalar_t>
__global__ void sgl_ep_moe_align_block_size_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel, int32_t* cumsum,
int32_t start_expert, int32_t end_expert) {
__shared__ int32_t shared_counts[32][8];
__shared__ int32_t local_offsets[256];
const int warp_id = threadIdx.x / 32;
const int lane_id = threadIdx.x % 32;
const int experts_per_warp = 8;
const int my_expert_start = warp_id * experts_per_warp;
for (int i = 0; i < experts_per_warp; ++i) {
if (my_expert_start + i < num_experts) {
shared_counts[warp_id][i] = 0;
}
}
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int expert_id = topk_ids[i];
if (expert_id >= start_expert && expert_id < end_expert) {
expert_id -= start_expert;
int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
}
}
__syncthreads();
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
int expert_count = 0;
int warp_idx = (i - 1) / experts_per_warp;
int expert_offset = (i - 1) % experts_per_warp;
expert_count = shared_counts[warp_idx][expert_offset];
cumsum[i] =
cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
}
*total_tokens_post_pad = cumsum[num_experts];
}
__syncthreads();
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
local_offsets[threadIdx.x] = cumsum[threadIdx.x];
}
__syncthreads();
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
if (expert_id >= start_expert && expert_id < end_expert) {
expert_id -= start_expert;
int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1);
sorted_token_ids[rank_post_pad] = i;
}
}
}
template <typename scalar_t, int TOPK>
__global__ void moe_sum_kernel(
scalar_t* __restrict__ out, // [..., d]
......
......@@ -250,13 +250,6 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
int64_t head_size, torch::Tensor& cos_sin_cache,
bool is_neox, int64_t rot_dim,
torch::Tensor& cos_sin_cache_offsets);
void rotary_embedding_tgi(
torch::Tensor& query,
torch::Tensor& key,
int64_t head_size,
torch::Tensor& cos_cache,
torch::Tensor& sin_cache,
bool is_neox);
void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
......
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cuda_compat.h"
#include "dispatch_utils.h"
namespace vllm {
template<typename scalar_t, bool IS_NEOX>
inline __device__ void apply_token_rotary_embedding_tgi(
scalar_t* __restrict__ arr,
const float* __restrict__ cos_ptr,
const float* __restrict__ sin_ptr,
int rot_offset,
int embed_dim)
{
int x_index, y_index;
float cos, sin;
if (IS_NEOX) {
// GPT-NeoX style rotary embedding.
x_index = rot_offset;
y_index = embed_dim + rot_offset;
cos = VLLM_LDG(cos_ptr + x_index);
sin = VLLM_LDG(sin_ptr + x_index);
} else {
// GPT-J style rotary embedding.
x_index = 2 * rot_offset;
y_index = 2 * rot_offset + 1;
cos = VLLM_LDG(cos_ptr + x_index / 2);
sin = VLLM_LDG(sin_ptr + x_index / 2);
}
const scalar_t x = arr[x_index];
const scalar_t y = arr[y_index];
arr[x_index] = x * cos - y * sin;
arr[y_index] = y * cos + x * sin;
}
template<typename scalar_t, bool IS_NEOX>
inline __device__ void apply_rotary_embedding_tgi(
scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
const float* __restrict__ cos_ptr, // [max_position, 1, rot_dim]
const float* __restrict__ sin_ptr, // [max_position, 1, rot_dim]
const int head_size,
const int num_heads,
const int num_kv_heads,
const int rot_dim,
const int token_idx,
const int64_t query_stride,
const int64_t key_stride)
{
const int nq = num_heads * rot_dim;
for (int i = threadIdx.x; i < nq; i += blockDim.x) {
const int head_idx = i / rot_dim;
const int64_t token_head = token_idx * query_stride + head_idx * head_size;
const int rot_offset = i % rot_dim;
apply_token_rotary_embedding_tgi<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
sin_ptr, rot_offset, rot_dim);
}
const int nk = num_kv_heads * rot_dim;
for (int i = threadIdx.x; i < nk; i += blockDim.x) {
const int head_idx = i / rot_dim;
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
const int rot_offset = i % rot_dim;
apply_token_rotary_embedding_tgi<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
sin_ptr, rot_offset, rot_dim);
}
}
template<typename scalar_t, bool IS_NEOX>
__global__ void rotary_embedding_tgi_kernel(
scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
const float* __restrict__ cos_cache, // [max_position, 1, rot_dim]
const float* __restrict__ sin_cache, // [max_position, 1, rot_dim]
const int rot_dim,
const int64_t query_stride,
const int64_t key_stride,
const int num_heads,
const int num_kv_heads,
const int head_size) {
// Each thread block is responsible for one token.
const int token_idx = blockIdx.x;
const float* cos_ptr = cos_cache + token_idx * rot_dim;
const float* sin_ptr = sin_cache + token_idx * rot_dim;
apply_rotary_embedding_tgi<scalar_t, IS_NEOX>(query, key, cos_ptr, sin_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
}
} // namespace vllm
void rotary_embedding_tgi(
torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
int64_t head_size,
torch::Tensor& cos_cache,
torch::Tensor& sin_cache,
bool is_neox) {
int num_tokens = query.size(0);
int rot_dim = cos_cache.size(2);
int num_heads = query.size(1);
int num_kv_heads = key.size(1);
int query_stride = query.stride(0);
int key_stride = key.stride(0);
dim3 grid(num_tokens);
dim3 block(std::min(num_heads * rot_dim / 2, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
query.scalar_type(),
"rotary_embedding_tgi",
[&] {
if (is_neox) {
vllm::rotary_embedding_tgi_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
query.data_ptr<scalar_t>(),
key.data_ptr<scalar_t>(),
cos_cache.data_ptr<float>(),
sin_cache.data_ptr<float>(),
rot_dim,
query_stride,
key_stride,
num_heads,
num_kv_heads,
head_size);
} else {
vllm::rotary_embedding_tgi_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
query.data_ptr<scalar_t>(),
key.data_ptr<scalar_t>(),
cos_cache.data_ptr<float>(),
sin_cache.data_ptr<float>(),
rot_dim,
query_stride,
key_stride,
num_heads,
num_kv_heads,
head_size);
}
});
}
......@@ -391,15 +391,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor cos_sin_cache, bool is_neox) -> ()");
ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);
// Rotary embedding TGI for TGI
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
ops.def(
"rotary_embedding_tgi(Tensor! query, Tensor! key,"
" int head_size, Tensor cos_cache,"
" Tensor sin_cache, bool is_neox) -> ()");
// ops.def("rotary_embedding_tgi",&rotary_embedding_tgi);
ops.impl("rotary_embedding_tgi", torch::kCUDA, &rotary_embedding_tgi);
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key
// (supports multiple loras).
ops.def(
......
......@@ -35,6 +35,7 @@ def get_llm() -> LLM:
},
)
# Why do we need hf_overrides for the official original version:
# vllm converts it to Qwen3ForSequenceClassification when loaded for
# better performance.
......
......@@ -8,7 +8,7 @@ image:
# -- Image tag
tag: "latest"
# -- Container launch command
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
# -- Container port
containerPort: 8000
......
......@@ -41,7 +41,7 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
if envs.VLLM_USE_FLASH_ATTN_PA:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
else:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
......@@ -111,7 +111,7 @@ def test_models(
prompt_embeds = hf_model.get_prompt_embeddings(
example_prompts)
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
if envs.VLLM_USE_FLASH_ATTN_PA:
with VllmRunner(model,
max_model_len=8192,
enforce_eager=enforce_eager,
......
......@@ -86,6 +86,20 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if envs.VLLM_USE_FLASH_ATTN_PA:
with vllm_runner(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
block_size=64,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
else:
with vllm_runner(
model,
dtype=dtype,
......@@ -94,7 +108,6 @@ def test_models(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
......@@ -150,6 +163,22 @@ def test_models_distributed(
# will hurt multiprocessing backend with
# fork method (the default method).
if envs.VLLM_USE_FLASH_ATTN_PA:
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
block_size=64,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
max_tokens,
)
else:
with vllm_runner(
model,
dtype=dtype,
......@@ -158,7 +187,6 @@ def test_models_distributed(
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
......@@ -277,6 +305,25 @@ def test_with_prefix_caching(
max_num_batched_tokens = max_num_seqs = chunk_size
outputs = {} # type: ignore
for enable in (True, False):
if envs.VLLM_USE_FLASH_ATTN_PA:
with vllm_runner(
model,
dtype=dtype,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=True,
enable_prefix_caching=enable,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
block_size=64,
) as vllm_model:
outputs[enable] = []
for prompt in full_prompts:
outputs[enable] += vllm_model.generate_greedy(
[prompt],
max_tokens,
)
else:
with vllm_runner(
model,
dtype=dtype,
......@@ -286,7 +333,6 @@ def test_with_prefix_caching(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model:
outputs[enable] = []
for prompt in full_prompts:
......
# SPDX-License-Identifier: Apache-2.0
import os
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
import os
from ..utils import models_path_prefix
TEST_MODELS = [
(os.path.join(models_path_prefix, "facebook/opt-125m"), {}),
(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
# (os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic"), {
# "dtype": torch.float16,
# "quantization": "fp8"
# }),
(os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"), {
"quantization": "compressed-tensors"
}),
(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), {}),
]
if is_quant_method_supported("aqlm"):
TEST_MODELS.append((os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"), {
"quantization": "aqlm"
}))
# TODO: figure out why this fails.
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"), {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"), {
"quantization": "gptq"
}))
# if is_quant_method_supported("gptq_marlin"):
# TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
# "quantization": "gptq_marlin"
# }))
# if is_quant_method_supported("gptq_marlin_24"):
# TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
# "quantization": "gptq_marlin_24"
# }))
# if is_quant_method_supported("marlin"):
# TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
# "quantization": "marlin"
# }))
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {
"quantization": "AWQ"
}))
def check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1):
# make sure these models can be captured in full graph mode
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
print(f"MODEL={model}")
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=model,
enforce_eager=True,
tensor_parallel_size=tp_size,
disable_custom_all_reduce=True,
compilation_config=optimization_level,
**model_kwargs)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
......@@ -24,7 +24,7 @@ from vllm.utils import SUPPORT_TC, gpuname
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -107,7 +107,7 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs",
[
{
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
......@@ -200,15 +200,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
@pytest.mark.parametrize("per_test_common_llm_kwargs",
[{
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, {
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
"max_num_batched_tokens": 3,
"max_num_seqs": 2,
}, {
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
"max_num_batched_tokens": 256,
"max_num_seqs": 10,
}])
......@@ -274,7 +274,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache
......@@ -355,7 +355,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -430,7 +430,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"block_size": 64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
"num_gpu_blocks_override": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......
......@@ -844,7 +844,17 @@ def test_chunked_prefill_with_actual_engine(model: str,
"""
prompt = "hello" * 40
if envs.VLLM_USE_FLASH_ATTN_PA:
engine_args = EngineArgs(
model=model,
max_num_partial_prefills=max_num_partial_prefills,
max_num_batched_tokens=40,
max_num_seqs=8,
enable_chunked_prefill=True,
gpu_memory_utilization=0.8,
block_size=64,
)
else:
engine_args = EngineArgs(
model=model,
max_num_partial_prefills=max_num_partial_prefills,
......@@ -852,7 +862,6 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs=8,
enable_chunked_prefill=True,
gpu_memory_utilization=0.8,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
......
......@@ -23,7 +23,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?")
llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
llm = LLM(model=model, block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA else 16)
sampling_params = SamplingParams(max_tokens=10,
temperature=0.0,
detokenize=False)
......
......@@ -13,7 +13,7 @@ import vllm.envs as envs
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
@pytest.mark.parametrize("block_size", [64] if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else [16])
@pytest.mark.parametrize("block_size", [64] if envs.VLLM_USE_FLASH_ATTN_PA else [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
# without triggering asserts.
......
......@@ -60,7 +60,7 @@ def test_custom_executor(model, tmp_path):
model=model,
distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
......@@ -84,7 +84,7 @@ def test_custom_executor_async(model, tmp_path):
model=model,
distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
......@@ -111,7 +111,7 @@ def test_respect_ray(model):
model=model,
distributed_executor_backend="ray",
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
assert engine.model_executor.uses_ray
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
from typing import Any, List, Optional
import pytest
import os
from vllm import CompletionOutput, LLMEngine, SamplingParams
from ..utils import models_path_prefix
MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
MAX_TOKENS = 200
IS_ASYNC = False
@pytest.fixture(scope="session")
def vllm_model(vllm_runner):
with vllm_runner(MODEL) as vllm_model:
yield vllm_model
def _test_stopping(llm_engine: LLMEngine,
expected_output: str,
expected_reason: Any,
stop: Optional[List[str]] = None,
stop_token_ids: Optional[List[int]] = None,
include_in_output: bool = False,
use_async_output_proc: bool = False) -> None:
llm_engine.add_request(
"id", "A story about vLLM:\n",
SamplingParams(
temperature=0.0,
max_tokens=MAX_TOKENS,
stop=stop,
stop_token_ids=stop_token_ids,
include_stop_str_in_output=include_in_output,
), None)
output: Optional[CompletionOutput] = None
output_text = ""
stop_reason = None
if use_async_output_proc:
llm_engine.step()
while llm_engine.has_unfinished_requests():
(request_output, ) = llm_engine.step()
(output, ) = request_output.outputs
# Ensure we don't backtrack
assert output.text.startswith(output_text)
output_text = output.text
stop_reason = output.stop_reason
assert output is not None
assert output_text == expected_output
assert stop_reason == expected_reason
def _set_async_mode(llm_engine, is_async):
llm_engine.scheduler[0].use_async_output_proc = is_async
def _stop_basic(llm_engine, is_async):
_test_stopping(llm_engine,
stop=["."],
include_in_output=False,
expected_output="VLLM is a 100% volunteer organization",
expected_reason=".",
use_async_output_proc=is_async)
_test_stopping(llm_engine,
stop=["."],
include_in_output=True,
expected_output="VLLM is a 100% volunteer organization.",
expected_reason=".",
use_async_output_proc=is_async)
def _stop_multi_tokens(llm_engine, is_async):
_test_stopping(
llm_engine,
stop=["group of peo", "short"],
include_in_output=False,
expected_output="VLLM is a 100% volunteer organization. We are a ",
expected_reason="group of peo",
use_async_output_proc=is_async)
_test_stopping(
llm_engine,
stop=["group of peo", "short"],
include_in_output=True,
expected_output=
"VLLM is a 100% volunteer organization. We are a group of peo",
expected_reason="group of peo",
use_async_output_proc=is_async)
def _stop_partial_token(llm_engine, is_async):
_test_stopping(llm_engine,
stop=["gani"],
include_in_output=False,
expected_output="VLLM is a 100% volunteer or",
expected_reason="gani",
use_async_output_proc=is_async)
_test_stopping(llm_engine,
stop=["gani"],
include_in_output=True,
expected_output="VLLM is a 100% volunteer organi",
expected_reason="gani",
use_async_output_proc=is_async)
def _stop_token_id(llm_engine, is_async):
# token id 13013 => " organization"
_test_stopping(llm_engine,
stop_token_ids=[13013],
include_in_output=False,
expected_output="VLLM is a 100% volunteer",
expected_reason=13013,
use_async_output_proc=is_async)
_test_stopping(llm_engine,
stop_token_ids=[13013],
include_in_output=True,
expected_output="VLLM is a 100% volunteer organization",
expected_reason=13013,
use_async_output_proc=is_async)
@pytest.mark.skip_global_cleanup
def test_stop_basic(vllm_model):
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_basic(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_basic(vllm_model.model.llm_engine, is_async=False)
@pytest.mark.skip_global_cleanup
def test_stop_multi_tokens(vllm_model):
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
@pytest.mark.skip_global_cleanup
def test_stop_partial_token(vllm_model):
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_partial_token(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_partial_token(vllm_model.model.llm_engine, is_async=False)
@pytest.mark.skip_global_cleanup
def test_stop_token_id(vllm_model):
_set_async_mode(vllm_model.model.llm_engine, True)
_stop_token_id(vllm_model.model.llm_engine, is_async=True)
_set_async_mode(vllm_model.model.llm_engine, False)
_stop_token_id(vllm_model.model.llm_engine, is_async=False)
import json
import openai # use the official client for correctness check
import pytest
import os
import pytest_asyncio
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from ...utils import RemoteOpenAIServer, models_path_prefix
# any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora")
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module")
def server_with_lora_modules_json(zephyr_lora_files):
# Define the json format LoRA module configurations
lora_module_1 = {
"name": "zephyr-lora",
"path": zephyr_lora_files,
"base_model_name": MODEL_NAME
}
lora_module_2 = {
"name": "zephyr-lora2",
"path": zephyr_lora_files,
"base_model_name": MODEL_NAME
}
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
json.dumps(lora_module_1),
json.dumps(lora_module_2),
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"64",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client_for_lora_lineage(server_with_lora_modules_json):
async with server_with_lora_modules_json.get_async_client(
) as async_client:
yield async_client
@pytest.mark.asyncio
async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
zephyr_lora_files):
models = await client_for_lora_lineage.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert served_model.root == MODEL_NAME
assert served_model.parent is None
assert all(lora_model.root == zephyr_lora_files
for lora_model in lora_models)
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
from typing import Optional
import pytest
import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm import _custom_ops as ops
from vllm.attention.ops.blocksparse_attention.interface import (
LocalStridedBlockSparseAttn)
from vllm.platforms import current_platform
from vllm.utils import get_max_shared_memory_bytes
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
# MAX_SEQ_LEN = 2771
# There may not be enough gpu memory due to large NUM_BLOCKS.
# Reduce NUM_BLOCKS when it happens.
NUM_BLOCKS = 4321 # Arbitrary values for testing
PARTITION_SIZE = 512
DTYPES = [torch.half, torch.bfloat16]
NUM_GEN_SEQS = [3] # Arbitrary values for testing
NUM_PREFILL_SEQS = [3] # Arbitrary values for testing
NUM_HEADS = [(40, 40)] # Arbitrary values for testing
HEAD_SIZES = [64, 112]
BLOCK_SIZES = [16]
USE_ALIBI = [False, True]
KV_CACHE_DTYPE = ["auto", "fp8"] if not current_platform.is_rocm() else ["auto"]
SEEDS = [0]
CUDA_DEVICES = ['cuda:0']
BLOCKSPARSE_LOCAL_BLOCKS = [16]
BLOCKSPARSE_VERT_STRIDES = [8]
BLOCKSPARSE_BLOCK_SIZES = [64]
BLOCKSPARSE_HEADS_SLIDINGS = [2, -1]
BLOCKSPARSE_HOMO_HEADS = [True, False]
def ref_masked_attention(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
scale: float,
attn_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
if attn_mask is not None:
attn_weights = attn_weights + attn_mask.float()
attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
out = torch.einsum("hqk,khd->qhd", attn_weights, value)
return out
def ref_single_query_cached_kv_attention(
output: torch.Tensor,
query: torch.Tensor,
num_queries_per_kv: int,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
block_tables: torch.Tensor,
seq_lens: torch.Tensor,
scale: float,
alibi_slopes: Optional[torch.Tensor],
tp_rank: int = 0,
blocksparse_local_blocks: int = 0,
blocksparse_vert_stride: int = 1,
blocksparse_block_size: int = 64,
blocksparse_head_sliding_step: int = 0,
) -> None:
num_query_heads = query.shape[1]
num_kv_heads = value_cache.shape[1]
head_size = value_cache.shape[2]
block_size = value_cache.shape[3]
num_seqs = query.shape[0]
block_tables_lst = block_tables.cpu().tolist()
seq_lens_lst = seq_lens.cpu().tolist()
for i in range(num_seqs):
q = query[i].unsqueeze(0)
block_table = block_tables_lst[i]
seq_len = int(seq_lens_lst[i])
keys_lst: list[torch.Tensor] = []
values_lst: list[torch.Tensor] = []
for j in range(seq_len):
block_number = int(block_table[j // block_size])
block_offset = j % block_size
k = key_cache[block_number, :, :, block_offset, :]
k = k.reshape(num_kv_heads, head_size)
keys_lst.append(k)
v = value_cache[block_number, :, :, block_offset]
values_lst.append(v)
keys = torch.stack(keys_lst, dim=0)
values = torch.stack(values_lst, dim=0)
if num_queries_per_kv > 1:
# Handle MQA and GQA
keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
values = torch.repeat_interleave(values, num_queries_per_kv, dim=1)
alibi_bias = None
if alibi_slopes is not None:
# Create the ALiBi bias used in the paged attention kernel.
position_ids = torch.arange(seq_len).int()
alibi_bias = (position_ids - seq_len + 1).float()
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
1, 1, -1)
if blocksparse_vert_stride >= 1:
bsize = blocksparse_block_size
hsliding = blocksparse_head_sliding_step
vert = blocksparse_vert_stride
locals = blocksparse_local_blocks
qb = (seq_len - 1) // bsize
attn_mask = q.new_zeros(
(num_query_heads, 1, seq_len)).float() - torch.inf
for h in range(num_query_heads):
if hsliding >= 0: # slide with q heads
bs_offset = (tp_rank * num_query_heads + h) * hsliding + 1
else: # slide with kv heads
bs_offset = (tp_rank * num_kv_heads +
h // num_queries_per_kv) * (-hsliding) + 1
for kb in range(qb + 1):
kj = kb * bsize
if (qb - kb) < locals or \
(kb + bs_offset) % vert == 0:
attn_mask[h, 0, kj:min(kj + bsize, seq_len)] = 0
if alibi_bias is not None:
attn_mask += alibi_bias
else:
attn_mask = alibi_bias
out = ref_masked_attention(q, keys, values, scale, attn_mask=attn_mask)
out = out.view(num_query_heads, head_size)
output[i].copy_(out, non_blocking=True)
@pytest.mark.parametrize("version", ["v1", "v2"])
@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("use_alibi", USE_ALIBI)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
@pytest.mark.parametrize("blocksparse_head_sliding_step",
BLOCKSPARSE_HEADS_SLIDINGS)
def test_paged_attention(
kv_cache_factory,
version: str,
num_seqs: int,
num_heads: tuple[int, int],
head_size: int,
use_alibi: bool,
block_size: int,
dtype: torch.dtype,
kv_cache_dtype: str,
seed: int,
device: str,
blocksparse_local_blocks: int,
blocksparse_vert_stride: int,
blocksparse_block_size: int,
blocksparse_head_sliding_step: int,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads
query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
query.uniform_(-scale, scale)
assert num_query_heads % num_kv_heads == 0
num_queries_per_kv = num_query_heads // num_kv_heads
alibi_slopes = None
if use_alibi:
alibi_slopes = torch.rand(num_query_heads, dtype=torch.float)
seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
seq_lens[-1] = MAX_SEQ_LEN
max_seq_len = max(seq_lens)
seq_lens = torch.tensor(seq_lens, dtype=torch.int)
# Create the block tables.
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables = []
for _ in range(num_seqs):
block_table = [
random.randint(0, NUM_BLOCKS - 1)
for _ in range(max_num_blocks_per_seq)
]
block_tables.append(block_table)
block_tables = torch.tensor(block_tables, dtype=torch.int)
# Create the KV caches.
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
num_kv_heads, head_size,
kv_cache_dtype, dtype, seed,
device)
key_cache, value_cache = key_caches[0], value_caches[0]
# Using default kv_scale
k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
tp_rank = 0
# Call the paged attention kernel.
output = torch.empty_like(query)
if version == "v1":
ops.paged_attention_v1(
output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
tp_rank=tp_rank,
blocksparse_local_blocks=blocksparse_local_blocks,
blocksparse_vert_stride=blocksparse_vert_stride,
blocksparse_block_size=blocksparse_block_size,
blocksparse_head_sliding_step=blocksparse_head_sliding_step,
)
elif version == "v2":
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
assert PARTITION_SIZE % block_size == 0
num_seqs, num_heads, head_size = output.shape
tmp_output = torch.empty(
size=(num_seqs, num_heads, num_partitions, head_size),
dtype=output.dtype,
)
exp_sums = torch.empty(
size=(num_seqs, num_heads, num_partitions),
dtype=torch.float32,
)
max_logits = torch.empty_like(exp_sums)
ops.paged_attention_v2(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
tp_rank=tp_rank,
blocksparse_local_blocks=blocksparse_local_blocks,
blocksparse_vert_stride=blocksparse_vert_stride,
blocksparse_block_size=blocksparse_block_size,
blocksparse_head_sliding_step=blocksparse_head_sliding_step,
)
else:
raise AssertionError(f"Unknown version: {version}")
# Run the reference implementation.
if kv_cache_dtype == "fp8":
# Convert cache data back to dtype.
x = 16 // torch.tensor([], dtype=dtype).element_size()
key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
block_size, x)
dequantized_key_cache = torch.empty(size=key_cache_shape,
dtype=dtype,
device=device)
ops.convert_fp8(dequantized_key_cache, key_cache)
key_cache = dequantized_key_cache
value_cache_shape = value_cache.shape
dequantized_value_cache = torch.empty(size=value_cache_shape,
dtype=dtype,
device=device)
ops.convert_fp8(dequantized_value_cache, value_cache)
value_cache = dequantized_value_cache
ref_output = torch.empty_like(query)
ref_single_query_cached_kv_attention(
ref_output,
query,
num_queries_per_kv,
key_cache,
value_cache,
block_tables,
seq_lens,
scale,
alibi_slopes,
tp_rank,
blocksparse_local_blocks,
blocksparse_vert_stride,
blocksparse_block_size,
blocksparse_head_sliding_step,
)
# NOTE(woosuk): Due to the kernel-level differences in the two
# implementations, there is a small numerical difference in the two
# outputs. Thus, we use a relaxed tolerance for the test.
atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
# NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
# so we use a relaxed tolerance for the test.
atol, rtol = 1e-3, 1e-5
if kv_cache_dtype == "fp8":
atol, rtol = 1e-2, 1e-5
torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
def ref_multi_query_kv_attention(
cu_seq_lens: list[int],
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
scale: float,
dtype: torch.dtype,
) -> torch.Tensor:
num_seqs = len(cu_seq_lens) - 1
ref_outputs = []
for i in range(num_seqs):
start_idx = cu_seq_lens[i]
end_idx = cu_seq_lens[i + 1]
seq_len = end_idx - start_idx
# Create attention mask.
attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
diagonal=1)
attn_mask = attn_mask * torch.finfo(dtype).min
attn_mask = attn_mask.to(dtype=dtype)
ref_output = ref_masked_attention(
query[start_idx:end_idx],
key[start_idx:end_idx],
value[start_idx:end_idx],
scale,
attn_mask=attn_mask,
)
ref_outputs.append(ref_output)
ref_output = torch.cat(ref_outputs, dim=0)
return ref_output
# @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
# @pytest.mark.parametrize("num_heads", NUM_HEADS)
# @pytest.mark.parametrize("head_size", HEAD_SIZES)
# @pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS)
# @pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES)
# @pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES)
# @pytest.mark.parametrize("blocksparse_homo_heads", BLOCKSPARSE_HOMO_HEADS)
# @pytest.mark.parametrize("dtype", DTYPES)
# @pytest.mark.parametrize("seed", SEEDS)
# @pytest.mark.parametrize("device", CUDA_DEVICES)
# @torch.inference_mode()
# def test_varlen_blocksparse_attention_prefill(
# num_seqs: int,
# num_heads: tuple[int, int],
# head_size: int,
# blocksparse_local_blocks: int,
# blocksparse_vert_stride: int,
# blocksparse_block_size: int,
# blocksparse_homo_heads: bool,
# dtype: torch.dtype,
# seed: int,
# device: str,
# ) -> None:
# current_platform.seed_everything(seed)
# torch.set_default_device(device)
# # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# # As the xformers library is already tested with its own tests, we can use
# # a smaller MAX_SEQ_LEN here.
# max_len = min(MAX_SEQ_LEN, 4096)
# seq_lens = random.sample(range(1, max_len), num_seqs)
# cu_seq_lens = torch.cumsum(torch.tensor([0] + seq_lens), dim=0)
# num_tokens = sum(seq_lens)
# scale = float(1.0 / (head_size**0.5))
# num_query_heads, num_kv_heads = num_heads
# assert num_query_heads % num_kv_heads == 0
# num_queries_per_kv = num_query_heads // num_kv_heads
# qkv = torch.empty(num_tokens,
# num_query_heads + 2 * num_kv_heads,
# head_size,
# dtype=dtype)
# qkv.uniform_(-scale, scale)
# query, key, value = qkv.split(
# [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
# bs_attn_op = LocalStridedBlockSparseAttn(
# num_query_heads,
# max_len,
# local_blocks=blocksparse_local_blocks,
# vert_stride=blocksparse_vert_stride,
# block_size=blocksparse_block_size,
# device=device,
# dtype=dtype,
# homo_head=blocksparse_homo_heads)
# output = bs_attn_op(query,
# key,
# value,
# cu_seq_lens.to(device),
# sm_scale=scale)
# if num_queries_per_kv > 1:
# # Handle MQA and GQA
# key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
# value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
# ref_output = ref_multi_query_kv_attention(
# cu_seq_lens.tolist(),
# query,
# key,
# value,
# scale,
# dtype,
# )
# torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2)
from typing import List
import os
import vllm
from vllm.lora.request import LoRARequest
from ..utils import models_path_prefix
MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen1.5-32B-Chat")
PROMPT_TEMPLATE = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
prompts = [
PROMPT_TEMPLATE.format(query="who are you?"),
PROMPT_TEMPLATE.format(
query="What is the capital city of China?"
),
PROMPT_TEMPLATE.format(
query="What is the longest river in the world?"
),
]
print(prompts)
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts: List[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
def test_qwen_lora(qwen_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
trust_remote_code=True)
expected_lora_output = [
"I am a large language model created by Alibaba Cloud. I am called Qwen.",
"The capital city of China is Beijing.",
"The longest river in the world is the Nile, located in Africa. It stretches for approximately 4,135 miles (6,650 kilometers) from its source in the highlands of Rwanda, through Tanzania, Uganda, South Sudan, Sudan, and Egypt, before emptying into the Mediterranean Sea. The Nile is famous for its historical and cultural significance, particularly in relation to ancient Egyptian civilization.",
]
output1 = do_sample(llm, qwen_lora_files, lora_id=1)
for i in range(len(expected_lora_output)):
assert output1[i] == expected_lora_output[i]
output2 = do_sample(llm, qwen_lora_files, lora_id=2)
for i in range(len(expected_lora_output)):
assert output2[i] == expected_lora_output[i]
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
Run `pytest tests/models/test_mamba.py`.
"""
import os
import pytest
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm.engine.arg_utils import EngineArgs
from vllm.sampling_params import SamplingParams
from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = [
os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"),
os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev"),
# TODO: Compare to a Mamba2 model. The HF transformers implementation of
# Mamba2 is buggy for Codestral as it doesn't handle n_groups.
# See https://github.com/huggingface/transformers/pull/35943
# "mistralai/Mamba-Codestral-7B-v0.1",
]
# Use lower-level interfaces to create this greedy generator, as mamba will
# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
def generate_greedy(model_name, example_prompts, max_tokens):
# Create a text generation pipeline
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Set the device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Generate texts from the prompts
outputs = []
for prompt in example_prompts:
# Tokenize the input prompt with truncation
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
input_ids = inputs["input_ids"].to(model.device)
# Generate text using the model's generate method directly
generated_ids = model.generate(input_ids,
max_new_tokens=max_tokens,
do_sample=False)
generated_text = tokenizer.decode(generated_ids[0],
skip_special_tokens=True)
outputs.append((generated_ids[0].tolist(), generated_text))
return outputs
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [96])
def test_models(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
hf_outputs = generate_greedy(model, example_prompts, max_tokens)
# Set max_num_seqs to keep Codestral from going OOM at fp32
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [96])
def test_batching(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# To pass the small model tests, we need full precision.
for_loop_outputs = []
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for prompt in example_prompts:
for_loop_outputs.append(
vllm_model.generate_greedy([prompt], max_tokens)[0])
batched_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=for_loop_outputs,
outputs_1_lst=batched_outputs,
name_0="for_loop_vllm",
name_1="batched_vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [10])
def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
model: str, dtype: str,
max_tokens: int) -> None:
# Tests chunked prefill in conjunction with n>1. In this case, prefill is
# populated with decoding tokens and we test that it doesn't fail.
# This test might fail if cache is not allocated correctly for n > 1
# decoding steps inside a chunked prefill forward pass (where we have both
# prefill and decode together )
sampling_params = SamplingParams(n=3,
temperature=1,
seed=0,
max_tokens=max_tokens)
with vllm_runner(
model,
dtype=dtype,
enable_chunked_prefill=True,
max_num_batched_tokens=30,
max_num_seqs=10 # forces prefill chunks with decoding
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
max_tokens: int,
chunked_prefill_token_size: int) -> None:
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
non_chunked = generate_greedy(model, example_prompts, max_tokens)
with vllm_runner(model,
dtype=dtype,
enable_chunked_prefill=True,
max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_seqs) as vllm_model:
chunked = vllm_model.generate_greedy(example_prompts,
max_tokens=max_tokens)
check_outputs_equal(
outputs_0_lst=chunked,
outputs_1_lst=non_chunked,
name_0="chunked",
name_1="non_chunked",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [15])
def test_parallel_sampling(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# Numerical differences produce slightly different output for these
if 'state-spaces' in model:
example_prompts.pop(0)
example_prompts.pop(0)
example_prompts.pop(0)
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for_loop_outputs = []
for _ in range(10):
for_loop_outputs.append(
vllm_model.generate_greedy(example_prompts, max_tokens)[0])
sampling_params = SamplingParams(n=10,
temperature=0.001,
seed=0,
max_tokens=max_tokens)
n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
token_ids, texts = n_lt_1_outputs[0]
n_lt_1_outputs = [(token_id, text)
for token_id, text in zip(token_ids, texts)]
check_outputs_equal(
outputs_0_lst=n_lt_1_outputs,
outputs_1_lst=for_loop_outputs,
name_0="vllm_n_lt_1_outputs",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [20])
def test_mamba_cache_cg_padding(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# This test is for verifying that mamba cache is padded to CG captured
# batch size. If it's not, a torch RuntimeError will be raised because
# tensor dimensions aren't compatible
vllm_config = EngineArgs(model=model).create_engine_config()
while len(example_prompts) == vllm_config.pad_for_cudagraph(
len(example_prompts)):
example_prompts.append(example_prompts[0])
try:
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
except RuntimeError:
pytest.fail(
"Couldn't run batch size which is not equal to a Cuda Graph "
"captured batch size. "
"Could be related to mamba cache not padded correctly")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [20])
def test_models_preemption_recompute(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# Tests that outputs are identical with and w/o preemtions (recompute)
assert dtype == "float"
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
vllm_model.model.llm_engine.scheduler[
0].ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)
vllm_model.model.llm_engine.scheduler[
0].ENABLE_ARTIFICIAL_PREEMPT = False
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=preempt_vllm_outputs,
outputs_1_lst=vllm_outputs,
name_0="vllm_preepmtions",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
vllm_runner,
model: str,
dtype: str,
example_prompts,
) -> None:
# This test is for verifying that the Mamba inner state management doesn't
# collapse in case where the number of incoming requests and
# finished_requests_ids is larger than the maximum Mamba block capacity.
# This could generally happen due to the fact that Mamba does support
# statelessness mechanism where it can cleanup new incoming requests in
# a single step.
try:
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError:
pytest.fail("Mamba inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily ")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_state_cleanup(
vllm_runner,
model: str,
dtype: str,
example_prompts,
) -> None:
# This test is for verifying that the Mamba state is cleaned up between
# steps, If its not cleaned, an error would be expected.
try:
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError:
pytest.fail("Mamba inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_multistep(
vllm_runner,
model: str,
dtype: str,
example_prompts,
) -> None:
with vllm_runner(model, num_scheduler_steps=8,
max_num_seqs=2) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64])
def test_multistep_correctness(vllm_runner, model: str, dtype: str,
max_tokens: int, example_prompts) -> None:
with vllm_runner(model, num_scheduler_steps=8,
max_num_seqs=2) as vllm_model:
vllm_outputs_multistep = vllm_model.generate_greedy(
example_prompts, max_tokens)
with vllm_runner(model, num_scheduler_steps=1,
max_num_seqs=2) as vllm_model:
vllm_outputs_single_step = vllm_model.generate_greedy(
example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_outputs_multistep,
outputs_1_lst=vllm_outputs_single_step,
name_0="vllm_outputs_multistep",
name_1="vllm_outputs_single_step",
)
from contextlib import nullcontext
import numpy as np
import pytest
import os
from transformers import LlavaNextImageProcessor
from vllm.config import ModelConfig
from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.image import rescale_image_size
from ..utils import models_path_prefix
@pytest.fixture
def mm_registry():
return MultiModalRegistry()
@pytest.mark.parametrize("dtype", ["half", "float"])
@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
def test_llava_next_image_processor(image_assets, mm_registry, dtype,
size_factor):
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-vicuna-7b-hf")
hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
assert isinstance(hf_processor, LlavaNextImageProcessor)
model_config = ModelConfig(
model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype=dtype,
revision=None,
limit_mm_per_prompt={"image": 1},
)
mm_registry.init_mm_limits_per_prompt(model_config)
for asset in image_assets:
image = rescale_image_size(asset.pil_image, size_factor)
hf_result = hf_processor.preprocess(
image,
return_tensors="pt",
)
vllm_result = mm_registry.map_input(
model_config,
{"image": image},
)
assert hf_result.keys() == vllm_result.keys()
for key, hf_tensor in hf_result.items():
hf_arr: np.ndarray = hf_tensor.numpy()
vllm_arr: np.ndarray = vllm_result[key].numpy()
assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
@pytest.mark.parametrize(
("num_images", "limit", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
(2, 1, False), (2, 2, True)],
)
def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")
model_config = ModelConfig(
model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="half",
revision=None,
limit_mm_per_prompt={"image": limit},
)
mm_registry.init_mm_limits_per_prompt(model_config)
image = image_assets[0].pil_image
if num_images == 0:
mm_inputs = {}
elif num_images == 1:
mm_inputs = {"image": image}
else:
mm_inputs = {"image": [image] * num_images}
with nullcontext() if is_valid else pytest.raises(ValueError):
mm_registry.map_input(model_config, mm_inputs)
# NOTE: We don't test zero images since the HF processor doesn't support it
@pytest.mark.parametrize("num_images", [1, 2])
def test_image_mapper_multi(image_assets, mm_registry, num_images):
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")
model_config = ModelConfig(
model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="half",
revision=None,
limit_mm_per_prompt={"image": num_images},
)
mm_registry.init_mm_limits_per_prompt(model_config)
image = image_assets[0].pil_image
mm_inputs = {"image": [image] * num_images}
mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
assert len(mapped_inputs["pixel_values"]) == num_images
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment