Commit fcffb7c8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'vllm-v0.2.7-dtk23.10'

parents eb181638 4095d0db
...@@ -41,7 +41,7 @@ python3 setup.py install ...@@ -41,7 +41,7 @@ python3 setup.py install
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.2.6 - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.2.7
## Known Issue ## Known Issue
- -
......
...@@ -27,7 +27,7 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -27,7 +27,7 @@ Easy, fast, and cheap LLM serving for everyone
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
--- ---
## About
vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is a fast and easy-to-use library for LLM inference and serving.
vLLM is fast with: vLLM is fast with:
......
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#include <c10/cuda/CUDAGuard.h>
#include "cuda_compat.h" #include "cuda_compat.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
...@@ -36,6 +37,7 @@ void silu_and_mul( ...@@ -36,6 +37,7 @@ void silu_and_mul(
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(d, 1024)); dim3 block(std::min(d, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), input.scalar_type(),
...@@ -71,6 +73,7 @@ __global__ void activation_kernel( ...@@ -71,6 +73,7 @@ __global__ void activation_kernel(
int64_t num_tokens = input.numel() / d; \ int64_t num_tokens = input.numel() / d; \
dim3 grid(num_tokens); \ dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \ dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \ VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), \ input.scalar_type(), \
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "attention_dtypes.h" #include "attention_dtypes.h"
#include "attention_utils.cuh" #include "attention_utils.cuh"
...@@ -616,6 +617,7 @@ void paged_attention_v1_launcher( ...@@ -616,6 +617,7 @@ void paged_attention_v1_launcher(
dim3 grid(num_heads, num_seqs, 1); dim3 grid(num_heads, num_seqs, 1);
dim3 block(NUM_THREADS); dim3 block(NUM_THREADS);
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
switch (head_size) { switch (head_size) {
// NOTE(woosuk): To reduce the compilation time, we only compile for the // NOTE(woosuk): To reduce the compilation time, we only compile for the
...@@ -784,6 +786,7 @@ void paged_attention_v2_launcher( ...@@ -784,6 +786,7 @@ void paged_attention_v2_launcher(
int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
dim3 block(NUM_THREADS); dim3 block(NUM_THREADS);
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
switch (head_size) { switch (head_size) {
// NOTE(woosuk): To reduce the compilation time, we only compile for the // NOTE(woosuk): To reduce the compilation time, we only compile for the
......
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cuda_compat.h" #include "cuda_compat.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
...@@ -33,6 +34,7 @@ void swap_blocks( ...@@ -33,6 +34,7 @@ void swap_blocks(
char *dst_ptr = static_cast<char*>(dst.data_ptr()); char *dst_ptr = static_cast<char*>(dst.data_ptr());
const int64_t block_size_in_bytes = src.element_size() * src[0].numel(); const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
const at::cuda::OptionalCUDAGuard device_guard(src_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// NOTE(woosuk): This can be slow if the number of blocks is large. // NOTE(woosuk): This can be slow if the number of blocks is large.
for (const auto& pair : block_mapping) { for (const auto& pair : block_mapping) {
...@@ -127,6 +129,7 @@ void copy_blocks( ...@@ -127,6 +129,7 @@ void copy_blocks(
const int numel_per_block = key_caches[0][0].numel(); const int numel_per_block = key_caches[0][0].numel();
dim3 grid(num_layers, num_pairs); dim3 grid(num_layers, num_pairs);
dim3 block(std::min(1024, numel_per_block)); dim3 block(std::min(1024, numel_per_block));
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] { key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
...@@ -207,6 +210,7 @@ void reshape_and_cache( ...@@ -207,6 +210,7 @@ void reshape_and_cache(
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(num_heads * head_size, 512)); dim3 block(std::min(num_heads * head_size, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
key.scalar_type(), key.scalar_type(),
...@@ -367,6 +371,7 @@ void gather_cached_kv( ...@@ -367,6 +371,7 @@ void gather_cached_kv(
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(num_heads * head_size, 512)); dim3 block(std::min(num_heads * head_size, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
key.scalar_type(), key.scalar_type(),
......
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "reduction_utils.cuh" #include "reduction_utils.cuh"
...@@ -76,6 +77,7 @@ void rms_norm( ...@@ -76,6 +77,7 @@ void rms_norm(
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024)); dim3 block(std::min(hidden_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), input.scalar_type(),
...@@ -101,6 +103,7 @@ void fused_add_rms_norm( ...@@ -101,6 +103,7 @@ void fused_add_rms_norm(
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(hidden_size, 1024)); dim3 block(std::min(hidden_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), input.scalar_type(),
......
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cuda_compat.h" #include "cuda_compat.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
...@@ -94,6 +95,7 @@ void rotary_embedding( ...@@ -94,6 +95,7 @@ void rotary_embedding(
dim3 grid(num_tokens); dim3 grid(num_tokens);
dim3 block(std::min(num_heads * rot_dim / 2, 512)); dim3 block(std::min(num_heads * rot_dim / 2, 512));
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES( VLLM_DISPATCH_FLOATING_TYPES(
query.scalar_type(), query.scalar_type(),
......
...@@ -287,7 +287,8 @@ void gemm_half_q_half_cuda_part ...@@ -287,7 +287,8 @@ void gemm_half_q_half_cuda_part
fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count); fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count);
kernel<<<gridDim, blockDim>>> const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
kernel<<<gridDim, blockDim, 0, stream>>>
( (
a, a,
b_q_weight, b_q_weight,
...@@ -434,7 +435,8 @@ void reconstruct_exllama ...@@ -434,7 +435,8 @@ void reconstruct_exllama
gridDim.y = DIVIDE(height, BLOCK_KN_SIZE); gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
reconstruct_exllama_kernel<<<gridDim, blockDim>>> const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>
( (
b_q_weight, b_q_weight,
b_q_perm, b_q_perm,
...@@ -567,7 +569,8 @@ void gemm_half_q_half_alt ...@@ -567,7 +569,8 @@ void gemm_half_q_half_alt
gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX); gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX);
gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
gemm_half_q_half_alt_kernel<<<gridDim, blockDim>>> const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
gemm_half_q_half_alt_kernel<<<gridDim, blockDim, 0, stream>>>
( (
(const half2*) a, (const half2*) a,
b_q_weight, b_q_weight,
...@@ -639,7 +642,8 @@ void reconstruct_gptq ...@@ -639,7 +642,8 @@ void reconstruct_gptq
blockDim.y = 1; blockDim.y = 1;
gridDim.y = DIVIDE(height, 8); gridDim.y = DIVIDE(height, 8);
gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
reconstruct_gptq_kernel<<<gridDim, blockDim>>> const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
reconstruct_gptq_kernel<<<gridDim, blockDim, 0, stream>>>
( (
b_q_weight, b_q_weight,
b_gptq_scales, b_gptq_scales,
...@@ -794,7 +798,8 @@ void shuffle_exllama_weight ...@@ -794,7 +798,8 @@ void shuffle_exllama_weight
gridDim.x = DIVIDE(width, THREADS_X); gridDim.x = DIVIDE(width, THREADS_X);
gridDim.y = height / 8; gridDim.y = height / 8;
make_sequential_kernel<<<gridDim, blockDim>>> const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
make_sequential_kernel<<<gridDim, blockDim, 0, stream>>>
( (
q_weight, q_weight,
new_qweight, new_qweight,
...@@ -813,7 +818,8 @@ void shuffle_exllama_weight ...@@ -813,7 +818,8 @@ void shuffle_exllama_weight
blockDim.y = 1; blockDim.y = 1;
gridDim.x = DIVIDE(width, THREADS_X); gridDim.x = DIVIDE(width, THREADS_X);
gridDim.y = 1; gridDim.y = 1;
shuffle_kernel<<<gridDim, blockDim>>>(q_weight, height, width); const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width);
} }
} // namespace gptq } // namespace gptq
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
// half-tensor // half-tensor
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <ATen/cuda/CUDATensorMethods.cuh> #include <ATen/cuda/CUDATensorMethods.cuh>
#include <c10/cuda/CUDAGuard.h>
#define BLOCKWIDTH 128 #define BLOCKWIDTH 128
#define BLOCKHEIGHT4 16 #define BLOCKHEIGHT4 16
...@@ -200,7 +201,9 @@ void squeezellm_gemm( ...@@ -200,7 +201,9 @@ void squeezellm_gemm(
); );
dim3 threads(BLOCKWIDTH); dim3 threads(BLOCKWIDTH);
vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>( const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
#ifndef USE_ROCM #ifndef USE_ROCM
(half2*) vec.data<at::Half>(), (half2*) vec.data<at::Half>(),
#else #else
......
...@@ -58,11 +58,10 @@ Next, you need to rewrite the :code:`forward` methods of your model by following ...@@ -58,11 +58,10 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
+ positions: torch.Tensor, + positions: torch.Tensor,
+ kv_caches: List[KVCache], + kv_caches: List[KVCache],
+ input_metadata: InputMetadata, + input_metadata: InputMetadata,
+ cache_events: Optional[List[torch.cuda.Event]], +) -> Optional[SamplerOutput]:
+) -> SamplerOutput:
3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. 1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
4. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture. 2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
.. note:: .. note::
Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
......
...@@ -47,6 +47,6 @@ if __name__ == "__main__": ...@@ -47,6 +47,6 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
demo = build_demo() demo = build_demo()
demo.queue(concurrency_count=100).launch(server_name=args.host, demo.queue().launch(server_name=args.host,
server_port=args.port, server_port=args.port,
share=True) share=True)
...@@ -3,8 +3,6 @@ typing-extensions>=4.8.0 ...@@ -3,8 +3,6 @@ typing-extensions>=4.8.0
starlette starlette
psutil psutil
ray >= 2.5.1 ray >= 2.5.1
pandas # Required for Ray data.
pyarrow # Required for Ray data.
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy numpy
tokenizers>=0.15.0 tokenizers>=0.15.0
......
ninja # For faster builds. ninja # For faster builds.
psutil psutil
ray >= 2.5.1 ray >= 2.5.1
pandas # Required for Ray data.
pyarrow # Required for Ray data.
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy numpy
torch == 2.1.2 torch == 2.1.2
......
...@@ -300,8 +300,8 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -300,8 +300,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
version += ".torch" + torch.__version__[:3] version += ".torch" + torch.__version__[:3]
with open(add_version_path, encoding="utf-8",mode="w") as file: with open(add_version_path, encoding="utf-8",mode="w") as file:
file.write("__version__='0.2.6'\n") file.write("__version__='0.2.7'\n")
file.write("__dcu_version__='0.2.6+{}'\n".format(version)) file.write("__dcu_version__='0.2.7+{}'\n".format(version))
file.close() file.close()
......
...@@ -8,11 +8,11 @@ import pytest ...@@ -8,11 +8,11 @@ import pytest
import requests import requests
def _query_server(prompt: str) -> dict: def _query_server(prompt: str, max_tokens: int = 5) -> dict:
response = requests.post("http://localhost:8000/generate", response = requests.post("http://localhost:8000/generate",
json={ json={
"prompt": prompt, "prompt": prompt,
"max_tokens": 100, "max_tokens": max_tokens,
"temperature": 0, "temperature": 0,
"ignore_eos": True "ignore_eos": True
}) })
...@@ -20,6 +20,10 @@ def _query_server(prompt: str) -> dict: ...@@ -20,6 +20,10 @@ def _query_server(prompt: str) -> dict:
return response.json() return response.json()
def _query_server_long(prompt: str) -> dict:
return _query_server(prompt, max_tokens=500)
@pytest.fixture @pytest.fixture
def api_server(): def api_server():
script_path = Path(__file__).parent.joinpath( script_path = Path(__file__).parent.joinpath(
...@@ -44,13 +48,14 @@ def test_api_server(api_server): ...@@ -44,13 +48,14 @@ def test_api_server(api_server):
""" """
with Pool(32) as pool: with Pool(32) as pool:
# Wait until the server is ready # Wait until the server is ready
prompts = ["Hello world"] * 1 prompts = ["warm up"] * 1
result = None result = None
while not result: while not result:
try: try:
for _ in pool.map(_query_server, prompts): for r in pool.map(_query_server, prompts):
result = r
break break
except Exception: except requests.exceptions.ConnectionError:
time.sleep(1) time.sleep(1)
# Actual tests start here # Actual tests start here
...@@ -63,12 +68,14 @@ def test_api_server(api_server): ...@@ -63,12 +68,14 @@ def test_api_server(api_server):
assert num_aborted_requests == 0 assert num_aborted_requests == 0
# Try with 100 prompts # Try with 100 prompts
prompts = ["Hello world"] * 100 prompts = ["test prompt"] * 100
for result in pool.map(_query_server, prompts): for result in pool.map(_query_server, prompts):
assert result assert result
with Pool(32) as pool:
# Cancel requests # Cancel requests
pool.map_async(_query_server, prompts) prompts = ["canceled requests"] * 100
pool.map_async(_query_server_long, prompts)
time.sleep(0.01) time.sleep(0.01)
pool.terminate() pool.terminate()
pool.join() pool.join()
...@@ -81,6 +88,6 @@ def test_api_server(api_server): ...@@ -81,6 +88,6 @@ def test_api_server(api_server):
# check that server still runs after cancellations # check that server still runs after cancellations
with Pool(32) as pool: with Pool(32) as pool:
# Try with 100 prompts # Try with 100 prompts
prompts = ["Hello world"] * 100 prompts = ["test prompt after canceled"] * 100
for result in pool.map(_query_server, prompts): for result in pool.map(_query_server, prompts):
assert result assert result
...@@ -8,8 +8,9 @@ from transformers import AutoModelForCausalLM ...@@ -8,8 +8,9 @@ from transformers import AutoModelForCausalLM
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
_TEST_PROMPTS = ["prompts/example.txt"] _TEST_DIR = os.path.dirname(__file__)
_LONG_PROMPTS = ["prompts/summary.txt"] _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
def _read_prompts(filename: str) -> str: def _read_prompts(filename: str) -> str:
...@@ -24,7 +25,7 @@ def _read_prompts(filename: str) -> str: ...@@ -24,7 +25,7 @@ def _read_prompts(filename: str) -> str:
def example_prompts() -> List[str]: def example_prompts() -> List[str]:
prompts = [] prompts = []
for filename in _TEST_PROMPTS: for filename in _TEST_PROMPTS:
prompts += _read_prompts(os.path.join("tests", filename)) prompts += _read_prompts(filename)
return prompts return prompts
...@@ -32,7 +33,7 @@ def example_prompts() -> List[str]: ...@@ -32,7 +33,7 @@ def example_prompts() -> List[str]:
def example_long_prompts() -> List[str]: def example_long_prompts() -> List[str]:
prompts = [] prompts = []
for filename in _LONG_PROMPTS: for filename in _LONG_PROMPTS:
prompts += _read_prompts(os.path.join("tests", filename)) prompts += _read_prompts(filename)
return prompts return prompts
......
...@@ -8,7 +8,7 @@ import pytest ...@@ -8,7 +8,7 @@ import pytest
import torch import torch
from vllm.config import ParallelConfig from vllm.config import ParallelConfig
from vllm.engine.ray_utils import get_open_port from vllm.utils import get_open_port
from vllm.model_executor.parallel_utils.communication_op import ( from vllm.model_executor.parallel_utils.communication_op import (
tensor_model_parallel_all_reduce, tensor_model_parallel_all_reduce,
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
......
...@@ -12,6 +12,7 @@ def create_kv_caches( ...@@ -12,6 +12,7 @@ def create_kv_caches(
head_size: int, head_size: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: str,
) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) torch.cuda.manual_seed(seed)
...@@ -23,7 +24,7 @@ def create_kv_caches( ...@@ -23,7 +24,7 @@ def create_kv_caches(
for _ in range(num_layers): for _ in range(num_layers):
key_cache = torch.empty(size=key_cache_shape, key_cache = torch.empty(size=key_cache_shape,
dtype=dtype, dtype=dtype,
device='cuda') device=device)
key_cache.uniform_(-scale, scale) key_cache.uniform_(-scale, scale)
key_caches.append(key_cache) key_caches.append(key_cache)
...@@ -32,7 +33,7 @@ def create_kv_caches( ...@@ -32,7 +33,7 @@ def create_kv_caches(
for _ in range(num_layers): for _ in range(num_layers):
value_cache = torch.empty(size=value_cache_shape, value_cache = torch.empty(size=value_cache_shape,
dtype=dtype, dtype=dtype,
device='cuda') device=device)
value_cache.uniform_(-scale, scale) value_cache.uniform_(-scale, scale)
value_caches.append(value_cache) value_caches.append(value_cache)
return key_caches, value_caches return key_caches, value_caches
......
...@@ -7,22 +7,26 @@ DTYPES = [torch.half, torch.bfloat16, torch.float] ...@@ -7,22 +7,26 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing
D = [512, 4096, 5120, 13824] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing
SEEDS = [0] SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D) @pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_silu_and_mul( def test_silu_and_mul(
num_tokens: int, num_tokens: int,
d: int, d: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int,
) -> None: ) -> None:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) torch.cuda.manual_seed(seed)
x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda") gpu_id = f"cuda:{device}"
x = torch.randn(num_tokens, 2 * d, dtype=dtype, device=gpu_id)
layer = SiluAndMul() layer = SiluAndMul()
out = layer(x) out = layer(x)
ref_out = layer._forward(x) ref_out = layer._forward(x)
...@@ -33,16 +37,19 @@ def test_silu_and_mul( ...@@ -33,16 +37,19 @@ def test_silu_and_mul(
@pytest.mark.parametrize("d", D) @pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_gelu_new( def test_gelu_new(
num_tokens: int, num_tokens: int,
d: int, d: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int,
) -> None: ) -> None:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) torch.cuda.manual_seed(seed)
x = torch.randn(num_tokens, d, dtype=dtype, device="cuda") gpu_id = f"cuda:{device}"
x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
layer = NewGELU() layer = NewGELU()
out = layer(x) out = layer(x)
ref_out = layer._forward(x) ref_out = layer._forward(x)
...@@ -53,15 +60,18 @@ def test_gelu_new( ...@@ -53,15 +60,18 @@ def test_gelu_new(
@pytest.mark.parametrize("d", D) @pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES)
def test_gelu_fast( def test_gelu_fast(
num_tokens: int, num_tokens: int,
d: int, d: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int,
) -> None: ) -> None:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) torch.cuda.manual_seed(seed)
x = torch.randn(num_tokens, d, dtype=dtype, device="cuda") gpu_id = f"cuda:{device}"
x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
layer = FastGELU() layer = FastGELU()
out = layer(x) out = layer(x)
ref_out = layer._forward(x) ref_out = layer._forward(x)
......
...@@ -24,6 +24,7 @@ HEAD_SIZES = [64, 80, 96, 112, 128, 256] ...@@ -24,6 +24,7 @@ HEAD_SIZES = [64, 80, 96, 112, 128, 256]
BLOCK_SIZES = [16, 32] BLOCK_SIZES = [16, 32]
USE_ALIBI = [False, True] USE_ALIBI = [False, True]
SEEDS = [0] SEEDS = [0]
DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
def ref_masked_attention( def ref_masked_attention(
...@@ -87,7 +88,7 @@ def ref_single_query_cached_kv_attention( ...@@ -87,7 +88,7 @@ def ref_single_query_cached_kv_attention(
alibi_bias = None alibi_bias = None
if alibi_slopes is not None: if alibi_slopes is not None:
# Create the ALiBi bias used in the paged attention kernel. # Create the ALiBi bias used in the paged attention kernel.
position_ids = torch.arange(context_len, device="cuda").int() position_ids = torch.arange(context_len, device=query.device).int()
alibi_bias = (position_ids - context_len + 1).float() alibi_bias = (position_ids - context_len + 1).float()
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view( alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
1, 1, -1) 1, 1, -1)
...@@ -105,6 +106,7 @@ def ref_single_query_cached_kv_attention( ...@@ -105,6 +106,7 @@ def ref_single_query_cached_kv_attention(
@pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES)
def test_paged_attention( def test_paged_attention(
kv_cache_factory, kv_cache_factory,
version: str, version: str,
...@@ -115,18 +117,19 @@ def test_paged_attention( ...@@ -115,18 +117,19 @@ def test_paged_attention(
block_size: int, block_size: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int,
) -> None: ) -> None:
random.seed(seed) random.seed(seed)
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) torch.cuda.manual_seed(seed)
gpu_id = f"cuda:{device}"
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads num_query_heads, num_kv_heads = num_heads
query = torch.empty(num_seqs, query = torch.empty(num_seqs,
num_query_heads, num_query_heads,
head_size, head_size,
dtype=dtype, dtype=dtype,
device="cuda") device=gpu_id)
query.uniform_(-scale, scale) query.uniform_(-scale, scale)
assert num_query_heads % num_kv_heads == 0 assert num_query_heads % num_kv_heads == 0
...@@ -135,12 +138,12 @@ def test_paged_attention( ...@@ -135,12 +138,12 @@ def test_paged_attention(
if use_alibi: if use_alibi:
alibi_slopes = torch.randn(num_query_heads, alibi_slopes = torch.randn(num_query_heads,
dtype=torch.float, dtype=torch.float,
device="cuda") device=gpu_id)
context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)] context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
context_lens[-1] = MAX_SEQ_LEN context_lens[-1] = MAX_SEQ_LEN
max_context_len = max(context_lens) max_context_len = max(context_lens)
context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda") context_lens = torch.tensor(context_lens, dtype=torch.int, device=gpu_id)
# Create the block tables. # Create the block tables.
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
...@@ -151,12 +154,12 @@ def test_paged_attention( ...@@ -151,12 +154,12 @@ def test_paged_attention(
for _ in range(max_num_blocks_per_seq) for _ in range(max_num_blocks_per_seq)
] ]
block_tables.append(block_table) block_tables.append(block_table)
block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda") block_tables = torch.tensor(block_tables, dtype=torch.int, device=gpu_id)
# Create the KV caches. # Create the KV caches.
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
num_kv_heads, head_size, dtype, num_kv_heads, head_size, dtype,
seed) seed, gpu_id)
key_cache, value_cache = key_caches[0], value_caches[0] key_cache, value_cache = key_caches[0], value_caches[0]
# Call the paged attention kernel. # Call the paged attention kernel.
...@@ -249,7 +252,7 @@ def ref_multi_query_kv_attention( ...@@ -249,7 +252,7 @@ def ref_multi_query_kv_attention(
attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype), attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
diagonal=1) diagonal=1)
attn_mask = attn_mask * torch.finfo(dtype).min attn_mask = attn_mask * torch.finfo(dtype).min
attn_mask = attn_mask.to(dtype=dtype, device="cuda") attn_mask = attn_mask.to(dtype=dtype, device=query.device)
ref_output = ref_masked_attention( ref_output = ref_masked_attention(
query[start_idx:end_idx], query[start_idx:end_idx],
...@@ -269,6 +272,7 @@ def ref_multi_query_kv_attention( ...@@ -269,6 +272,7 @@ def ref_multi_query_kv_attention(
@pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_multi_query_kv_attention( def test_multi_query_kv_attention(
num_seqs: int, num_seqs: int,
...@@ -276,11 +280,12 @@ def test_multi_query_kv_attention( ...@@ -276,11 +280,12 @@ def test_multi_query_kv_attention(
head_size: int, head_size: int,
dtype: torch.dtype, dtype: torch.dtype,
seed: int, seed: int,
device: int,
) -> None: ) -> None:
random.seed(seed) random.seed(seed)
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) torch.cuda.manual_seed(seed)
gpu_id = f"cuda:{device}"
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use # As the xformers library is already tested with its own tests, we can use
# a smaller MAX_SEQ_LEN here. # a smaller MAX_SEQ_LEN here.
...@@ -294,7 +299,7 @@ def test_multi_query_kv_attention( ...@@ -294,7 +299,7 @@ def test_multi_query_kv_attention(
num_query_heads + 2 * num_kv_heads, num_query_heads + 2 * num_kv_heads,
head_size, head_size,
dtype=dtype, dtype=dtype,
device="cuda") device=gpu_id)
qkv.uniform_(-scale, scale) qkv.uniform_(-scale, scale)
query, key, value = qkv.split( query, key, value = qkv.split(
[num_query_heads, num_kv_heads, num_kv_heads], dim=1) [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment