Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
e019635f
Commit
e019635f
authored
Nov 01, 2024
by
xuxzh1
🎱
Browse files
update
parent
64def8e2
Changes
171
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2192 deletions
+0
-2192
server/vllm/requirements.txt
server/vllm/requirements.txt
+0
-9
server/vllm/setup.py
server/vllm/setup.py
+0
-292
server/vllm/tests/async_engine/api_server_async_engine.py
server/vllm/tests/async_engine/api_server_async_engine.py
+0
-51
server/vllm/tests/async_engine/test_api_server.py
server/vllm/tests/async_engine/test_api_server.py
+0
-89
server/vllm/tests/async_engine/test_async_llm_engine.py
server/vllm/tests/async_engine/test_async_llm_engine.py
+0
-80
server/vllm/tests/async_engine/test_request_tracker.py
server/vllm/tests/async_engine/test_request_tracker.py
+0
-75
server/vllm/tests/conftest.py
server/vllm/tests/conftest.py
+0
-212
server/vllm/tests/distributed/test_comm_ops.py
server/vllm/tests/distributed/test_comm_ops.py
+0
-82
server/vllm/tests/engine/test_detokenize.py
server/vllm/tests/engine/test_detokenize.py
+0
-63
server/vllm/tests/kernels/conftest.py
server/vllm/tests/kernels/conftest.py
+0
-43
server/vllm/tests/kernels/test_activation.py
server/vllm/tests/kernels/test_activation.py
+0
-75
server/vllm/tests/kernels/test_attention.py
server/vllm/tests/kernels/test_attention.py
+0
-332
server/vllm/tests/kernels/test_cache.py
server/vllm/tests/kernels/test_cache.py
+0
-146
server/vllm/tests/kernels/test_layernorm.py
server/vllm/tests/kernels/test_layernorm.py
+0
-58
server/vllm/tests/kernels/test_pos_encoding.py
server/vllm/tests/kernels/test_pos_encoding.py
+0
-174
server/vllm/tests/models/test_models.py
server/vllm/tests/models/test_models.py
+0
-46
server/vllm/tests/samplers/test_beam_search.py
server/vllm/tests/samplers/test_beam_search.py
+0
-46
server/vllm/tests/samplers/test_logprobs.py
server/vllm/tests/samplers/test_logprobs.py
+0
-55
server/vllm/tests/samplers/test_sampler.py
server/vllm/tests/samplers/test_sampler.py
+0
-185
server/vllm/tests/test_patched_rotary.py
server/vllm/tests/test_patched_rotary.py
+0
-79
No files found.
server/vllm/requirements.txt
deleted
100644 → 0
View file @
64def8e2
ninja # For faster builds.
psutil
pandas # Required for Ray data.
pyarrow # Required for Ray data.
sentencepiece # Required for LLaMA tokenizer.
numpy
transformers==4.33.2 # match TGI
uvicorn[standard]
pydantic < 2 # Required for OpenAI server.
server/vllm/setup.py
deleted
100755 → 0
View file @
64def8e2
import
io
import
os
import
re
import
subprocess
from
typing
import
List
,
Set
import
warnings
from
packaging.version
import
parse
,
Version
import
setuptools
import
torch
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
,
ROCM_HOME
ROOT_DIR
=
os
.
path
.
dirname
(
__file__
)
# Supported NVIDIA GPU architectures.
SUPPORTED_ARCHS
=
{
"7.0"
,
"7.5"
,
"8.0"
,
"8.6"
,
"8.9"
,
"9.0"
}
# Compiler flags.
CXX_FLAGS
=
[
"-g"
,
"-O2"
,
"-std=c++17"
]
# TODO(woosuk): Should we use -O3?
NVCC_FLAGS
=
[
"-O2"
,
"-std=c++17"
,
"--gpu-max-threads-per-block=1024"
]
ABI
=
1
if
torch
.
_C
.
_GLIBCXX_USE_CXX11_ABI
else
0
CXX_FLAGS
+=
[
f
"-D_GLIBCXX_USE_CXX11_ABI=
{
ABI
}
"
]
NVCC_FLAGS
+=
[
f
"-D_GLIBCXX_USE_CXX11_ABI=
{
ABI
}
"
]
if
torch
.
version
.
hip
:
if
ROCM_HOME
is
not
None
:
NVCC_FLAGS
+=
[
f
"-DUSE_ROCM"
]
if
not
torch
.
version
.
hip
:
if
CUDA_HOME
is
None
:
raise
RuntimeError
(
"Cannot find CUDA_HOME. CUDA must be available to build the package."
)
def
get_nvcc_cuda_version
(
cuda_dir
:
str
)
->
Version
:
"""Get the CUDA version from nvcc.
Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
"""
nvcc_output
=
subprocess
.
check_output
([
cuda_dir
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
output
=
nvcc_output
.
split
()
release_idx
=
output
.
index
(
"release"
)
+
1
nvcc_cuda_version
=
parse
(
output
[
release_idx
].
split
(
","
)[
0
])
return
nvcc_cuda_version
def
get_torch_arch_list
()
->
Set
[
str
]:
# TORCH_CUDA_ARCH_LIST can have one or more architectures,
# e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
# compiler to additionally include PTX code that can be runtime-compiled
# and executed on the 8.6 or newer architectures. While the PTX code will
# not give the best performance on the newer architectures, it provides
# forward compatibility.
env_arch_list
=
os
.
environ
.
get
(
"TORCH_CUDA_ARCH_LIST"
,
None
)
if
env_arch_list
is
None
:
return
set
()
# List are separated by ; or space.
torch_arch_list
=
set
(
env_arch_list
.
replace
(
" "
,
";"
).
split
(
";"
))
if
not
torch_arch_list
:
return
set
()
# Filter out the invalid architectures and print a warning.
valid_archs
=
SUPPORTED_ARCHS
.
union
({
s
+
"+PTX"
for
s
in
SUPPORTED_ARCHS
})
arch_list
=
torch_arch_list
.
intersection
(
valid_archs
)
# If none of the specified architectures are valid, raise an error.
if
not
arch_list
:
raise
RuntimeError
(
"None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
f
"variable (
{
env_arch_list
}
) is supported. "
f
"Supported CUDA architectures are:
{
valid_archs
}
."
)
invalid_arch_list
=
torch_arch_list
-
valid_archs
if
invalid_arch_list
:
warnings
.
warn
(
f
"Unsupported CUDA architectures (
{
invalid_arch_list
}
) are "
"excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
f
"(
{
env_arch_list
}
). Supported CUDA architectures are: "
f
"
{
valid_archs
}
."
)
return
arch_list
def
get_cuda_compute_capabilities
(
nvcc_cuda_version
):
# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities
=
get_torch_arch_list
()
if
not
compute_capabilities
:
# If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
# GPUs on the current machine.
device_count
=
torch
.
cuda
.
device_count
()
for
i
in
range
(
device_count
):
major
,
minor
=
torch
.
cuda
.
get_device_capability
(
i
)
if
major
<
7
:
raise
RuntimeError
(
"GPUs with compute capability below 7.0 are not supported."
)
compute_capabilities
.
add
(
f
"
{
major
}
.
{
minor
}
"
)
if
not
compute_capabilities
:
# If no GPU is specified nor available, add all supported architectures
# based on the NVCC CUDA version.
compute_capabilities
=
SUPPORTED_ARCHS
.
copy
()
if
nvcc_cuda_version
<
Version
(
"11.1"
):
compute_capabilities
.
remove
(
"8.6"
)
if
nvcc_cuda_version
<
Version
(
"11.8"
):
compute_capabilities
.
remove
(
"8.9"
)
compute_capabilities
.
remove
(
"9.0"
)
return
compute_capabilities
def
validate_nvcc_cuda_version
(
nvcc_cuda_version
,
compute_capabilities
):
if
nvcc_cuda_version
<
Version
(
"11.0"
):
raise
RuntimeError
(
"CUDA 11.0 or higher is required to build the package."
)
if
nvcc_cuda_version
<
Version
(
"11.1"
):
if
any
(
cc
.
startswith
(
"8.6"
)
for
cc
in
compute_capabilities
):
raise
RuntimeError
(
"CUDA 11.1 or higher is required for compute capability 8.6."
)
if
nvcc_cuda_version
<
Version
(
"11.8"
):
if
any
(
cc
.
startswith
(
"8.9"
)
for
cc
in
compute_capabilities
):
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
# However, GPUs with compute capability 8.9 can also run the code generated by
# the previous versions of CUDA 11 and targeting compute capability 8.0.
# Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
# instead of 8.9.
warnings
.
warn
(
"CUDA 11.8 or higher is required for compute capability 8.9. "
"Targeting compute capability 8.0 instead."
)
compute_capabilities
=
set
(
cc
for
cc
in
compute_capabilities
if
not
cc
.
startswith
(
"8.9"
))
compute_capabilities
.
add
(
"8.0+PTX"
)
if
any
(
cc
.
startswith
(
"9.0"
)
for
cc
in
compute_capabilities
):
raise
RuntimeError
(
"CUDA 11.8 or higher is required for compute capability 9.0."
)
if
not
torch
.
version
.
hip
:
nvcc_cuda_version
=
get_nvcc_cuda_version
(
CUDA_HOME
)
compute_capabilities
=
get_cuda_compute_capabilities
(
nvcc_cuda_version
)
validate_nvcc_cuda_version
(
nvcc_cuda_version
,
compute_capabilities
)
# Add target compute capabilities to NVCC flags.
for
capability
in
compute_capabilities
:
num
=
capability
[
0
]
+
capability
[
2
]
NVCC_FLAGS
+=
[
"-gencode"
,
f
"arch=compute_
{
num
}
,code=sm_
{
num
}
"
]
if
capability
.
endswith
(
"+PTX"
):
NVCC_FLAGS
+=
[
"-gencode"
,
f
"arch=compute_
{
num
}
,code=compute_
{
num
}
"
]
# Use NVCC threads to parallelize the build.
if
nvcc_cuda_version
>=
Version
(
"11.2"
):
num_threads
=
min
(
os
.
cpu_count
(),
8
)
NVCC_FLAGS
+=
[
"--threads"
,
str
(
num_threads
)]
ext_modules
=
[]
# Cache operations.
cache_extension
=
CUDAExtension
(
name
=
"vllm.cache_ops"
,
sources
=
[
"csrc/cache.cpp"
,
"csrc/cache_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
cache_extension
)
# Attention kernels.
attention_extension
=
CUDAExtension
(
name
=
"vllm.attention_ops"
,
sources
=
[
"csrc/attention.cpp"
,
"csrc/attention/attention_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
attention_extension
)
# Positional encoding kernels.
positional_encoding_extension
=
CUDAExtension
(
name
=
"vllm.pos_encoding_ops"
,
sources
=
[
"csrc/pos_encoding.cpp"
,
"csrc/pos_encoding_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
positional_encoding_extension
)
# Layer normalization kernels.
layernorm_extension
=
CUDAExtension
(
name
=
"vllm.layernorm_ops"
,
sources
=
[
"csrc/layernorm.cpp"
,
"csrc/layernorm_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
layernorm_extension
)
# Activation kernels.
activation_extension
=
CUDAExtension
(
name
=
"vllm.activation_ops"
,
sources
=
[
"csrc/activation.cpp"
,
"csrc/activation_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
activation_extension
)
# Quantization kernels.
quantization_extension
=
CUDAExtension
(
name
=
"vllm.quantization_ops"
,
sources
=
[
"csrc/quantization.cpp"
,
"csrc/quantization/awq/gemm_kernels.cu"
,
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
quantization_extension
)
#ROCM also need this
# if not torch.version.hip:
# ext_modules.append(quantization_extension)
# Misc. CUDA utils.
cuda_utils_extension
=
CUDAExtension
(
name
=
"vllm.cuda_utils"
,
sources
=
[
"csrc/cuda_utils.cpp"
,
"csrc/cuda_utils_kernels.cu"
],
extra_compile_args
=
{
"cxx"
:
CXX_FLAGS
,
"nvcc"
:
NVCC_FLAGS
,
},
)
ext_modules
.
append
(
cuda_utils_extension
)
def
get_path
(
*
filepath
)
->
str
:
return
os
.
path
.
join
(
ROOT_DIR
,
*
filepath
)
def
find_version
(
filepath
:
str
):
"""Extract version information from the given filepath.
Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
"""
with
open
(
filepath
)
as
fp
:
version_match
=
re
.
search
(
r
"^__version__ = ['\"]([^'\"]*)['\"]"
,
fp
.
read
(),
re
.
M
)
if
version_match
:
return
version_match
.
group
(
1
)
raise
RuntimeError
(
"Unable to find version string."
)
def
read_readme
()
->
str
:
"""Read the README file."""
return
io
.
open
(
get_path
(
"README.md"
),
"r"
,
encoding
=
"utf-8"
).
read
()
def
get_requirements
()
->
List
[
str
]:
"""Get Python package dependencies from requirements.txt."""
with
open
(
get_path
(
"requirements.txt"
))
as
f
:
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
return
requirements
setuptools
.
setup
(
name
=
"vllm"
,
version
=
find_version
(
get_path
(
"vllm"
,
"__init__.py"
)),
author
=
"vLLM Team"
,
license
=
"Apache 2.0"
,
description
=
(
"A high-throughput and memory-efficient inference and "
"serving engine for LLMs"
),
long_description
=
read_readme
(),
long_description_content_type
=
"text/markdown"
,
url
=
"https://github.com/vllm-project/vllm"
,
project_urls
=
{
"Homepage"
:
"https://github.com/vllm-project/vllm"
,
"Documentation"
:
"https://vllm.readthedocs.io/en/latest/"
,
},
classifiers
=
[
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"License :: OSI Approved :: Apache Software License"
,
"Topic :: Scientific/Engineering :: Artificial Intelligence"
,
],
packages
=
setuptools
.
find_packages
(
exclude
=
(
"benchmarks"
,
"csrc"
,
"docs"
,
"examples"
,
"tests"
)),
python_requires
=
">=3.8"
,
install_requires
=
get_requirements
(),
ext_modules
=
ext_modules
,
cmdclass
=
{
"build_ext"
:
BuildExtension
},
)
server/vllm/tests/async_engine/api_server_async_engine.py
deleted
100644 → 0
View file @
64def8e2
"""vllm.entrypoints.api_server with some extra logging for testing."""
import
argparse
from
typing
import
Any
,
Dict
import
uvicorn
from
fastapi.responses
import
JSONResponse
,
Response
import
vllm.entrypoints.api_server
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
app
=
vllm
.
entrypoints
.
api_server
.
app
class
AsyncLLMEngineWithStats
(
AsyncLLMEngine
):
# pylint: disable=redefined-outer-name
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
_num_aborts
=
0
async
def
abort
(
self
,
request_id
:
str
)
->
None
:
await
super
().
abort
(
request_id
)
self
.
_num_aborts
+=
1
def
testing_stats
(
self
)
->
Dict
[
str
,
Any
]:
return
{
"num_aborted_requests"
:
self
.
_num_aborts
}
@
app
.
get
(
"/stats"
)
def
stats
()
->
Response
:
"""Get the statistics of the engine."""
return
JSONResponse
(
engine
.
testing_stats
())
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngineWithStats
.
from_engine_args
(
engine_args
)
vllm
.
entrypoints
.
api_server
.
engine
=
engine
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"debug"
,
timeout_keep_alive
=
vllm
.
entrypoints
.
api_server
.
TIMEOUT_KEEP_ALIVE
)
server/vllm/tests/async_engine/test_api_server.py
deleted
100644 → 0
View file @
64def8e2
import
subprocess
import
sys
import
time
from
multiprocessing
import
Pool
from
pathlib
import
Path
import
pytest
import
requests
def
_query_server
(
prompt
:
str
)
->
dict
:
response
=
requests
.
post
(
"http://localhost:8000/generate"
,
json
=
{
"prompt"
:
prompt
,
"max_tokens"
:
100
,
"temperature"
:
0
,
"ignore_eos"
:
True
})
response
.
raise_for_status
()
return
response
.
json
()
@
pytest
.
fixture
def
api_server
():
script_path
=
Path
(
__file__
).
parent
.
joinpath
(
"api_server_async_engine.py"
).
absolute
()
# pylint: disable=consider-using-with
uvicorn_process
=
subprocess
.
Popen
([
sys
.
executable
,
"-u"
,
str
(
script_path
),
"--model"
,
"facebook/opt-125m"
])
yield
uvicorn_process
.
terminate
()
# pylint: disable=redefined-outer-name, unused-argument
def
test_api_server
(
api_server
):
"""
Run the API server and test it.
We run both the server and requests in separate processes.
We test that the server can handle incoming requests, including
multiple requests at the same time, and that it can handle requests
being cancelled without crashing.
"""
with
Pool
(
32
)
as
pool
:
# Wait until the server is ready
prompts
=
[
"Hello world"
]
*
1
result
=
None
while
not
result
:
# pylint: disable=bare-except
try
:
for
result
in
pool
.
map
(
_query_server
,
prompts
):
break
except
:
time
.
sleep
(
1
)
# Actual tests start here
# Try with 1 prompt
for
result
in
pool
.
map
(
_query_server
,
prompts
):
assert
result
num_aborted_requests
=
requests
.
get
(
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
assert
num_aborted_requests
==
0
# Try with 100 prompts
prompts
=
[
"Hello world"
]
*
100
for
result
in
pool
.
map
(
_query_server
,
prompts
):
assert
result
# Cancel requests
pool
.
map_async
(
_query_server
,
prompts
)
time
.
sleep
(
0.01
)
pool
.
terminate
()
pool
.
join
()
# check cancellation stats
num_aborted_requests
=
requests
.
get
(
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
assert
num_aborted_requests
>
0
# check that server still runs after cancellations
with
Pool
(
32
)
as
pool
:
# Try with 100 prompts
prompts
=
[
"Hello world"
]
*
100
for
result
in
pool
.
map
(
_query_server
,
prompts
):
assert
result
server/vllm/tests/async_engine/test_async_llm_engine.py
deleted
100644 → 0
View file @
64def8e2
import
asyncio
from
dataclasses
import
dataclass
import
pytest
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
@
dataclass
class
RequestOutput
:
request_id
:
int
finished
:
bool
=
False
class
MockEngine
:
def
__init__
(
self
):
self
.
step_calls
=
0
self
.
add_request_calls
=
0
self
.
abort_request_calls
=
0
self
.
request_id
=
None
async
def
step_async
(
self
):
self
.
step_calls
+=
1
return
[
RequestOutput
(
request_id
=
self
.
request_id
)]
if
self
.
request_id
else
[]
def
generate
(
self
,
request_id
):
self
.
request_id
=
request_id
def
stop_generating
(
self
):
self
.
request_id
=
None
def
add_request
(
self
,
**
kwargs
):
del
kwargs
# Unused
self
.
add_request_calls
+=
1
def
abort_request
(
self
,
request_id
):
del
request_id
# Unused
self
.
abort_request_calls
+=
1
class
MockAsyncLLMEngine
(
AsyncLLMEngine
):
def
_init_engine
(
self
,
*
args
,
**
kwargs
):
return
MockEngine
()
@
pytest
.
mark
.
asyncio
async
def
test_new_requests_event
():
engine
=
MockAsyncLLMEngine
(
worker_use_ray
=
False
,
engine_use_ray
=
False
)
engine
.
start_background_loop
()
await
asyncio
.
sleep
(
0.01
)
assert
engine
.
engine
.
step_calls
==
0
await
engine
.
add_request
(
"1"
,
""
,
None
)
await
asyncio
.
sleep
(
0.01
)
assert
engine
.
engine
.
add_request_calls
==
1
assert
engine
.
engine
.
step_calls
==
1
await
engine
.
add_request
(
"2"
,
""
,
None
)
engine
.
engine
.
generate
(
"2"
)
await
asyncio
.
sleep
(
0
)
assert
engine
.
engine
.
add_request_calls
==
2
assert
engine
.
engine
.
step_calls
==
2
await
asyncio
.
sleep
(
0
)
assert
engine
.
engine
.
step_calls
==
3
engine
.
engine
.
stop_generating
()
await
asyncio
.
sleep
(
0
)
assert
engine
.
engine
.
step_calls
==
4
await
asyncio
.
sleep
(
0
)
assert
engine
.
engine
.
step_calls
==
4
await
engine
.
add_request
(
"3"
,
""
,
None
)
await
asyncio
.
sleep
(
0.01
)
assert
engine
.
engine
.
add_request_calls
==
3
assert
engine
.
engine
.
step_calls
==
5
await
asyncio
.
sleep
(
0.01
)
assert
engine
.
engine
.
add_request_calls
==
3
assert
engine
.
engine
.
step_calls
==
5
server/vllm/tests/async_engine/test_request_tracker.py
deleted
100644 → 0
View file @
64def8e2
import
pytest
from
vllm.engine.async_llm_engine
import
RequestTracker
from
vllm.outputs
import
RequestOutput
class
DummyEvent
:
def
__init__
(
self
):
self
.
flag
=
False
def
set
(
self
):
self
.
flag
=
True
def
clear
(
self
):
self
.
flag
=
False
def
test_request_tracker
():
tracker
=
RequestTracker
()
tracker
.
new_requests_event
=
DummyEvent
()
stream_1
=
tracker
.
add_request
(
"1"
)
assert
tracker
.
new_requests_event
.
flag
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
not
tracker
.
new_requests_event
.
flag
assert
len
(
new
)
==
1
assert
new
[
0
][
"request_id"
]
==
"1"
assert
not
finished
assert
not
stream_1
.
finished
stream_2
=
tracker
.
add_request
(
"2"
)
stream_3
=
tracker
.
add_request
(
"3"
)
assert
tracker
.
new_requests_event
.
flag
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
not
tracker
.
new_requests_event
.
flag
assert
len
(
new
)
==
2
assert
new
[
0
][
"request_id"
]
==
"2"
assert
new
[
1
][
"request_id"
]
==
"3"
assert
not
finished
assert
not
stream_2
.
finished
assert
not
stream_3
.
finished
# request_ids must be unique
with
pytest
.
raises
(
KeyError
):
tracker
.
add_request
(
"1"
)
assert
not
tracker
.
new_requests_event
.
flag
tracker
.
abort_request
(
"1"
)
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
len
(
finished
)
==
1
assert
"1"
in
finished
assert
not
new
assert
stream_1
.
finished
stream_4
=
tracker
.
add_request
(
"4"
)
tracker
.
abort_request
(
"4"
)
assert
tracker
.
new_requests_event
.
flag
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
len
(
finished
)
==
1
assert
"4"
in
finished
assert
not
new
assert
stream_4
.
finished
stream_5
=
tracker
.
add_request
(
"5"
)
assert
tracker
.
new_requests_event
.
flag
tracker
.
process_request_output
(
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
not
tracker
.
new_requests_event
.
flag
assert
len
(
finished
)
==
1
assert
"2"
in
finished
assert
len
(
new
)
==
1
assert
new
[
0
][
"request_id"
]
==
"5"
assert
stream_2
.
finished
assert
not
stream_5
.
finished
server/vllm/tests/conftest.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
List
,
Optional
,
Tuple
import
pytest
import
torch
from
transformers
import
AutoModelForCausalLM
from
vllm
import
LLM
,
SamplingParams
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
_TEST_PROMPTS
=
[
# pylint: disable=line-too-long
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs."
,
"Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020."
,
"Compare and contrast artificial intelligence with human intelligence in terms of processing information."
,
"Describe the basic components of a neural network and how it can be trained."
,
"Write a short story about a robot that dreams for the first time."
,
"Analyze the impact of the COVID-19 pandemic on global economic structures and future business models."
,
"Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies."
,
"Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'"
,
]
@
pytest
.
fixture
def
example_prompts
()
->
List
[
str
]:
return
_TEST_PROMPTS
_STR_DTYPE_TO_TORCH_DTYPE
=
{
"half"
:
torch
.
half
,
"bfloat16"
:
torch
.
bfloat16
,
"float"
:
torch
.
float
,
}
class
HfRunner
:
def
__init__
(
self
,
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
dtype
:
str
=
"half"
,
)
->
None
:
assert
dtype
in
_STR_DTYPE_TO_TORCH_DTYPE
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name
,
torch_dtype
=
torch_dtype
,
trust_remote_code
=
True
,
).
cuda
()
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
self
.
tokenizer
=
get_tokenizer
(
tokenizer_name
,
trust_remote_code
=
True
)
def
generate
(
self
,
prompts
:
List
[
str
],
**
kwargs
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
outputs
:
List
[
Tuple
[
List
[
int
],
str
]]
=
[]
for
prompt
in
prompts
:
input_ids
=
self
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
input_ids
output_ids
=
self
.
model
.
generate
(
input_ids
.
cuda
(),
use_cache
=
True
,
**
kwargs
,
)
output_str
=
self
.
tokenizer
.
batch_decode
(
output_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
,
)
output_ids
=
output_ids
.
cpu
().
tolist
()
outputs
.
append
((
output_ids
,
output_str
))
return
outputs
def
generate_greedy
(
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
)
for
i
in
range
(
len
(
outputs
)):
output_ids
,
output_str
=
outputs
[
i
]
outputs
[
i
]
=
(
output_ids
[
0
],
output_str
[
0
])
return
outputs
def
generate_beam_search
(
self
,
prompts
:
List
[
str
],
beam_width
:
int
,
max_tokens
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
num_beams
=
beam_width
,
num_return_sequences
=
beam_width
)
for
i
in
range
(
len
(
outputs
)):
output_ids
,
output_str
=
outputs
[
i
]
for
j
in
range
(
len
(
output_ids
)):
output_ids
[
j
]
=
[
x
for
x
in
output_ids
[
j
]
if
x
!=
self
.
tokenizer
.
pad_token_id
]
outputs
[
i
]
=
(
output_ids
,
output_str
)
return
outputs
def
generate_greedy_logprobs
(
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
)
->
List
[
List
[
torch
.
Tensor
]]:
all_logprobs
=
[]
for
prompt
in
prompts
:
input_ids
=
self
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
input_ids
output
=
self
.
model
.
generate
(
input_ids
.
cuda
(),
use_cache
=
True
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
output_hidden_states
=
True
,
return_dict_in_generate
=
True
,
)
seq_logprobs
=
[]
for
hidden_states
in
output
.
hidden_states
:
last_hidden_states
=
hidden_states
[
-
1
][
0
]
logits
=
torch
.
matmul
(
last_hidden_states
,
self
.
model
.
get_output_embeddings
().
weight
.
t
(),
)
if
self
.
model
.
get_output_embeddings
().
bias
is
not
None
:
logits
+=
self
.
model
.
get_output_embeddings
(
).
bias
.
unsqueeze
(
0
)
logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
seq_logprobs
.
append
(
logprobs
)
all_logprobs
.
append
(
seq_logprobs
)
return
all_logprobs
@
pytest
.
fixture
def
hf_runner
():
return
HfRunner
class
VllmRunner
:
def
__init__
(
self
,
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
dtype
:
str
=
"half"
,
)
->
None
:
self
.
model
=
LLM
(
model
=
model_name
,
tokenizer
=
tokenizer_name
,
trust_remote_code
=
True
,
dtype
=
dtype
,
swap_space
=
0
,
)
def
generate
(
self
,
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
req_outputs
=
self
.
model
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
outputs
=
[]
for
req_output
in
req_outputs
:
prompt_str
=
req_output
.
prompt
prompt_ids
=
req_output
.
prompt_token_ids
req_sample_output_ids
=
[]
req_sample_output_strs
=
[]
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_ids
=
sample
.
token_ids
req_sample_output_ids
.
append
(
prompt_ids
+
output_ids
)
req_sample_output_strs
.
append
(
prompt_str
+
output_str
)
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
return
outputs
def
generate_greedy
(
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
outputs
=
self
.
generate
(
prompts
,
greedy_params
)
return
[(
output_ids
[
0
],
output_str
[
0
])
for
output_ids
,
output_str
in
outputs
]
def
generate_beam_search
(
self
,
prompts
:
List
[
str
],
beam_width
:
int
,
max_tokens
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
beam_search_params
=
SamplingParams
(
n
=
beam_width
,
use_beam_search
=
True
,
temperature
=
0.0
,
max_tokens
=
max_tokens
)
outputs
=
self
.
generate
(
prompts
,
beam_search_params
)
return
outputs
@
pytest
.
fixture
def
vllm_runner
():
return
VllmRunner
server/vllm/tests/distributed/test_comm_ops.py
deleted
100644 → 0
View file @
64def8e2
"""Test the communication operators.
Run `pytest tests/distributed/test_comm_ops.py --forked`.
"""
from
multiprocessing
import
Process
import
pytest
import
torch
from
vllm.config
import
ParallelConfig
from
vllm.engine.ray_utils
import
get_open_port
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_all_reduce
,
tensor_model_parallel_all_gather
,
)
from
vllm.worker.worker
import
_init_distributed_environment
def
init_test_distributed_environment
(
pipeline_parallel_size
:
int
,
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
,
tensor_parallel_size
,
worker_use_ray
=
True
)
distributed_init_method
=
f
"tcp://localhost:
{
distributed_init_port
}
"
torch
.
cuda
.
set_device
(
rank
)
_init_distributed_environment
(
parallel_config
,
rank
,
distributed_init_method
)
def
all_reduce_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
distributed_init_port
)
num_elements
=
8
all_tensors
=
[
torch
.
arange
(
num_elements
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
*
(
r
+
1
)
for
r
in
range
(
tensor_parallel_size
)
]
expected
=
torch
.
sum
(
torch
.
stack
(
all_tensors
,
dim
=
0
),
dim
=
0
)
t
=
all_tensors
[
rank
]
t
=
tensor_model_parallel_all_reduce
(
t
)
assert
torch
.
allclose
(
t
,
expected
)
def
all_gather_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
distributed_init_port
)
num_dimensions
=
3
tensor_size
=
list
(
range
(
2
,
num_dimensions
+
2
))
total_size
=
1
for
s
in
tensor_size
:
total_size
*=
s
for
all_gather_dimension
in
range
(
num_dimensions
):
all_tensors
=
[
torch
.
arange
(
total_size
,
dtype
=
torch
.
float32
,
device
=
"cuda"
).
reshape
(
tensor_size
)
*
(
r
+
1
)
for
r
in
range
(
tensor_parallel_size
)
]
expected
=
torch
.
cat
(
all_tensors
,
dim
=
all_gather_dimension
)
t
=
all_tensors
[
rank
]
t
=
tensor_model_parallel_all_gather
(
t
,
all_gather_dimension
)
assert
torch
.
allclose
(
t
,
expected
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
all_reduce_test_worker
,
all_gather_test_worker
])
def
test_multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
):
distributed_init_port
=
get_open_port
()
processes
=
[]
for
rank
in
range
(
tensor_parallel_size
):
p
=
Process
(
target
=
test_target
,
args
=
(
tensor_parallel_size
,
rank
,
distributed_init_port
))
p
.
start
()
processes
.
append
(
p
)
for
p
in
processes
:
p
.
join
()
assert
all
(
p
.
exitcode
==
0
for
p
in
processes
)
server/vllm/tests/engine/test_detokenize.py
deleted
100644 → 0
View file @
64def8e2
import
pytest
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
detokenize_incrementally
TRUTH
=
[
# pylint: disable=line-too-long
"Hello here, this is a simple test"
,
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving"
,
"我很感谢你的热情"
]
TOKENIZERS
=
[
"facebook/opt-125m"
,
"gpt2"
,
"bigcode/tiny_starcoder_py"
,
"EleutherAI/gpt-j-6b"
,
"EleutherAI/pythia-70m"
,
"bigscience/bloom-560m"
,
"mosaicml/mpt-7b"
,
"tiiuae/falcon-7b"
,
"meta-llama/Llama-2-7b-hf"
,
"codellama/CodeLlama-7b-hf"
,
]
def
_run_incremental_decode
(
tokenizer
,
all_input_ids
,
skip_special_tokens
:
bool
):
decoded_text
=
""
offset
=
0
token_offset
=
0
prev_tokens
=
None
for
i
in
range
(
len
(
all_input_ids
)):
new_tokens
,
text
,
offset
,
token_offset
=
detokenize_incrementally
(
tokenizer
,
all_input_ids
[:
i
+
1
],
prev_tokens
,
offset
,
token_offset
,
skip_special_tokens
=
skip_special_tokens
)
decoded_text
+=
text
if
prev_tokens
is
None
:
prev_tokens
=
new_tokens
else
:
prev_tokens
+=
new_tokens
return
decoded_text
@
pytest
.
mark
.
parametrize
(
"truth"
,
TRUTH
)
@
pytest
.
mark
.
parametrize
(
"tokenizer_id"
,
TOKENIZERS
)
@
pytest
.
mark
.
parametrize
(
"skip_special_tokens"
,
(
True
,
False
))
def
test_decode_streaming
(
tokenizer_id
,
truth
,
skip_special_tokens
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_id
)
all_input_ids
=
tokenizer
(
truth
,
add_special_tokens
=
False
)[
"input_ids"
]
if
skip_special_tokens
:
all_input_ids
=
([
tokenizer
.
bos_token_id
]
if
tokenizer
.
bos_token_id
is
not
None
else
[])
+
all_input_ids
+
[
tokenizer
.
eos_token_id
]
decoded_text
=
_run_incremental_decode
(
tokenizer
,
all_input_ids
,
skip_special_tokens
=
skip_special_tokens
)
assert
decoded_text
==
truth
server/vllm/tests/kernels/conftest.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
List
,
Tuple
import
pytest
import
torch
def
create_kv_caches
(
num_blocks
:
int
,
block_size
:
int
,
num_layers
:
int
,
num_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
Tuple
[
List
[
torch
.
Tensor
],
List
[
torch
.
Tensor
]]:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
scale
=
head_size
**-
0.5
x
=
16
//
torch
.
tensor
([],
dtype
=
dtype
).
element_size
()
key_cache_shape
=
(
num_blocks
,
num_heads
,
head_size
//
x
,
block_size
,
x
)
key_caches
=
[]
for
_
in
range
(
num_layers
):
key_cache
=
torch
.
empty
(
size
=
key_cache_shape
,
dtype
=
dtype
,
device
=
'cuda'
)
key_cache
.
uniform_
(
-
scale
,
scale
)
key_caches
.
append
(
key_cache
)
value_cache_shape
=
(
num_blocks
,
num_heads
,
head_size
,
block_size
)
value_caches
=
[]
for
_
in
range
(
num_layers
):
value_cache
=
torch
.
empty
(
size
=
value_cache_shape
,
dtype
=
dtype
,
device
=
'cuda'
)
value_cache
.
uniform_
(
-
scale
,
scale
)
value_caches
.
append
(
value_cache
)
return
key_caches
,
value_caches
@
pytest
.
fixture
()
def
kv_cache_factory
():
return
create_kv_caches
server/vllm/tests/kernels/test_activation.py
deleted
100644 → 0
View file @
64def8e2
import
pytest
import
torch
import
torch.nn.functional
as
F
from
transformers.activations
import
get_activation
from
vllm
import
activation_ops
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
7
,
83
,
2048
]
# Arbitrary values for testing
D
=
[
512
,
4096
,
5120
,
13824
]
# Arbitrary values for testing
SEEDS
=
[
0
]
def
ref_silu_and_mul
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x1
,
x2
=
x
.
chunk
(
chunks
=
2
,
dim
=
1
)
return
F
.
silu
(
x1
)
*
x2
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"d"
,
D
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_silu_and_mul
(
num_tokens
:
int
,
d
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
,
device
=
"cuda"
)
out
=
torch
.
empty
(
num_tokens
,
d
,
dtype
=
dtype
,
device
=
"cuda"
)
activation_ops
.
silu_and_mul
(
out
,
x
)
ref_out
=
ref_silu_and_mul
(
x
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-5
,
rtol
=
1e-5
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"d"
,
D
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_gelu_new
(
num_tokens
:
int
,
d
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
,
device
=
"cuda"
)
out
=
torch
.
empty
(
num_tokens
,
d
,
dtype
=
dtype
,
device
=
"cuda"
)
activation_ops
.
gelu_new
(
out
,
x
)
ref_out
=
get_activation
(
"gelu_new"
)(
x
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-5
,
rtol
=
1e-5
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"d"
,
D
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_gelu_fast
(
num_tokens
:
int
,
d
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
,
device
=
"cuda"
)
out
=
torch
.
empty
(
num_tokens
,
d
,
dtype
=
dtype
,
device
=
"cuda"
)
activation_ops
.
gelu_fast
(
out
,
x
)
ref_out
=
get_activation
(
"gelu_fast"
)(
x
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-5
,
rtol
=
1e-5
)
server/vllm/tests/kernels/test_attention.py
deleted
100644 → 0
View file @
64def8e2
import
random
from
typing
import
List
,
Optional
,
Tuple
import
pytest
import
torch
from
xformers
import
ops
as
xops
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
vllm
import
attention_ops
from
vllm.utils
import
get_max_shared_memory_bytes
FLOAT32_BYTES
=
torch
.
finfo
(
torch
.
float
).
bits
//
8
# This will change depending on the compute capability.
# - 512 as a buffer
MAX_SEQ_LEN
=
get_max_shared_memory_bytes
()
//
FLOAT32_BYTES
-
512
NUM_BLOCKS
=
128
# Arbitrary values for testing
PARTITION_SIZE
=
512
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_GEN_SEQS
=
[
7
]
# Arbitrary values for testing
NUM_PREFILL_SEQS
=
[
3
]
# Arbitrary values for testing
NUM_HEADS
=
[(
40
,
40
),
(
64
,
8
)]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
BLOCK_SIZES
=
[
16
,
32
]
USE_ALIBI
=
[
False
,
True
]
SEEDS
=
[
0
]
def
ref_masked_attention
(
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
scale
:
float
,
attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
attn_weights
=
scale
*
torch
.
einsum
(
"qhd,khd->hqk"
,
query
,
key
).
float
()
if
attn_mask
is
not
None
:
attn_weights
=
attn_weights
+
attn_mask
.
float
()
attn_weights
=
torch
.
softmax
(
attn_weights
,
dim
=-
1
).
to
(
value
.
dtype
)
out
=
torch
.
einsum
(
"hqk,khd->qhd"
,
attn_weights
,
value
)
return
out
def
ref_single_query_cached_kv_attention
(
output
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
num_queries_per_kv
:
int
,
key_cache
:
torch
.
Tensor
,
value_cache
:
torch
.
Tensor
,
block_tables
:
torch
.
Tensor
,
context_lens
:
torch
.
Tensor
,
scale
:
float
,
alibi_slopes
:
Optional
[
torch
.
Tensor
],
)
->
None
:
num_query_heads
=
query
.
shape
[
1
]
num_kv_heads
=
value_cache
.
shape
[
1
]
head_size
=
value_cache
.
shape
[
2
]
block_size
=
value_cache
.
shape
[
3
]
num_seqs
=
query
.
shape
[
0
]
block_tables
=
block_tables
.
cpu
().
tolist
()
context_lens
=
context_lens
.
cpu
().
tolist
()
for
i
in
range
(
num_seqs
):
q
=
query
[
i
].
unsqueeze
(
0
)
block_table
=
block_tables
[
i
]
context_len
=
int
(
context_lens
[
i
])
keys
=
[]
values
=
[]
for
j
in
range
(
context_len
):
block_number
=
int
(
block_table
[
j
//
block_size
])
block_offset
=
j
%
block_size
k
=
key_cache
[
block_number
,
:,
:,
block_offset
,
:]
k
=
k
.
reshape
(
num_kv_heads
,
head_size
)
keys
.
append
(
k
)
v
=
value_cache
[
block_number
,
:,
:,
block_offset
]
values
.
append
(
v
)
keys
=
torch
.
stack
(
keys
,
dim
=
0
)
values
=
torch
.
stack
(
values
,
dim
=
0
)
if
num_queries_per_kv
>
1
:
# Handle MQA and GQA
keys
=
torch
.
repeat_interleave
(
keys
,
num_queries_per_kv
,
dim
=
1
)
values
=
torch
.
repeat_interleave
(
values
,
num_queries_per_kv
,
dim
=
1
)
alibi_bias
=
None
if
alibi_slopes
is
not
None
:
# Create the ALiBi bias used in the paged attention kernel.
position_ids
=
torch
.
arange
(
context_len
,
device
=
"cuda"
).
int
()
alibi_bias
=
(
position_ids
-
context_len
+
1
).
float
()
alibi_bias
=
alibi_slopes
.
view
(
-
1
,
1
,
1
)
*
alibi_bias
.
view
(
1
,
1
,
-
1
)
out
=
ref_masked_attention
(
q
,
keys
,
values
,
scale
,
alibi_bias
)
out
=
out
.
view
(
num_query_heads
,
head_size
)
output
[
i
].
copy_
(
out
,
non_blocking
=
True
)
@
pytest
.
mark
.
parametrize
(
"version"
,
[
"v1"
,
"v2"
])
@
pytest
.
mark
.
parametrize
(
"num_seqs"
,
NUM_GEN_SEQS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
USE_ALIBI
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
def
test_paged_attention
(
kv_cache_factory
,
version
:
str
,
num_seqs
:
int
,
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
use_alibi
:
bool
,
block_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
num_query_heads
,
num_kv_heads
=
num_heads
query
=
torch
.
empty
(
num_seqs
,
num_query_heads
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
query
.
uniform_
(
-
scale
,
scale
)
assert
num_query_heads
%
num_kv_heads
==
0
num_queries_per_kv
=
num_query_heads
//
num_kv_heads
head_mapping
=
torch
.
repeat_interleave
(
torch
.
arange
(
num_kv_heads
,
dtype
=
torch
.
int32
,
device
=
"cuda"
),
num_queries_per_kv
)
alibi_slopes
=
None
if
use_alibi
:
alibi_slopes
=
torch
.
randn
(
num_query_heads
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
context_lens
=
[
random
.
randint
(
1
,
MAX_SEQ_LEN
)
for
_
in
range
(
num_seqs
)]
context_lens
[
-
1
]
=
MAX_SEQ_LEN
max_context_len
=
max
(
context_lens
)
context_lens
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
# Create the block tables.
max_num_blocks_per_seq
=
(
max_context_len
+
block_size
-
1
)
//
block_size
block_tables
=
[]
for
_
in
range
(
num_seqs
):
block_table
=
[
random
.
randint
(
0
,
NUM_BLOCKS
-
1
)
for
_
in
range
(
max_num_blocks_per_seq
)
]
block_tables
.
append
(
block_table
)
block_tables
=
torch
.
tensor
(
block_tables
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
# Create the KV caches.
key_caches
,
value_caches
=
kv_cache_factory
(
NUM_BLOCKS
,
block_size
,
1
,
num_kv_heads
,
head_size
,
dtype
,
seed
)
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Call the paged attention kernel.
output
=
torch
.
empty_like
(
query
)
if
version
==
"v1"
:
attention_ops
.
paged_attention_v1
(
output
,
query
,
key_cache
,
value_cache
,
head_mapping
,
scale
,
block_tables
,
context_lens
,
block_size
,
max_context_len
,
alibi_slopes
,
)
elif
version
==
"v2"
:
num_partitions
=
((
max_context_len
+
PARTITION_SIZE
-
1
)
//
PARTITION_SIZE
)
assert
PARTITION_SIZE
%
block_size
==
0
num_seqs
,
num_heads
,
head_size
=
output
.
shape
tmp_output
=
torch
.
empty
(
size
=
(
num_seqs
,
num_heads
,
num_partitions
,
head_size
),
dtype
=
output
.
dtype
,
device
=
output
.
device
,
)
exp_sums
=
torch
.
empty
(
size
=
(
num_seqs
,
num_heads
,
num_partitions
),
dtype
=
torch
.
float32
,
device
=
output
.
device
,
)
max_logits
=
torch
.
empty_like
(
exp_sums
)
attention_ops
.
paged_attention_v2
(
output
,
exp_sums
,
max_logits
,
tmp_output
,
query
,
key_cache
,
value_cache
,
head_mapping
,
scale
,
block_tables
,
context_lens
,
block_size
,
max_context_len
,
alibi_slopes
,
)
else
:
assert
False
,
f
"Unknown version:
{
version
}
"
# Run the reference implementation.
ref_output
=
torch
.
empty_like
(
query
)
ref_single_query_cached_kv_attention
(
ref_output
,
query
,
num_queries_per_kv
,
key_cache
,
value_cache
,
block_tables
,
context_lens
,
scale
,
alibi_slopes
,
)
# NOTE(woosuk): Due to the kernel-level differences in the two
# implementations, there is a small numerical difference in the two
# outputs. Thus, we use a relaxed tolerance for the test.
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
1e-3
,
rtol
=
1e-5
)
def
ref_multi_query_kv_attention
(
cu_seq_lens
:
List
[
int
],
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
scale
:
float
,
dtype
:
torch
.
dtype
,
)
->
torch
.
Tensor
:
num_seqs
=
len
(
cu_seq_lens
)
-
1
ref_outputs
=
[]
for
i
in
range
(
num_seqs
):
start_idx
=
cu_seq_lens
[
i
]
end_idx
=
cu_seq_lens
[
i
+
1
]
seq_len
=
end_idx
-
start_idx
# Create attention mask.
attn_mask
=
torch
.
triu
(
torch
.
ones
(
seq_len
,
seq_len
,
dtype
=
dtype
),
diagonal
=
1
)
attn_mask
=
attn_mask
*
torch
.
finfo
(
dtype
).
min
attn_mask
=
attn_mask
.
to
(
dtype
=
dtype
,
device
=
"cuda"
)
ref_output
=
ref_masked_attention
(
query
[
start_idx
:
end_idx
],
key
[
start_idx
:
end_idx
],
value
[
start_idx
:
end_idx
],
scale
,
attn_mask
=
attn_mask
,
)
ref_outputs
.
append
(
ref_output
)
ref_output
=
torch
.
cat
(
ref_outputs
,
dim
=
0
)
return
ref_output
# TODO(woosuk): Add tests for USE_ALIBI=True.
@
pytest
.
mark
.
parametrize
(
"num_seqs"
,
NUM_PREFILL_SEQS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_multi_query_kv_attention
(
num_seqs
:
int
,
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
# a smaller MAX_SEQ_LEN here.
max_len
=
min
(
MAX_SEQ_LEN
,
4096
)
seq_lens
=
random
.
sample
(
range
(
1
,
max_len
),
num_seqs
)
num_tokens
=
sum
(
seq_lens
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
num_query_heads
,
num_kv_heads
=
num_heads
qkv
=
torch
.
empty
(
num_tokens
,
num_query_heads
+
2
*
num_kv_heads
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
qkv
.
uniform_
(
-
scale
,
scale
)
query
,
key
,
value
=
qkv
.
split
(
[
num_query_heads
,
num_kv_heads
,
num_kv_heads
],
dim
=
1
)
num_queries_per_kv
=
num_query_heads
//
num_kv_heads
if
num_queries_per_kv
>
1
:
# Handle MQA and GQA
key
=
torch
.
repeat_interleave
(
key
,
num_queries_per_kv
,
dim
=
1
)
value
=
torch
.
repeat_interleave
(
value
,
num_queries_per_kv
,
dim
=
1
)
attn_bias
=
BlockDiagonalCausalMask
.
from_seqlens
(
seq_lens
)
output
=
xops
.
memory_efficient_attention_forward
(
query
.
unsqueeze
(
0
),
key
.
unsqueeze
(
0
),
value
.
unsqueeze
(
0
),
attn_bias
=
attn_bias
,
p
=
0.0
,
scale
=
scale
,
)
output
=
output
.
squeeze
(
0
)
cu_seq_lens
=
[
0
]
for
seq_len
in
seq_lens
:
cu_seq_lens
.
append
(
cu_seq_lens
[
-
1
]
+
seq_len
)
ref_output
=
ref_multi_query_kv_attention
(
cu_seq_lens
,
query
,
key
,
value
,
scale
,
dtype
,
)
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
1e-3
,
rtol
=
1e-5
)
server/vllm/tests/kernels/test_cache.py
deleted
100644 → 0
View file @
64def8e2
import
random
import
pytest
import
torch
from
vllm
import
cache_ops
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
7
,
83
,
2048
]
# Arbitrary values for testing
NUM_LAYERS
=
[
5
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
NUM_BLOCKS
=
[
1024
]
# Arbitrary values for testing
NUM_MAPPINGS
=
[
32
,
256
]
# Arbitrary values for testing
SEEDS
=
[
0
]
@
pytest
.
mark
.
parametrize
(
"num_mappings"
,
NUM_MAPPINGS
)
@
pytest
.
mark
.
parametrize
(
"num_layers"
,
NUM_LAYERS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_copy_blocks
(
kv_cache_factory
,
num_mappings
:
int
,
num_layers
:
int
,
num_heads
:
int
,
head_size
:
int
,
block_size
:
int
,
num_blocks
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
# Generate random block mappings where each source block is mapped to two
# destination blocks.
assert
2
*
num_mappings
<=
num_blocks
src_blocks
=
random
.
sample
(
range
(
num_blocks
),
num_mappings
)
remainig_blocks
=
list
(
set
(
range
(
num_blocks
))
-
set
(
src_blocks
))
dst_blocks
=
random
.
sample
(
remainig_blocks
,
2
*
num_mappings
)
block_mapping
=
{}
for
i
in
range
(
num_mappings
):
src
=
src_blocks
[
i
]
dst1
=
dst_blocks
[
2
*
i
]
dst2
=
dst_blocks
[
2
*
i
+
1
]
block_mapping
[
src
]
=
[
dst1
,
dst2
]
# Create the KV caches.
key_caches
,
value_caches
=
kv_cache_factory
(
num_blocks
,
block_size
,
num_layers
,
num_heads
,
head_size
,
dtype
,
seed
)
# Clone the KV caches.
cloned_key_caches
=
[
key_cache
.
clone
()
for
key_cache
in
key_caches
]
cloned_value_caches
=
[
value_cache
.
clone
()
for
value_cache
in
value_caches
]
# Call the copy blocks kernel.
cache_ops
.
copy_blocks
(
key_caches
,
value_caches
,
block_mapping
)
# Run the reference implementation.
for
src
,
dsts
in
block_mapping
.
items
():
for
dst
in
dsts
:
for
cloned_key_cache
in
cloned_key_caches
:
cloned_key_cache
[
dst
]
=
cloned_key_cache
[
src
]
for
cloned_value_cache
in
cloned_value_caches
:
cloned_value_cache
[
dst
]
=
cloned_value_cache
[
src
]
# Compare the results.
for
key_cache
,
cloned_key_cache
in
zip
(
key_caches
,
cloned_key_caches
):
assert
torch
.
allclose
(
key_cache
,
cloned_key_cache
)
for
value_cache
,
cloned_value_cache
in
zip
(
value_caches
,
cloned_value_caches
):
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_reshape_and_cache
(
kv_cache_factory
,
num_tokens
:
int
,
num_heads
:
int
,
head_size
:
int
,
block_size
:
int
,
num_blocks
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
# Create a random slot mapping.
num_slots
=
block_size
*
num_blocks
slot_mapping
=
random
.
sample
(
range
(
num_slots
),
num_tokens
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
qkv
=
torch
.
randn
(
num_tokens
,
3
,
num_heads
,
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
_
,
key
,
value
=
qkv
.
unbind
(
dim
=
1
)
# Create the KV caches.
key_caches
,
value_caches
=
kv_cache_factory
(
num_blocks
,
block_size
,
1
,
num_heads
,
head_size
,
dtype
,
seed
)
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Clone the KV caches.
cloned_key_cache
=
key_cache
.
clone
()
cloned_value_cache
=
value_cache
.
clone
()
# Call the reshape_and_cache kernel.
cache_ops
.
reshape_and_cache
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
)
# Run the reference implementation.
reshaped_key
=
key
.
reshape
(
num_tokens
,
*
key_cache
[
0
,
:,
:,
0
,
:].
shape
)
block_indicies
=
torch
.
div
(
slot_mapping
,
block_size
,
rounding_mode
=
"floor"
)
block_indicies
=
block_indicies
.
cpu
().
tolist
()
block_offsets
=
slot_mapping
%
block_size
block_offsets
=
block_offsets
.
cpu
().
tolist
()
for
i
in
range
(
num_tokens
):
block_idx
=
block_indicies
[
i
]
block_offset
=
block_offsets
[
i
]
cloned_key_cache
[
block_idx
,
:,
:,
block_offset
,
:]
=
reshaped_key
[
i
]
cloned_value_cache
[
block_idx
,
:,
:,
block_offset
]
=
value
[
i
]
assert
torch
.
allclose
(
key_cache
,
cloned_key_cache
)
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
server/vllm/tests/kernels/test_layernorm.py
deleted
100644 → 0
View file @
64def8e2
import
pytest
import
torch
import
torch.nn
as
nn
from
vllm
import
layernorm_ops
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
67
,
768
,
2048
,
5120
,
8192
]
# Arbitrary values for testing
NUM_TOKENS
=
[
7
,
83
,
4096
]
# Arbitrary values for testing
SEEDS
=
[
0
]
class
RefRMSNorm
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
,
eps
=
1e-6
):
super
().
__init__
()
weight
=
torch
.
empty
(
hidden_size
)
weight
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
self
.
weight
=
nn
.
Parameter
(
weight
)
self
.
variance_epsilon
=
eps
def
forward
(
self
,
hidden_states
):
input_dtype
=
hidden_states
.
dtype
hidden_states
=
hidden_states
.
to
(
torch
.
float32
)
variance
=
hidden_states
.
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
hidden_states
=
hidden_states
*
torch
.
rsqrt
(
variance
+
self
.
variance_epsilon
)
return
self
.
weight
*
hidden_states
.
to
(
input_dtype
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_rms_norm
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
scale
=
float
(
hidden_size
**-
0.5
)
x
=
torch
.
empty
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
x
.
uniform_
(
-
scale
,
scale
)
ref
=
RefRMSNorm
(
hidden_size
).
to
(
dtype
).
cuda
()
out
=
torch
.
empty_like
(
x
)
layernorm_ops
.
rms_norm
(
out
,
x
,
ref
.
weight
.
data
,
ref
.
variance_epsilon
,
)
ref_out
=
ref
(
x
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-2
,
rtol
=
1e-5
)
server/vllm/tests/kernels/test_pos_encoding.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
Optional
,
Tuple
import
pytest
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
vllm
import
pos_encoding_ops
IS_NEOX_STYLE
=
[
True
,
False
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
ROTARY_DIMS
=
[
None
,
32
]
# None means rotary dim == head size
NUM_HEADS
=
[
7
,
12
,
40
,
52
]
# Arbitrary values for testing
NUM_TOKENS
=
[
11
,
83
,
2048
]
# Arbitrary values for testing
SEEDS
=
[
0
]
def
rotate_neox
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x1
=
x
[...,
:
x
.
shape
[
-
1
]
//
2
]
x2
=
x
[...,
x
.
shape
[
-
1
]
//
2
:]
return
torch
.
cat
((
-
x2
,
x1
),
dim
=-
1
)
def
rotate_gptj
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x1
=
x
[...,
::
2
]
x2
=
x
[...,
1
::
2
]
x
=
torch
.
stack
((
-
x2
,
x1
),
dim
=-
1
)
return
x
.
flatten
(
-
2
)
def
apply_rope
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
cos
:
torch
.
Tensor
,
sin
:
torch
.
Tensor
,
is_neox_style
:
bool
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
rotate_fn
=
rotate_neox
if
is_neox_style
else
rotate_gptj
q_embed
=
(
q
*
cos
)
+
(
rotate_fn
(
q
)
*
sin
)
k_embed
=
(
k
*
cos
)
+
(
rotate_fn
(
k
)
*
sin
)
return
q_embed
,
k_embed
class
RefRotaryEmbedding
(
nn
.
Module
):
"""Reference implementation of rotary embedding."""
def
__init__
(
self
,
dim
:
int
,
is_neox_style
:
bool
,
max_position_embeddings
:
int
=
8192
,
base
:
int
=
10000
,
)
->
None
:
super
().
__init__
()
self
.
rotary_dim
=
dim
self
.
is_neox_style
=
is_neox_style
self
.
max_position_embeddings
=
max_position_embeddings
# Create cos and sin embeddings.
inv_freq
=
1.0
/
(
base
**
(
torch
.
arange
(
0
,
dim
,
2
)
/
dim
))
t
=
torch
.
arange
(
max_position_embeddings
).
float
()
freqs
=
torch
.
einsum
(
"i,j->ij"
,
t
,
inv_freq
.
float
())
if
is_neox_style
:
emb
=
torch
.
cat
((
freqs
,
freqs
),
dim
=-
1
)
else
:
emb
=
torch
.
repeat_interleave
(
freqs
,
2
,
-
1
)
cos
=
emb
.
cos
().
to
(
dtype
=
inv_freq
.
dtype
)
sin
=
emb
.
sin
().
to
(
dtype
=
inv_freq
.
dtype
)
self
.
register_buffer
(
"cos_cached"
,
cos
,
persistent
=
False
)
self
.
register_buffer
(
"sin_cached"
,
sin
,
persistent
=
False
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
# [num_tokens]
query
:
torch
.
Tensor
,
# [num_tokens, num_heads, head_size]
key
:
torch
.
Tensor
,
# [num_tokens, num_heads, head_size]
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
query_rot
=
query
[...,
:
self
.
rotary_dim
]
query_pass
=
query
[...,
self
.
rotary_dim
:]
key_rot
=
key
[...,
:
self
.
rotary_dim
]
key_pass
=
key
[...,
self
.
rotary_dim
:]
query_rot
=
query_rot
.
transpose
(
0
,
1
)
key_rot
=
key_rot
.
transpose
(
0
,
1
)
cos
=
F
.
embedding
(
positions
,
self
.
cos_cached
)
sin
=
F
.
embedding
(
positions
,
self
.
sin_cached
)
query_rot
,
key_rot
=
apply_rope
(
query_rot
,
key_rot
,
cos
,
sin
,
self
.
is_neox_style
)
query_rot
=
query_rot
.
transpose
(
0
,
1
).
contiguous
()
key_rot
=
key_rot
.
transpose
(
0
,
1
).
contiguous
()
query
=
torch
.
cat
((
query_rot
,
query_pass
),
dim
=-
1
)
key
=
torch
.
cat
((
key_rot
,
key_pass
),
dim
=-
1
)
# Output query/key shape: [num_tokens, num_tokens, head_size]
return
query
,
key
@
pytest
.
mark
.
parametrize
(
"is_neox_style"
,
IS_NEOX_STYLE
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"rotary_dim"
,
ROTARY_DIMS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_rotary_embedding
(
is_neox_style
:
bool
,
num_tokens
:
int
,
num_heads
:
int
,
head_size
:
int
,
rotary_dim
:
Optional
[
int
],
dtype
:
torch
.
dtype
,
seed
:
int
,
max_position
:
int
=
8192
,
base
:
int
=
10000
,
)
->
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
positions
=
torch
.
randint
(
0
,
max_position
,
(
num_tokens
,
),
device
=
"cuda"
)
query
=
torch
.
randn
(
num_tokens
,
num_heads
*
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
key
=
torch
.
randn
(
num_tokens
,
num_heads
*
head_size
,
dtype
=
dtype
,
device
=
"cuda"
)
# Create the rotary embedding.
inv_freq
=
1.0
/
(
base
**
(
torch
.
arange
(
0
,
rotary_dim
,
2
,
dtype
=
torch
.
float
)
/
rotary_dim
))
t
=
torch
.
arange
(
max_position
).
float
()
freqs
=
torch
.
einsum
(
"i,j -> ij"
,
t
,
inv_freq
)
cos
=
freqs
.
cos
()
sin
=
freqs
.
sin
()
cos_sin_cache
=
torch
.
cat
((
cos
,
sin
),
dim
=-
1
)
cos_sin_cache
=
cos_sin_cache
.
to
(
dtype
=
dtype
,
device
=
"cuda"
)
# Run the kernel. The kernel is in-place, so we need to clone the inputs.
out_query
=
query
.
clone
()
out_key
=
key
.
clone
()
pos_encoding_ops
.
rotary_embedding
(
positions
,
out_query
,
out_key
,
head_size
,
cos_sin_cache
,
is_neox_style
,
)
# Run the reference implementation.
ref_rotary_embedding
=
RefRotaryEmbedding
(
dim
=
rotary_dim
,
is_neox_style
=
is_neox_style
,
max_position_embeddings
=
max_position
,
base
=
base
,
).
to
(
dtype
=
dtype
,
device
=
"cuda"
)
ref_query
,
ref_key
=
ref_rotary_embedding
(
positions
,
query
.
view
(
num_tokens
,
num_heads
,
head_size
),
key
.
view
(
num_tokens
,
num_heads
,
head_size
),
)
ref_query
=
ref_query
.
view
(
num_tokens
,
num_heads
*
head_size
)
ref_key
=
ref_key
.
view
(
num_tokens
,
num_heads
*
head_size
)
# Compare the results.
assert
torch
.
allclose
(
out_query
,
ref_query
,
atol
=
1e-5
,
rtol
=
1e-5
)
assert
torch
.
allclose
(
out_key
,
ref_key
,
atol
=
1e-5
,
rtol
=
1e-5
)
server/vllm/tests/models/test_models.py
deleted
100644 → 0
View file @
64def8e2
"""Compare the outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/models/test_models.py --forked`.
"""
import
pytest
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
"mistralai/Mistral-7B-v0.1"
,
"tiiuae/falcon-7b"
,
"gpt2"
,
"bigcode/tiny_starcoder_py"
,
"EleutherAI/gpt-j-6b"
,
"EleutherAI/pythia-70m"
,
"bigscience/bloom-560m"
,
"mosaicml/mpt-7b"
,
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
server/vllm/tests/samplers/test_beam_search.py
deleted
100644 → 0
View file @
64def8e2
"""Compare the outputs of HF and vLLM when using beam search.
Run `pytest tests/samplers/test_beam_search.py --forked`.
"""
import
pytest
# FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256.
# 2. Increase beam_width to 8.
# 3. Use the model "huggyllama/llama-7b".
MAX_TOKENS
=
[
128
]
BEAM_WIDTHS
=
[
4
]
MODELS
=
[
"facebook/opt-125m"
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
MAX_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
BEAM_WIDTHS
)
def
test_beam_search_single_input
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
beam_width
:
int
,
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_beam_search
(
example_prompts
,
beam_width
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
_
=
hf_outputs
[
i
]
vllm_output_ids
,
_
=
vllm_outputs
[
i
]
assert
len
(
hf_output_ids
)
==
len
(
vllm_output_ids
)
for
j
in
range
(
len
(
hf_output_ids
)):
assert
hf_output_ids
[
j
]
==
vllm_output_ids
[
j
],
(
f
"Test
{
i
}
output
{
j
}
:
\n
HF:
{
hf_output_ids
}
\n
"
f
"vLLM:
{
vllm_output_ids
}
"
)
server/vllm/tests/samplers/test_logprobs.py
deleted
100644 → 0
View file @
64def8e2
import
pytest
import
torch
from
vllm
import
SamplingParams
MODELS
=
[
"facebook/opt-125m"
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_get_prompt_logprobs
(
hf_runner
,
vllm_runner
,
model
,
dtype
,
example_prompts
,
):
max_tokens
=
5
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_logprobs
=
hf_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
=
max_tokens
,
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
logprobs
=
5
,
prompt_logprobs
=
5
,
temperature
=
0.0
)
vllm_results
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
vllm_sampling_params
)
# Test whether logprobs are included in the results.
for
result
in
vllm_results
:
assert
result
.
prompt_logprobs
is
not
None
assert
result
.
outputs
[
0
].
logprobs
is
not
None
# Test whether prompt logprobs are consistent with HF
for
vllm_result
,
hf_logprob
in
zip
(
vllm_results
,
hf_logprobs
):
# Check prompt logprobs
vllm_prompt_logprobs
=
vllm_result
.
prompt_logprobs
[
1
:]
for
i
,
vllm_prompt_logprob_dict
in
enumerate
(
vllm_prompt_logprobs
):
for
token_id
,
logprob
in
vllm_prompt_logprob_dict
.
items
():
torch
.
testing
.
assert_close
(
logprob
,
hf_logprob
[
0
][
i
][
token_id
].
item
(),
atol
=
1e-2
,
rtol
=
1e-2
)
vllm_sample_logprobs
=
vllm_result
.
outputs
[
0
].
logprobs
for
i
,
vllm_sample_logprob_dict
in
enumerate
(
vllm_sample_logprobs
):
for
token_id
,
logprob
in
vllm_sample_logprob_dict
.
items
():
torch
.
testing
.
assert_close
(
logprob
,
hf_logprob
[
i
][
-
1
][
token_id
].
item
(),
atol
=
1e-2
,
rtol
=
1e-2
)
server/vllm/tests/samplers/test_sampler.py
deleted
100644 → 0
View file @
64def8e2
# pylint: disable=protected-access
import
random
from
typing
import
Tuple
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.worker.worker
import
Worker
class
MockLogitsSampler
(
Sampler
):
def
__init__
(
self
,
vocab_size
:
int
,
fake_logits
:
torch
.
Tensor
):
super
().
__init__
(
vocab_size
=
vocab_size
)
self
.
fake_logits
=
fake_logits
def
forward
(
self
,
*
args
,
**
kwargs
):
with
patch
(
"vllm.model_executor.layers.sampler._prune_hidden_states"
,
lambda
x
,
y
:
x
):
with
patch
(
"vllm.model_executor.layers.sampler._get_logits"
,
lambda
*
args
,
**
kwargs
:
self
.
fake_logits
):
return
super
().
forward
(
*
args
,
**
kwargs
)
def
_prepare_test
(
batch_size
:
int
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
MockLogitsSampler
,
Worker
]:
vocab_size
=
32000
input_tensor
=
torch
.
rand
((
batch_size
,
1024
),
device
=
"cuda"
,
dtype
=
torch
.
float16
)
fake_logits
=
torch
.
full
((
batch_size
,
vocab_size
),
1e-2
,
device
=
input_tensor
.
device
,
dtype
=
input_tensor
.
dtype
)
sampler
=
MockLogitsSampler
(
32000
,
fake_logits
)
worker
=
Worker
(
None
,
None
,
None
)
worker
.
block_size
=
16
return
input_tensor
,
fake_logits
,
sampler
,
worker
RANDOM_SEEDS
=
list
(
range
(
128
))
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_all_greedy
(
seed
:
int
):
set_random_seed
(
seed
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
fake_logits
,
sampler
,
worker
=
_prepare_test
(
batch_size
)
seq_group_metadata_list
=
[]
for
i
in
range
(
batch_size
):
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0
,
),
block_tables
=
{
0
:
[
1
]},
))
_
,
_
,
input_metadata
=
worker
.
_prepare_inputs
(
seq_group_metadata_list
)
sampler_output
=
sampler
(
embedding
=
None
,
hidden_states
=
input_tensor
,
input_metadata
=
input_metadata
)
expected
=
torch
.
argmax
(
fake_logits
,
dim
=-
1
)
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
==
expected
[
i
].
item
()
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_all_random
(
seed
:
int
):
set_random_seed
(
seed
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
fake_logits
,
sampler
,
worker
=
_prepare_test
(
batch_size
)
for
i
in
range
(
batch_size
):
fake_logits
[
i
,
i
]
=
1e2
seq_group_metadata_list
=
[]
for
i
in
range
(
batch_size
):
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
n
=
random
.
randint
(
1
,
10
),
),
block_tables
=
{
0
:
[
1
]},
))
_
,
_
,
input_metadata
=
worker
.
_prepare_inputs
(
seq_group_metadata_list
)
sampler_output
=
sampler
(
embedding
=
None
,
hidden_states
=
input_tensor
,
input_metadata
=
input_metadata
)
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
==
i
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_all_beam
(
seed
:
int
):
set_random_seed
(
seed
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
_
,
sampler
,
worker
=
_prepare_test
(
batch_size
)
seq_group_metadata_list
=
[]
for
i
in
range
(
batch_size
):
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0
,
best_of
=
2
,
use_beam_search
=
True
,
),
block_tables
=
{
0
:
[
1
]},
))
_
,
_
,
input_metadata
=
worker
.
_prepare_inputs
(
seq_group_metadata_list
)
sampler
(
embedding
=
None
,
hidden_states
=
input_tensor
,
input_metadata
=
input_metadata
)
# no assertion here as I am not sure how to determine whether
# the outputs are expected - in other words, this just tests
# whether there are no exceptions in the sampler
# when handling an all-beam search case.
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_sampler_mixed
(
seed
:
int
):
set_random_seed
(
seed
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
fake_logits
,
sampler
,
worker
=
_prepare_test
(
batch_size
)
seq_group_metadata_list
=
[]
expected_tokens
=
[]
for
i
in
range
(
batch_size
):
n
=
1
sampling_type
=
random
.
randint
(
0
,
2
)
if
sampling_type
==
0
:
sampling_params
=
SamplingParams
(
temperature
=
0
)
elif
sampling_type
==
1
:
n
=
random
.
randint
(
1
,
10
)
sampling_params
=
SamplingParams
(
temperature
=
random
.
random
()
+
0.1
,
top_p
=
min
(
random
.
random
()
+
0.1
,
1
),
top_k
=
random
.
randint
(
0
,
10
)
or
-
1
,
n
=
n
,
presence_penalty
=
random
.
randint
(
0
,
1
),
)
else
:
sampling_params
=
SamplingParams
(
temperature
=
0
,
use_beam_search
=
True
,
best_of
=
2
)
for
idx
in
range
(
n
):
fake_logits
[
i
,
i
+
idx
]
=
1e2
expected_tokens
.
append
(
i
+
idx
)
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
([
1
,
2
,
3
])},
sampling_params
=
sampling_params
,
block_tables
=
{
0
:
[
1
]},
))
_
,
_
,
input_metadata
=
worker
.
_prepare_inputs
(
seq_group_metadata_list
)
sampler_output
=
sampler
(
embedding
=
None
,
hidden_states
=
input_tensor
,
input_metadata
=
input_metadata
)
for
i
,
sequence_output
in
enumerate
(
sampler_output
):
if
seq_group_metadata_list
[
i
].
sampling_params
.
use_beam_search
:
continue
for
nth_output
in
sequence_output
.
samples
:
assert
nth_output
.
output_token
in
expected_tokens
server/vllm/tests/test_patched_rotary.py
deleted
100644 → 0
View file @
64def8e2
import
torch
import
rotary_emb
from
vllm
import
pos_encoding_ops
def
apply_rotary_eager
(
query
,
key
,
cos
,
sin
):
def
_apply_rot
(
x
,
cos
,
sin
):
rotary_dim
=
cos
.
shape
[
-
1
]
dtype
=
x
.
dtype
x_upcast
=
x
.
to
(
torch
.
float32
)
cos
=
cos
.
to
(
torch
.
float32
)
sin
=
sin
.
to
(
torch
.
float32
)
x1
=
x_upcast
[...,
:
rotary_dim
]
x2
=
x_upcast
[...,
rotary_dim
:
2
*
rotary_dim
]
# Flash Attention rotary_emb kernel casts everything to float, not sure why, so we do so here as well.
x
[...,
:
rotary_dim
]
=
(
x1
*
cos
-
x2
*
sin
).
to
(
dtype
)
x
[...,
rotary_dim
:
2
*
rotary_dim
]
=
(
x1
*
sin
+
x2
*
cos
).
to
(
dtype
)
_apply_rot
(
query
,
cos
,
sin
)
_apply_rot
(
key
,
cos
,
sin
)
def
apply_rotary_flash
(
query
,
key
,
cos
,
sin
):
def
_apply_rot
(
x
,
cos
,
sin
):
rotary_dim
=
cos
.
shape
[
-
1
]
x1
=
x
[...,
:
rotary_dim
]
x2
=
x
[...,
rotary_dim
:
2
*
rotary_dim
]
rotary_emb
.
apply_rotary
(
x1
,
x2
,
cos
,
sin
,
x1
,
x2
,
False
)
_apply_rot
(
query
,
cos
,
sin
)
_apply_rot
(
key
,
cos
,
sin
)
def
apply_rotary_vllm
(
query
,
key
,
cos
,
sin
):
head_size
=
query
.
shape
[
-
1
]
#print("query", query.dtype)
#print("key", key.dtype)
#print("cos", cos.dtype)
#print("sin", sin.dtype)
# Inplace operation, updating query and key.
pos_encoding_ops
.
rotary_embedding
(
query
,
key
,
head_size
,
cos
,
sin
,
True
)
seqlen
=
8
cos
=
torch
.
rand
(
seqlen
,
1
,
64
).
to
(
"cuda"
).
to
(
torch
.
float16
)
sin
=
torch
.
rand
(
seqlen
,
1
,
64
).
to
(
"cuda"
).
to
(
torch
.
float16
)
head_dim
=
128
num_heads
=
32
query_eager
=
torch
.
rand
(
seqlen
,
num_heads
,
head_dim
).
to
(
torch
.
float16
).
to
(
"cuda"
)
key_eager
=
torch
.
rand
(
seqlen
,
num_heads
,
head_dim
).
to
(
torch
.
float16
).
to
(
"cuda"
)
query_vllm
=
query_eager
.
clone
()
query_flash
=
query_eager
.
clone
()
key_vllm
=
key_eager
.
clone
()
key_flash
=
key_eager
.
clone
()
apply_rotary_eager
(
query_eager
,
key_eager
,
cos
.
clone
(),
sin
.
clone
())
apply_rotary_flash
(
query_flash
,
key_flash
,
cos
.
clone
(),
sin
.
clone
())
apply_rotary_vllm
(
query_vllm
,
key_vllm
,
cos
.
clone
().
float
(),
sin
.
clone
().
float
())
def
check_diff
(
a
,
b
,
a_name
,
b_name
):
print
(
f
"Allclose
{
a_name
}
,
{
b_name
}
:
{
torch
.
allclose
(
a
,
b
)
}
; Abs reldiff:
{
((
a
-
b
).
abs
()
/
(
a
.
abs
()
+
1e-12
)).
mean
()
}
"
)
check_diff
(
query_eager
,
query_vllm
,
"query_eager"
,
"query_vllm"
)
check_diff
(
query_eager
,
query_flash
,
"query_eager"
,
"query_flash"
)
check_diff
(
key_eager
,
key_vllm
,
"key_eager"
,
"key_vllm"
)
check_diff
(
key_eager
,
key_flash
,
"key_eager"
,
"key_flash"
)
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment