Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
e00b0a19
Commit
e00b0a19
authored
Mar 23, 2024
by
zhuwenwen
Browse files
merge v0.3.3
parents
ead94d93
3f1166ab
Changes
239
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1614 additions
and
209 deletions
+1614
-209
tests/async_engine/test_chat_template.py
tests/async_engine/test_chat_template.py
+17
-19
tests/async_engine/test_request_tracker.py
tests/async_engine/test_request_tracker.py
+1
-1
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+38
-0
tests/conftest.py
tests/conftest.py
+41
-5
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+41
-0
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+36
-29
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+85
-0
tests/entrypoints/test_guided_processors.py
tests/entrypoints/test_guided_processors.py
+75
-0
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+599
-0
tests/kernels/allclose_default.py
tests/kernels/allclose_default.py
+18
-0
tests/kernels/conftest.py
tests/kernels/conftest.py
+2
-39
tests/kernels/test_activation.py
tests/kernels/test_activation.py
+35
-40
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+72
-34
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+96
-21
tests/kernels/test_layernorm.py
tests/kernels/test_layernorm.py
+10
-7
tests/kernels/test_moe.py
tests/kernels/test_moe.py
+98
-0
tests/kernels/test_pos_encoding.py
tests/kernels/test_pos_encoding.py
+20
-14
tests/kernels/test_prefix_prefill.py
tests/kernels/test_prefix_prefill.py
+175
-0
tests/lora/__init__.py
tests/lora/__init__.py
+0
-0
tests/lora/conftest.py
tests/lora/conftest.py
+155
-0
No files found.
tests/async_engine/test_
openai_server
.py
→
tests/async_engine/test_
chat_template
.py
View file @
e00b0a19
from
argparse
import
Namespace
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
import
os
import
os
import
pathlib
import
pathlib
import
pytest
import
pytest
from
fastapi.testclient
import
TestClient
from
vllm.entrypoints.openai.api_server
import
*
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))).
parent
.
parent
/
"examples/template_chatml.jinja"
__file__
))).
parent
.
parent
/
"examples/template_chatml.jinja"
...
@@ -48,7 +48,6 @@ TEST_MESSAGES = [
...
@@ -48,7 +48,6 @@ TEST_MESSAGES = [
'content'
:
'What is the capital of'
'content'
:
'What is the capital of'
},
},
]
]
client
=
TestClient
(
app
)
@
dataclass
@
dataclass
...
@@ -56,13 +55,17 @@ class MockTokenizer:
...
@@ -56,13 +55,17 @@ class MockTokenizer:
chat_template
=
None
chat_template
=
None
@
dataclass
class
MockServingChat
:
tokenizer
:
MockTokenizer
def
test_load_chat_template
():
def
test_load_chat_template
():
# Testing chatml template
# Testing chatml template
mock_args
=
Namespace
(
chat_template
=
chatml_jinja_path
)
tokenizer
=
MockTokenizer
()
tokenizer
=
MockTokenizer
()
mock_serving_chat
=
MockServingChat
(
tokenizer
)
# Call the function with the mocked args
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
load_chat_template
(
mock_args
,
tokenizer
)
chat_template
=
chatml_jinja_path
)
template_content
=
tokenizer
.
chat_template
template_content
=
tokenizer
.
chat_template
...
@@ -76,11 +79,11 @@ def test_load_chat_template():
...
@@ -76,11 +79,11 @@ def test_load_chat_template():
def
test_no_load_chat_template
():
def
test_no_load_chat_template
():
# Testing chatml template
# Testing chatml template
template
=
"../../examples/does_not_exist"
template
=
"../../examples/does_not_exist"
mock_args
=
Namespace
(
chat_template
=
template
)
tokenizer
=
MockTokenizer
()
tokenizer
=
MockTokenizer
()
# Call the function with the mocked args
mock_serving_chat
=
MockServingChat
(
tokenizer
)
load_chat_template
(
mock_args
,
tokenizer
=
tokenizer
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
chat_template
=
template
)
template_content
=
tokenizer
.
chat_template
template_content
=
tokenizer
.
chat_template
# Test assertions
# Test assertions
...
@@ -97,9 +100,9 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -97,9 +100,9 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
expected_output
):
expected_output
):
# Initialize the tokenizer
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
mock_serving_chat
=
MockServingChat
(
tokenizer
)
mock_args
=
Namespace
(
chat_template
=
template
)
OpenAIServingChat
.
_load_chat_template
(
mock_serving_chat
,
load_chat_template
(
mock_args
,
tokenizer
)
chat_template
=
template
)
# Create a mock request object using keyword arguments
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
mock_request
=
ChatCompletionRequest
(
...
@@ -115,8 +118,3 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -115,8 +118,3 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
# Test assertion
# Test assertion
assert
result
==
expected_output
,
f
"The generated prompt does not match the expected output for model
{
model
}
and template
{
template
}
"
assert
result
==
expected_output
,
f
"The generated prompt does not match the expected output for model
{
model
}
and template
{
template
}
"
def
test_health_endpoint
():
response
=
client
.
get
(
"/health"
)
assert
response
.
status_code
==
200
tests/async_engine/test_request_tracker.py
View file @
e00b0a19
...
@@ -64,7 +64,7 @@ def test_request_tracker():
...
@@ -64,7 +64,7 @@ def test_request_tracker():
stream_5
=
tracker
.
add_request
(
"5"
)
stream_5
=
tracker
.
add_request
(
"5"
)
assert
tracker
.
new_requests_event
.
flag
assert
tracker
.
new_requests_event
.
flag
tracker
.
process_request_output
(
tracker
.
process_request_output
(
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
finished
=
True
))
RequestOutput
(
"2"
,
"output"
,
[],
[],
[],
bool
(
finished
)
))
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
new
,
finished
=
tracker
.
get_new_and_finished_requests
()
assert
not
tracker
.
new_requests_event
.
flag
assert
not
tracker
.
new_requests_event
.
flag
assert
len
(
finished
)
==
1
assert
len
(
finished
)
==
1
...
...
tests/basic_correctness/test_basic_correctness.py
0 → 100644
View file @
e00b0a19
"""Compare the short outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/basic_correctness/test_basic_correctness.py --forked`.
"""
import
pytest
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
tests/conftest.py
View file @
e00b0a19
...
@@ -13,12 +13,10 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
...
@@ -13,12 +13,10 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
_LONG_PROMPTS
=
[
os
.
path
.
join
(
_TEST_DIR
,
"prompts"
,
"summary.txt"
)]
def
_read_prompts
(
filename
:
str
)
->
str
:
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
prompts
=
[]
with
open
(
filename
,
"r"
)
as
f
:
with
open
(
filename
,
"r"
)
as
f
:
prompt
=
f
.
readline
()
prompts
=
f
.
readlines
()
prompts
.
append
(
prompt
)
return
prompts
return
prompts
@
pytest
.
fixture
@
pytest
.
fixture
...
@@ -165,6 +163,9 @@ class VllmRunner:
...
@@ -165,6 +163,9 @@ class VllmRunner:
model_name
:
str
,
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
tokenizer_name
:
Optional
[
str
]
=
None
,
dtype
:
str
=
"half"
,
dtype
:
str
=
"half"
,
disable_log_stats
:
bool
=
True
,
tensor_parallel_size
:
int
=
1
,
**
kwargs
,
)
->
None
:
)
->
None
:
self
.
model
=
LLM
(
self
.
model
=
LLM
(
model
=
model_name
,
model
=
model_name
,
...
@@ -172,6 +173,9 @@ class VllmRunner:
...
@@ -172,6 +173,9 @@ class VllmRunner:
trust_remote_code
=
True
,
trust_remote_code
=
True
,
dtype
=
dtype
,
dtype
=
dtype
,
swap_space
=
0
,
swap_space
=
0
,
disable_log_stats
=
disable_log_stats
,
tensor_parallel_size
=
tensor_parallel_size
,
**
kwargs
,
)
)
def
generate
(
def
generate
(
...
@@ -195,6 +199,24 @@ class VllmRunner:
...
@@ -195,6 +199,24 @@ class VllmRunner:
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
outputs
.
append
((
req_sample_output_ids
,
req_sample_output_strs
))
return
outputs
return
outputs
def
generate_w_logprobs
(
self
,
prompts
:
List
[
str
],
sampling_params
:
SamplingParams
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
assert
sampling_params
.
logprobs
is
not
None
req_outputs
=
self
.
model
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
outputs
=
[]
for
req_output
in
req_outputs
:
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_ids
=
sample
.
token_ids
output_logprobs
=
sample
.
logprobs
outputs
.
append
((
output_ids
,
output_str
,
output_logprobs
))
return
outputs
def
generate_greedy
(
def
generate_greedy
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
...
@@ -205,6 +227,20 @@ class VllmRunner:
...
@@ -205,6 +227,20 @@ class VllmRunner:
return
[(
output_ids
[
0
],
output_str
[
0
])
return
[(
output_ids
[
0
],
output_str
[
0
])
for
output_ids
,
output_str
in
outputs
]
for
output_ids
,
output_str
in
outputs
]
def
generate_greedy_logprobs
(
self
,
prompts
:
List
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
List
[
Tuple
[
List
[
int
],
str
]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
,
logprobs
=
num_logprobs
)
outputs
=
self
.
generate_w_logprobs
(
prompts
,
greedy_logprobs_params
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
def
generate_beam_search
(
def
generate_beam_search
(
self
,
self
,
prompts
:
List
[
str
],
prompts
:
List
[
str
],
...
...
tests/distributed/test_basic_distributed_correctness.py
0 → 100644
View file @
e00b0a19
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
"""
import
pytest
import
torch
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-2-7b-hf"
,
]
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
vllm_output_ids
,
vllm_output_str
=
vllm_outputs
[
i
]
assert
hf_output_str
==
vllm_output_str
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_str
!
r
}
\n
vLLM:
{
vllm_output_str
!
r
}
"
)
assert
hf_output_ids
==
vllm_output_ids
,
(
f
"Test
{
i
}
:
\n
HF:
{
hf_output_ids
}
\n
vLLM:
{
vllm_output_ids
}
"
)
tests/distributed/test_comm_ops.py
View file @
e00b0a19
...
@@ -6,24 +6,13 @@ import pytest
...
@@ -6,24 +6,13 @@ import pytest
import
torch
import
torch
import
ray
import
ray
from
vllm.config
import
ParallelConfig
from
vllm.utils
import
get_open_port
from
vllm.model_executor.parallel_utils.communication_op
import
(
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_all_reduce
,
tensor_model_parallel_all_reduce
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_gather
,
broadcast_tensor_dict
,
)
)
from
vllm.worker.worker
import
_init_distributed_environment
from
vllm.test_utils
import
(
init_test_distributed_environment
,
multi_process_tensor_parallel
)
def
init_test_distributed_environment
(
pipeline_parallel_size
:
int
,
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
,
tensor_parallel_size
,
worker_use_ray
=
True
)
distributed_init_method
=
f
"tcp://localhost:
{
distributed_init_port
}
"
_init_distributed_environment
(
parallel_config
,
rank
,
distributed_init_method
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
@@ -64,22 +53,40 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
...
@@ -64,22 +53,40 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
assert
torch
.
allclose
(
t
,
expected
)
assert
torch
.
allclose
(
t
,
expected
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
broadcast_tensor_dict_test_worker
(
tensor_parallel_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
init_test_distributed_environment
(
1
,
tensor_parallel_size
,
rank
,
distributed_init_port
)
test_dict
=
{
"a"
:
torch
.
arange
(
8
,
dtype
=
torch
.
float32
,
device
=
"cuda"
),
"b"
:
torch
.
arange
(
16
,
dtype
=
torch
.
int8
,
device
=
"cuda"
),
"c"
:
"test"
,
"d"
:
[
1
,
2
,
3
],
"e"
:
{
"a"
:
1
,
"b"
:
2
},
}
if
rank
==
0
:
broadcast_tensor_dict
(
test_dict
,
src
=
0
)
else
:
recv_dict
=
broadcast_tensor_dict
(
src
=
0
)
assert
len
(
recv_dict
)
==
len
(
test_dict
)
assert
torch
.
allclose
(
recv_dict
[
"a"
],
test_dict
[
"a"
])
assert
torch
.
allclose
(
recv_dict
[
"b"
],
test_dict
[
"b"
])
assert
recv_dict
[
"c"
]
==
test_dict
[
"c"
]
assert
recv_dict
[
"d"
]
==
test_dict
[
"d"
]
assert
recv_dict
[
"e"
]
==
test_dict
[
"e"
]
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
[
all_reduce_test_worker
,
all_gather_test_worker
])
all_reduce_test_worker
,
all_gather_test_worker
,
broadcast_tensor_dict_test_worker
])
def
test_multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
):
def
test_multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
):
# Using ray helps debugging the error when it failed
multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
)
# as compared to multiprocessing.
ray
.
init
()
distributed_init_port
=
get_open_port
()
refs
=
[]
for
rank
in
range
(
tensor_parallel_size
):
refs
.
append
(
test_target
.
remote
(
tensor_parallel_size
,
rank
,
distributed_init_port
))
ray
.
get
(
refs
)
ray
.
shutdown
()
tests/distributed/test_custom_all_reduce.py
0 → 100644
View file @
e00b0a19
import
random
import
os
import
pytest
import
ray
import
torch
import
torch.distributed
as
dist
from
vllm.model_executor.parallel_utils
import
custom_all_reduce
as
custom_ar
from
vllm.model_executor.parallel_utils.communication_op
import
(
tensor_model_parallel_all_reduce
)
from
vllm.test_utils
import
(
init_test_distributed_environment
,
multi_process_tensor_parallel
)
random
.
seed
(
42
)
test_sizes
=
[
random
.
randint
(
1024
,
2048
*
1024
)
for
_
in
range
(
8
)]
for
i
,
v
in
enumerate
(
test_sizes
):
test_sizes
[
i
]
-=
v
%
8
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
graph_allreduce
(
world_size
,
rank
,
distributed_init_port
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
1
,
world_size
,
rank
,
distributed_init_port
)
custom_ar
.
init_custom_ar
()
for
sz
in
test_sizes
:
for
dtype
in
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]:
with
custom_ar
.
capture
():
# use integers so result matches NCCL exactly
inp1
=
torch
.
randint
(
1
,
16
,
(
sz
,
),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
inp2
=
torch
.
randint
(
1
,
16
,
(
sz
,
),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
torch
.
cuda
.
synchronize
()
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
):
out1
=
tensor_model_parallel_all_reduce
(
inp1
)
# the input buffer is immediately modified to test
# synchronization
dist
.
all_reduce
(
inp1
)
out2
=
tensor_model_parallel_all_reduce
(
inp2
)
dist
.
all_reduce
(
inp2
)
graph
.
replay
()
assert
torch
.
allclose
(
out1
,
inp1
)
assert
torch
.
allclose
(
out2
,
inp2
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
eager_allreduce
(
world_size
,
rank
,
distributed_init_port
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
1
,
world_size
,
rank
,
distributed_init_port
)
sz
=
1024
custom_ar
.
init_custom_ar
()
fa
=
custom_ar
.
get_handle
()
inp
=
torch
.
ones
(
sz
,
dtype
=
torch
.
float32
,
device
=
device
)
out
=
fa
.
all_reduce_unreg
(
inp
)
assert
torch
.
allclose
(
out
,
inp
*
world_size
)
inp
=
torch
.
ones
(
sz
*
4
,
dtype
=
torch
.
bfloat16
,
device
=
device
)
out
=
fa
.
all_reduce_unreg
(
inp
)
assert
torch
.
allclose
(
out
,
inp
*
world_size
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
eager_allreduce
,
graph_allreduce
])
def
test_multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
):
multi_process_tensor_parallel
(
tensor_parallel_size
,
test_target
)
if
__name__
==
"__main__"
:
multi_process_tensor_parallel
(
2
,
graph_allreduce
)
tests/entrypoints/test_guided_processors.py
0 → 100644
View file @
e00b0a19
# This unit test should be moved to a new
# tests/test_guided_decoding directory.
from
transformers
import
AutoTokenizer
import
torch
from
vllm.model_executor.guided_logits_processors
import
(
RegexLogitsProcessor
,
JSONLogitsProcessor
)
TEST_SCHEMA
=
{
"type"
:
"object"
,
"properties"
:
{
"name"
:
{
"type"
:
"string"
},
"age"
:
{
"type"
:
"integer"
},
"skills"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"maxLength"
:
10
},
"minItems"
:
3
},
"work history"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"properties"
:
{
"company"
:
{
"type"
:
"string"
},
"duration"
:
{
"type"
:
"string"
},
"position"
:
{
"type"
:
"string"
}
},
"required"
:
[
"company"
,
"position"
]
}
}
},
"required"
:
[
"name"
,
"age"
,
"skills"
,
"work history"
]
}
TEST_REGEX
=
r
"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+
\
r
"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
def
test_guided_logits_processors
():
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
regex_LP
=
RegexLogitsProcessor
(
TEST_REGEX
,
tokenizer
)
json_LP
=
JSONLogitsProcessor
(
TEST_SCHEMA
,
tokenizer
)
regex_LP
.
init_state
()
token_ids
=
tokenizer
.
encode
(
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
)
tensor
=
torch
.
rand
(
32000
)
original_tensor
=
torch
.
clone
(
tensor
)
regex_LP
(
token_ids
,
tensor
)
assert
tensor
.
shape
==
original_tensor
.
shape
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
json_LP
.
init_state
()
token_ids
=
tokenizer
.
encode
(
f
"Give an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
)
tensor
=
torch
.
rand
(
32000
)
original_tensor
=
torch
.
clone
(
tensor
)
json_LP
(
token_ids
,
tensor
)
assert
tensor
.
shape
==
original_tensor
.
shape
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
tests/entrypoints/test_openai_server.py
0 → 100644
View file @
e00b0a19
import
os
import
subprocess
import
time
import
sys
import
pytest
import
requests
import
ray
# using Ray for overall ease of process management, parallel requests, and debugging.
import
openai
# use the official client for correctness check
from
huggingface_hub
import
snapshot_download
# downloading lora to test lora requests
# imports for guided decoding tests
import
json
import
jsonschema
import
re
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
MAX_SERVER_START_WAIT_S
=
600
# wait for server to start for 60 seconds
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# any model with a chat template should work here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
TEST_SCHEMA
=
{
"type"
:
"object"
,
"properties"
:
{
"name"
:
{
"type"
:
"string"
},
"age"
:
{
"type"
:
"integer"
},
"skills"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"maxLength"
:
10
},
"minItems"
:
3
},
"work history"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"properties"
:
{
"company"
:
{
"type"
:
"string"
},
"duration"
:
{
"type"
:
"string"
},
"position"
:
{
"type"
:
"string"
}
},
"required"
:
[
"company"
,
"position"
]
}
}
},
"required"
:
[
"name"
,
"age"
,
"skills"
,
"work history"
]
}
TEST_REGEX
=
r
"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+
\
r
"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
TEST_CHOICE
=
[
"Python"
,
"Java"
,
"JavaScript"
,
"C++"
,
"C#"
,
"PHP"
,
"TypeScript"
,
"Ruby"
,
"Swift"
,
"Kotlin"
]
pytestmark
=
pytest
.
mark
.
asyncio
@
ray
.
remote
(
num_gpus
=
1
)
class
ServerRunner
:
def
__init__
(
self
,
args
):
env
=
os
.
environ
.
copy
()
env
[
"PYTHONUNBUFFERED"
]
=
"1"
self
.
proc
=
subprocess
.
Popen
(
[
"python3"
,
"-m"
,
"vllm.entrypoints.openai.api_server"
]
+
args
,
env
=
env
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stderr
,
)
self
.
_wait_for_server
()
def
ready
(
self
):
return
True
def
_wait_for_server
(
self
):
# run health check
start
=
time
.
time
()
while
True
:
try
:
if
requests
.
get
(
"http://localhost:8000/health"
).
status_code
==
200
:
break
except
Exception
as
err
:
if
self
.
proc
.
poll
()
is
not
None
:
raise
RuntimeError
(
"Server exited unexpectedly."
)
from
err
time
.
sleep
(
0.5
)
if
time
.
time
()
-
start
>
MAX_SERVER_START_WAIT_S
:
raise
RuntimeError
(
"Server failed to start in time."
)
from
err
def
__del__
(
self
):
if
hasattr
(
self
,
"proc"
):
self
.
proc
.
terminate
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
server
(
zephyr_lora_files
):
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
MODEL_NAME
,
"--dtype"
,
"bfloat16"
,
# use half precision for speed and memory savings in CI environment
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
lora_models
=
models
[
1
:]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
10
)
assert
chat_completion
.
id
is
not
None
assert
chat_completion
.
choices
is
not
None
and
len
(
chat_completion
.
choices
)
==
1
assert
chat_completion
.
choices
[
0
].
message
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
.
top_logprobs
is
not
None
assert
len
(
chat_completion
.
choices
[
0
].
logprobs
.
top_logprobs
[
0
])
==
10
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_completion_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
single_usage
=
single_completion
.
usage
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
=
[]
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
usage
==
single_usage
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_chat_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
=
[]
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_batch_completions
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
async
def
test_logits_bias
(
server
,
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
async
def
test_guided_json_completion
(
server
,
client
:
openai
.
AsyncOpenAI
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
))
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
completion
.
choices
[
i
].
text
is
not
None
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
TEST_SCHEMA
)
async
def
test_guided_json_chat
(
server
,
client
:
openai
.
AsyncOpenAI
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"Give an example JSON for an employee profile that "
+
\
f
"fits this schema:
{
TEST_SCHEMA
}
"
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
))
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
json1
=
json
.
loads
(
message
.
content
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
TEST_SCHEMA
)
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Give me another one with a different name and age"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
))
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
json2
=
json
.
loads
(
message
.
content
)
jsonschema
.
validate
(
instance
=
json2
,
schema
=
TEST_SCHEMA
)
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
async
def
test_guided_regex_completion
(
server
,
client
:
openai
.
AsyncOpenAI
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
))
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
completion
.
choices
[
i
].
text
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
completion
.
choices
[
i
].
text
)
is
not
None
async
def
test_guided_regex_chat
(
server
,
client
:
openai
.
AsyncOpenAI
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
f
"Give an example IP address with this regex:
{
TEST_REGEX
}
"
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
))
ip1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
ip1
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
ip1
)
is
not
None
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
ip1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Give me a different one"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
))
ip2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
ip2
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
ip2
)
is
not
None
assert
ip1
!=
ip2
async
def
test_guided_choice_completion
(
server
,
client
:
openai
.
AsyncOpenAI
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
))
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
TEST_CHOICE
async
def
test_guided_choice_chat
(
server
,
client
:
openai
.
AsyncOpenAI
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"The best language for type-safe systems programming is "
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
))
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice1
in
TEST_CHOICE
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
choice1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"I disagree, pick another one"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
))
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice2
in
TEST_CHOICE
assert
choice1
!=
choice2
async
def
test_guided_decoding_type_error
(
server
,
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
))
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"The best language for type-safe systems programming is "
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
extra_body
=
dict
(
guided_regex
=
{
1
:
"Python"
,
2
:
"C++"
}))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
guided_json
=
TEST_SCHEMA
))
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/kernels/allclose_default.py
0 → 100644
View file @
e00b0a19
import
torch
# Reference default values of atol and rtol are from
# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
default_atol
=
{
torch
.
float16
:
1e-3
,
torch
.
bfloat16
:
1e-3
,
torch
.
float
:
1e-5
}
default_rtol
=
{
torch
.
float16
:
1e-3
,
torch
.
bfloat16
:
1.6e-2
,
torch
.
float
:
1.3e-6
}
def
get_default_atol
(
output
)
->
float
:
return
default_atol
[
output
.
dtype
]
def
get_default_rtol
(
output
)
->
float
:
return
default_rtol
[
output
.
dtype
]
tests/kernels/conftest.py
View file @
e00b0a19
from
typing
import
List
,
Tuple
import
pytest
import
pytest
import
torch
from
vllm.utils
import
create_kv_caches_with_random
def
create_kv_caches
(
num_blocks
:
int
,
block_size
:
int
,
num_layers
:
int
,
num_heads
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
device
:
str
,
)
->
Tuple
[
List
[
torch
.
Tensor
],
List
[
torch
.
Tensor
]]:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
scale
=
head_size
**-
0.5
x
=
16
//
torch
.
tensor
([],
dtype
=
dtype
).
element_size
()
key_cache_shape
=
(
num_blocks
,
num_heads
,
head_size
//
x
,
block_size
,
x
)
key_caches
=
[]
for
_
in
range
(
num_layers
):
key_cache
=
torch
.
empty
(
size
=
key_cache_shape
,
dtype
=
dtype
,
device
=
device
)
key_cache
.
uniform_
(
-
scale
,
scale
)
key_caches
.
append
(
key_cache
)
value_cache_shape
=
(
num_blocks
,
num_heads
,
head_size
,
block_size
)
value_caches
=
[]
for
_
in
range
(
num_layers
):
value_cache
=
torch
.
empty
(
size
=
value_cache_shape
,
dtype
=
dtype
,
device
=
device
)
value_cache
.
uniform_
(
-
scale
,
scale
)
value_caches
.
append
(
value_cache
)
return
key_caches
,
value_caches
@
pytest
.
fixture
()
@
pytest
.
fixture
()
def
kv_cache_factory
():
def
kv_cache_factory
():
return
create_kv_caches
return
create_kv_caches
_with_random
tests/kernels/test_activation.py
View file @
e00b0a19
from
typing
import
Type
import
pytest
import
pytest
import
torch
import
torch
from
vllm.model_executor.layers.activation
import
FastGELU
,
NewGELU
,
SiluAndMul
from
vllm.model_executor.layers.activation
import
(
FastGELU
,
GeluAndMul
,
NewGELU
,
SiluAndMul
)
from
allclose_default
import
get_default_atol
,
get_default_rtol
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
7
,
83
,
2048
]
# Arbitrary values for testing
NUM_TOKENS
=
[
7
,
83
,
2048
]
# Arbitrary values for testing
D
=
[
512
,
4096
,
5120
,
13824
]
# Arbitrary values for testing
D
=
[
512
,
4096
,
5120
,
13824
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
DEVICES
=
[
i
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)]
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
@
pytest
.
mark
.
parametrize
(
"activation"
,
[
SiluAndMul
,
GeluAndMul
])
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"d"
,
D
)
@
pytest
.
mark
.
parametrize
(
"d"
,
D
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_
DEVICES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_silu_and_mul
(
def
test_act_and_mul
(
activation
:
Type
[
torch
.
nn
.
Module
],
num_tokens
:
int
,
num_tokens
:
int
,
d
:
int
,
d
:
int
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
seed
:
int
,
seed
:
int
,
device
:
int
,
device
:
str
,
)
->
None
:
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
gpu_id
=
f
"cuda:
{
device
}
"
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
,
device
=
gpu_id
)
torch
.
set_default_device
(
device
)
layer
=
SiluAndMul
()
x
=
torch
.
randn
(
num_tokens
,
2
*
d
,
dtype
=
dtype
)
layer
=
activation
()
out
=
layer
(
x
)
out
=
layer
(
x
)
ref_out
=
layer
.
_forward
(
x
)
ref_out
=
layer
.
_forward
(
x
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-5
,
rtol
=
1e-5
)
# The SiLU and GELU implementations are equivalent to the native PyTorch
# implementations, so we can do exact comparison.
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
0.0
,
rtol
=
0.0
)
@
pytest
.
mark
.
parametrize
(
"activation"
,
[
FastGELU
,
NewGELU
])
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"d"
,
D
)
@
pytest
.
mark
.
parametrize
(
"d"
,
D
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_
DEVICES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_gelu_new
(
def
test_activation
(
num_tokens
:
int
,
activation
:
Type
[
torch
.
nn
.
Module
],
d
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
device
:
int
,
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
gpu_id
=
f
"cuda:
{
device
}
"
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
,
device
=
gpu_id
)
layer
=
NewGELU
()
out
=
layer
(
x
)
ref_out
=
layer
.
_forward
(
x
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-5
,
rtol
=
1e-5
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"d"
,
D
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_gelu_fast
(
num_tokens
:
int
,
num_tokens
:
int
,
d
:
int
,
d
:
int
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
seed
:
int
,
seed
:
int
,
device
:
int
,
device
:
str
,
)
->
None
:
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
gpu_id
=
f
"cuda:
{
device
}
"
torch
.
cuda
.
manual_seed
(
seed
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
,
device
=
gpu_id
)
torch
.
set_default_device
(
device
)
layer
=
FastGELU
()
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
layer
=
activation
()
out
=
layer
(
x
)
out
=
layer
(
x
)
ref_out
=
layer
.
_forward
(
x
)
ref_out
=
layer
.
_forward
(
x
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-5
,
rtol
=
1e-5
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
get_default_atol
(
out
),
rtol
=
get_default_rtol
(
out
))
tests/kernels/test_attention.py
View file @
e00b0a19
...
@@ -6,25 +6,38 @@ import torch
...
@@ -6,25 +6,38 @@ import torch
from
xformers
import
ops
as
xops
from
xformers
import
ops
as
xops
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
vllm._C
import
ops
from
vllm._C
import
ops
,
cache_ops
from
vllm.utils
import
get_max_shared_memory_bytes
from
vllm.utils
import
get_max_shared_memory_bytes
from
vllm.utils
import
is_hip
from
allclose_default
import
get_default_atol
,
get_default_rtol
FLOAT32_BYTES
=
torch
.
finfo
(
torch
.
float
).
bits
//
8
FLOAT32_BYTES
=
torch
.
finfo
(
torch
.
float
).
bits
//
8
# This will change depending on the compute capability.
# This will change depending on the compute capability.
# - 512 as a buffer
# - 512 as a buffer
MAX_SEQ_LEN
=
get_max_shared_memory_bytes
()
//
FLOAT32_BYTES
-
512
MAX_SEQ_LEN
=
get_max_shared_memory_bytes
()
//
FLOAT32_BYTES
-
512
NUM_BLOCKS
=
12000
# Arbitrary values for testing
# There may not be enough gpu memory due to large NUM_BLOCKS.
# Reduce NUM_BLOCKS when it happens.
NUM_BLOCKS
=
4321
# Arbitrary values for testing
PARTITION_SIZE
=
512
PARTITION_SIZE
=
512
# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
if
not
is_hip
()
else
[
torch
.
half
,
torch
.
bfloat16
]
NUM_GEN_SEQS
=
[
7
]
# Arbitrary values for testing
NUM_GEN_SEQS
=
[
7
]
# Arbitrary values for testing
NUM_PREFILL_SEQS
=
[
3
]
# Arbitrary values for testing
NUM_PREFILL_SEQS
=
[
3
]
# Arbitrary values for testing
NUM_HEADS
=
[(
40
,
40
),
(
64
,
8
)]
# Arbitrary values for testing
NUM_HEADS
=
[(
40
,
40
),
(
64
,
8
)]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
# FlashAttention forward only supports head dimension at most 128
# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
if
not
is_hip
()
else
[
64
,
80
,
96
,
112
,
128
]
BLOCK_SIZES
=
[
16
,
32
]
BLOCK_SIZES
=
[
16
,
32
]
USE_ALIBI
=
[
False
,
True
]
USE_ALIBI
=
[
False
,
True
]
KV_CACHE_DTYPE
=
[
"auto"
,
"fp8_e5m2"
]
SEEDS
=
[
0
]
SEEDS
=
[
0
]
DEVICES
=
[
i
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)]
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
def
ref_masked_attention
(
def
ref_masked_attention
(
...
@@ -88,7 +101,7 @@ def ref_single_query_cached_kv_attention(
...
@@ -88,7 +101,7 @@ def ref_single_query_cached_kv_attention(
alibi_bias
=
None
alibi_bias
=
None
if
alibi_slopes
is
not
None
:
if
alibi_slopes
is
not
None
:
# Create the ALiBi bias used in the paged attention kernel.
# Create the ALiBi bias used in the paged attention kernel.
position_ids
=
torch
.
arange
(
context_len
,
device
=
query
.
device
).
int
()
position_ids
=
torch
.
arange
(
context_len
).
int
()
alibi_bias
=
(
position_ids
-
context_len
+
1
).
float
()
alibi_bias
=
(
position_ids
-
context_len
+
1
).
float
()
alibi_bias
=
alibi_slopes
.
view
(
-
1
,
1
,
1
)
*
alibi_bias
.
view
(
alibi_bias
=
alibi_slopes
.
view
(
-
1
,
1
,
1
)
*
alibi_bias
.
view
(
1
,
1
,
-
1
)
1
,
1
,
-
1
)
...
@@ -105,8 +118,9 @@ def ref_single_query_cached_kv_attention(
...
@@ -105,8 +118,9 @@ def ref_single_query_cached_kv_attention(
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
USE_ALIBI
)
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
USE_ALIBI
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
KV_CACHE_DTYPE
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_
DEVICES
)
def
test_paged_attention
(
def
test_paged_attention
(
kv_cache_factory
,
kv_cache_factory
,
version
:
str
,
version
:
str
,
...
@@ -116,34 +130,30 @@ def test_paged_attention(
...
@@ -116,34 +130,30 @@ def test_paged_attention(
use_alibi
:
bool
,
use_alibi
:
bool
,
block_size
:
int
,
block_size
:
int
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
kv_cache_dtype
:
str
,
seed
:
int
,
seed
:
int
,
device
:
int
,
device
:
str
,
)
->
None
:
)
->
None
:
random
.
seed
(
seed
)
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
gpu_id
=
f
"cuda:
{
device
}
"
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
set_default_device
(
device
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
num_query_heads
,
num_kv_heads
=
num_heads
num_query_heads
,
num_kv_heads
=
num_heads
query
=
torch
.
empty
(
num_seqs
,
query
=
torch
.
empty
(
num_seqs
,
num_query_heads
,
head_size
,
dtype
=
dtype
)
num_query_heads
,
head_size
,
dtype
=
dtype
,
device
=
gpu_id
)
query
.
uniform_
(
-
scale
,
scale
)
query
.
uniform_
(
-
scale
,
scale
)
assert
num_query_heads
%
num_kv_heads
==
0
assert
num_query_heads
%
num_kv_heads
==
0
num_queries_per_kv
=
num_query_heads
//
num_kv_heads
num_queries_per_kv
=
num_query_heads
//
num_kv_heads
alibi_slopes
=
None
alibi_slopes
=
None
if
use_alibi
:
if
use_alibi
:
alibi_slopes
=
torch
.
randn
(
num_query_heads
,
alibi_slopes
=
torch
.
randn
(
num_query_heads
,
dtype
=
torch
.
float
)
dtype
=
torch
.
float
,
device
=
gpu_id
)
context_lens
=
[
random
.
randint
(
1
,
MAX_SEQ_LEN
)
for
_
in
range
(
num_seqs
)]
context_lens
=
[
random
.
randint
(
1
,
MAX_SEQ_LEN
)
for
_
in
range
(
num_seqs
)]
context_lens
[
-
1
]
=
MAX_SEQ_LEN
context_lens
[
-
1
]
=
MAX_SEQ_LEN
max_context_len
=
max
(
context_lens
)
max_context_len
=
max
(
context_lens
)
context_lens
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
,
device
=
gpu_id
)
context_lens
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
)
# Create the block tables.
# Create the block tables.
max_num_blocks_per_seq
=
(
max_context_len
+
block_size
-
1
)
//
block_size
max_num_blocks_per_seq
=
(
max_context_len
+
block_size
-
1
)
//
block_size
...
@@ -154,12 +164,13 @@ def test_paged_attention(
...
@@ -154,12 +164,13 @@ def test_paged_attention(
for
_
in
range
(
max_num_blocks_per_seq
)
for
_
in
range
(
max_num_blocks_per_seq
)
]
]
block_tables
.
append
(
block_table
)
block_tables
.
append
(
block_table
)
block_tables
=
torch
.
tensor
(
block_tables
,
dtype
=
torch
.
int
,
device
=
gpu_id
)
block_tables
=
torch
.
tensor
(
block_tables
,
dtype
=
torch
.
int
)
# Create the KV caches.
# Create the KV caches.
key_caches
,
value_caches
=
kv_cache_factory
(
NUM_BLOCKS
,
block_size
,
1
,
key_caches
,
value_caches
=
kv_cache_factory
(
NUM_BLOCKS
,
block_size
,
1
,
num_kv_heads
,
head_size
,
dtype
,
num_kv_heads
,
head_size
,
seed
,
gpu_id
)
kv_cache_dtype
,
dtype
,
seed
,
device
)
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Call the paged attention kernel.
# Call the paged attention kernel.
...
@@ -177,6 +188,7 @@ def test_paged_attention(
...
@@ -177,6 +188,7 @@ def test_paged_attention(
block_size
,
block_size
,
max_context_len
,
max_context_len
,
alibi_slopes
,
alibi_slopes
,
kv_cache_dtype
,
)
)
elif
version
==
"v2"
:
elif
version
==
"v2"
:
num_partitions
=
((
max_context_len
+
PARTITION_SIZE
-
1
)
//
num_partitions
=
((
max_context_len
+
PARTITION_SIZE
-
1
)
//
...
@@ -186,12 +198,10 @@ def test_paged_attention(
...
@@ -186,12 +198,10 @@ def test_paged_attention(
tmp_output
=
torch
.
empty
(
tmp_output
=
torch
.
empty
(
size
=
(
num_seqs
,
num_heads
,
num_partitions
,
head_size
),
size
=
(
num_seqs
,
num_heads
,
num_partitions
,
head_size
),
dtype
=
output
.
dtype
,
dtype
=
output
.
dtype
,
device
=
output
.
device
,
)
)
exp_sums
=
torch
.
empty
(
exp_sums
=
torch
.
empty
(
size
=
(
num_seqs
,
num_heads
,
num_partitions
),
size
=
(
num_seqs
,
num_heads
,
num_partitions
),
dtype
=
torch
.
float32
,
dtype
=
torch
.
float32
,
device
=
output
.
device
,
)
)
max_logits
=
torch
.
empty_like
(
exp_sums
)
max_logits
=
torch
.
empty_like
(
exp_sums
)
ops
.
paged_attention_v2
(
ops
.
paged_attention_v2
(
...
@@ -209,11 +219,30 @@ def test_paged_attention(
...
@@ -209,11 +219,30 @@ def test_paged_attention(
block_size
,
block_size
,
max_context_len
,
max_context_len
,
alibi_slopes
,
alibi_slopes
,
kv_cache_dtype
,
)
)
else
:
else
:
raise
AssertionError
(
f
"Unknown version:
{
version
}
"
)
raise
AssertionError
(
f
"Unknown version:
{
version
}
"
)
# Run the reference implementation.
# Run the reference implementation.
if
kv_cache_dtype
==
"fp8_e5m2"
:
# Convert cache data back to dtype.
x
=
16
//
torch
.
tensor
([],
dtype
=
dtype
).
element_size
()
key_cache_shape
=
(
NUM_BLOCKS
,
num_kv_heads
,
head_size
//
x
,
block_size
,
x
)
dequantized_key_cache
=
torch
.
empty
(
size
=
key_cache_shape
,
dtype
=
dtype
,
device
=
device
)
cache_ops
.
convert_fp8_e5m2
(
key_cache
,
dequantized_key_cache
)
key_cache
=
dequantized_key_cache
value_cache_shape
=
value_cache
.
shape
dequantized_value_cache
=
torch
.
empty
(
size
=
value_cache_shape
,
dtype
=
dtype
,
device
=
device
)
cache_ops
.
convert_fp8_e5m2
(
value_cache
,
dequantized_value_cache
)
value_cache
=
dequantized_value_cache
ref_output
=
torch
.
empty_like
(
query
)
ref_output
=
torch
.
empty_like
(
query
)
ref_single_query_cached_kv_attention
(
ref_single_query_cached_kv_attention
(
ref_output
,
ref_output
,
...
@@ -230,7 +259,14 @@ def test_paged_attention(
...
@@ -230,7 +259,14 @@ def test_paged_attention(
# NOTE(woosuk): Due to the kernel-level differences in the two
# NOTE(woosuk): Due to the kernel-level differences in the two
# implementations, there is a small numerical difference in the two
# implementations, there is a small numerical difference in the two
# outputs. Thus, we use a relaxed tolerance for the test.
# outputs. Thus, we use a relaxed tolerance for the test.
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
1e-3
,
rtol
=
1e-5
)
atol
=
get_default_atol
(
output
)
if
is_hip
()
else
1e-3
rtol
=
get_default_rtol
(
output
)
if
is_hip
()
else
1e-5
# NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
# so we use a relaxed tolerance for the test.
if
kv_cache_dtype
==
"fp8_e5m2"
:
atol
,
rtol
=
1e-2
,
1e-5
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
atol
,
rtol
=
rtol
)
def
ref_multi_query_kv_attention
(
def
ref_multi_query_kv_attention
(
...
@@ -252,7 +288,7 @@ def ref_multi_query_kv_attention(
...
@@ -252,7 +288,7 @@ def ref_multi_query_kv_attention(
attn_mask
=
torch
.
triu
(
torch
.
ones
(
seq_len
,
seq_len
,
dtype
=
dtype
),
attn_mask
=
torch
.
triu
(
torch
.
ones
(
seq_len
,
seq_len
,
dtype
=
dtype
),
diagonal
=
1
)
diagonal
=
1
)
attn_mask
=
attn_mask
*
torch
.
finfo
(
dtype
).
min
attn_mask
=
attn_mask
*
torch
.
finfo
(
dtype
).
min
attn_mask
=
attn_mask
.
to
(
dtype
=
dtype
,
device
=
query
.
device
)
attn_mask
=
attn_mask
.
to
(
dtype
=
dtype
)
ref_output
=
ref_masked_attention
(
ref_output
=
ref_masked_attention
(
query
[
start_idx
:
end_idx
],
query
[
start_idx
:
end_idx
],
...
@@ -272,7 +308,7 @@ def ref_multi_query_kv_attention(
...
@@ -272,7 +308,7 @@ def ref_multi_query_kv_attention(
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_
DEVICES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_multi_query_kv_attention
(
def
test_multi_query_kv_attention
(
num_seqs
:
int
,
num_seqs
:
int
,
...
@@ -280,12 +316,13 @@ def test_multi_query_kv_attention(
...
@@ -280,12 +316,13 @@ def test_multi_query_kv_attention(
head_size
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
seed
:
int
,
seed
:
int
,
device
:
int
,
device
:
str
,
)
->
None
:
)
->
None
:
random
.
seed
(
seed
)
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
gpu_id
=
f
"cuda:
{
device
}
"
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
set_default_device
(
device
)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use
# As the xformers library is already tested with its own tests, we can use
# a smaller MAX_SEQ_LEN here.
# a smaller MAX_SEQ_LEN here.
...
@@ -298,8 +335,7 @@ def test_multi_query_kv_attention(
...
@@ -298,8 +335,7 @@ def test_multi_query_kv_attention(
qkv
=
torch
.
empty
(
num_tokens
,
qkv
=
torch
.
empty
(
num_tokens
,
num_query_heads
+
2
*
num_kv_heads
,
num_query_heads
+
2
*
num_kv_heads
,
head_size
,
head_size
,
dtype
=
dtype
,
dtype
=
dtype
)
device
=
gpu_id
)
qkv
.
uniform_
(
-
scale
,
scale
)
qkv
.
uniform_
(
-
scale
,
scale
)
query
,
key
,
value
=
qkv
.
split
(
query
,
key
,
value
=
qkv
.
split
(
[
num_query_heads
,
num_kv_heads
,
num_kv_heads
],
dim
=
1
)
[
num_query_heads
,
num_kv_heads
,
num_kv_heads
],
dim
=
1
)
...
@@ -331,4 +367,6 @@ def test_multi_query_kv_attention(
...
@@ -331,4 +367,6 @@ def test_multi_query_kv_attention(
scale
,
scale
,
dtype
,
dtype
,
)
)
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
1e-3
,
rtol
=
1e-5
)
atol
=
get_default_atol
(
output
)
if
is_hip
()
else
1e-3
rtol
=
get_default_rtol
(
output
)
if
is_hip
()
else
1e-5
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
atol
,
rtol
=
rtol
)
tests/kernels/test_cache.py
View file @
e00b0a19
...
@@ -3,18 +3,28 @@ import random
...
@@ -3,18 +3,28 @@ import random
import
pytest
import
pytest
import
torch
import
torch
from
typing
import
Tuple
from
vllm._C
import
cache_ops
from
vllm._C
import
cache_ops
from
vllm.utils
import
is_hip
COPYING_DIRECTION
=
[(
'cuda'
,
'cpu'
),
(
'cuda'
,
'cuda'
),
(
'cpu'
,
'cuda'
)]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
NUM_TOKENS
=
[
42
]
# Arbitrary values for testing
NUM_TOKENS
=
[
42
]
# Arbitrary values for testing
NUM_LAYERS
=
[
1
]
# Arbitrary values for testing
NUM_LAYERS
=
[
1
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
NUM_HEADS
=
[
8
]
# Arbitrary values for testing
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
HEAD_SIZES
=
[
64
,
80
,
96
,
112
,
128
,
256
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
BLOCK_SIZES
=
[
8
,
16
,
32
]
NUM_BLOCKS
=
[
1024
,
3600
]
# Arbitrary values for testing
# reduce the size for ROCm test to avoid HIP OOM
NUM_BLOCKS
=
[
1024
,
36000
]
if
not
is_hip
else
[
1024
,
10000
]
# Arbitrary values for testing
NUM_MAPPINGS
=
[
256
]
# Arbitrary values for testing
NUM_MAPPINGS
=
[
256
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
DEVICES
=
[
i
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)]
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
KV_CACHE_DTYPE
=
[
"auto"
,
"fp8_e5m2"
]
@
pytest
.
mark
.
parametrize
(
"num_mappings"
,
NUM_MAPPINGS
)
@
pytest
.
mark
.
parametrize
(
"num_mappings"
,
NUM_MAPPINGS
)
...
@@ -25,7 +35,8 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
...
@@ -25,7 +35,8 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
KV_CACHE_DTYPE
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_copy_blocks
(
def
test_copy_blocks
(
kv_cache_factory
,
kv_cache_factory
,
...
@@ -37,12 +48,14 @@ def test_copy_blocks(
...
@@ -37,12 +48,14 @@ def test_copy_blocks(
num_blocks
:
int
,
num_blocks
:
int
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
seed
:
int
,
seed
:
int
,
device
:
int
,
kv_cache_dtype
:
str
,
device
:
str
,
)
->
None
:
)
->
None
:
random
.
seed
(
seed
)
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
gpu_id
=
f
"cuda:
{
device
}
"
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
set_default_device
(
device
)
# Generate random block mappings where each source block is mapped to two
# Generate random block mappings where each source block is mapped to two
# destination blocks.
# destination blocks.
assert
2
*
num_mappings
<=
num_blocks
assert
2
*
num_mappings
<=
num_blocks
...
@@ -59,7 +72,8 @@ def test_copy_blocks(
...
@@ -59,7 +72,8 @@ def test_copy_blocks(
# Create the KV caches.
# Create the KV caches.
key_caches
,
value_caches
=
kv_cache_factory
(
num_blocks
,
block_size
,
key_caches
,
value_caches
=
kv_cache_factory
(
num_blocks
,
block_size
,
num_layers
,
num_heads
,
num_layers
,
num_heads
,
head_size
,
dtype
,
seed
,
gpu_id
)
head_size
,
kv_cache_dtype
,
dtype
,
seed
,
device
)
# Clone the KV caches.
# Clone the KV caches.
cloned_key_caches
=
[
key_cache
.
clone
()
for
key_cache
in
key_caches
]
cloned_key_caches
=
[
key_cache
.
clone
()
for
key_cache
in
key_caches
]
...
@@ -91,7 +105,7 @@ def test_copy_blocks(
...
@@ -91,7 +105,7 @@ def test_copy_blocks(
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_
DEVICES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_reshape_and_cache
(
def
test_reshape_and_cache
(
kv_cache_factory
,
kv_cache_factory
,
...
@@ -102,29 +116,25 @@ def test_reshape_and_cache(
...
@@ -102,29 +116,25 @@ def test_reshape_and_cache(
num_blocks
:
int
,
num_blocks
:
int
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
seed
:
int
,
seed
:
int
,
device
:
int
,
device
:
str
,
)
->
None
:
)
->
None
:
random
.
seed
(
seed
)
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
gpu_id
=
f
"cuda:
{
device
}
"
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
set_default_device
(
device
)
# Create a random slot mapping.
# Create a random slot mapping.
num_slots
=
block_size
*
num_blocks
num_slots
=
block_size
*
num_blocks
slot_mapping
=
random
.
sample
(
range
(
num_slots
),
num_tokens
)
slot_mapping
=
random
.
sample
(
range
(
num_slots
),
num_tokens
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
long
,
device
=
gpu_id
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
long
)
qkv
=
torch
.
randn
(
num_tokens
,
qkv
=
torch
.
randn
(
num_tokens
,
3
,
num_heads
,
head_size
,
dtype
=
dtype
)
3
,
num_heads
,
head_size
,
dtype
=
dtype
,
device
=
gpu_id
)
_
,
key
,
value
=
qkv
.
unbind
(
dim
=
1
)
_
,
key
,
value
=
qkv
.
unbind
(
dim
=
1
)
# Create the KV caches.
# Create the KV caches.
key_caches
,
value_caches
=
kv_cache_factory
(
num_blocks
,
block_size
,
1
,
key_caches
,
value_caches
=
kv_cache_factory
(
num_blocks
,
block_size
,
1
,
num_heads
,
head_size
,
dtype
,
num_heads
,
head_size
,
dtype
,
seed
,
gpu_id
)
None
,
seed
,
device
)
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
key_cache
,
value_cache
=
key_caches
[
0
],
value_caches
[
0
]
# Clone the KV caches.
# Clone the KV caches.
...
@@ -133,7 +143,7 @@ def test_reshape_and_cache(
...
@@ -133,7 +143,7 @@ def test_reshape_and_cache(
# Call the reshape_and_cache kernel.
# Call the reshape_and_cache kernel.
cache_ops
.
reshape_and_cache
(
key
,
value
,
key_cache
,
value_cache
,
cache_ops
.
reshape_and_cache
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
)
slot_mapping
,
"auto"
)
# Run the reference implementation.
# Run the reference implementation.
reshaped_key
=
key
.
reshape
(
num_tokens
,
*
key_cache
[
0
,
:,
:,
0
,
:].
shape
)
reshaped_key
=
key
.
reshape
(
num_tokens
,
*
key_cache
[
0
,
:,
:,
0
,
:].
shape
)
...
@@ -149,3 +159,68 @@ def test_reshape_and_cache(
...
@@ -149,3 +159,68 @@ def test_reshape_and_cache(
assert
torch
.
allclose
(
key_cache
,
cloned_key_cache
)
assert
torch
.
allclose
(
key_cache
,
cloned_key_cache
)
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
assert
torch
.
allclose
(
value_cache
,
cloned_value_cache
)
@
pytest
.
mark
.
parametrize
(
"direction"
,
COPYING_DIRECTION
)
@
pytest
.
mark
.
parametrize
(
"num_mappings"
,
NUM_MAPPINGS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_swap_blocks
(
kv_cache_factory
,
direction
:
Tuple
[
str
,
str
],
num_mappings
:
int
,
num_heads
:
int
,
head_size
:
int
,
block_size
:
int
,
num_blocks
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
device
:
str
,
)
->
None
:
random
.
seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
seed
)
src_device
=
device
if
direction
[
0
]
==
"cuda"
else
'cpu'
dst_device
=
device
if
direction
[
1
]
==
"cuda"
else
'cpu'
src_blocks
=
random
.
sample
(
range
(
num_blocks
),
num_mappings
)
# For the same device, mapping must not overlap
if
src_device
==
dst_device
:
remaining_blocks
=
list
(
set
(
range
(
num_blocks
))
-
set
(
src_blocks
))
dst_blocks
=
random
.
sample
(
remaining_blocks
,
num_mappings
)
else
:
dst_blocks
=
random
.
sample
(
range
(
num_blocks
),
num_mappings
)
block_mapping
=
dict
(
zip
(
src_blocks
,
dst_blocks
))
# Create the KV caches on the first device.
src_key_caches
,
src_value_caches
=
kv_cache_factory
(
num_blocks
,
block_size
,
1
,
num_heads
,
head_size
,
dtype
,
None
,
seed
,
src_device
)
# Create the KV caches on the second device.
dist_key_caches
,
dist_value_caches
=
kv_cache_factory
(
num_blocks
,
block_size
,
1
,
num_heads
,
head_size
,
dtype
,
None
,
seed
,
dst_device
)
src_key_caches_clone
=
src_key_caches
[
0
].
clone
()
src_value_caches_clone
=
src_value_caches
[
0
].
clone
()
# Call the swap_blocks kernel.
cache_ops
.
swap_blocks
(
src_key_caches
[
0
],
dist_key_caches
[
0
],
block_mapping
)
cache_ops
.
swap_blocks
(
src_value_caches
[
0
],
dist_value_caches
[
0
],
block_mapping
)
for
src
,
dst
in
block_mapping
.
items
():
assert
torch
.
allclose
(
src_key_caches_clone
[
src
].
cpu
(),
dist_key_caches
[
0
][
dst
].
cpu
())
assert
torch
.
allclose
(
src_value_caches_clone
[
src
].
cpu
(),
dist_value_caches
[
0
][
dst
].
cpu
())
tests/kernels/test_layernorm.py
View file @
e00b0a19
...
@@ -8,7 +8,9 @@ NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
...
@@ -8,7 +8,9 @@ NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing
HIDDEN_SIZES
=
[
768
,
5120
,
8192
]
# Arbitrary values for testing
HIDDEN_SIZES
=
[
768
,
5120
,
8192
]
# Arbitrary values for testing
ADD_RESIDUAL
=
[
False
,
True
]
ADD_RESIDUAL
=
[
False
,
True
]
SEEDS
=
[
0
]
SEEDS
=
[
0
]
DEVICES
=
[
i
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)]
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
...
@@ -16,7 +18,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
...
@@ -16,7 +18,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@
pytest
.
mark
.
parametrize
(
"add_residual"
,
ADD_RESIDUAL
)
@
pytest
.
mark
.
parametrize
(
"add_residual"
,
ADD_RESIDUAL
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_
DEVICES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_rms_norm
(
def
test_rms_norm
(
num_tokens
:
int
,
num_tokens
:
int
,
...
@@ -24,15 +26,16 @@ def test_rms_norm(
...
@@ -24,15 +26,16 @@ def test_rms_norm(
add_residual
:
bool
,
add_residual
:
bool
,
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
seed
:
int
,
seed
:
int
,
device
:
int
,
device
:
str
,
)
->
None
:
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
gpu_id
=
f
"cuda:
{
device
}
"
torch
.
cuda
.
manual_seed
(
seed
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
,
device
=
gpu_id
)
torch
.
set_default_device
(
device
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
scale
=
1
/
(
2
*
hidden_size
)
scale
=
1
/
(
2
*
hidden_size
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
gpu_id
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
)
x
*=
scale
x
*=
scale
residual
=
torch
.
randn_like
(
x
)
*
scale
if
add_residual
else
None
residual
=
torch
.
randn_like
(
x
)
*
scale
if
add_residual
else
None
...
...
tests/kernels/test_moe.py
0 → 100644
View file @
e00b0a19
"""Tests for the MOE layers.
Run `pytest tests/kernels/test_moe.py`.
"""
import
pytest
import
torch
from
transformers
import
MixtralConfig
from
transformers.models.mixtral.modeling_mixtral
import
MixtralSparseMoeBlock
from
vllm.model_executor.layers.fused_moe
import
fused_moe
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.models.mixtral
import
MixtralMoE
def
torch_moe
(
a
,
w1
,
w2
,
score
,
topk
):
B
,
D
=
a
.
shape
a
=
a
.
view
(
B
,
-
1
,
D
).
repeat
(
1
,
topk
,
1
).
reshape
(
-
1
,
D
)
out
=
torch
.
zeros
(
B
*
topk
,
w2
.
shape
[
1
],
dtype
=
a
.
dtype
,
device
=
a
.
device
)
score
=
torch
.
softmax
(
score
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weight
,
topk_ids
=
torch
.
topk
(
score
,
topk
)
topk_weight
=
topk_weight
.
view
(
-
1
)
topk_ids
=
topk_ids
.
view
(
-
1
)
for
i
in
range
(
w1
.
shape
[
0
]):
mask
=
topk_ids
==
i
if
mask
.
sum
():
out
[
mask
]
=
SiluAndMul
()(
a
[
mask
]
@
w1
[
i
].
transpose
(
0
,
1
))
@
w2
[
i
].
transpose
(
0
,
1
)
return
(
out
.
view
(
B
,
-
1
,
w2
.
shape
[
1
])
*
topk_weight
.
view
(
B
,
-
1
,
1
).
to
(
out
.
dtype
)).
sum
(
dim
=
1
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
511
,
1024
])
@
pytest
.
mark
.
parametrize
(
"e"
,
[
8
,
64
])
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
2
,
6
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_fused_moe
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
):
a
=
torch
.
randn
((
m
,
k
),
device
=
'cuda'
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
'cuda'
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
e
,
k
,
n
),
device
=
'cuda'
,
dtype
=
dtype
)
/
10
score
=
torch
.
randn
((
m
,
e
),
device
=
'cuda'
,
dtype
=
dtype
)
triton_output
=
fused_moe
(
a
,
w1
,
w2
,
score
,
topk
,
renormalize
=
False
)
torch_output
=
torch_moe
(
a
,
w1
,
w2
,
score
,
topk
)
assert
torch
.
allclose
(
triton_output
,
torch_output
,
atol
=
1e-2
,
rtol
=
0
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
torch
.
inference_mode
()
def
test_mixtral_moe
(
dtype
:
torch
.
dtype
):
"Make sure our Mixtral MoE implementation agrees with the one from huggingface."
# Instantiate our and huggingface's MoE blocks
config
=
MixtralConfig
()
hf_moe
=
MixtralSparseMoeBlock
(
config
).
to
(
dtype
).
to
(
"cuda"
)
vllm_moe
=
MixtralMoE
(
num_experts
=
config
.
num_local_experts
,
top_k
=
config
.
num_experts_per_tok
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
params_dtype
=
dtype
,
tp_size
=
1
,
).
cuda
()
# Load the weights
vllm_moe
.
gate
.
linear_weights
[
"weight"
][:]
=
hf_moe
.
gate
.
weight
.
data
for
i
in
range
(
config
.
num_local_experts
):
weights
=
(
hf_moe
.
experts
[
i
].
w1
.
weight
.
data
,
hf_moe
.
experts
[
i
].
w3
.
weight
.
data
)
vllm_moe
.
ws
[
i
][:]
=
torch
.
cat
(
weights
,
dim
=
0
)
vllm_moe
.
w2s
[
i
][:]
=
hf_moe
.
experts
[
i
].
w2
.
weight
.
data
# Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
inputs
=
torch
.
randn
((
1
,
64
,
config
.
hidden_size
)).
to
(
dtype
).
to
(
"cuda"
)
# Run forward passes for both MoE blocks
hf_states
,
_
=
hf_moe
.
forward
(
inputs
)
vllm_states
=
vllm_moe
.
forward
(
inputs
)
mixtral_moe_tol
=
{
torch
.
float32
:
1e-3
,
torch
.
float16
:
1e-3
,
torch
.
bfloat16
:
1e-2
,
}
assert
torch
.
allclose
(
hf_states
,
vllm_states
,
rtol
=
mixtral_moe_tol
[
dtype
],
atol
=
mixtral_moe_tol
[
dtype
])
tests/kernels/test_pos_encoding.py
View file @
e00b0a19
...
@@ -2,7 +2,7 @@ from typing import Optional
...
@@ -2,7 +2,7 @@ from typing import Optional
import
pytest
import
pytest
import
torch
import
torch
from
allclose_default
import
get_default_atol
,
get_default_rtol
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
IS_NEOX_STYLE
=
[
True
,
False
]
IS_NEOX_STYLE
=
[
True
,
False
]
...
@@ -13,7 +13,9 @@ NUM_HEADS = [7, 17] # Arbitrary values for testing
...
@@ -13,7 +13,9 @@ NUM_HEADS = [7, 17] # Arbitrary values for testing
BATCH_SIZES
=
[
1
,
5
]
# Arbitrary values for testing
BATCH_SIZES
=
[
1
,
5
]
# Arbitrary values for testing
SEQ_LENS
=
[
11
,
8192
]
# Arbitrary values for testing
SEQ_LENS
=
[
11
,
8192
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
DEVICES
=
[
i
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)]
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
@
pytest
.
mark
.
parametrize
(
"is_neox_style"
,
IS_NEOX_STYLE
)
@
pytest
.
mark
.
parametrize
(
"is_neox_style"
,
IS_NEOX_STYLE
)
...
@@ -24,7 +26,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
...
@@ -24,7 +26,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@
pytest
.
mark
.
parametrize
(
"rotary_dim"
,
ROTARY_DIMS
)
@
pytest
.
mark
.
parametrize
(
"rotary_dim"
,
ROTARY_DIMS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_
DEVICES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_rotary_embedding
(
def
test_rotary_embedding
(
is_neox_style
:
bool
,
is_neox_style
:
bool
,
...
@@ -35,28 +37,26 @@ def test_rotary_embedding(
...
@@ -35,28 +37,26 @@ def test_rotary_embedding(
rotary_dim
:
Optional
[
int
],
rotary_dim
:
Optional
[
int
],
dtype
:
torch
.
dtype
,
dtype
:
torch
.
dtype
,
seed
:
int
,
seed
:
int
,
device
:
int
,
device
:
str
,
max_position
:
int
=
8192
,
max_position
:
int
=
8192
,
base
:
int
=
10000
,
base
:
int
=
10000
,
)
->
None
:
)
->
None
:
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
gpu_id
=
f
"cuda:
{
device
}
"
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
)
rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
)
rope
=
rope
.
to
(
dtype
=
dtype
,
device
=
gpu_id
)
rope
=
rope
.
to
(
dtype
=
dtype
)
positions
=
torch
.
randint
(
0
,
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
))
max_position
,
(
batch_size
,
seq_len
),
device
=
gpu_id
)
query
=
torch
.
randn
(
batch_size
,
query
=
torch
.
randn
(
batch_size
,
seq_len
,
seq_len
,
num_heads
*
head_size
,
num_heads
*
head_size
,
dtype
=
dtype
,
dtype
=
dtype
)
device
=
gpu_id
)
key
=
torch
.
randn_like
(
query
)
key
=
torch
.
randn_like
(
query
)
# NOTE(woosuk): The reference implementation should be executed first
# NOTE(woosuk): The reference implementation should be executed first
...
@@ -64,5 +64,11 @@ def test_rotary_embedding(
...
@@ -64,5 +64,11 @@ def test_rotary_embedding(
ref_query
,
ref_key
=
rope
.
_forward
(
positions
,
query
,
key
)
ref_query
,
ref_key
=
rope
.
_forward
(
positions
,
query
,
key
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
)
# Compare the results.
# Compare the results.
assert
torch
.
allclose
(
out_query
,
ref_query
,
atol
=
1e-5
,
rtol
=
1e-5
)
assert
torch
.
allclose
(
out_query
,
assert
torch
.
allclose
(
out_key
,
ref_key
,
atol
=
1e-5
,
rtol
=
1e-5
)
ref_query
,
atol
=
get_default_atol
(
out_query
),
rtol
=
get_default_rtol
(
out_query
))
assert
torch
.
allclose
(
out_key
,
ref_key
,
atol
=
get_default_atol
(
out_key
),
rtol
=
get_default_rtol
(
out_key
))
tests/kernels/test_prefix_prefill.py
0 → 100644
View file @
e00b0a19
import
random
import
pytest
import
time
import
torch
from
vllm.model_executor.layers.triton_kernel.prefix_prefill
import
(
context_attention_fwd
)
from
xformers
import
ops
as
xops
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalFromBottomRightMask
NUM_HEADS
=
[
64
]
NUM_QUERIES_PER_KV
=
[
1
,
8
,
64
]
HEAD_SIZES
=
[
128
]
DTYPES
=
[
torch
.
float16
]
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"num_queries_per_kv"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
torch
.
inference_mode
()
def
test_contexted_kv_attention
(
num_heads
:
int
,
num_queries_per_kv
:
int
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
str
,
)
->
None
:
random
.
seed
(
0
)
torch
.
manual_seed
(
0
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed
(
0
)
torch
.
set_default_device
(
device
)
MAX_SEQ_LEN
=
1024
MAX_CTX_LEN
=
1024
BS
=
10
cache_size
=
640
block_size
=
32
max_block_per_request
=
64
subquery_lens
=
[
random
.
randint
(
16
,
MAX_SEQ_LEN
)
for
_
in
range
(
BS
)]
ctx_lens
=
[
random
.
randint
(
16
,
MAX_CTX_LEN
)
for
_
in
range
(
BS
)]
seq_lens
=
[
a
+
b
for
a
,
b
in
zip
(
subquery_lens
,
ctx_lens
)]
num_kv_heads
=
num_heads
//
num_queries_per_kv
num_tokens
=
sum
(
subquery_lens
)
query
=
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
query
.
uniform_
(
-
1e-3
,
1e-3
)
output
=
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
kv
=
torch
.
empty
(
sum
(
seq_lens
),
2
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
kv
.
uniform_
(
-
1e-3
,
1e-3
)
key
,
value
=
kv
.
unbind
(
dim
=
1
)
k_cache
=
torch
.
zeros
(
cache_size
,
block_size
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
v_cache
=
torch
.
zeros
(
cache_size
,
block_size
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
k
=
torch
.
zeros
(
sum
(
subquery_lens
),
num_kv_heads
,
head_size
,
dtype
=
dtype
)
v
=
torch
.
zeros
(
sum
(
subquery_lens
),
num_kv_heads
,
head_size
,
dtype
=
dtype
)
values
=
torch
.
arange
(
0
,
cache_size
,
dtype
=
torch
.
long
)
values
=
values
[
torch
.
randperm
(
cache_size
)]
block_table
=
values
[:
BS
*
max_block_per_request
].
view
(
BS
,
max_block_per_request
)
b_seq_len
=
torch
.
tensor
(
seq_lens
,
dtype
=
torch
.
long
)
b_ctx_len
=
torch
.
tensor
(
ctx_lens
,
dtype
=
torch
.
long
)
b_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
subquery_lens
[:
-
1
],
dtype
=
torch
.
long
),
dim
=
0
)
max_input_len
=
MAX_SEQ_LEN
# copy kv to cache
b_seq_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
seq_lens
[:
-
1
],
dtype
=
torch
.
long
),
dim
=
0
)
for
i
in
range
(
BS
):
for
j
in
range
(
subquery_lens
[
i
]):
k
[
b_start_loc
[
i
]
+
j
].
copy_
(
key
[
b_seq_start_loc
[
i
]
+
b_ctx_len
[
i
]
+
j
])
v
[
b_start_loc
[
i
]
+
j
].
copy_
(
value
[
b_seq_start_loc
[
i
]
+
b_ctx_len
[
i
]
+
j
])
cur_ctx
=
0
block_id
=
0
while
cur_ctx
<
b_ctx_len
[
i
]:
start_loc
=
b_seq_start_loc
[
i
]
+
cur_ctx
if
cur_ctx
+
block_size
>
b_ctx_len
[
i
]:
end_loc
=
b_seq_start_loc
[
i
]
+
b_ctx_len
[
i
]
else
:
end_loc
=
start_loc
+
block_size
start_slot
=
block_table
[
i
,
block_id
]
*
block_size
end_slot
=
start_slot
+
end_loc
-
start_loc
k_cache
.
view
(
-
1
,
num_kv_heads
,
head_size
)[
start_slot
:
end_slot
].
copy_
(
key
[
start_loc
:
end_loc
])
v_cache
.
view
(
-
1
,
num_kv_heads
,
head_size
)[
start_slot
:
end_slot
].
copy_
(
value
[
start_loc
:
end_loc
])
cur_ctx
+=
block_size
block_id
+=
1
# transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
# to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
k_cache
=
k_cache
.
view
(
-
1
,
block_size
,
num_kv_heads
,
head_size
//
8
,
8
).
permute
(
0
,
2
,
3
,
1
,
4
).
contiguous
()
# transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
# to V_cache[num_blocks, num_kv_heads, head_size, block_size]
v_cache
=
v_cache
.
view
(
-
1
,
block_size
,
num_kv_heads
,
head_size
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
# Warm up the Triton kernel by calling it once before actually measuring generation time
context_attention_fwd
(
query
,
k
,
v
,
output
,
k_cache
,
v_cache
,
block_table
,
b_start_loc
,
b_seq_len
,
b_ctx_len
,
max_input_len
)
torch
.
cuda
.
synchronize
()
start_time
=
time
.
time
()
context_attention_fwd
(
query
,
k
,
v
,
output
,
k_cache
,
v_cache
,
block_table
,
b_start_loc
,
b_seq_len
,
b_ctx_len
,
max_input_len
)
torch
.
cuda
.
synchronize
()
end_time
=
time
.
time
()
print
(
f
"triton Time:
{
(
end_time
-
start_time
)
*
1000
:.
2
f
}
ms"
)
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
attn_op
=
xops
.
fmha
.
cutlass
.
FwOp
()
if
num_kv_heads
!=
num_heads
:
# As of Nov 2023, xformers only supports MHA. For MQA/GQA,
# project the key and value tensors to the desired number of
# heads.
#
# see also: vllm/model_executor/layers/attention.py
query
=
query
.
view
(
query
.
shape
[
0
],
num_kv_heads
,
num_queries_per_kv
,
query
.
shape
[
-
1
])
key
=
key
[:,
:,
None
,
:].
expand
(
key
.
shape
[
0
],
num_kv_heads
,
num_queries_per_kv
,
key
.
shape
[
-
1
])
value
=
value
[:,
:,
None
,
:].
expand
(
value
.
shape
[
0
],
num_kv_heads
,
num_queries_per_kv
,
value
.
shape
[
-
1
])
query
=
query
.
unsqueeze
(
0
)
key
=
key
.
unsqueeze
(
0
)
value
=
value
.
unsqueeze
(
0
)
attn_bias
=
BlockDiagonalCausalFromBottomRightMask
.
from_seqlens
(
subquery_lens
,
seq_lens
)
output_ref
=
xops
.
memory_efficient_attention_forward
(
query
,
key
,
value
,
attn_bias
=
attn_bias
,
p
=
0.0
,
scale
=
scale
,
op
=
attn_op
,
)
torch
.
cuda
.
synchronize
()
start_time
=
time
.
time
()
output_ref
=
xops
.
memory_efficient_attention_forward
(
query
,
key
,
value
,
attn_bias
=
attn_bias
,
p
=
0.0
,
scale
=
scale
,
op
=
attn_op
,
)
torch
.
cuda
.
synchronize
()
end_time
=
time
.
time
()
print
(
f
"xformers Time:
{
(
end_time
-
start_time
)
*
1000
:.
2
f
}
ms"
)
output_ref
=
output_ref
.
squeeze
(
0
,
2
)
assert
torch
.
allclose
(
output_ref
,
output
,
atol
=
1e-6
,
rtol
=
0
)
tests/lora/__init__.py
0 → 100644
View file @
e00b0a19
tests/lora/conftest.py
0 → 100644
View file @
e00b0a19
import
contextlib
import
gc
import
tempfile
from
collections
import
OrderedDict
from
unittest.mock
import
patch
,
MagicMock
import
pytest
import
ray
import
torch
import
torch.nn
as
nn
from
huggingface_hub
import
snapshot_download
import
vllm
from
vllm.config
import
LoRAConfig
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.parallel_utils.parallel_state
import
(
destroy_model_parallel
,
initialize_model_parallel
)
def
cleanup
():
destroy_model_parallel
()
with
contextlib
.
suppress
(
AssertionError
):
torch
.
distributed
.
destroy_process_group
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
ray
.
shutdown
()
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup_fixture
():
yield
cleanup
()
@
pytest
.
fixture
def
dist_init
():
if
not
torch
.
distributed
.
is_initialized
():
temp_file
=
tempfile
.
mkstemp
()[
1
]
torch
.
distributed
.
init_process_group
(
backend
=
"nccl"
,
world_size
=
1
,
rank
=
0
,
init_method
=
f
"file://
{
temp_file
}
"
,
)
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
initialize_model_parallel
(
1
,
1
)
yield
cleanup
()
@
pytest
.
fixture
def
dist_init_torch_only
():
if
torch
.
distributed
.
is_initialized
():
return
temp_file
=
tempfile
.
mkstemp
()[
1
]
torch
.
distributed
.
init_process_group
(
backend
=
"nccl"
,
world_size
=
1
,
rank
=
0
,
init_method
=
f
"file://
{
temp_file
}
"
,
)
@
pytest
.
fixture
def
dummy_model
()
->
nn
.
Module
:
model
=
nn
.
Sequential
(
OrderedDict
([
(
"dense1"
,
ColumnParallelLinear
(
764
,
100
)),
(
"dense2"
,
RowParallelLinear
(
100
,
50
)),
(
"layer1"
,
nn
.
Sequential
(
OrderedDict
([
(
"dense1"
,
ColumnParallelLinear
(
100
,
10
)),
(
"dense2"
,
RowParallelLinear
(
10
,
50
)),
])),
),
(
"act2"
,
nn
.
ReLU
()),
(
"output"
,
ColumnParallelLinear
(
50
,
10
)),
(
"outact"
,
nn
.
Sigmoid
()),
# Special handling for lm_head & sampler
(
"lm_head"
,
ParallelLMHead
(
512
,
10
)),
(
"sampler"
,
Sampler
(
512
))
]))
model
.
config
=
MagicMock
()
return
model
@
pytest
.
fixture
def
dummy_model_gate_up
()
->
nn
.
Module
:
model
=
nn
.
Sequential
(
OrderedDict
([
(
"dense1"
,
ColumnParallelLinear
(
764
,
100
)),
(
"dense2"
,
RowParallelLinear
(
100
,
50
)),
(
"layer1"
,
nn
.
Sequential
(
OrderedDict
([
(
"dense1"
,
ColumnParallelLinear
(
100
,
10
)),
(
"dense2"
,
RowParallelLinear
(
10
,
50
)),
])),
),
(
"act2"
,
nn
.
ReLU
()),
(
"gate_up_proj"
,
MergedColumnParallelLinear
(
50
,
[
5
,
5
])),
(
"outact"
,
nn
.
Sigmoid
()),
# Special handling for lm_head & sampler
(
"lm_head"
,
ParallelLMHead
(
512
,
10
)),
(
"sampler"
,
Sampler
(
512
))
]))
model
.
config
=
MagicMock
()
return
model
@
pytest
.
fixture
(
scope
=
"session"
)
def
sql_lora_files
():
return
snapshot_download
(
repo_id
=
"yard1/llama-2-7b-sql-lora-test"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
mixtral_lora_files
():
return
snapshot_download
(
repo_id
=
"terrysun/mixtral-lora-adapter"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
gemma_lora_files
():
return
snapshot_download
(
repo_id
=
"wskwon/gemma-7b-test-lora"
)
@
pytest
.
fixture
def
llama_2_7b_engine_extra_embeddings
()
->
nn
.
Module
:
cleanup
()
get_model_old
=
get_model
def
get_model_patched
(
model_config
,
device_config
,
**
kwargs
):
return
get_model_old
(
model_config
,
device_config
,
lora_config
=
LoRAConfig
(
max_loras
=
4
,
max_lora_rank
=
8
))
with
patch
(
"vllm.worker.model_runner.get_model"
,
get_model_patched
):
engine
=
vllm
.
LLM
(
"meta-llama/Llama-2-7b-hf"
,
enable_lora
=
False
)
yield
engine
.
llm_engine
del
engine
cleanup
()
@
pytest
.
fixture
def
llama_2_7b_model_extra_embeddings
(
llama_2_7b_engine_extra_embeddings
)
->
nn
.
Module
:
yield
llama_2_7b_engine_extra_embeddings
.
driver_worker
.
model_runner
.
model
Prev
1
2
3
4
5
6
7
8
9
10
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment