Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f48954a4
Commit
f48954a4
authored
Jun 12, 2024
by
zhuwenwen
Browse files
merge v0.5.0
parents
1dba29d3
8f89d720
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1197 additions
and
176 deletions
+1197
-176
tests/core/block/test_block_manager_v2.py
tests/core/block/test_block_manager_v2.py
+57
-1
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+10
-12
tests/distributed/test_chunked_prefill_distributed.py
tests/distributed/test_chunked_prefill_distributed.py
+12
-14
tests/distributed/test_same_node.py
tests/distributed/test_same_node.py
+11
-0
tests/engine/test_stop_reason.py
tests/engine/test_stop_reason.py
+2
-3
tests/engine/test_stop_strings.py
tests/engine/test_stop_strings.py
+2
-1
tests/entrypoints/test_guided_processors.py
tests/entrypoints/test_guided_processors.py
+0
-2
tests/entrypoints/test_llm_generate_multiple_loras.py
tests/entrypoints/test_llm_generate_multiple_loras.py
+69
-0
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+415
-94
tests/entrypoints/test_openai_vision.py
tests/entrypoints/test_openai_vision.py
+286
-0
tests/kernels/test_activation.py
tests/kernels/test_activation.py
+2
-2
tests/kernels/test_attention_selector.py
tests/kernels/test_attention_selector.py
+10
-17
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+11
-4
tests/kernels/test_int8_quant.py
tests/kernels/test_int8_quant.py
+43
-8
tests/kernels/test_layernorm.py
tests/kernels/test_layernorm.py
+1
-1
tests/kernels/test_pos_encoding.py
tests/kernels/test_pos_encoding.py
+4
-3
tests/kernels/utils.py
tests/kernels/utils.py
+22
-0
tests/lora/conftest.py
tests/lora/conftest.py
+16
-2
tests/lora/test_layers.py
tests/lora/test_layers.py
+217
-2
tests/lora/test_llama.py
tests/lora/test_llama.py
+7
-10
No files found.
tests/core/block/test_block_manager_v2.py
View file @
f48954a4
...
...
@@ -7,7 +7,8 @@ from vllm.core.interfaces import AllocStatus
from
vllm.sequence
import
Logprob
,
SequenceStatus
from
vllm.utils
import
chunk_list
from
..utils
import
create_seq_group
,
create_seq_group_encoder_decoder
from
..utils
import
(
create_dummy_prompt
,
create_seq_group
,
create_seq_group_encoder_decoder
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
...
...
@@ -255,6 +256,61 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
assert
num_consumed_blocks
==
expected_consumed_blocks
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"num_cpu_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"num_lookahead_slots"
,
[
0
,
2
,
10
])
@
pytest
.
mark
.
parametrize
(
"enable_caching"
,
[
False
,
True
])
def
test_swap
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
num_lookahead_slots
,
enable_caching
):
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
"""
block_manager
=
BlockSpaceManagerV2
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
enable_caching
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
prompt
.
status
=
SequenceStatus
.
RUNNING
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap seq group from GPU -> CPU.
gpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
mapping_keys
=
[
key
for
key
,
_
in
mapping
]
assert
mapping_keys
==
gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap seq group from CPU -> GPU.
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
cpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
mapping_keys
=
[
key
for
key
,
_
in
mapping
]
assert
mapping_keys
==
[
cpu_blocks
[
0
]]
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
,
16
])
@
pytest
.
mark
.
parametrize
(
"prompt_len"
,
[
10
,
300
,
1000
])
@
pytest
.
mark
.
parametrize
(
"num_slots_to_append"
,
[
50
])
...
...
tests/distributed/test_basic_distributed_correctness.py
View file @
f48954a4
...
...
@@ -42,18 +42,16 @@ def test_models(
backend_by_env_var
=
os
.
getenv
(
VLLM_ATTENTION_BACKEND
)
enforce_eager
=
backend_by_env_var
==
"FLASHINFER"
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
enforce_eager
=
enforce_eager
,
distributed_executor_backend
=
distributed_executor_backend
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
enforce_eager
=
enforce_eager
,
distributed_executor_backend
=
distributed_executor_backend
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
tests/distributed/test_chunked_prefill_distributed.py
View file @
f48954a4
...
...
@@ -45,21 +45,19 @@ def test_models(
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
tests/distributed/test_same_node.py
0 → 100644
View file @
f48954a4
import
os
import
torch
from
vllm.distributed.parallel_state
import
is_in_the_same_node
torch
.
distributed
.
init_process_group
(
backend
=
"gloo"
)
test_result
=
is_in_the_same_node
(
torch
.
distributed
.
group
.
WORLD
)
expected
=
os
.
environ
.
get
(
"VLLM_TEST_SAME_HOST"
,
"1"
)
==
"1"
assert
test_result
==
expected
,
f
"Expected
{
expected
}
, got
{
test_result
}
"
tests/engine/test_stop_reason.py
View file @
f48954a4
...
...
@@ -19,9 +19,8 @@ MAX_TOKENS = 1024
@
pytest
.
fixture
def
vllm_model
(
vllm_runner
):
vllm_model
=
vllm_runner
(
MODEL
)
yield
vllm_model
del
vllm_model
with
vllm_runner
(
MODEL
)
as
vllm_model
:
yield
vllm_model
def
test_stop_reason
(
vllm_model
,
example_prompts
):
...
...
tests/engine/test_stop_strings.py
View file @
f48954a4
...
...
@@ -10,7 +10,8 @@ MAX_TOKENS = 200
@
pytest
.
fixture
(
scope
=
"session"
)
def
vllm_model
(
vllm_runner
):
return
vllm_runner
(
MODEL
)
with
vllm_runner
(
MODEL
)
as
vllm_model
:
yield
vllm_model
@
pytest
.
mark
.
skip_global_cleanup
...
...
tests/entrypoints/test_guided_processors.py
View file @
f48954a4
...
...
@@ -63,7 +63,6 @@ def test_guided_logits_processors():
tokenizer
,
whitespace_pattern
=
None
)
regex_LP
.
init_state
()
token_ids
=
tokenizer
.
encode
(
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
)
tensor
=
torch
.
rand
(
32000
)
...
...
@@ -72,7 +71,6 @@ def test_guided_logits_processors():
assert
tensor
.
shape
==
original_tensor
.
shape
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
json_LP
.
init_state
()
token_ids
=
tokenizer
.
encode
(
f
"Give an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
)
tensor
=
torch
.
rand
(
32000
)
...
...
tests/entrypoints/test_llm_generate_multiple_loras.py
0 → 100644
View file @
f48954a4
import
weakref
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm.lora.request
import
LoRARequest
from
..conftest
import
cleanup
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
PROMPTS
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
pytestmark
=
pytest
.
mark
.
llm
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
tensor_parallel_size
=
1
,
max_model_len
=
8192
,
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
64
,
max_num_seqs
=
128
,
enforce_eager
=
True
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_multiple_lora_requests
(
llm
:
LLM
,
zephyr_lora_files
):
lora_request
=
[
LoRARequest
(
LORA_NAME
,
idx
+
1
,
zephyr_lora_files
)
for
idx
in
range
(
len
(
PROMPTS
))
]
# Multiple SamplingParams should be matched with each prompt
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
lora_request
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
# Exception raised, if the size of params does not match the size of prompts
with
pytest
.
raises
(
ValueError
):
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
lora_request
[:
1
])
# Single LoRARequest should be applied to every prompt
single_lora_request
=
lora_request
[
0
]
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
single_lora_request
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
tests/entrypoints/test_openai_server.py
View file @
f48954a4
...
...
@@ -167,9 +167,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
...
...
@@ -180,8 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
asyncio
...
...
@@ -206,9 +206,9 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
#
fir
st test
base model, then test loras
#
ju
st test
1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_zero_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
...
...
@@ -224,7 +224,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<
=
1
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
=
=
1
@
pytest
.
mark
.
asyncio
...
...
@@ -246,7 +246,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
...
...
@@ -264,7 +264,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
6
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
...
...
@@ -274,7 +276,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
6
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
...
@@ -287,55 +291,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
max_tokens
=
5
,
temperature
=
0.0
,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
chat_completion
.
id
is
not
None
assert
chat_completion
.
choices
is
not
None
and
len
(
chat_completion
.
choices
)
==
1
assert
chat_completion
.
choices
[
0
].
message
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
is
not
None
assert
len
(
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
)
==
5
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
...
...
@@ -390,7 +346,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
content
is
not
None
assert
len
(
choice
.
logprobs
.
content
[
0
].
top_logprobs
)
<
=
1
assert
len
(
choice
.
logprobs
.
content
[
0
].
top_logprobs
)
=
=
0
@
pytest
.
mark
.
asyncio
...
...
@@ -418,11 +374,14 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
content
is
not
None
assert
len
(
choice
.
logprobs
.
content
[
0
].
top_logprobs
)
<
=
6
assert
len
(
choice
.
logprobs
.
content
[
0
].
top_logprobs
)
=
=
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_too_many_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
...
...
@@ -463,7 +422,51 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
chat_completion
.
id
is
not
None
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
37
,
total_tokens
=
47
)
message
=
choice
.
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
...
...
@@ -478,8 +481,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
single_usage
=
single_completion
.
usage
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
...
...
@@ -495,7 +496,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
chunk
.
usage
==
single_usage
assert
""
.
join
(
chunks
)
==
single_output
...
...
@@ -550,6 +550,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_chat_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What is the capital of France?"
}]
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
...
...
@@ -620,8 +752,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
...
...
@@ -668,9 +799,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
3
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
completion
.
choices
[
i
].
text
is
not
None
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
TEST_SCHEMA
)
...
...
@@ -737,9 +867,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
3
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
completion
.
choices
[
i
].
text
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
completion
.
choices
[
i
].
text
)
is
not
None
...
...
@@ -796,7 +925,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
2
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
TEST_CHOICE
...
...
@@ -898,12 +1027,199 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
top_logprobs
=
5
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
.
content
is
not
None
top_logprobs
=
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
# -9999.0 is the minimum logprob returned by OpenAI
assert
all
(
isinstance
(
token
.
logprob
,
float
)
and
token
.
logprob
>=
-
9999.0
for
token
in
top_logprobs
)
for
item
in
top_logprobs
:
assert
item
.
logprob
>=
-
9999.0
,
f
"Failed (top_logprobs=
{
top_logprobs
}
)"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_named_tool_use
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
}]
# non-streaming
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
}
})
message
=
chat_completion
.
choices
[
0
].
message
assert
len
(
message
.
content
)
==
0
json_string
=
message
.
tool_calls
[
0
].
function
.
arguments
json1
=
json
.
loads
(
json_string
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
TEST_SCHEMA
)
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
json_string
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Give me another one with a different name and age"
})
# streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
}
},
stream
=
True
)
output
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
assert
delta
.
content
is
None
or
len
(
delta
.
content
)
==
0
if
delta
.
tool_calls
:
output
.
append
(
delta
.
tool_calls
[
0
].
function
.
arguments
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
json2
=
json
.
loads
(
""
.
join
(
output
))
jsonschema
.
validate
(
instance
=
json2
,
schema
=
TEST_SCHEMA
)
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_required_tool_use_not_yet_supported
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
"required"
)
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
"auto"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_inconsistent_tool_choice_and_tools
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
}
})
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"nondefined_function_name"
}
})
@
pytest
.
mark
.
asyncio
...
...
@@ -920,6 +1236,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
response_format
=
{
"type"
:
"json_object"
})
content
=
resp
.
choices
[
0
].
message
.
content
assert
content
is
not
None
loaded
=
json
.
loads
(
content
)
assert
loaded
==
{
"result"
:
2
},
loaded
...
...
@@ -1032,8 +1350,9 @@ number: "1" | "2"
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
...
...
@@ -1042,12 +1361,11 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
1
)
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
(
completion
.
choices
[
0
].
text
is
not
None
and
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
))
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
...
...
@@ -1055,6 +1373,9 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
...
...
@@ -1085,32 +1406,32 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
)
async
def
test_single_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input
=
[
input
_texts
=
[
"The chef prepared a delicious meal."
,
]
# test single embedding
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input
,
input
=
input
_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
embeddings
.
data
is
not
None
and
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
# test using token IDs
input
=
[
1
,
1
,
1
,
1
,
1
]
input
_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input
,
input
=
input
_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
embeddings
.
data
is
not
None
and
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
...
...
@@ -1125,29 +1446,29 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
async
def
test_batch_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test List[str]
inputs
=
[
input
_text
s
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
inputs
,
input
=
input
_text
s
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
embeddings
.
data
is
not
None
and
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
# test List[List[int]]
inputs
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
input
_token
s
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
inputs
,
input
=
input
_token
s
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
embeddings
.
data
is
not
None
and
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
...
...
tests/entrypoints/test_openai_vision.py
0 → 100644
View file @
f48954a4
from
pathlib
import
Path
from
typing
import
Dict
import
openai
import
pytest
import
pytest_asyncio
import
ray
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
encode_image_base64
from
..utils
import
ServerRunner
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE
=
(
Path
(
__file__
).
parent
.
parent
.
parent
/
"examples/template_llava.jinja"
)
assert
LLAVA_CHAT_TEMPLATE
.
exists
()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
,
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
MODEL_NAME
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"4096"
,
"--enforce-eager"
,
"--image-input-type"
,
"pixel_values"
,
"--image-token-id"
,
"32000"
,
"--image-input-shape"
,
"1,3,336,336"
,
"--image-feature-size"
,
"576"
,
"--chat-template"
,
str
(
LLAVA_CHAT_TEMPLATE
),
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
@
pytest_asyncio
.
fixture
(
scope
=
"session"
)
async
def
base64_encoded_image
()
->
Dict
[
str
,
str
]:
return
{
image_url
:
encode_image_base64
(
await
ImageFetchAiohttp
.
fetch_image
(
image_url
))
for
image_url
in
TEST_IMAGE_URLS
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
596
,
total_tokens
=
606
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image_base64encoded
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
base64_encoded_image
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
base64_encoded_image
[
image_url
]
}
"
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
596
,
total_tokens
=
606
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_chat_streaming_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
delta
.
content
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_multi_image_input
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-image input
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/kernels/test_activation.py
View file @
f48954a4
...
...
@@ -44,7 +44,7 @@ def test_act_and_mul(
elif
activation
==
"gelu_tanh"
:
layer
=
GeluAndMul
(
approximate
=
"tanh"
)
out
=
layer
(
x
)
ref_out
=
layer
.
_
forward
(
x
)
ref_out
=
layer
.
forward
_native
(
x
)
# The SiLU and GELU implementations are equivalent to the native PyTorch
# implementations, so we can do exact comparison.
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
0.0
,
rtol
=
0.0
)
...
...
@@ -72,7 +72,7 @@ def test_activation(
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
layer
=
activation
()
out
=
layer
(
x
)
ref_out
=
layer
.
_
forward
(
x
)
ref_out
=
layer
.
forward
_native
(
x
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
get_default_atol
(
out
),
...
...
tests/kernels/test_attention_selector.py
View file @
f48954a4
import
os
from
unittest.mock
import
patch
import
pytest
import
torch
from
tests.kernels.utils
import
(
STR_FLASH_ATTN_VAL
,
STR_INVALID_VAL
,
override_backend_env_variable
)
from
vllm.attention.selector
import
which_attn_to_use
@
pytest
.
mark
.
parametrize
(
"name"
,
[
"TORCH_SDPA"
,
"ROCM_FLASH"
,
"XFORMERS"
,
"FLASHINFER"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
,
"cuda"
])
def
test_env
(
name
:
str
,
device
:
str
):
def
test_env
(
name
:
str
,
device
:
str
,
monkeypatch
):
"""Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend.
"""
name_backup
=
os
.
environ
.
get
(
"VLLM_ATTENTION_BACKEND"
,
None
)
o
s
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
name
o
verride_backend_env_variable
(
monkeypatch
,
name
)
if
device
==
"cpu"
:
with
patch
(
"vllm.attention.selector.is_cpu"
,
return_value
=
True
):
...
...
@@ -32,14 +33,11 @@ def test_env(name: str, device: str):
torch
.
float16
,
16
)
assert
backend
.
name
==
name
if
name_backup
is
not
None
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
name_backup
def
test_flash_attn
():
def
test_flash_attn
(
monkeypatch
):
"""Test FlashAttn validation."""
name_backup
=
os
.
environ
.
get
(
"VLLM_ATTENTION_BACKEND"
,
None
)
o
s
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"
FLASH_ATTN
"
o
verride_backend_env_variable
(
monkeypatch
,
STR_
FLASH_ATTN
_VAL
)
# Unsupported CUDA arch
with
patch
(
"torch.cuda.get_device_capability"
,
return_value
=
[
7
,
5
]):
...
...
@@ -71,14 +69,9 @@ def test_flash_attn():
backend
=
which_attn_to_use
(
8
,
17
,
8
,
None
,
torch
.
float16
,
None
,
16
)
assert
backend
.
name
!=
"FLASH_ATTN"
if
name_backup
is
not
None
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
name_backup
def
test_invalid_env
():
def
test_invalid_env
(
monkeypatch
):
"""Throw an exception if the backend name is invalid."""
name_backup
=
os
.
environ
.
get
(
"VLLM_ATTENTION_BACKEND"
,
None
)
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"INVALID"
override_backend_env_variable
(
monkeypatch
,
STR_INVALID_VAL
)
with
pytest
.
raises
(
ValueError
):
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
None
,
16
)
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
name_backup
tests/kernels/test_cutlass.py
View file @
f48954a4
...
...
@@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int,
assert
torch
.
allclose
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
512
,
222
,
100
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
...
...
@@ -207,14 +207,21 @@ class CutlassLayer(torch.nn.Module):
self
.
out_dtype
)
def
test_cutlass_cuda_graph
():
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
def
test_cutlass_cuda_graph
(
per_act_token
:
bool
,
per_out_ch
:
bool
):
m
,
n
,
k
=
512
,
512
,
512
a
=
to_int8
(
torch
.
randn
((
m
,
k
),
device
=
"cuda"
))
b
=
to_int8
(
torch
.
randn
((
n
,
k
),
device
=
"cuda"
).
t
())
scale_a
=
(
torch
.
randn
((
m
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
)
scale_b
=
(
torch
.
randn
((
1
,
n
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
)
m_a_scales
=
m
if
per_act_token
else
1
n_b_scales
=
n
if
per_out_ch
else
1
scale_a
=
(
torch
.
randn
(
(
m_a_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
)
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
)
# Construct a trivial model with a single layer that calls a CUTLASS kernel
model
=
CutlassLayer
(
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
...
...
tests/kernels/test_int8_quant.py
View file @
f48954a4
import
pytest
import
torch
from
vllm._C
import
ops
# ruff: noqa: F401
import
vllm._C
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
8192
]
# Arbitrary values for testing
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
8193
]
# Arbitrary values for testing
NUM_TOKENS
=
[
1
,
7
,
83
,
4096
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SCALE
=
[
0.1
,
0.5
,
0.8
,
1.2
,
2.1
]
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_dynamic_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x_token_max
,
_
=
x
.
max
(
dim
=
1
)
x_token_max
=
x_token_max
.
to
(
dtype
=
torch
.
float32
)
scales
=
(
x_token_max
/
float
(
127.0
))[:,
None
].
to
(
device
=
"cuda"
,
dtype
=
torch
.
float32
)
torch_out
=
(
x
/
scales
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
ops_out
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
,
device
=
"cuda"
)
scales_out
=
torch
.
empty_like
(
scales
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
torch
.
ops
.
_C
.
dynamic_scaled_int8_quant
(
ops_out
,
x
,
scales_out
)
assert
torch
.
allclose
(
scales_out
,
scales
)
assert
torch
.
allclose
(
torch_out
,
ops_out
,
atol
=
1
)
# big atol to account for rounding errors
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"scale"
,
SCALE
)
@
torch
.
inference_mode
()
def
test_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
)
->
None
:
def
test_static_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
out1
=
(
x
/
scale
).
round
().
clamp
(
torch
.
iinfo
(
torch
.
int8
).
min
,
torch
.
iinfo
(
torch
.
int8
).
max
).
to
(
torch
.
int8
)
out1
=
(
x
/
scale
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
out2
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
)
ops
.
static_scaled_int8_quant
(
out2
,
x
,
scale
)
scale_argument
=
torch
.
tensor
([
scale
],
dtype
=
torch
.
float32
,
device
=
"cuda"
)
torch
.
ops
.
_C
.
static_scaled_int8_quant
(
out2
,
x
,
scale_argument
)
assert
torch
.
allclose
(
out1
,
out2
,
atol
=
1
)
# big atol to account for rounding errors
tests/kernels/test_layernorm.py
View file @
f48954a4
...
...
@@ -42,7 +42,7 @@ def test_rms_norm(
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
ref_out
=
layer
.
_
forward
(
x
,
residual
)
ref_out
=
layer
.
forward
_native
(
x
,
residual
)
out
=
layer
(
x
,
residual
)
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# numerical errors than other operators because they involve reductions.
...
...
tests/kernels/test_pos_encoding.py
View file @
f48954a4
...
...
@@ -64,7 +64,7 @@ def test_rotary_embedding(
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
ref_query
,
ref_key
=
rope
.
_
forward
(
positions
,
query
,
key
)
ref_query
,
ref_key
=
rope
.
forward
_native
(
positions
,
query
,
key
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
)
# Compare the results.
assert
torch
.
allclose
(
out_query
,
...
...
@@ -121,7 +121,7 @@ def test_batched_rotary_embedding(
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
ref_query
,
ref_key
=
rope
.
_
forward
(
positions
,
query
,
key
)
ref_query
,
ref_key
=
rope
.
forward
_native
(
positions
,
query
,
key
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
,
...
...
@@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora(
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
ref_query
,
ref_key
=
rope
.
_forward
(
positions
,
query
,
key
,
query_offsets
)
ref_query
,
ref_key
=
rope
.
forward_native
(
positions
,
query
,
key
,
query_offsets
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
,
query_offsets
.
flatten
())
# Compare the results.
...
...
tests/kernels/utils.py
0 → 100644
View file @
f48954a4
"""Kernel test utils"""
import
pytest
STR_BACKEND_ENV_VAR
:
str
=
"VLLM_ATTENTION_BACKEND"
STR_FLASH_ATTN_VAL
:
str
=
"FLASH_ATTN"
STR_INVALID_VAL
:
str
=
"INVALID"
def
override_backend_env_variable
(
mpatch
:
pytest
.
MonkeyPatch
,
backend_name
:
str
)
->
None
:
'''
Override the environment variable indicating the vLLM backend temporarily,
using pytest monkeypatch to ensure that the env vars get
reset once the test context exits.
Arguments:
* mpatch: pytest monkeypatch instance
* backend_name: attention backend name to force
'''
mpatch
.
setenv
(
STR_BACKEND_ENV_VAR
,
backend_name
)
tests/lora/conftest.py
View file @
f48954a4
...
...
@@ -42,10 +42,24 @@ def cleanup():
ray
.
shutdown
()
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
(
request
)
->
bool
:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
This can provide a ~10x speedup for non-GPU unit tests since they don't need
to initialize torch.
"""
if
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
):
return
False
return
True
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup_fixture
():
def
cleanup_fixture
(
should_do_global_cleanup_after_test
:
bool
):
yield
cleanup
()
if
should_do_global_cleanup_after_test
:
cleanup
()
@
pytest
.
fixture
...
...
tests/lora/test_layers.py
View file @
f48954a4
...
...
@@ -2,6 +2,7 @@ import random
from
copy
import
deepcopy
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
unittest.mock
import
patch
import
pytest
import
torch
...
...
@@ -32,7 +33,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
,
get_masked_input_and_mask
)
from
vllm.model_executor.utils
import
set_random_seed
from
.utils
import
DummyLoRAManager
...
...
@@ -427,7 +428,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
logits_processor
=
LogitsProcessor
(
vocab_size
+
lora_config
.
lora_extra_vocab_size
,
vocab_size
)
lora_logits_processor
=
LogitsProcessorWithLoRA
(
logits_processor
,
1024
,
linear
.
weight
.
dtype
,
linear
.
weight
.
device
)
logits_processor
,
1024
,
linear
.
weight
.
dtype
,
linear
.
weight
.
device
,
None
)
lora_logits_processor
.
create_lora_weights
(
max_loras
,
lora_config
)
return
linear
,
logits_processor
,
lora_logits_processor
...
...
@@ -867,3 +869,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
torch
.
allclose
(
ref_q
,
actual_q
)
torch
.
allclose
(
ref_k
,
actual_k
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
256
)))
def
test_vocab_parallel_embedding_indices
(
tp_size
,
seed
):
random
.
seed
(
seed
)
vocab_size
=
random
.
randint
(
4000
,
64000
)
added_vocab_size
=
random
.
randint
(
0
,
1024
)
org_vocab_size
=
vocab_size
-
added_vocab_size
last_org_vocab_end_index
=
0
last_added_vocab_end_index
=
org_vocab_size
computed_vocab_size
=
0
computed_org_vocab_size
=
0
computed_added_vocab_size
=
0
vocab_size_padded
=
-
1
all_org_tokens
=
[]
all_added_tokens
=
[]
token_ids
=
[]
for
tp_rank
in
range
(
tp_size
):
with
patch
(
"vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank"
,
return_value
=
tp_rank
),
patch
(
"vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size"
,
return_value
=
tp_size
):
vocab_embedding
=
VocabParallelEmbedding
(
vocab_size
,
1
,
org_num_embeddings
=
org_vocab_size
)
vocab_size_padded
=
vocab_embedding
.
num_embeddings_padded
shard_indices
=
vocab_embedding
.
shard_indices
# Assert that the ranges are contiguous
assert
shard_indices
.
org_vocab_start_index
==
last_org_vocab_end_index
assert
(
shard_indices
.
added_vocab_start_index
==
last_added_vocab_end_index
)
# Ensure that we are not exceeding the vocab size
computed_vocab_size
+=
shard_indices
.
num_elements_padded
computed_org_vocab_size
+=
shard_indices
.
num_org_elements
computed_added_vocab_size
+=
shard_indices
.
num_added_elements
# Ensure that the ranges are not overlapping
all_org_tokens
.
extend
(
range
(
shard_indices
.
org_vocab_start_index
,
shard_indices
.
org_vocab_end_index
))
all_added_tokens
.
extend
(
range
(
shard_indices
.
added_vocab_start_index
,
shard_indices
.
added_vocab_end_index
))
token_ids
.
extend
(
range
(
shard_indices
.
org_vocab_start_index
,
shard_indices
.
org_vocab_end_index
))
token_ids
.
extend
([
-
1
]
*
(
shard_indices
.
num_org_elements_padded
-
shard_indices
.
num_org_elements
))
token_ids
.
extend
(
range
(
shard_indices
.
added_vocab_start_index
,
shard_indices
.
added_vocab_end_index
))
token_ids
.
extend
([
-
1
]
*
(
shard_indices
.
num_added_elements_padded
-
shard_indices
.
num_added_elements
))
last_org_vocab_end_index
=
shard_indices
.
org_vocab_end_index
last_added_vocab_end_index
=
shard_indices
.
added_vocab_end_index
assert
computed_vocab_size
==
vocab_size_padded
assert
computed_org_vocab_size
==
org_vocab_size
assert
computed_added_vocab_size
==
added_vocab_size
# Ensure that the ranges are not overlapping
assert
len
(
all_org_tokens
)
==
len
(
set
(
all_org_tokens
))
assert
len
(
all_added_tokens
)
==
len
(
set
(
all_added_tokens
))
assert
not
set
(
all_org_tokens
).
intersection
(
set
(
all_added_tokens
))
token_ids_tensor
=
torch
.
tensor
(
token_ids
,
dtype
=
torch
.
long
)
reindex_mapping
=
vocab_embedding
.
get_sharded_to_full_mapping
()
assert
reindex_mapping
is
not
None
or
tp_size
==
1
if
reindex_mapping
is
not
None
:
reindexed_token_ids
=
token_ids_tensor
[
reindex_mapping
]
expected
=
torch
.
tensor
(
list
(
range
(
0
,
vocab_size
)))
assert
reindexed_token_ids
[:
vocab_size
].
equal
(
expected
)
assert
torch
.
all
(
reindexed_token_ids
[
vocab_size
:]
==
-
1
)
def
test_get_masked_input_and_mask
():
x
=
torch
.
tensor
([
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
])
# base tp 1 case, no padding
modified_x
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
0
)
assert
torch
.
equal
(
x
,
modified_x
)
# tp 2 case, no padding
modified_x_rank_0
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
4
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
10
,
num_org_vocab_padding
=
0
)
modified_x_rank_1
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
4
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
10
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
0
)
assert
torch
.
equal
(
modified_x_rank_0
,
torch
.
tensor
([
0
,
1
,
2
,
3
,
0
,
0
,
0
,
0
,
4
,
5
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_1
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
1
,
2
,
3
,
0
,
0
,
4
,
5
]))
# tp 4 case, no padding
modified_x_rank_0
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
2
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
9
,
num_org_vocab_padding
=
0
)
modified_x_rank_1
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
2
,
org_vocab_end_index
=
4
,
added_vocab_start_index
=
9
,
added_vocab_end_index
=
10
,
num_org_vocab_padding
=
0
)
modified_x_rank_2
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
4
,
org_vocab_end_index
=
6
,
added_vocab_start_index
=
10
,
added_vocab_end_index
=
11
,
num_org_vocab_padding
=
0
)
modified_x_rank_3
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
6
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
11
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
0
)
assert
torch
.
equal
(
modified_x_rank_0
,
torch
.
tensor
([
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_1
,
torch
.
tensor
([
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_2
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
2
,
0
]))
assert
torch
.
equal
(
modified_x_rank_3
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
2
]))
# base tp 1 case, with padding
modified_x
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
2
)
assert
torch
.
equal
(
modified_x
,
torch
.
tensor
([
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
10
,
11
,
12
,
13
]))
# tp 2 case, with padding
modified_x_rank_0
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
4
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
10
,
num_org_vocab_padding
=
2
)
modified_x_rank_1
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
4
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
10
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
2
)
assert
torch
.
equal
(
modified_x_rank_0
,
torch
.
tensor
([
0
,
1
,
2
,
3
,
0
,
0
,
0
,
0
,
6
,
7
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_1
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
1
,
2
,
3
,
0
,
0
,
6
,
7
]))
# tp 4 case, with padding
modified_x_rank_0
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
2
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
9
,
num_org_vocab_padding
=
2
)
modified_x_rank_1
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
2
,
org_vocab_end_index
=
4
,
added_vocab_start_index
=
9
,
added_vocab_end_index
=
10
,
num_org_vocab_padding
=
2
)
modified_x_rank_2
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
4
,
org_vocab_end_index
=
6
,
added_vocab_start_index
=
10
,
added_vocab_end_index
=
11
,
num_org_vocab_padding
=
2
)
modified_x_rank_3
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
6
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
11
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
2
)
assert
torch
.
equal
(
modified_x_rank_0
,
torch
.
tensor
([
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_1
,
torch
.
tensor
([
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_2
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
4
,
0
]))
assert
torch
.
equal
(
modified_x_rank_3
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
4
]))
tests/lora/test_llama.py
View file @
f48954a4
...
...
@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int):
return
generated_texts
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
def
test_llama_lora
(
sql_lora_files
,
tp_size
):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < tp_size:
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
,
4
])
def
test_llama_lora
(
sql_lora_files
,
tp_size
,
num_gpus_available
):
if
num_gpus_available
<
tp_size
:
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
llm
=
vllm
.
LLM
(
MODEL_PATH
,
enable_lora
=
True
,
...
...
@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size):
print
(
"removing lora"
)
@
pytest
.
mark
.
skip
(
"Requires multiple GPUs"
)
def
test_llama_tensor_parallel_equality
(
sql_lora_files
):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
def
test_llama_tensor_parallel_equality
(
sql_lora_files
,
num_gpus_available
):
if
num_gpus_available
<
4
:
pytest
.
skip
(
"Not enough GPUs for tensor parallelism 4"
)
llm_tp1
=
vllm
.
LLM
(
MODEL_PATH
,
enable_lora
=
True
,
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment