Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f48954a4
Commit
f48954a4
authored
Jun 12, 2024
by
zhuwenwen
Browse files
merge v0.5.0
parents
1dba29d3
8f89d720
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1197 additions
and
176 deletions
+1197
-176
tests/core/block/test_block_manager_v2.py
tests/core/block/test_block_manager_v2.py
+57
-1
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+10
-12
tests/distributed/test_chunked_prefill_distributed.py
tests/distributed/test_chunked_prefill_distributed.py
+12
-14
tests/distributed/test_same_node.py
tests/distributed/test_same_node.py
+11
-0
tests/engine/test_stop_reason.py
tests/engine/test_stop_reason.py
+2
-3
tests/engine/test_stop_strings.py
tests/engine/test_stop_strings.py
+2
-1
tests/entrypoints/test_guided_processors.py
tests/entrypoints/test_guided_processors.py
+0
-2
tests/entrypoints/test_llm_generate_multiple_loras.py
tests/entrypoints/test_llm_generate_multiple_loras.py
+69
-0
tests/entrypoints/test_openai_server.py
tests/entrypoints/test_openai_server.py
+415
-94
tests/entrypoints/test_openai_vision.py
tests/entrypoints/test_openai_vision.py
+286
-0
tests/kernels/test_activation.py
tests/kernels/test_activation.py
+2
-2
tests/kernels/test_attention_selector.py
tests/kernels/test_attention_selector.py
+10
-17
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+11
-4
tests/kernels/test_int8_quant.py
tests/kernels/test_int8_quant.py
+43
-8
tests/kernels/test_layernorm.py
tests/kernels/test_layernorm.py
+1
-1
tests/kernels/test_pos_encoding.py
tests/kernels/test_pos_encoding.py
+4
-3
tests/kernels/utils.py
tests/kernels/utils.py
+22
-0
tests/lora/conftest.py
tests/lora/conftest.py
+16
-2
tests/lora/test_layers.py
tests/lora/test_layers.py
+217
-2
tests/lora/test_llama.py
tests/lora/test_llama.py
+7
-10
No files found.
tests/core/block/test_block_manager_v2.py
View file @
f48954a4
...
@@ -7,7 +7,8 @@ from vllm.core.interfaces import AllocStatus
...
@@ -7,7 +7,8 @@ from vllm.core.interfaces import AllocStatus
from
vllm.sequence
import
Logprob
,
SequenceStatus
from
vllm.sequence
import
Logprob
,
SequenceStatus
from
vllm.utils
import
chunk_list
from
vllm.utils
import
chunk_list
from
..utils
import
create_seq_group
,
create_seq_group_encoder_decoder
from
..utils
import
(
create_dummy_prompt
,
create_seq_group
,
create_seq_group_encoder_decoder
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
...
@@ -255,6 +256,61 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
...
@@ -255,6 +256,61 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
assert
num_consumed_blocks
==
expected_consumed_blocks
assert
num_consumed_blocks
==
expected_consumed_blocks
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"num_cpu_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"num_gpu_blocks"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"num_lookahead_slots"
,
[
0
,
2
,
10
])
@
pytest
.
mark
.
parametrize
(
"enable_caching"
,
[
False
,
True
])
def
test_swap
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
num_lookahead_slots
,
enable_caching
):
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
"""
block_manager
=
BlockSpaceManagerV2
(
block_size
,
num_cpu_blocks
,
num_gpu_blocks
,
watermark
=
0
,
enable_caching
=
enable_caching
)
prompt
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
-
1
)
prompt
.
status
=
SequenceStatus
.
WAITING
block_manager
.
allocate
(
seq_group
)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id
=
0
prompt
.
status
=
SequenceStatus
.
RUNNING
prompt
.
append_token_id
(
token_id
,
{
token_id
:
Logprob
(
0.0
)})
# Swap seq group from GPU -> CPU.
gpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_out
(
seq_group
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_out
(
seq_group
)
mapping_keys
=
[
key
for
key
,
_
in
mapping
]
assert
mapping_keys
==
gpu_blocks
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_cpu_blocks
==
after_cpu_blocks
+
len
(
gpu_blocks
)
assert
before_gpu_blocks
+
len
(
gpu_blocks
)
==
after_gpu_blocks
prompt
.
status
=
SequenceStatus
.
SWAPPED
# Swap seq group from CPU -> GPU.
assert
block_manager
.
can_swap_in
(
seq_group
,
num_lookahead_slots
)
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
cpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
mapping_keys
=
[
key
for
key
,
_
in
mapping
]
assert
mapping_keys
==
[
cpu_blocks
[
0
]]
after_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
after_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
assert
before_gpu_blocks
==
after_gpu_blocks
+
len
(
cpu_blocks
)
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
,
16
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
8
,
16
])
@
pytest
.
mark
.
parametrize
(
"prompt_len"
,
[
10
,
300
,
1000
])
@
pytest
.
mark
.
parametrize
(
"prompt_len"
,
[
10
,
300
,
1000
])
@
pytest
.
mark
.
parametrize
(
"num_slots_to_append"
,
[
50
])
@
pytest
.
mark
.
parametrize
(
"num_slots_to_append"
,
[
50
])
...
...
tests/distributed/test_basic_distributed_correctness.py
View file @
f48954a4
...
@@ -42,18 +42,16 @@ def test_models(
...
@@ -42,18 +42,16 @@ def test_models(
backend_by_env_var
=
os
.
getenv
(
VLLM_ATTENTION_BACKEND
)
backend_by_env_var
=
os
.
getenv
(
VLLM_ATTENTION_BACKEND
)
enforce_eager
=
backend_by_env_var
==
"FLASHINFER"
enforce_eager
=
backend_by_env_var
==
"FLASHINFER"
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
with
vllm_runner
(
model
,
vllm_model
=
vllm_runner
(
dtype
=
dtype
,
model
,
tensor_parallel_size
=
2
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
enforce_eager
=
enforce_eager
,
)
as
vllm_model
:
distributed_executor_backend
=
distributed_executor_backend
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
tests/distributed/test_chunked_prefill_distributed.py
View file @
f48954a4
...
@@ -45,21 +45,19 @@ def test_models(
...
@@ -45,21 +45,19 @@ def test_models(
enable_chunked_prefill
=
True
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
max_num_batched_tokens
=
chunked_prefill_token_size
hf_model
=
hf_runner
(
model
,
dtype
=
dtype
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
hf_model
vllm_model
=
vllm_runner
(
with
vllm_runner
(
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
)
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
del
vllm_model
for
i
in
range
(
len
(
example_prompts
)):
for
i
in
range
(
len
(
example_prompts
)):
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
hf_output_ids
,
hf_output_str
=
hf_outputs
[
i
]
...
...
tests/distributed/test_same_node.py
0 → 100644
View file @
f48954a4
import
os
import
torch
from
vllm.distributed.parallel_state
import
is_in_the_same_node
torch
.
distributed
.
init_process_group
(
backend
=
"gloo"
)
test_result
=
is_in_the_same_node
(
torch
.
distributed
.
group
.
WORLD
)
expected
=
os
.
environ
.
get
(
"VLLM_TEST_SAME_HOST"
,
"1"
)
==
"1"
assert
test_result
==
expected
,
f
"Expected
{
expected
}
, got
{
test_result
}
"
tests/engine/test_stop_reason.py
View file @
f48954a4
...
@@ -19,9 +19,8 @@ MAX_TOKENS = 1024
...
@@ -19,9 +19,8 @@ MAX_TOKENS = 1024
@
pytest
.
fixture
@
pytest
.
fixture
def
vllm_model
(
vllm_runner
):
def
vllm_model
(
vllm_runner
):
vllm_model
=
vllm_runner
(
MODEL
)
with
vllm_runner
(
MODEL
)
as
vllm_model
:
yield
vllm_model
yield
vllm_model
del
vllm_model
def
test_stop_reason
(
vllm_model
,
example_prompts
):
def
test_stop_reason
(
vllm_model
,
example_prompts
):
...
...
tests/engine/test_stop_strings.py
View file @
f48954a4
...
@@ -10,7 +10,8 @@ MAX_TOKENS = 200
...
@@ -10,7 +10,8 @@ MAX_TOKENS = 200
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
vllm_model
(
vllm_runner
):
def
vllm_model
(
vllm_runner
):
return
vllm_runner
(
MODEL
)
with
vllm_runner
(
MODEL
)
as
vllm_model
:
yield
vllm_model
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
...
...
tests/entrypoints/test_guided_processors.py
View file @
f48954a4
...
@@ -63,7 +63,6 @@ def test_guided_logits_processors():
...
@@ -63,7 +63,6 @@ def test_guided_logits_processors():
tokenizer
,
tokenizer
,
whitespace_pattern
=
None
)
whitespace_pattern
=
None
)
regex_LP
.
init_state
()
token_ids
=
tokenizer
.
encode
(
token_ids
=
tokenizer
.
encode
(
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
)
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
)
tensor
=
torch
.
rand
(
32000
)
tensor
=
torch
.
rand
(
32000
)
...
@@ -72,7 +71,6 @@ def test_guided_logits_processors():
...
@@ -72,7 +71,6 @@ def test_guided_logits_processors():
assert
tensor
.
shape
==
original_tensor
.
shape
assert
tensor
.
shape
==
original_tensor
.
shape
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
json_LP
.
init_state
()
token_ids
=
tokenizer
.
encode
(
token_ids
=
tokenizer
.
encode
(
f
"Give an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
)
f
"Give an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
)
tensor
=
torch
.
rand
(
32000
)
tensor
=
torch
.
rand
(
32000
)
...
...
tests/entrypoints/test_llm_generate_multiple_loras.py
0 → 100644
View file @
f48954a4
import
weakref
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm.lora.request
import
LoRARequest
from
..conftest
import
cleanup
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
PROMPTS
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
pytestmark
=
pytest
.
mark
.
llm
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
tensor_parallel_size
=
1
,
max_model_len
=
8192
,
enable_lora
=
True
,
max_loras
=
4
,
max_lora_rank
=
64
,
max_num_seqs
=
128
,
enforce_eager
=
True
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_multiple_lora_requests
(
llm
:
LLM
,
zephyr_lora_files
):
lora_request
=
[
LoRARequest
(
LORA_NAME
,
idx
+
1
,
zephyr_lora_files
)
for
idx
in
range
(
len
(
PROMPTS
))
]
# Multiple SamplingParams should be matched with each prompt
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
lora_request
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
# Exception raised, if the size of params does not match the size of prompts
with
pytest
.
raises
(
ValueError
):
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
lora_request
[:
1
])
# Single LoRARequest should be applied to every prompt
single_lora_request
=
lora_request
[
0
]
outputs
=
llm
.
generate
(
PROMPTS
,
lora_request
=
single_lora_request
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
tests/entrypoints/test_openai_server.py
View file @
f48954a4
...
@@ -167,9 +167,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
...
@@ -167,9 +167,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
assert
completion
.
id
is
not
None
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
choice
=
completion
.
choices
[
0
]
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
...
@@ -180,8 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
...
@@ -180,8 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -206,9 +206,9 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -206,9 +206,9 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
#
fir
st test
base model, then test loras
#
ju
st test
1 lora hereafter
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
async
def
test_zero_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_zero_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
...
@@ -224,7 +224,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -224,7 +224,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<
=
1
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
=
=
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -246,7 +246,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -246,7 +246,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -264,7 +264,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -264,7 +264,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
logprobs
=
6
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
)
...
...
with
pytest
.
raises
(
with
pytest
.
raises
(
...
@@ -274,7 +276,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -274,7 +276,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
logprobs
=
6
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
stream
=
True
,
)
)
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
...
@@ -287,55 +291,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -287,55 +291,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
completion
=
completion
.
choices
[
0
].
text
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
assert
completion
is
not
None
and
len
(
completion
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
chat_completion
.
id
is
not
None
assert
chat_completion
.
choices
is
not
None
and
len
(
chat_completion
.
choices
)
==
1
assert
chat_completion
.
choices
[
0
].
message
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
is
not
None
assert
len
(
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
)
==
5
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -390,7 +346,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
...
@@ -390,7 +346,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
content
is
not
None
assert
choice
.
logprobs
.
content
is
not
None
assert
len
(
choice
.
logprobs
.
content
[
0
].
top_logprobs
)
<
=
1
assert
len
(
choice
.
logprobs
.
content
[
0
].
top_logprobs
)
=
=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -418,11 +374,14 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
...
@@ -418,11 +374,14 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
content
is
not
None
assert
choice
.
logprobs
.
content
is
not
None
assert
len
(
choice
.
logprobs
.
content
[
0
].
top_logprobs
)
<
=
6
assert
len
(
choice
.
logprobs
.
content
[
0
].
top_logprobs
)
=
=
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_too_many_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_too_many_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
messages
=
[{
messages
=
[{
...
@@ -463,7 +422,51 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -463,7 +422,51 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
chat_completion
.
id
is
not
None
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
37
,
total_tokens
=
47
)
message
=
choice
.
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
[
MODEL_NAME
,
"zephyr-lora"
],
)
)
...
@@ -478,8 +481,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
...
@@ -478,8 +481,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
single_output
=
single_completion
.
choices
[
0
].
text
single_output
=
single_completion
.
choices
[
0
].
text
single_usage
=
single_completion
.
usage
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
prompt
=
prompt
,
max_tokens
=
5
,
max_tokens
=
5
,
...
@@ -495,7 +496,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
...
@@ -495,7 +496,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
assert
finish_reason_count
==
1
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
chunk
.
choices
[
0
].
text
assert
chunk
.
usage
==
single_usage
assert
""
.
join
(
chunks
)
==
single_output
assert
""
.
join
(
chunks
)
==
single_output
...
@@ -550,6 +550,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
...
@@ -550,6 +550,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
assert
""
.
join
(
chunks
)
==
output
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_chat_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What is the capital of France?"
}]
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
# just test 1 lora hereafter
...
@@ -620,8 +752,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
...
@@ -620,8 +752,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
logit_bias
=
{
str
(
token_id
):
100
},
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
seed
=
42
,
)
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
...
@@ -668,9 +799,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
...
@@ -668,9 +799,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
3
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
for
i
in
range
(
3
):
assert
completion
.
choices
[
i
].
text
is
not
None
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
TEST_SCHEMA
)
...
@@ -737,9 +867,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
...
@@ -737,9 +867,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
3
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
for
i
in
range
(
3
):
assert
completion
.
choices
[
i
].
text
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
completion
.
choices
[
i
].
text
)
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
completion
.
choices
[
i
].
text
)
is
not
None
...
@@ -796,7 +925,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
...
@@ -796,7 +925,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
2
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
TEST_CHOICE
assert
completion
.
choices
[
i
].
text
in
TEST_CHOICE
...
@@ -898,12 +1027,199 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
...
@@ -898,12 +1027,199 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
top_logprobs
=
5
,
top_logprobs
=
5
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
.
content
is
not
None
top_logprobs
=
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
top_logprobs
=
chat_completion
.
choices
[
0
].
logprobs
.
content
[
0
].
top_logprobs
# -9999.0 is the minimum logprob returned by OpenAI
# -9999.0 is the minimum logprob returned by OpenAI
assert
all
(
for
item
in
top_logprobs
:
isinstance
(
token
.
logprob
,
float
)
and
token
.
logprob
>=
-
9999.0
assert
item
.
logprob
>=
-
9999.0
,
f
"Failed (top_logprobs=
{
top_logprobs
}
)"
for
token
in
top_logprobs
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_named_tool_use
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
}]
# non-streaming
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
}
})
message
=
chat_completion
.
choices
[
0
].
message
assert
len
(
message
.
content
)
==
0
json_string
=
message
.
tool_calls
[
0
].
function
.
arguments
json1
=
json
.
loads
(
json_string
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
TEST_SCHEMA
)
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
json_string
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Give me another one with a different name and age"
})
# streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
}
},
stream
=
True
)
output
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
assert
delta
.
content
is
None
or
len
(
delta
.
content
)
==
0
if
delta
.
tool_calls
:
output
.
append
(
delta
.
tool_calls
[
0
].
function
.
arguments
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
json2
=
json
.
loads
(
""
.
join
(
output
))
jsonschema
.
validate
(
instance
=
json2
,
schema
=
TEST_SCHEMA
)
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_required_tool_use_not_yet_supported
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
"required"
)
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
"auto"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_inconsistent_tool_choice_and_tools
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
}
})
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
}
}],
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"nondefined_function_name"
}
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -920,6 +1236,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
...
@@ -920,6 +1236,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
response_format
=
{
"type"
:
"json_object"
})
response_format
=
{
"type"
:
"json_object"
})
content
=
resp
.
choices
[
0
].
message
.
content
content
=
resp
.
choices
[
0
].
message
.
content
assert
content
is
not
None
loaded
=
json
.
loads
(
content
)
loaded
=
json
.
loads
(
content
)
assert
loaded
==
{
"result"
:
2
},
loaded
assert
loaded
==
{
"result"
:
2
},
loaded
...
@@ -1032,8 +1350,9 @@ number: "1" | "2"
...
@@ -1032,8 +1350,9 @@ number: "1" | "2"
"model_name"
,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_echo_logprob_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
...
@@ -1042,12 +1361,11 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
...
@@ -1042,12 +1361,11 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
echo
=
True
,
echo
=
True
,
logprobs
=
1
)
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
list
)
else
prompt
assert
(
completion
.
choices
[
0
].
text
is
not
None
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
and
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
))
logprobs
=
completion
.
choices
[
0
].
logprobs
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
len
(
logprobs
.
text_offset
)
>
5
...
@@ -1055,6 +1373,9 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
...
@@ -1055,6 +1373,9 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
and
logprobs
.
token_logprobs
[
0
]
is
None
)
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
assert
len
(
logprobs
.
tokens
)
>
5
...
@@ -1085,32 +1406,32 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
...
@@ -1085,32 +1406,32 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
)
)
async
def
test_single_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_single_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
input
=
[
input
_texts
=
[
"The chef prepared a delicious meal."
,
"The chef prepared a delicious meal."
,
]
]
# test single embedding
# test single embedding
embeddings
=
await
client
.
embeddings
.
create
(
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
model
=
model_name
,
input
=
input
,
input
=
input
_texts
,
encoding_format
=
"float"
,
encoding_format
=
"float"
,
)
)
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
embeddings
.
data
is
not
None
and
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
# test using token IDs
# test using token IDs
input
=
[
1
,
1
,
1
,
1
,
1
]
input
_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
client
.
embeddings
.
create
(
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
model
=
model_name
,
input
=
input
,
input
=
input
_tokens
,
encoding_format
=
"float"
,
encoding_format
=
"float"
,
)
)
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
embeddings
.
data
is
not
None
and
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
prompt_tokens
==
5
...
@@ -1125,29 +1446,29 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
...
@@ -1125,29 +1446,29 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
async
def
test_batch_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_batch_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
# test List[str]
# test List[str]
inputs
=
[
input
_text
s
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
"Stars twinkle brightly in the night sky."
]
]
embeddings
=
await
client
.
embeddings
.
create
(
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
model
=
model_name
,
input
=
inputs
,
input
=
input
_text
s
,
encoding_format
=
"float"
,
encoding_format
=
"float"
,
)
)
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
embeddings
.
data
is
not
None
and
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
# test List[List[int]]
# test List[List[int]]
inputs
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
input
_token
s
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
[
25
,
32
,
64
,
77
]]
embeddings
=
await
client
.
embeddings
.
create
(
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
model
=
model_name
,
input
=
inputs
,
input
=
input
_token
s
,
encoding_format
=
"float"
,
encoding_format
=
"float"
,
)
)
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
embeddings
.
data
is
not
None
and
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
prompt_tokens
==
17
...
...
tests/entrypoints/test_openai_vision.py
0 → 100644
View file @
f48954a4
from
pathlib
import
Path
from
typing
import
Dict
import
openai
import
pytest
import
pytest_asyncio
import
ray
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
encode_image_base64
from
..utils
import
ServerRunner
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE
=
(
Path
(
__file__
).
parent
.
parent
.
parent
/
"examples/template_llava.jinja"
)
assert
LLAVA_CHAT_TEMPLATE
.
exists
()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
,
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
MODEL_NAME
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"4096"
,
"--enforce-eager"
,
"--image-input-type"
,
"pixel_values"
,
"--image-token-id"
,
"32000"
,
"--image-input-shape"
,
"1,3,336,336"
,
"--image-feature-size"
,
"576"
,
"--chat-template"
,
str
(
LLAVA_CHAT_TEMPLATE
),
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
@
pytest_asyncio
.
fixture
(
scope
=
"session"
)
async
def
base64_encoded_image
()
->
Dict
[
str
,
str
]:
return
{
image_url
:
encode_image_base64
(
await
ImageFetchAiohttp
.
fetch_image
(
image_url
))
for
image_url
in
TEST_IMAGE_URLS
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
596
,
total_tokens
=
606
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image_base64encoded
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
base64_encoded_image
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
base64_encoded_image
[
image_url
]
}
"
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
596
,
total_tokens
=
606
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_chat_streaming_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
delta
.
content
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_multi_image_input
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-image input
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/kernels/test_activation.py
View file @
f48954a4
...
@@ -44,7 +44,7 @@ def test_act_and_mul(
...
@@ -44,7 +44,7 @@ def test_act_and_mul(
elif
activation
==
"gelu_tanh"
:
elif
activation
==
"gelu_tanh"
:
layer
=
GeluAndMul
(
approximate
=
"tanh"
)
layer
=
GeluAndMul
(
approximate
=
"tanh"
)
out
=
layer
(
x
)
out
=
layer
(
x
)
ref_out
=
layer
.
_
forward
(
x
)
ref_out
=
layer
.
forward
_native
(
x
)
# The SiLU and GELU implementations are equivalent to the native PyTorch
# The SiLU and GELU implementations are equivalent to the native PyTorch
# implementations, so we can do exact comparison.
# implementations, so we can do exact comparison.
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
0.0
,
rtol
=
0.0
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
0.0
,
rtol
=
0.0
)
...
@@ -72,7 +72,7 @@ def test_activation(
...
@@ -72,7 +72,7 @@ def test_activation(
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
x
=
torch
.
randn
(
num_tokens
,
d
,
dtype
=
dtype
)
layer
=
activation
()
layer
=
activation
()
out
=
layer
(
x
)
out
=
layer
(
x
)
ref_out
=
layer
.
_
forward
(
x
)
ref_out
=
layer
.
forward
_native
(
x
)
assert
torch
.
allclose
(
out
,
assert
torch
.
allclose
(
out
,
ref_out
,
ref_out
,
atol
=
get_default_atol
(
out
),
atol
=
get_default_atol
(
out
),
...
...
tests/kernels/test_attention_selector.py
View file @
f48954a4
import
os
from
unittest.mock
import
patch
from
unittest.mock
import
patch
import
pytest
import
pytest
import
torch
import
torch
from
tests.kernels.utils
import
(
STR_FLASH_ATTN_VAL
,
STR_INVALID_VAL
,
override_backend_env_variable
)
from
vllm.attention.selector
import
which_attn_to_use
from
vllm.attention.selector
import
which_attn_to_use
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"name"
,
[
"TORCH_SDPA"
,
"ROCM_FLASH"
,
"XFORMERS"
,
"FLASHINFER"
])
"name"
,
[
"TORCH_SDPA"
,
"ROCM_FLASH"
,
"XFORMERS"
,
"FLASHINFER"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
,
"cuda"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
,
"cuda"
])
def
test_env
(
name
:
str
,
device
:
str
):
def
test_env
(
name
:
str
,
device
:
str
,
monkeypatch
):
"""Test that the attention selector can be set via environment variable.
"""Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend.
Note that we do not test FlashAttn because it is the default backend.
"""
"""
name_backup
=
os
.
environ
.
get
(
"VLLM_ATTENTION_BACKEND"
,
None
)
o
s
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
name
o
verride_backend_env_variable
(
monkeypatch
,
name
)
if
device
==
"cpu"
:
if
device
==
"cpu"
:
with
patch
(
"vllm.attention.selector.is_cpu"
,
return_value
=
True
):
with
patch
(
"vllm.attention.selector.is_cpu"
,
return_value
=
True
):
...
@@ -32,14 +33,11 @@ def test_env(name: str, device: str):
...
@@ -32,14 +33,11 @@ def test_env(name: str, device: str):
torch
.
float16
,
16
)
torch
.
float16
,
16
)
assert
backend
.
name
==
name
assert
backend
.
name
==
name
if
name_backup
is
not
None
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
name_backup
def
test_flash_attn
():
def
test_flash_attn
(
monkeypatch
):
"""Test FlashAttn validation."""
"""Test FlashAttn validation."""
name_backup
=
os
.
environ
.
get
(
"VLLM_ATTENTION_BACKEND"
,
None
)
o
s
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"
FLASH_ATTN
"
o
verride_backend_env_variable
(
monkeypatch
,
STR_
FLASH_ATTN
_VAL
)
# Unsupported CUDA arch
# Unsupported CUDA arch
with
patch
(
"torch.cuda.get_device_capability"
,
return_value
=
[
7
,
5
]):
with
patch
(
"torch.cuda.get_device_capability"
,
return_value
=
[
7
,
5
]):
...
@@ -71,14 +69,9 @@ def test_flash_attn():
...
@@ -71,14 +69,9 @@ def test_flash_attn():
backend
=
which_attn_to_use
(
8
,
17
,
8
,
None
,
torch
.
float16
,
None
,
16
)
backend
=
which_attn_to_use
(
8
,
17
,
8
,
None
,
torch
.
float16
,
None
,
16
)
assert
backend
.
name
!=
"FLASH_ATTN"
assert
backend
.
name
!=
"FLASH_ATTN"
if
name_backup
is
not
None
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
name_backup
def
test_invalid_env
():
def
test_invalid_env
(
monkeypatch
):
"""Throw an exception if the backend name is invalid."""
"""Throw an exception if the backend name is invalid."""
name_backup
=
os
.
environ
.
get
(
"VLLM_ATTENTION_BACKEND"
,
None
)
override_backend_env_variable
(
monkeypatch
,
STR_INVALID_VAL
)
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"INVALID"
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
None
,
16
)
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
None
,
16
)
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
name_backup
tests/kernels/test_cutlass.py
View file @
f48954a4
...
@@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int,
...
@@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int,
assert
torch
.
allclose
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
assert
torch
.
allclose
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
512
,
222
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"m"
,
[
512
,
222
,
100
,
33
,
1
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"n"
,
[
2048
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
...
@@ -207,14 +207,21 @@ class CutlassLayer(torch.nn.Module):
...
@@ -207,14 +207,21 @@ class CutlassLayer(torch.nn.Module):
self
.
out_dtype
)
self
.
out_dtype
)
def
test_cutlass_cuda_graph
():
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
def
test_cutlass_cuda_graph
(
per_act_token
:
bool
,
per_out_ch
:
bool
):
m
,
n
,
k
=
512
,
512
,
512
m
,
n
,
k
=
512
,
512
,
512
a
=
to_int8
(
torch
.
randn
((
m
,
k
),
device
=
"cuda"
))
a
=
to_int8
(
torch
.
randn
((
m
,
k
),
device
=
"cuda"
))
b
=
to_int8
(
torch
.
randn
((
n
,
k
),
device
=
"cuda"
).
t
())
b
=
to_int8
(
torch
.
randn
((
n
,
k
),
device
=
"cuda"
).
t
())
scale_a
=
(
torch
.
randn
((
m
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
)
m_a_scales
=
m
if
per_act_token
else
1
scale_b
=
(
torch
.
randn
((
1
,
n
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
)
n_b_scales
=
n
if
per_out_ch
else
1
scale_a
=
(
torch
.
randn
(
(
m_a_scales
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
)
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
)
# Construct a trivial model with a single layer that calls a CUTLASS kernel
# Construct a trivial model with a single layer that calls a CUTLASS kernel
model
=
CutlassLayer
(
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
model
=
CutlassLayer
(
b
,
scale_a
,
scale_b
,
torch
.
bfloat16
)
...
...
tests/kernels/test_int8_quant.py
View file @
f48954a4
import
pytest
import
pytest
import
torch
import
torch
from
vllm._C
import
ops
# ruff: noqa: F401
import
vllm._C
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
DTYPES
=
[
torch
.
half
,
torch
.
bfloat16
,
torch
.
float
]
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
8192
]
# Arbitrary values for testing
HIDDEN_SIZES
=
[
16
,
67
,
768
,
2048
,
5120
,
5137
,
8192
,
8193
]
# Arbitrary values for testing
NUM_TOKENS
=
[
1
,
7
,
83
,
4096
]
# Arbitrary values for testing
NUM_TOKENS
=
[
1
,
7
,
83
,
4096
]
# Arbitrary values for testing
SEEDS
=
[
0
]
SEEDS
=
[
0
]
SCALE
=
[
0.1
,
0.5
,
0.8
,
1.2
,
2.1
]
SCALE
=
[
0.1
,
0.5
,
0.8
,
1.2
,
2.1
]
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
def
test_dynamic_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
seed
:
int
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x_token_max
,
_
=
x
.
max
(
dim
=
1
)
x_token_max
=
x_token_max
.
to
(
dtype
=
torch
.
float32
)
scales
=
(
x_token_max
/
float
(
127.0
))[:,
None
].
to
(
device
=
"cuda"
,
dtype
=
torch
.
float32
)
torch_out
=
(
x
/
scales
).
round
().
clamp
(
int8_traits
.
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
ops_out
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
,
device
=
"cuda"
)
scales_out
=
torch
.
empty_like
(
scales
,
dtype
=
torch
.
float32
,
device
=
"cuda"
)
torch
.
ops
.
_C
.
dynamic_scaled_int8_quant
(
ops_out
,
x
,
scales_out
)
assert
torch
.
allclose
(
scales_out
,
scales
)
assert
torch
.
allclose
(
torch_out
,
ops_out
,
atol
=
1
)
# big atol to account for rounding errors
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"scale"
,
SCALE
)
@
pytest
.
mark
.
parametrize
(
"scale"
,
SCALE
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
def
test_static_scaled_int8_quant
(
num_tokens
:
int
,
hidden_size
:
int
,
seed
:
int
,
scale
:
float
)
->
None
:
dtype
:
torch
.
dtype
,
seed
:
int
,
scale
:
float
)
->
None
:
torch
.
random
.
manual_seed
(
seed
)
torch
.
random
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
int8_traits
=
torch
.
iinfo
(
torch
.
int8
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
x
=
torch
.
rand
(
num_tokens
,
hidden_size
,
dtype
=
dtype
,
device
=
"cuda"
)
*
1000
out1
=
(
x
/
scale
).
round
().
clamp
(
out1
=
(
x
/
scale
).
round
().
clamp
(
int8_traits
.
min
,
torch
.
iinfo
(
torch
.
int8
).
min
,
int8_traits
.
max
).
to
(
torch
.
int8
)
torch
.
iinfo
(
torch
.
int8
).
max
).
to
(
torch
.
int8
)
out2
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
)
out2
=
torch
.
empty_like
(
x
,
dtype
=
torch
.
int8
)
ops
.
static_scaled_int8_quant
(
out2
,
x
,
scale
)
scale_argument
=
torch
.
tensor
([
scale
],
dtype
=
torch
.
float32
,
device
=
"cuda"
)
torch
.
ops
.
_C
.
static_scaled_int8_quant
(
out2
,
x
,
scale_argument
)
assert
torch
.
allclose
(
out1
,
out2
,
assert
torch
.
allclose
(
out1
,
out2
,
atol
=
1
)
# big atol to account for rounding errors
atol
=
1
)
# big atol to account for rounding errors
tests/kernels/test_layernorm.py
View file @
f48954a4
...
@@ -42,7 +42,7 @@ def test_rms_norm(
...
@@ -42,7 +42,7 @@ def test_rms_norm(
# NOTE(woosuk): The reference implementation should be executed first
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
# because the custom kernel is in-place.
ref_out
=
layer
.
_
forward
(
x
,
residual
)
ref_out
=
layer
.
forward
_native
(
x
,
residual
)
out
=
layer
(
x
,
residual
)
out
=
layer
(
x
,
residual
)
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# numerical errors than other operators because they involve reductions.
# numerical errors than other operators because they involve reductions.
...
...
tests/kernels/test_pos_encoding.py
View file @
f48954a4
...
@@ -64,7 +64,7 @@ def test_rotary_embedding(
...
@@ -64,7 +64,7 @@ def test_rotary_embedding(
# NOTE(woosuk): The reference implementation should be executed first
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
# because the custom kernel is in-place.
ref_query
,
ref_key
=
rope
.
_
forward
(
positions
,
query
,
key
)
ref_query
,
ref_key
=
rope
.
forward
_native
(
positions
,
query
,
key
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
)
# Compare the results.
# Compare the results.
assert
torch
.
allclose
(
out_query
,
assert
torch
.
allclose
(
out_query
,
...
@@ -121,7 +121,7 @@ def test_batched_rotary_embedding(
...
@@ -121,7 +121,7 @@ def test_batched_rotary_embedding(
# NOTE(woosuk): The reference implementation should be executed first
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
# because the custom kernel is in-place.
ref_query
,
ref_key
=
rope
.
_
forward
(
positions
,
query
,
key
)
ref_query
,
ref_key
=
rope
.
forward
_native
(
positions
,
query
,
key
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
query
,
key
,
key
,
...
@@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora(
...
@@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora(
# NOTE(woosuk): The reference implementation should be executed first
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
# because the custom kernel is in-place.
ref_query
,
ref_key
=
rope
.
_forward
(
positions
,
query
,
key
,
query_offsets
)
ref_query
,
ref_key
=
rope
.
forward_native
(
positions
,
query
,
key
,
query_offsets
)
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
,
out_query
,
out_key
=
rope
.
forward
(
positions
,
query
,
key
,
query_offsets
.
flatten
())
query_offsets
.
flatten
())
# Compare the results.
# Compare the results.
...
...
tests/kernels/utils.py
0 → 100644
View file @
f48954a4
"""Kernel test utils"""
import
pytest
STR_BACKEND_ENV_VAR
:
str
=
"VLLM_ATTENTION_BACKEND"
STR_FLASH_ATTN_VAL
:
str
=
"FLASH_ATTN"
STR_INVALID_VAL
:
str
=
"INVALID"
def
override_backend_env_variable
(
mpatch
:
pytest
.
MonkeyPatch
,
backend_name
:
str
)
->
None
:
'''
Override the environment variable indicating the vLLM backend temporarily,
using pytest monkeypatch to ensure that the env vars get
reset once the test context exits.
Arguments:
* mpatch: pytest monkeypatch instance
* backend_name: attention backend name to force
'''
mpatch
.
setenv
(
STR_BACKEND_ENV_VAR
,
backend_name
)
tests/lora/conftest.py
View file @
f48954a4
...
@@ -42,10 +42,24 @@ def cleanup():
...
@@ -42,10 +42,24 @@ def cleanup():
ray
.
shutdown
()
ray
.
shutdown
()
@
pytest
.
fixture
()
def
should_do_global_cleanup_after_test
(
request
)
->
bool
:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
This can provide a ~10x speedup for non-GPU unit tests since they don't need
to initialize torch.
"""
if
request
.
node
.
get_closest_marker
(
"skip_global_cleanup"
):
return
False
return
True
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
def
cleanup_fixture
():
def
cleanup_fixture
(
should_do_global_cleanup_after_test
:
bool
):
yield
yield
cleanup
()
if
should_do_global_cleanup_after_test
:
cleanup
()
@
pytest
.
fixture
@
pytest
.
fixture
...
...
tests/lora/test_layers.py
View file @
f48954a4
...
@@ -2,6 +2,7 @@ import random
...
@@ -2,6 +2,7 @@ import random
from
copy
import
deepcopy
from
copy
import
deepcopy
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
unittest.mock
import
patch
import
pytest
import
pytest
import
torch
import
torch
...
@@ -32,7 +33,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...
@@ -32,7 +33,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
ParallelLMHead
,
VocabParallelEmbedding
,
get_masked_input_and_mask
)
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.model_executor.utils
import
set_random_seed
from
.utils
import
DummyLoRAManager
from
.utils
import
DummyLoRAManager
...
@@ -427,7 +428,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
...
@@ -427,7 +428,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
logits_processor
=
LogitsProcessor
(
logits_processor
=
LogitsProcessor
(
vocab_size
+
lora_config
.
lora_extra_vocab_size
,
vocab_size
)
vocab_size
+
lora_config
.
lora_extra_vocab_size
,
vocab_size
)
lora_logits_processor
=
LogitsProcessorWithLoRA
(
lora_logits_processor
=
LogitsProcessorWithLoRA
(
logits_processor
,
1024
,
linear
.
weight
.
dtype
,
linear
.
weight
.
device
)
logits_processor
,
1024
,
linear
.
weight
.
dtype
,
linear
.
weight
.
device
,
None
)
lora_logits_processor
.
create_lora_weights
(
max_loras
,
lora_config
)
lora_logits_processor
.
create_lora_weights
(
max_loras
,
lora_config
)
return
linear
,
logits_processor
,
lora_logits_processor
return
linear
,
logits_processor
,
lora_logits_processor
...
@@ -867,3 +869,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
...
@@ -867,3 +869,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
torch
.
allclose
(
ref_q
,
actual_q
)
torch
.
allclose
(
ref_q
,
actual_q
)
torch
.
allclose
(
ref_k
,
actual_k
)
torch
.
allclose
(
ref_k
,
actual_k
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
list
(
range
(
256
)))
def
test_vocab_parallel_embedding_indices
(
tp_size
,
seed
):
random
.
seed
(
seed
)
vocab_size
=
random
.
randint
(
4000
,
64000
)
added_vocab_size
=
random
.
randint
(
0
,
1024
)
org_vocab_size
=
vocab_size
-
added_vocab_size
last_org_vocab_end_index
=
0
last_added_vocab_end_index
=
org_vocab_size
computed_vocab_size
=
0
computed_org_vocab_size
=
0
computed_added_vocab_size
=
0
vocab_size_padded
=
-
1
all_org_tokens
=
[]
all_added_tokens
=
[]
token_ids
=
[]
for
tp_rank
in
range
(
tp_size
):
with
patch
(
"vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank"
,
return_value
=
tp_rank
),
patch
(
"vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size"
,
return_value
=
tp_size
):
vocab_embedding
=
VocabParallelEmbedding
(
vocab_size
,
1
,
org_num_embeddings
=
org_vocab_size
)
vocab_size_padded
=
vocab_embedding
.
num_embeddings_padded
shard_indices
=
vocab_embedding
.
shard_indices
# Assert that the ranges are contiguous
assert
shard_indices
.
org_vocab_start_index
==
last_org_vocab_end_index
assert
(
shard_indices
.
added_vocab_start_index
==
last_added_vocab_end_index
)
# Ensure that we are not exceeding the vocab size
computed_vocab_size
+=
shard_indices
.
num_elements_padded
computed_org_vocab_size
+=
shard_indices
.
num_org_elements
computed_added_vocab_size
+=
shard_indices
.
num_added_elements
# Ensure that the ranges are not overlapping
all_org_tokens
.
extend
(
range
(
shard_indices
.
org_vocab_start_index
,
shard_indices
.
org_vocab_end_index
))
all_added_tokens
.
extend
(
range
(
shard_indices
.
added_vocab_start_index
,
shard_indices
.
added_vocab_end_index
))
token_ids
.
extend
(
range
(
shard_indices
.
org_vocab_start_index
,
shard_indices
.
org_vocab_end_index
))
token_ids
.
extend
([
-
1
]
*
(
shard_indices
.
num_org_elements_padded
-
shard_indices
.
num_org_elements
))
token_ids
.
extend
(
range
(
shard_indices
.
added_vocab_start_index
,
shard_indices
.
added_vocab_end_index
))
token_ids
.
extend
([
-
1
]
*
(
shard_indices
.
num_added_elements_padded
-
shard_indices
.
num_added_elements
))
last_org_vocab_end_index
=
shard_indices
.
org_vocab_end_index
last_added_vocab_end_index
=
shard_indices
.
added_vocab_end_index
assert
computed_vocab_size
==
vocab_size_padded
assert
computed_org_vocab_size
==
org_vocab_size
assert
computed_added_vocab_size
==
added_vocab_size
# Ensure that the ranges are not overlapping
assert
len
(
all_org_tokens
)
==
len
(
set
(
all_org_tokens
))
assert
len
(
all_added_tokens
)
==
len
(
set
(
all_added_tokens
))
assert
not
set
(
all_org_tokens
).
intersection
(
set
(
all_added_tokens
))
token_ids_tensor
=
torch
.
tensor
(
token_ids
,
dtype
=
torch
.
long
)
reindex_mapping
=
vocab_embedding
.
get_sharded_to_full_mapping
()
assert
reindex_mapping
is
not
None
or
tp_size
==
1
if
reindex_mapping
is
not
None
:
reindexed_token_ids
=
token_ids_tensor
[
reindex_mapping
]
expected
=
torch
.
tensor
(
list
(
range
(
0
,
vocab_size
)))
assert
reindexed_token_ids
[:
vocab_size
].
equal
(
expected
)
assert
torch
.
all
(
reindexed_token_ids
[
vocab_size
:]
==
-
1
)
def
test_get_masked_input_and_mask
():
x
=
torch
.
tensor
([
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
])
# base tp 1 case, no padding
modified_x
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
0
)
assert
torch
.
equal
(
x
,
modified_x
)
# tp 2 case, no padding
modified_x_rank_0
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
4
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
10
,
num_org_vocab_padding
=
0
)
modified_x_rank_1
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
4
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
10
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
0
)
assert
torch
.
equal
(
modified_x_rank_0
,
torch
.
tensor
([
0
,
1
,
2
,
3
,
0
,
0
,
0
,
0
,
4
,
5
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_1
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
1
,
2
,
3
,
0
,
0
,
4
,
5
]))
# tp 4 case, no padding
modified_x_rank_0
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
2
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
9
,
num_org_vocab_padding
=
0
)
modified_x_rank_1
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
2
,
org_vocab_end_index
=
4
,
added_vocab_start_index
=
9
,
added_vocab_end_index
=
10
,
num_org_vocab_padding
=
0
)
modified_x_rank_2
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
4
,
org_vocab_end_index
=
6
,
added_vocab_start_index
=
10
,
added_vocab_end_index
=
11
,
num_org_vocab_padding
=
0
)
modified_x_rank_3
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
6
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
11
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
0
)
assert
torch
.
equal
(
modified_x_rank_0
,
torch
.
tensor
([
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_1
,
torch
.
tensor
([
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_2
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
2
,
0
]))
assert
torch
.
equal
(
modified_x_rank_3
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
2
]))
# base tp 1 case, with padding
modified_x
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
2
)
assert
torch
.
equal
(
modified_x
,
torch
.
tensor
([
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
10
,
11
,
12
,
13
]))
# tp 2 case, with padding
modified_x_rank_0
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
4
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
10
,
num_org_vocab_padding
=
2
)
modified_x_rank_1
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
4
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
10
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
2
)
assert
torch
.
equal
(
modified_x_rank_0
,
torch
.
tensor
([
0
,
1
,
2
,
3
,
0
,
0
,
0
,
0
,
6
,
7
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_1
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
1
,
2
,
3
,
0
,
0
,
6
,
7
]))
# tp 4 case, with padding
modified_x_rank_0
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
0
,
org_vocab_end_index
=
2
,
added_vocab_start_index
=
8
,
added_vocab_end_index
=
9
,
num_org_vocab_padding
=
2
)
modified_x_rank_1
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
2
,
org_vocab_end_index
=
4
,
added_vocab_start_index
=
9
,
added_vocab_end_index
=
10
,
num_org_vocab_padding
=
2
)
modified_x_rank_2
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
4
,
org_vocab_end_index
=
6
,
added_vocab_start_index
=
10
,
added_vocab_end_index
=
11
,
num_org_vocab_padding
=
2
)
modified_x_rank_3
,
_
=
get_masked_input_and_mask
(
x
,
org_vocab_start_index
=
6
,
org_vocab_end_index
=
8
,
added_vocab_start_index
=
11
,
added_vocab_end_index
=
12
,
num_org_vocab_padding
=
2
)
assert
torch
.
equal
(
modified_x_rank_0
,
torch
.
tensor
([
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_1
,
torch
.
tensor
([
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
0
]))
assert
torch
.
equal
(
modified_x_rank_2
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
4
,
0
]))
assert
torch
.
equal
(
modified_x_rank_3
,
torch
.
tensor
([
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
4
]))
tests/lora/test_llama.py
View file @
f48954a4
...
@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int):
...
@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int):
return
generated_texts
return
generated_texts
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
1
,
2
,
4
])
def
test_llama_lora
(
sql_lora_files
,
tp_size
):
def
test_llama_lora
(
sql_lora_files
,
tp_size
,
num_gpus_available
):
# Cannot use as it will initialize torch.cuda too early...
if
num_gpus_available
<
tp_size
:
# if torch.cuda.device_count() < tp_size:
pytest
.
skip
(
f
"Not enough GPUs for tensor parallelism
{
tp_size
}
"
)
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm
=
vllm
.
LLM
(
MODEL_PATH
,
llm
=
vllm
.
LLM
(
MODEL_PATH
,
enable_lora
=
True
,
enable_lora
=
True
,
...
@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size):
...
@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size):
print
(
"removing lora"
)
print
(
"removing lora"
)
@
pytest
.
mark
.
skip
(
"Requires multiple GPUs"
)
def
test_llama_tensor_parallel_equality
(
sql_lora_files
,
num_gpus_available
):
def
test_llama_tensor_parallel_equality
(
sql_lora_files
):
if
num_gpus_available
<
4
:
# Cannot use as it will initialize torch.cuda too early...
pytest
.
skip
(
"Not enough GPUs for tensor parallelism 4"
)
# if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
llm_tp1
=
vllm
.
LLM
(
MODEL_PATH
,
llm_tp1
=
vllm
.
LLM
(
MODEL_PATH
,
enable_lora
=
True
,
enable_lora
=
True
,
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment