Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
705f6a35
Commit
705f6a35
authored
Jul 16, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1
parents
af837396
4cf256ae
Changes
439
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3164 additions
and
218 deletions
+3164
-218
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+2
-4
tests/entrypoints/openai/__init__.py
tests/entrypoints/openai/__init__.py
+0
-0
tests/entrypoints/openai/conftest.py
tests/entrypoints/openai/conftest.py
+69
-0
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+823
-0
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+664
-0
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+137
-0
tests/entrypoints/openai/test_guided_processors.py
tests/entrypoints/openai/test_guided_processors.py
+13
-54
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_models.py
+60
-0
tests/entrypoints/openai/test_oot_registration.py
tests/entrypoints/openai/test_oot_registration.py
+0
-3
tests/entrypoints/openai/test_run_batch.py
tests/entrypoints/openai/test_run_batch.py
+2
-1
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+0
-4
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+27
-50
tests/kernels/test_attention.py
tests/kernels/test_attention.py
+17
-16
tests/kernels/test_attention_selector.py
tests/kernels/test_attention_selector.py
+14
-9
tests/kernels/test_blocksparse_attention.py
tests/kernels/test_blocksparse_attention.py
+11
-11
tests/kernels/test_cache.py
tests/kernels/test_cache.py
+17
-15
tests/kernels/test_cutlass.py
tests/kernels/test_cutlass.py
+105
-49
tests/kernels/test_encoder_decoder_attn.py
tests/kernels/test_encoder_decoder_attn.py
+953
-0
tests/kernels/test_flash_attn.py
tests/kernels/test_flash_attn.py
+2
-2
tests/kernels/test_flashinfer.py
tests/kernels/test_flashinfer.py
+248
-0
No files found.
Too many changes to show.
To preserve performance only
439 of 439+
files are displayed.
Plain diff
Email patch
tests/entrypoints/test
_llm
_generate_multiple_loras.py
→
tests/entrypoints/
llm/
test_generate_multiple_loras.py
View file @
705f6a35
...
...
@@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download
from
vllm
import
LLM
from
vllm.lora.request
import
LoRARequest
from
..conftest
import
cleanup
from
..
.
conftest
import
cleanup
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
...
@@ -20,8 +20,6 @@ PROMPTS = [
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
pytestmark
=
pytest
.
mark
.
llm
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
...
...
@@ -44,7 +42,7 @@ def llm():
cleanup
()
@
pytest
.
fixture
(
scope
=
"
session
"
)
@
pytest
.
fixture
(
scope
=
"
module
"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
...
...
tests/entrypoints/openai/__init__.py
0 → 100644
View file @
705f6a35
tests/entrypoints/openai/conftest.py
0 → 100644
View file @
705f6a35
import
pytest
@
pytest
.
fixture
def
sample_regex
():
return
(
r
"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r
"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
@
pytest
.
fixture
def
sample_json_schema
():
return
{
"type"
:
"object"
,
"properties"
:
{
"name"
:
{
"type"
:
"string"
},
"age"
:
{
"type"
:
"integer"
},
"skills"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"maxLength"
:
10
},
"minItems"
:
3
},
"work_history"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"properties"
:
{
"company"
:
{
"type"
:
"string"
},
"duration"
:
{
"type"
:
"number"
},
"position"
:
{
"type"
:
"string"
}
},
"required"
:
[
"company"
,
"position"
]
}
}
},
"required"
:
[
"name"
,
"age"
,
"skills"
,
"work_history"
]
}
@
pytest
.
fixture
def
sample_guided_choice
():
return
[
"Python"
,
"Java"
,
"JavaScript"
,
"C++"
,
"C#"
,
"PHP"
,
"TypeScript"
,
"Ruby"
,
"Swift"
,
"Kotlin"
]
@
pytest
.
fixture
def
sample_sql_statements
():
return
(
"""
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""
)
\ No newline at end of file
tests/entrypoints/
test_
openai
_server
.py
→
tests/entrypoints/openai
/test_chat
.py
View file @
705f6a35
# imports for guided decoding tests
import
json
import
re
from
typing
import
List
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import
ray
import
torch
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
..utils
import
ServerRunner
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
EMBEDDING_MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
TEST_SCHEMA
=
{
"type"
:
"object"
,
"properties"
:
{
"name"
:
{
"type"
:
"string"
},
"age"
:
{
"type"
:
"integer"
},
"skills"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"maxLength"
:
10
},
"minItems"
:
3
},
"work history"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"properties"
:
{
"company"
:
{
"type"
:
"string"
},
"duration"
:
{
"type"
:
"string"
},
"position"
:
{
"type"
:
"string"
}
},
"required"
:
[
"company"
,
"position"
]
}
}
},
"required"
:
[
"name"
,
"age"
,
"skills"
,
"work history"
]
}
TEST_REGEX
=
(
r
"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r
"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
TEST_CHOICE
=
[
"Python"
,
"Java"
,
"JavaScript"
,
"C++"
,
"C#"
,
"PHP"
,
"TypeScript"
,
"Ruby"
,
"Swift"
,
"Kotlin"
]
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.75"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
with
RemoteOpenAIServer
([
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
])
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
(
zephyr_lora_files
):
ray
.
shutdown
()
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
EMBEDDING_MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--enforce-eager"
,
"--gpu-memory-utilization"
,
"0.75"
,
"--max-model-len"
,
"8192"
,
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
@
pytest
.
mark
.
asyncio
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
lora_models
=
models
[
1
:]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_no_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
None
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_zero_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
0
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_some_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
5
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_too_many_completion_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
...
...
@@ -300,8 +62,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_no_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_no_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -326,8 +87,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_zero_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_zero_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -354,8 +114,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_some_logprobs_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_some_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -382,7 +141,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_too_many_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_too_many_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -425,7 +184,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_single_chat_session
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_single_chat_session
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -465,48 +224,13 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_completion_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_chat_streaming
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_chat_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -533,7 +257,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
temperature
=
0.0
,
stream
=
True
,
)
chunks
=
[]
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
...
...
@@ -555,8 +279,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_chat_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_chat_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -621,195 +344,56 @@ async def test_chat_completion_stream_options(server,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_completion_stream_options
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options={"include_usage": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options={"include_usage": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options={"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options={"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_batch_completions
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
server
,
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# (i.e. using the same ordering as in the Completions API tests), the test
# will fail on the second `guided_decoding_backend` even when I swap their order
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
async
def
test_guided_choice_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"The best language for type-safe systems programming is "
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
TEST_SCHEMA
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice1
in
sample_guided_choice
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
TEST_SCHEMA
)
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
choice1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"I disagree, pick another one"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice2
in
sample_guided_choice
assert
choice1
!=
choice2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
async
def
test_guided_json_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -818,18 +402,18 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
f
"fits this schema:
{
sample_json_schema
}
"
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
json1
=
json
.
loads
(
message
.
content
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
sample_json_schema
)
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
messages
.
append
({
...
...
@@ -842,12 +426,12 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
TEST_SCHEMA
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
json2
=
json
.
loads
(
message
.
content
)
jsonschema
.
validate
(
instance
=
json2
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
json2
,
schema
=
sample_json_schema
)
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
...
...
@@ -855,28 +439,8 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
re
.
fullmatch
(
TEST_REGEX
,
completion
.
choices
[
i
].
text
)
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
async
def
test_guided_regex_chat
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_regex
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -884,17 +448,17 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
"role"
:
"user"
,
"content"
:
f
"Give an example IP address with this regex:
{
TEST_REGEX
}
"
f
"Give an example IP address with this regex:
{
sample_regex
}
"
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
ip1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
ip1
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
ip1
)
is
not
None
assert
re
.
fullmatch
(
sample_regex
,
ip1
)
is
not
None
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
ip1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Give me a different one"
})
...
...
@@ -902,39 +466,16 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
ip2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
ip2
is
not
None
assert
re
.
fullmatch
(
TEST_REGEX
,
ip2
)
is
not
None
assert
re
.
fullmatch
(
sample_regex
,
ip2
)
is
not
None
assert
ip1
!=
ip2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
TEST_CHOICE
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_chat
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -944,52 +485,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
"content"
:
"The best language for type-safe systems programming is "
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
guided_decoding_backend
=
guided_decoding_backend
))
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice1
in
TEST_CHOICE
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
choice1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"I disagree, pick another one"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
guided_decoding_backend
=
guided_decoding_backend
))
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice2
in
TEST_CHOICE
assert
choice1
!=
choice2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
,
guided_decoding_backend
=
guided_decoding_backend
))
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"The best language for type-safe systems programming is "
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
...
...
@@ -998,18 +494,13 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
2
:
"C++"
}))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
TEST_REGEX
,
guided_json
=
TEST_SCHEMA
))
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_chat_logprobs
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
async
def
test_guided_choice_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -1025,7 +516,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
,
extra_body
=
dict
(
guided_choice
=
TEST_CHOICE
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
...
...
@@ -1040,8 +531,9 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_named_tool_use
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
async
def
test_named_tool_use
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -1050,7 +542,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
f
"fits this schema:
{
sample_json_schema
}
"
}]
# non-streaming
...
...
@@ -1064,7 +556,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}],
tool_choice
=
{
...
...
@@ -1077,7 +569,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
assert
len
(
message
.
content
)
==
0
json_string
=
message
.
tool_calls
[
0
].
function
.
arguments
json1
=
json
.
loads
(
json_string
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
json1
,
schema
=
sample_json_schema
)
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
json_string
})
messages
.
append
({
...
...
@@ -1098,7 +590,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}],
tool_choice
=
{
...
...
@@ -1123,7 +615,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
# finish reason should only return in last block
assert
finish_reason_count
==
1
json2
=
json
.
loads
(
""
.
join
(
output
))
jsonschema
.
validate
(
instance
=
json2
,
schema
=
TEST_SCHEMA
)
jsonschema
.
validate
(
instance
=
json2
,
schema
=
sample_json_schema
)
assert
json1
[
"name"
]
!=
json2
[
"name"
]
assert
json1
[
"age"
]
!=
json2
[
"age"
]
...
...
@@ -1131,7 +623,8 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_required_tool_use_not_yet_supported
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -1140,7 +633,7 @@ async def test_required_tool_use_not_yet_supported(
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
f
"fits this schema:
{
sample_json_schema
}
"
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
...
...
@@ -1153,7 +646,7 @@ async def test_required_tool_use_not_yet_supported(
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}],
tool_choice
=
"required"
)
...
...
@@ -1168,7 +661,7 @@ async def test_required_tool_use_not_yet_supported(
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}],
tool_choice
=
"auto"
)
...
...
@@ -1176,8 +669,9 @@ async def test_required_tool_use_not_yet_supported(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
])
async
def
test_inconsistent_tool_choice_and_tools
(
server
,
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
):
async
def
test_inconsistent_tool_choice_and_tools
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -1186,7 +680,7 @@ async def test_inconsistent_tool_choice_and_tools(
"user"
,
"content"
:
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
TEST_SCHEMA
}
"
f
"fits this schema:
{
sample_json_schema
}
"
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
...
...
@@ -1211,7 +705,7 @@ async def test_inconsistent_tool_choice_and_tools(
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
TEST_SCHEMA
"parameters"
:
sample_json_schema
}
}],
tool_choice
=
{
...
...
@@ -1223,7 +717,7 @@ async def test_inconsistent_tool_choice_and_tools(
@
pytest
.
mark
.
asyncio
async
def
test_response_format_json_object
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_response_format_json_object
(
client
:
openai
.
AsyncOpenAI
):
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -1243,7 +737,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_extra_fields
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_extra_fields
(
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
...
...
@@ -1259,7 +753,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_complex_message_content
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_complex_message_content
(
client
:
openai
.
AsyncOpenAI
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
...
...
@@ -1279,7 +773,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_custom_role
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_custom_role
(
client
:
openai
.
AsyncOpenAI
):
# Not sure how the model handles custom roles so we just check that
# both string and complex message content are handled in the same way
...
...
@@ -1310,77 +804,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
server
,
client
:
openai
.
AsyncOpenAI
):
simple_sql_grammar
=
"""
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_grammar
=
simple_sql_grammar
))
content
=
completion
.
choices
[
0
].
text
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
simple_sql_grammar
)
parser
.
parse
(
content
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
content
.
strip
()
==
ground_truth
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
(
len
(
logprobs
.
token_logprobs
)
>
5
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
@
pytest
.
mark
.
asyncio
async
def
test_long_seed
(
server
,
client
:
openai
.
AsyncOpenAI
):
async
def
test_long_seed
(
client
:
openai
.
AsyncOpenAI
):
for
seed
in
[
torch
.
iinfo
(
torch
.
long
).
min
-
1
,
torch
.
iinfo
(
torch
.
long
).
max
+
1
...
...
@@ -1397,83 +821,3 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
assert
(
"greater_than_equal"
in
exc_info
.
value
.
message
or
"less_than_equal"
in
exc_info
.
value
.
message
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
# test single embedding
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
total_tokens
==
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_embedding
(
embedding_server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test List[str]
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
# test List[List[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
embeddings
=
await
client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/entrypoints/openai/test_completion.py
0 → 100644
View file @
705f6a35
# imports for guided decoding tests
import
json
import
re
from
typing
import
List
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
import
requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
with
RemoteOpenAIServer
([
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
])
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
None
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
0
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
5
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
"HuggingFaceH4/zephyr-7b-beta"
,
"zephyr-lora"
],
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
not
None
assert
chunk
.
usage
.
prompt_tokens
>
0
assert
chunk
.
usage
.
completion_tokens
>
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options=
# {"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options=
# {"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
None
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
],
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test both text and token IDs
for
prompts
in
([
"Hello, my name is"
]
*
2
,
[[
0
,
0
,
0
,
0
,
0
]]
*
2
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_regex
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
re
.
fullmatch
(
sample_regex
,
completion
.
choices
[
i
].
text
)
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
sample_guided_choice
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
,
sample_sql_statements
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_grammar
=
sample_sql_statements
))
content
=
completion
.
choices
[
0
].
text
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
sample_sql_statements
)
parser
.
parse
(
content
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
content
.
strip
()
==
ground_truth
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
(
len
(
logprobs
.
token_logprobs
)
>
5
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
,
sample_regex
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
,
guided_decoding_backend
=
guided_decoding_backend
))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_tokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
,
tokenizer_mode
=
"fast"
)
for
add_special
in
[
False
,
True
]:
prompt
=
"This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
add_special
)
response
=
requests
.
post
(
base_url
+
"/tokenize"
,
json
=
{
"add_special_tokens"
:
add_special
,
"model"
:
model_name
,
"prompt"
:
prompt
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"tokens"
:
tokens
,
"count"
:
len
(
tokens
),
"max_model_len"
:
8192
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_detokenize
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
base_url
=
str
(
client
.
base_url
)[:
-
3
]
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
,
tokenizer_mode
=
"fast"
)
prompt
=
"This is a test prompt."
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
response
=
requests
.
post
(
base_url
+
"detokenize"
,
json
=
{
"model"
:
model_name
,
"tokens"
:
tokens
})
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"prompt"
:
prompt
}
tests/entrypoints/openai/test_embedding.py
0 → 100644
View file @
705f6a35
import
base64
import
numpy
as
np
import
openai
import
pytest
from
...utils
import
RemoteOpenAIServer
EMBEDDING_MODEL_NAME
=
"intfloat/e5-mistral-7b-instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_server
():
with
RemoteOpenAIServer
([
"--model"
,
EMBEDDING_MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--enforce-eager"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
])
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_client
(
embedding_server
):
return
embedding_server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
# test single embedding
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
9
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
total_tokens
==
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test List[str]
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
# test List[List[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
encoding_format
=
"float"
,
)
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_batch_base64_embedding
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Hello my name is"
,
"The best thing about vLLM is that it supports many different models"
]
responses_float
=
await
embedding_client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
,
encoding_format
=
"float"
)
responses_base64
=
await
embedding_client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
,
encoding_format
=
"base64"
)
decoded_responses_base64_data
=
[]
for
data
in
responses_base64
.
data
:
decoded_responses_base64_data
.
append
(
np
.
frombuffer
(
base64
.
b64decode
(
data
.
embedding
),
dtype
=
"float"
).
tolist
())
assert
responses_float
.
data
[
0
].
embedding
==
decoded_responses_base64_data
[
0
]
assert
responses_float
.
data
[
1
].
embedding
==
decoded_responses_base64_data
[
1
]
tests/entrypoints/test_guided_processors.py
→
tests/entrypoints/
openai/
test_guided_processors.py
View file @
705f6a35
...
...
@@ -10,61 +10,17 @@ from vllm.model_executor.guided_decoding import (
from
vllm.model_executor.guided_decoding.outlines_logits_processors
import
(
JSONLogitsProcessor
,
RegexLogitsProcessor
)
TEST_SCHEMA
=
{
"type"
:
"object"
,
"properties"
:
{
"name"
:
{
"type"
:
"string"
},
"age"
:
{
"type"
:
"integer"
},
"skills"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
,
"maxLength"
:
10
},
"minItems"
:
3
},
"work history"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"object"
,
"properties"
:
{
"company"
:
{
"type"
:
"string"
},
"duration"
:
{
"type"
:
"string"
},
"position"
:
{
"type"
:
"string"
}
},
"required"
:
[
"company"
,
"position"
]
}
}
},
"required"
:
[
"name"
,
"age"
,
"skills"
,
"work history"
]
}
TEST_REGEX
=
(
r
"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r
"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
)
pytestmark
=
pytest
.
mark
.
openai
def
test_guided_logits_processors
():
def
test_guided_logits_processors
(
sample_regex
,
sample_json_schema
):
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
regex_LP
=
RegexLogitsProcessor
(
TEST_REGEX
,
tokenizer
)
json_LP
=
JSONLogitsProcessor
(
TEST_SCHEMA
,
regex_LP
=
RegexLogitsProcessor
(
sample_regex
,
tokenizer
)
json_LP
=
JSONLogitsProcessor
(
sample_json_schema
,
tokenizer
,
whitespace_pattern
=
None
)
token_ids
=
tokenizer
.
encode
(
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
)
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
)
tensor
=
torch
.
rand
(
32000
)
original_tensor
=
torch
.
clone
(
tensor
)
regex_LP
(
token_ids
,
tensor
)
...
...
@@ -72,7 +28,8 @@ def test_guided_logits_processors():
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
token_ids
=
tokenizer
.
encode
(
f
"Give an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
)
f
"Give an employee profile that fits this schema:
{
sample_json_schema
}
"
)
tensor
=
torch
.
rand
(
32000
)
original_tensor
=
torch
.
clone
(
tensor
)
json_LP
(
token_ids
,
tensor
)
...
...
@@ -82,13 +39,14 @@ def test_guided_logits_processors():
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_logits_processor_black_box
(
backend
:
str
):
async
def
test_guided_logits_processor_black_box
(
backend
:
str
,
sample_regex
,
sample_json_schema
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'HuggingFaceH4/zephyr-7b-beta'
)
token_ids
=
tokenizer
.
encode
(
f
"Give an example IPv4 address with this regex:
{
TEST_REGEX
}
"
)
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
)
regex_request
=
CompletionRequest
(
model
=
'test'
,
prompt
=
token_ids
,
guided_regex
=
TEST_REGEX
)
guided_regex
=
sample_regex
)
regex_lp
=
await
get_guided_decoding_logits_processor
(
backend
,
regex_request
,
tokenizer
)
assert
regex_lp
is
not
None
...
...
@@ -99,10 +57,11 @@ async def test_guided_logits_processor_black_box(backend: str):
assert
not
torch
.
allclose
(
tensor
,
original_tensor
)
token_ids
=
tokenizer
.
encode
(
f
"Give an employee profile that fits this schema:
{
TEST_SCHEMA
}
"
)
f
"Give an employee profile that fits this schema:
{
sample_json_schema
}
"
)
json_request
=
CompletionRequest
(
model
=
'test'
,
prompt
=
token_ids
,
guided_json
=
TEST_SCHEMA
)
guided_json
=
sample_json_schema
)
json_lp
=
await
get_guided_decoding_logits_processor
(
backend
,
json_request
,
tokenizer
)
assert
json_lp
is
not
None
...
...
tests/entrypoints/openai/test_models.py
0 → 100644
View file @
705f6a35
import
openai
# use the official client for correctness check
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
zephyr_lora_files
):
with
RemoteOpenAIServer
([
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"128"
,
])
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
async
def
test_check_models
(
client
:
openai
.
AsyncOpenAI
):
models
=
await
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
lora_models
=
models
[
1
:]
assert
served_model
.
id
==
MODEL_NAME
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
tests/entrypoints/
test_server
_oot_registration.py
→
tests/entrypoints/
openai/test
_oot_registration.py
View file @
705f6a35
import
sys
import
time
import
pytest
import
torch
from
openai
import
OpenAI
,
OpenAIError
...
...
@@ -10,8 +9,6 @@ from vllm.model_executor.models.opt import OPTForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.utils
import
get_open_port
pytestmark
=
pytest
.
mark
.
openai
class
MyOPTForCausalLM
(
OPTForCausalLM
):
...
...
tests/entrypoints/
test_
openai_run_batch.py
→
tests/entrypoints/openai
/test
_run_batch.py
View file @
705f6a35
...
...
@@ -6,7 +6,8 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
# ruff: noqa: E501
INPUT_BATCH
=
"""{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INVALID_INPUT_BATCH
=
"""{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
705f6a35
import
asyncio
from
dataclasses
import
dataclass
import
pytest
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
MODEL_NAME
=
"openai-community/gpt2"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
pytestmark
=
pytest
.
mark
.
openai
@
dataclass
class
MockModelConfig
:
...
...
tests/entrypoints/
test_
openai_vision.py
→
tests/entrypoints/openai
/test
_vision.py
View file @
705f6a35
from
pathlib
import
Path
from
typing
import
Dict
from
typing
import
Dict
,
List
import
openai
import
pytest
import
pytest_asyncio
import
ray
from
vllm.multimodal.utils
import
ImageFetchAiohttp
,
encode_image_base64
from
..utils
import
ServerRunn
er
from
..
.
utils
import
VLLM_PATH
,
RemoteOpenAIServ
er
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE
=
(
Path
(
__file__
).
parent
.
parent
.
parent
/
"examples/template_llava.jinja"
)
LLAVA_CHAT_TEMPLATE
=
VLLM_PATH
/
"examples/template_llava.jinja"
assert
LLAVA_CHAT_TEMPLATE
.
exists
()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
...
...
@@ -22,43 +20,26 @@ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
pytestmark
=
pytest
.
mark
.
openai
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
ray
.
init
()
server_runner
=
ServerRunner
.
remote
([
"--model"
,
MODEL_NAME
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"4096"
,
"--enforce-eager"
,
"--image-input-type"
,
"pixel_values"
,
"--image-token-id"
,
"32000"
,
"--image-input-shape"
,
"1,3,336,336"
,
"--image-feature-size"
,
"576"
,
"--chat-template"
,
str
(
LLAVA_CHAT_TEMPLATE
),
])
ray
.
get
(
server_runner
.
ready
.
remote
())
yield
server_runner
ray
.
shutdown
()
@
pytest
.
fixture
(
scope
=
"session"
)
def
client
():
client
=
openai
.
AsyncOpenAI
(
base_url
=
"http://localhost:8000/v1"
,
api_key
=
"token-abc123"
,
)
yield
client
with
RemoteOpenAIServer
([
"--model"
,
MODEL_NAME
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"4096"
,
"--enforce-eager"
,
"--chat-template"
,
str
(
LLAVA_CHAT_TEMPLATE
),
])
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest_asyncio
.
fixture
(
scope
=
"session"
)
...
...
@@ -73,7 +54,7 @@ async def base64_encoded_image() -> Dict[str, str]:
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_single_chat_session_image
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
...
...
@@ -126,7 +107,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image_base64encoded
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
base64_encoded_image
:
Dict
[
str
,
str
]):
messages
=
[{
...
...
@@ -180,7 +161,7 @@ async def test_single_chat_session_image_base64encoded(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_chat_streaming_image
(
server
,
client
:
openai
.
AsyncOpenAI
,
async
def
test_chat_streaming_image
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
...
...
@@ -217,7 +198,7 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
temperature
=
0.0
,
stream
=
True
,
)
chunks
=
[]
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
...
...
@@ -237,8 +218,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_multi_image_input
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
async
def
test_multi_image_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
messages
=
[{
"role"
:
...
...
@@ -280,7 +261,3 @@ async def test_multi_image_input(server, client: openai.AsyncOpenAI,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/kernels/test_attention.py
View file @
705f6a35
...
...
@@ -73,27 +73,27 @@ def ref_single_query_cached_kv_attention(
block_size
=
value_cache
.
shape
[
3
]
num_seqs
=
query
.
shape
[
0
]
block_tables
=
block_tables
.
cpu
().
tolist
()
seq_lens
=
seq_lens
.
cpu
().
tolist
()
block_tables
_lst
=
block_tables
.
cpu
().
tolist
()
seq_lens
_lst
=
seq_lens
.
cpu
().
tolist
()
for
i
in
range
(
num_seqs
):
q
=
query
[
i
].
unsqueeze
(
0
)
block_table
=
block_tables
[
i
]
seq_len
=
int
(
seq_lens
[
i
])
block_table
=
block_tables
_lst
[
i
]
seq_len
=
int
(
seq_lens
_lst
[
i
])
keys
=
[]
values
=
[]
keys
_lst
:
List
[
torch
.
Tensor
]
=
[]
values
_lst
:
List
[
torch
.
Tensor
]
=
[]
for
j
in
range
(
seq_len
):
block_number
=
int
(
block_table
[
j
//
block_size
])
block_offset
=
j
%
block_size
k
=
key_cache
[
block_number
,
:,
:,
block_offset
,
:]
k
=
k
.
reshape
(
num_kv_heads
,
head_size
)
keys
.
append
(
k
)
keys
_lst
.
append
(
k
)
v
=
value_cache
[
block_number
,
:,
:,
block_offset
]
values
.
append
(
v
)
keys
=
torch
.
stack
(
keys
,
dim
=
0
)
values
=
torch
.
stack
(
values
,
dim
=
0
)
values
_lst
.
append
(
v
)
keys
=
torch
.
stack
(
keys
_lst
,
dim
=
0
)
values
=
torch
.
stack
(
values
_lst
,
dim
=
0
)
if
num_queries_per_kv
>
1
:
# Handle MQA and GQA
keys
=
torch
.
repeat_interleave
(
keys
,
num_queries_per_kv
,
dim
=
1
)
...
...
@@ -158,14 +158,15 @@ def test_paged_attention(
# Create the block tables.
max_num_blocks_per_seq
=
(
max_seq_len
+
block_size
-
1
)
//
block_size
block_tables
=
[]
block_tables
_lst
:
List
[
List
[
int
]]
=
[]
for
_
in
range
(
num_seqs
):
block_table
=
[
random
.
randint
(
0
,
NUM_BLOCKS
-
1
)
for
_
in
range
(
max_num_blocks_per_seq
)
]
block_tables
.
append
(
block_table
)
block_tables
=
torch
.
tensor
(
block_tables
,
dtype
=
torch
.
int
)
block_tables_lst
.
append
(
block_table
)
block_tables
=
torch
.
tensor
(
block_tables_lst
,
dtype
=
torch
.
int
)
# Create the KV caches.
key_caches
,
value_caches
=
kv_cache_factory
(
NUM_BLOCKS
,
block_size
,
1
,
...
...
@@ -284,7 +285,7 @@ def ref_multi_query_kv_attention(
dtype
:
torch
.
dtype
,
)
->
torch
.
Tensor
:
num_seqs
=
len
(
cu_seq_lens
)
-
1
ref_outputs
=
[]
ref_outputs
:
List
[
torch
.
Tensor
]
=
[]
for
i
in
range
(
num_seqs
):
start_idx
=
cu_seq_lens
[
i
]
end_idx
=
cu_seq_lens
[
i
+
1
]
...
...
@@ -304,8 +305,8 @@ def ref_multi_query_kv_attention(
attn_mask
=
attn_mask
,
)
ref_outputs
.
append
(
ref_output
)
ref_output
=
torch
.
cat
(
ref_outputs
,
dim
=
0
)
return
ref_output
return
torch
.
cat
(
ref_output
s
,
dim
=
0
)
# TODO(woosuk): Add tests for USE_ALIBI=True.
...
...
tests/kernels/test_attention_selector.py
View file @
705f6a35
...
...
@@ -9,8 +9,8 @@ from vllm.attention.selector import which_attn_to_use
@
pytest
.
mark
.
parametrize
(
"name"
,
[
"TORCH_SDPA"
,
"ROCM_FLASH"
,
"XFORMERS"
,
"FLASHINFER"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"
hip
"
])
"name"
,
[
"TORCH_SDPA"
,
"ROCM_FLASH"
,
"XFORMERS"
,
"FLASHINFER"
,
"OPENVINO"
])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"
openvino"
,
"hip"
,
"cuda
"
])
def
test_env
(
name
:
str
,
device
:
str
,
monkeypatch
):
"""Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend.
...
...
@@ -28,6 +28,11 @@ def test_env(name: str, device: str, monkeypatch):
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
torch
.
float16
,
16
)
assert
backend
.
name
==
"ROCM_FLASH"
elif
device
==
"openvino"
:
with
patch
(
"vllm.attention.selector.is_openvino"
,
return_value
=
True
):
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
torch
.
float16
,
16
)
assert
backend
.
name
==
"OPENVINO"
else
:
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
torch
.
float16
,
16
)
...
...
@@ -42,32 +47,32 @@ def test_flash_attn(monkeypatch):
# Unsupported CUDA arch
with
patch
(
"torch.cuda.get_device_capability"
,
return_value
=
[
7
,
5
]):
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
None
,
16
)
assert
backend
.
name
!=
"
FLASH_ATTN
"
assert
backend
.
name
!=
STR_
FLASH_ATTN
_VAL
# Unsupported data type
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float8_e4m3fn
,
None
,
16
)
assert
backend
.
name
!=
"
FLASH_ATTN
"
assert
backend
.
name
!=
STR_
FLASH_ATTN
_VAL
# Unsupported kv cache data type
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
"fp8"
,
16
)
assert
backend
.
name
!=
"
FLASH_ATTN
"
assert
backend
.
name
!=
STR_
FLASH_ATTN
_VAL
# Unsupported block size
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
None
,
8
)
assert
backend
.
name
!=
"
FLASH_ATTN
"
assert
backend
.
name
!=
STR_
FLASH_ATTN
_VAL
# Unsupported sliding window
backend
=
which_attn_to_use
(
8
,
16
,
8
,
1
,
torch
.
float16
,
None
,
16
)
assert
backend
.
name
!=
"
FLASH_ATTN
"
assert
backend
.
name
!=
STR_
FLASH_ATTN
_VAL
# flash-attn is not installed
with
patch
.
dict
(
'sys.modules'
,
{
'vllm_flash_attn'
:
None
}):
backend
=
which_attn_to_use
(
8
,
16
,
8
,
None
,
torch
.
float16
,
None
,
16
)
assert
backend
.
name
!=
"
FLASH_ATTN
"
assert
backend
.
name
!=
STR_
FLASH_ATTN
_VAL
# Unsupported head size
backend
=
which_attn_to_use
(
8
,
17
,
8
,
None
,
torch
.
float16
,
None
,
16
)
assert
backend
.
name
!=
"
FLASH_ATTN
"
assert
backend
.
name
!=
STR_
FLASH_ATTN
_VAL
def
test_invalid_env
(
monkeypatch
):
...
...
tests/kernels/test_blocksparse_attention.py
View file @
705f6a35
...
...
@@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention(
block_size
=
value_cache
.
shape
[
3
]
num_seqs
=
query
.
shape
[
0
]
block_tables
=
block_tables
.
cpu
().
tolist
()
seq_lens
=
seq_lens
.
cpu
().
tolist
()
block_tables
_lst
=
block_tables
.
cpu
().
tolist
()
seq_lens
_lst
=
seq_lens
.
cpu
().
tolist
()
for
i
in
range
(
num_seqs
):
q
=
query
[
i
].
unsqueeze
(
0
)
block_table
=
block_tables
[
i
]
seq_len
=
int
(
seq_lens
[
i
])
block_table
=
block_tables
_lst
[
i
]
seq_len
=
int
(
seq_lens
_lst
[
i
])
keys
=
[]
values
=
[]
keys
_lst
:
List
[
torch
.
Tensor
]
=
[]
values
_lst
:
List
[
torch
.
Tensor
]
=
[]
for
j
in
range
(
seq_len
):
block_number
=
int
(
block_table
[
j
//
block_size
])
block_offset
=
j
%
block_size
k
=
key_cache
[
block_number
,
:,
:,
block_offset
,
:]
k
=
k
.
reshape
(
num_kv_heads
,
head_size
)
keys
.
append
(
k
)
keys
_lst
.
append
(
k
)
v
=
value_cache
[
block_number
,
:,
:,
block_offset
]
values
.
append
(
v
)
keys
=
torch
.
stack
(
keys
,
dim
=
0
)
values
=
torch
.
stack
(
values
,
dim
=
0
)
values
_lst
.
append
(
v
)
keys
=
torch
.
stack
(
keys
_lst
,
dim
=
0
)
values
=
torch
.
stack
(
values
_lst
,
dim
=
0
)
if
num_queries_per_kv
>
1
:
# Handle MQA and GQA
keys
=
torch
.
repeat_interleave
(
keys
,
num_queries_per_kv
,
dim
=
1
)
...
...
@@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill(
value
=
torch
.
repeat_interleave
(
value
,
num_queries_per_kv
,
dim
=
1
)
ref_output
=
ref_multi_query_kv_attention
(
cu_seq_lens
,
cu_seq_lens
.
tolist
()
,
query
,
key
,
value
,
...
...
tests/kernels/test_cache.py
View file @
705f6a35
import
random
from
typing
import
Tuple
from
typing
import
List
,
Tuple
import
pytest
import
torch
...
...
@@ -64,7 +64,7 @@ def test_copy_blocks(
src_blocks
=
random
.
sample
(
range
(
num_blocks
),
num_mappings
)
remainig_blocks
=
list
(
set
(
range
(
num_blocks
))
-
set
(
src_blocks
))
dst_blocks
=
random
.
sample
(
remainig_blocks
,
2
*
num_mappings
)
block_mapping
=
[]
block_mapping
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
num_mappings
):
src
=
src_blocks
[
i
]
dst1
=
dst_blocks
[
2
*
i
]
...
...
@@ -132,8 +132,8 @@ def test_reshape_and_cache(
torch
.
set_default_device
(
device
)
# Create a random slot mapping.
num_slots
=
block_size
*
num_blocks
slot_mapping
=
random
.
sample
(
range
(
num_slots
),
num_tokens
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
long
)
slot_mapping
_lst
=
random
.
sample
(
range
(
num_slots
),
num_tokens
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
_lst
,
dtype
=
torch
.
long
)
qkv
=
torch
.
randn
(
num_tokens
,
3
,
num_heads
,
head_size
,
dtype
=
dtype
)
_
,
key
,
value
=
qkv
.
unbind
(
dim
=
1
)
...
...
@@ -171,12 +171,12 @@ def test_reshape_and_cache(
# Run the reference implementation.
reshaped_key
=
key
.
reshape
(
num_tokens
,
*
key_cache
[
0
,
:,
:,
0
,
:].
shape
)
block_indicies
=
torch
.
div
(
slot_mapping
,
block_size
,
rounding_mode
=
"floor"
)
block_indicies
=
block_indicies
.
cpu
().
tolist
()
block_indicies
_lst
=
block_indicies
.
cpu
().
tolist
()
block_offsets
=
slot_mapping
%
block_size
block_offsets
=
block_offsets
.
cpu
().
tolist
()
block_offsets
_lst
=
block_offsets
.
cpu
().
tolist
()
for
i
in
range
(
num_tokens
):
block_idx
=
block_indicies
[
i
]
block_offset
=
block_offsets
[
i
]
block_idx
=
block_indicies
_lst
[
i
]
block_offset
=
block_offsets
_lst
[
i
]
cloned_key_cache
[
block_idx
,
:,
:,
block_offset
,
:]
=
reshaped_key
[
i
]
cloned_value_cache
[
block_idx
,
:,
:,
block_offset
]
=
value
[
i
]
...
...
@@ -225,8 +225,10 @@ def test_reshape_and_cache_flash(
# Create a random slot mapping.
num_slots
=
block_size
*
num_blocks
slot_mapping
=
random
.
sample
(
range
(
num_slots
),
num_tokens
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
long
,
device
=
device
)
slot_mapping_lst
=
random
.
sample
(
range
(
num_slots
),
num_tokens
)
slot_mapping
=
torch
.
tensor
(
slot_mapping_lst
,
dtype
=
torch
.
long
,
device
=
device
)
qkv
=
torch
.
randn
(
num_tokens
,
3
,
...
...
@@ -258,13 +260,13 @@ def test_reshape_and_cache_flash(
slot_mapping
,
kv_cache_dtype
)
# Run the reference implementation.
block_indicies
=
torch
.
div
(
slot_mapping
,
block_size
,
rounding_mode
=
'
floor
'
)
block_indicies
=
block_indicies
.
cpu
().
tolist
()
block_indicies
=
torch
.
div
(
slot_mapping
,
block_size
,
rounding_mode
=
"
floor
"
)
block_indicies
_lst
=
block_indicies
.
cpu
().
tolist
()
block_offsets
=
slot_mapping
%
block_size
block_offsets
=
block_offsets
.
cpu
().
tolist
()
block_offsets
_lst
=
block_offsets
.
cpu
().
tolist
()
for
i
in
range
(
num_tokens
):
block_idx
=
block_indicies
[
i
]
block_offset
=
block_offsets
[
i
]
block_idx
=
block_indicies
_lst
[
i
]
block_offset
=
block_offsets
_lst
[
i
]
cloned_key_cache
[
block_idx
,
block_offset
,
:,
:]
=
key
[
i
]
cloned_value_cache
[
block_idx
,
block_offset
,
:,
:]
=
value
[
i
]
...
...
tests/kernels/test_cutlass.py
View file @
705f6a35
...
...
@@ -2,36 +2,53 @@
Run `pytest tests/kernels/test_cutlass.py`.
"""
from
typing
import
Type
from
typing
import
Optional
,
Type
import
pytest
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.platforms
import
current_platform
CUDA_DEVICES
=
[
f
"cuda:
{
i
}
"
for
i
in
range
(
1
if
torch
.
cuda
.
device_count
()
==
1
else
2
)
]
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
current_platform
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
def
to_fp8
(
tensor
:
torch
.
t
ensor
):
def
to_fp8
(
tensor
:
torch
.
T
ensor
):
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
return
torch
.
round
(
tensor
.
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)).
to
(
dtype
=
torch
.
float8_e4m3fn
)
def
to_int8
(
tensor
:
torch
.
t
ensor
):
def
to_int8
(
tensor
:
torch
.
T
ensor
):
return
torch
.
round
(
tensor
.
clamp
(
min
=-
128
,
max
=
127
)).
to
(
dtype
=
torch
.
int8
)
def
baseline_scaled_mm
(
a
:
torch
.
Tensor
,
b
:
torch
.
Tensor
,
scale_a
:
torch
.
Tensor
,
scale_b
:
torch
.
Tensor
,
out_dtype
:
Type
[
torch
.
dtype
],
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
output
=
(
scale_a
*
(
scale_b
*
(
torch
.
mm
(
a
.
to
(
dtype
=
torch
.
float32
),
b
.
to
(
dtype
=
torch
.
float32
))))).
to
(
out_dtype
)
if
bias
is
not
None
:
output
=
output
+
bias
return
output
def
cutlass_fp8_gemm_helper
(
m
:
int
,
n
:
int
,
k
:
int
,
per_token_act_quant
:
bool
,
per_out_channel_weight_quant
:
bool
,
use_bias
:
bool
,
out_dtype
:
Type
[
torch
.
dtype
]
=
torch
.
bfloat16
,
device
:
str
=
"cuda"
):
# Test for a cutlass kernel with per-token activation quantization
...
...
@@ -42,16 +59,19 @@ def cutlass_fp8_gemm_helper(m: int,
m_a_scales
=
m
if
per_token_act_quant
else
1
n_b_scales
=
n
if
per_out_channel_weight_quant
else
1
scale_a
=
(
torch
.
randn
(
(
m_a_scales
,
1
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
scale_a
=
(
torch
.
randn
((
m_a_scales
,
1
),
device
=
device
,
dtype
=
torch
.
float32
))
scale_b
=
(
torch
.
randn
((
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
))
if
use_bias
:
bias
=
torch
.
rand
((
n
,
),
device
=
device
,
dtype
=
out_dtype
)
*
10
else
:
bias
=
None
out
=
ops
.
cutlass_scaled_mm_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
out_dtype
)
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
baseline
=
baseline_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
assert
torch
.
allclose
(
out
,
baseline
,
rtol
=
1e-2
,
atol
=
1
e-
1
)
assert
torch
.
allclose
(
out
,
baseline
,
rtol
=
1e-2
,
atol
=
5
e-
2
)
def
cutlass_int8_gemm_helper
(
m
:
int
,
...
...
@@ -59,6 +79,7 @@ def cutlass_int8_gemm_helper(m: int,
k
:
int
,
per_token_act_quant
:
bool
,
per_out_channel_weight_quant
:
bool
,
use_bias
:
bool
,
out_dtype
:
Type
[
torch
.
dtype
]
=
torch
.
bfloat16
,
device
:
str
=
"cuda"
):
# Test for a cutlass kernel with per-token activation quantization
...
...
@@ -69,15 +90,18 @@ def cutlass_int8_gemm_helper(m: int,
m_a_scales
=
m
if
per_token_act_quant
else
1
n_b_scales
=
n
if
per_out_channel_weight_quant
else
1
scale_a
=
(
torch
.
randn
(
(
m_a_scales
,
1
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
)
/
10
)
scale_a
=
(
torch
.
randn
(
(
m_a_scales
,
1
),
device
=
device
,
dtype
=
torch
.
float32
))
scale_b
=
(
torch
.
randn
(
(
1
,
n_b_scales
),
device
=
device
,
dtype
=
torch
.
float32
))
out
=
ops
.
cutlass_scaled_mm_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
dtype
=
out_dtype
)
if
use_bias
:
bias
=
torch
.
rand
((
n
,
),
device
=
device
,
dtype
=
out_dtype
)
*
10
else
:
bias
=
None
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
baseline
=
baseline_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
)
assert
torch
.
allclose
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
...
...
@@ -87,11 +111,12 @@ def cutlass_int8_gemm_helper(m: int,
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
capability
<
89
,
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_fp8_gemm
(
m
:
int
,
n
:
int
,
k
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
):
cutlass_fp8_gemm_helper
(
m
,
n
,
k
,
per_act_token
,
per_out_ch
)
per_out_ch
:
bool
,
use_bias
:
bool
):
cutlass_fp8_gemm_helper
(
m
,
n
,
k
,
per_act_token
,
per_out_ch
,
use_bias
)
@
pytest
.
mark
.
parametrize
(
"m"
,
[
512
,
222
,
33
,
1
])
...
...
@@ -99,49 +124,72 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
@
pytest
.
mark
.
parametrize
(
"k"
,
[
128
,
496
,
1024
])
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
def
test_cutlass_int8_gemm
(
m
:
int
,
n
:
int
,
k
:
int
,
per_act_token
:
bool
,
per_out_ch
:
bool
):
cutlass_int8_gemm_helper
(
m
,
n
,
k
,
per_act_token
,
per_out_ch
)
per_out_ch
:
bool
,
use_bias
:
bool
):
cutlass_int8_gemm_helper
(
m
,
n
,
k
,
per_act_token
,
per_out_ch
,
use_bias
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"out_dtype"
,
[
torch
.
bfloat16
,
torch
.
float16
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
def
test_cutlass_int8_gemm_output_dtype
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
out_dtype
:
Type
[
torch
.
dtype
]):
cutlass_int8_gemm_helper
(
512
,
512
,
512
,
per_act_token
,
per_out_ch
,
out_dtype
)
out_dtype
:
Type
[
torch
.
dtype
],
use_bias
:
bool
):
cutlass_int8_gemm_helper
(
512
,
512
,
512
,
per_act_token
,
per_out_ch
,
use_bias
,
out_dtype
=
out_dtype
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"out_dtype"
,
[
torch
.
bfloat16
,
torch
.
float16
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
capability
<
89
,
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_fp8_gemm_output_dtype
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
out_dtype
:
Type
[
torch
.
dtype
]):
cutlass_fp8_gemm_helper
(
512
,
512
,
512
,
per_act_token
,
per_out_ch
,
out_dtype
)
out_dtype
:
Type
[
torch
.
dtype
],
use_bias
:
bool
):
cutlass_fp8_gemm_helper
(
512
,
512
,
512
,
per_act_token
,
per_out_ch
,
use_bias
,
out_dtype
=
out_dtype
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
skipif
(
capability
<
89
,
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_fp8_gemm_devices
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
device
:
str
):
cutlass_fp8_gemm_helper
(
512
,
512
,
512
,
per_act_token
,
per_out_ch
,
use_bias
:
bool
,
device
:
str
):
cutlass_fp8_gemm_helper
(
512
,
512
,
512
,
per_act_token
,
per_out_ch
,
use_bias
,
torch
.
bfloat16
,
device
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_cutlass_int8_gemm_devices
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
device
:
str
):
cutlass_int8_gemm_helper
(
512
,
512
,
512
,
per_act_token
,
per_out_ch
,
torch
.
bfloat16
,
device
)
use_bias
:
bool
,
device
:
str
):
cutlass_int8_gemm_helper
(
512
,
512
,
512
,
per_act_token
,
per_out_ch
,
use_bias
,
out_dtype
=
torch
.
bfloat16
,
device
=
device
)
# For the following two tests:
...
...
@@ -151,20 +199,26 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
# kernel must handle any M thrown at it.
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
capability
<
89
,
reason
=
"FP8 is not supported on this GPU type."
)
def
test_cutlass_fp8_gemm_m_sweep
(
per_act_token
:
bool
,
per_out_ch
:
bool
):
def
test_cutlass_fp8_gemm_m_sweep
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
use_bias
:
bool
):
for
nk
in
range
(
32
,
128
,
32
):
for
m
in
range
(
1
,
128
):
cutlass_fp8_gemm_helper
(
m
,
nk
,
nk
,
per_act_token
,
per_out_ch
)
cutlass_fp8_gemm_helper
(
m
,
nk
,
nk
,
per_act_token
,
per_out_ch
,
use_bias
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"per_out_ch"
,
[
True
,
False
])
def
test_cutlass_int8_gemm_m_sweep
(
per_act_token
:
bool
,
per_out_ch
:
bool
):
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
def
test_cutlass_int8_gemm_m_sweep
(
per_act_token
:
bool
,
per_out_ch
:
bool
,
use_bias
:
bool
):
for
nk
in
range
(
32
,
128
,
32
):
for
m
in
range
(
1
,
128
):
cutlass_int8_gemm_helper
(
m
,
nk
,
nk
,
per_act_token
,
per_out_ch
)
cutlass_int8_gemm_helper
(
m
,
nk
,
nk
,
per_act_token
,
per_out_ch
,
use_bias
)
# Test working with a subset of A and B
...
...
@@ -180,14 +234,16 @@ def test_cutlass_subset():
scale_a
=
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
scale_b
=
torch
.
randn
((
1
,
1
),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
/
10
out
=
ops
.
cutlass_scaled_mm_dq
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
baseline
=
torch
.
mm
(
scale_a
*
a
.
to
(
dtype
=
torch
.
float32
),
scale_b
*
b
.
to
(
dtype
=
torch
.
float32
)).
to
(
dtype
=
torch
.
bfloat16
)
out
=
ops
.
cutlass_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
baseline
=
baseline_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
=
torch
.
bfloat16
)
assert
torch
.
allclose
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
...
...
@@ -203,8 +259,8 @@ class CutlassLayer(torch.nn.Module):
self
.
out_dtype
=
out_dtype
def
forward
(
self
,
a
):
return
ops
.
cutlass_scaled_mm
_dq
(
a
,
self
.
b
,
self
.
scale_a
,
self
.
scale_b
,
self
.
out_dtype
)
return
ops
.
cutlass_scaled_mm
(
a
,
self
.
b
,
self
.
scale_a
,
self
.
scale_b
,
self
.
out_dtype
)
@
pytest
.
mark
.
parametrize
(
"per_act_token"
,
[
True
,
False
])
...
...
tests/kernels/test_encoder_decoder_attn.py
0 → 100644
View file @
705f6a35
"""
Tests:
* E2E test of Encoder attention + Decoder self-attention +
Encoder/decoder cross-attention (collectively
"encoder/decoder attention")
* Confirm enc/dec models will fail for chunked prefill
* Confirm enc/dec models will fail for prefix caching
"""
from
typing
import
NamedTuple
,
Optional
import
pytest
import
torch
from
tests.kernels.utils
import
*
from
tests.kernels.utils
import
make_causal_mask
,
maybe_make_long_tensor
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.attention.backends.abstract
import
AttentionBackend
,
AttentionType
from
vllm.attention.backends.utils
import
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
from
vllm.utils
import
is_hip
HEAD_SIZES
=
[
64
,
256
]
NUM_HEADS
=
[
1
,
16
]
BATCH_SIZES
=
[
1
,
16
]
BLOCK_SIZES
=
[
16
]
BACKEND_NAMES
=
[
STR_XFORMERS_ATTN_VAL
]
CUDA_DEVICE
=
"cuda:0"
MAX_DEC_SEQ_LENS
=
[
128
]
MAX_ENC_SEQ_LENS
=
[
128
]
# Narrow teest-cases for unsupported-scenario
# tests
HEAD_SIZES_FOR_UNSUPP
=
[
HEAD_SIZES
[
0
]]
class
TestPoint
(
NamedTuple
):
"""
Encapsulates the attributes which define a single invocation
of the test_e2e_enc_dec_attn() test
Attributes:
num_heads: The number of heads in the model.
head_size: Head dimension
backend_name: Name of the backend framework used.
batch_size: Number of samples per batch.
block_size: Size of each block of data processed.
max_dec_seq_len: Maximum sequence length for the decoder.
max_enc_seq_len: Maximum sequence length for the encoder.
num_blocks: Number of blocks in the model.
"""
num_heads
:
int
head_size
:
int
backend_name
:
str
batch_size
:
int
block_size
:
int
max_dec_seq_len
:
int
max_enc_seq_len
:
int
num_blocks
:
int
class
TestResources
(
NamedTuple
):
'''
Encapsulates key components for performing an
encoder/decoder attention test
Note that
(1) attn automatically selects an attention backend
based on platform info & a set of canned
heuristics
(2) attn_backend is thus *not the same backend
instance* used by attn, but rather it is
intended to be a
*different instance* of the *same backend class*;
it is assumed that the user of TestResources
will leverage attn_backend for the purpose of
constructing backend-compatible attention
metadata instances
Attributes:
* scale: 1/sqrt(d) scale factor for attn
* attn_backend: implementatino of abstraction
attention interface using
a particular kernel library
i.e. XFormers
* attn: Attention layer instance
* kv_cache: shared key/value cache for all attention
'''
scale
:
float
attn_backend
:
AttentionBackend
attn
:
Attention
kv_cache
:
torch
.
Tensor
def
_make_test_resources
(
test_pt
:
TestPoint
,
)
->
TestResources
:
'''
Build key components for performing encoder/decoder attention test.
Note that
(1) The Attention instance constructed here, automatically selects
an attention backend class based on platform info & a set of canned
heuristics, so
(2) The attention backend instance constructed here is thus *not
the same backend instance* used by attn, but rather it is
intended to be a *different instance* of the *same backend class*;
therefore,
(3) This function requires that test_pt.backend_name matches the backend
class that Attention will automatically select when it is constructed.
Arguments:
* test_pt: TestPoint data structure; this function relies on the
following fields: num_heads, head_size, num_blocks,
block_size, backend_name
Returns:
* TestResources data structure.
'''
scale
=
float
(
1.0
/
(
test_pt
.
head_size
**
0.5
))
attn_backend
=
make_backend
(
test_pt
.
backend_name
)
attn
=
Attention
(
test_pt
.
num_heads
,
test_pt
.
head_size
,
scale
=
scale
,
)
if
test_pt
.
num_blocks
is
None
or
test_pt
.
num_heads
is
None
:
# Caller does not require a KV cache
return
TestResources
(
scale
,
attn_backend
,
attn
,
None
)
# Construct KV cache
kv_cache
=
make_kv_cache
(
test_pt
.
num_blocks
,
test_pt
.
num_heads
,
test_pt
.
head_size
,
test_pt
.
block_size
,
device
=
CUDA_DEVICE
)
return
TestResources
(
scale
,
attn_backend
,
attn
,
kv_cache
)
def
_encoder_attn_setup
(
test_pt
:
TestPoint
,
test_rsrcs
:
TestResources
,
)
->
PhaseTestParameters
:
'''
Set up test vectors & data structures for encoder attention test.
A triplet of synthetic query/key/value tensors are constructed.
Given this is an encoder attention test, the key & value
sequences will have the same length as the corresponding queries.
The query/key/value tensors are passed to an ideal reference
self-attention implementation to generate an ideal output tensor.
Encoder inference does not populate the KV cache, therefore
no KV cache memory mapping is constructed
Arguments:
* test_pt: TestPoint data structure; this function relies on the
following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
Returns:
* PhaseTestParameters data structure comprising (1) packed query/key/value
tensors, (2) the ideal output of attention computed using a naive
implementation, and (3) KVCache field set to None
'''
(
num_heads
,
head_size
,
_
,
batch_size
,
_
,
_
,
max_q_seq_len
,
_
,
)
=
test_pt
scale
=
test_rsrcs
.
scale
max_kv_seq_len
=
max_q_seq_len
# Make test tensors
qkv_in
,
_
,
_
=
make_qkv
(
batch_size
,
max_q_seq_len
,
max_kv_seq_len
,
num_heads
,
head_size
,
attn_type
=
AttentionType
.
ENCODER
,
device
=
CUDA_DEVICE
)
# Compute correct answer using naive non-causal attention
# implementation
ideal_output
=
ref_masked_attention
(
qkv_in
.
query
,
qkv_in
.
key
,
qkv_in
.
value
,
scale
=
scale
,
q_seq_lens
=
qkv_in
.
q_seq_lens
,
kv_seq_lens
=
qkv_in
.
kv_seq_lens
)
packed_ideal_output
,
_
=
pack_tensor
(
ideal_output
,
qkv_in
.
q_seq_lens
,
device
=
CUDA_DEVICE
)
packed_qkv
=
pack_qkv
(
qkv_in
,
device
=
CUDA_DEVICE
)
return
PhaseTestParameters
(
PackedQKVO
(
packed_qkv
,
packed_ideal_output
),
None
# No KV cache
)
def
_decoder_attn_setup
(
test_pt
:
TestPoint
,
test_rsrcs
:
TestResources
,
block_base_addr
:
int
=
0
,
)
->
Tuple
[
QKVInputs
,
PhaseTestParameters
,
PhaseTestParameters
,
int
]:
'''
Set up test vectors & data structures for self-attention test.
A triplet of synthetic query/key/value tensors are constructed ("baseline"
query/key/value). Given this is a self-attention test, the key & value
sequences will have the same length as the corresponding queries.
"Prefill" query/key/value tensors are derived by masking out the last value
in each baseline query/key/value. These tensors are used to test prefill &
populate KV cache for a subsequent decode test.
"Decode" query/key/value tensors are derived by extracting *only* the last
value from each baseline query/key/value (i.e. complement of the prefill
tensors.) These tensors are used to test decode, conditional on the kv cache
being populated during the prefill test.
The baseline query/key/value tensors are passed to an ideal reference
self-attention implementation to generate a "Baseline" ideal output tensor.
This tensor is split into the "Prefill" ideal output tensor (all but the
last element of each output sequence) and the "Decode" ideal output tensor
(*only* the last element of each output sequence); the "Prefill" and
"Decode" ideal output tensors can be used to validate the prefill and decode
test results, respectively.
This function also constructs the self-attention KV cache memory mapping
(slot mapping and block table), ensuring that the block table starts at
block_base_addr
Arguments:
* test_pt: TestPoint data structure; this function relies on the
following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
* block_base_addr: decoder self-attention block-table base address
Returns:
* qkv: Unpacked (batch_size x padded_seq_len x num_heads x
head_size) query/key/value tensors
* Prefill-phase decoder self-attention PhaseTestParameters data structure,
including (1) packed (number_of_tokens x num_heads x head_size)
query/key/value tensors along with (2) ideal attention output
computed using a naive implementation, and (3) memory-mapping data
structures appropriate for prefill phase.
* Decode-phase decoder self-attention PhaseTestParameters data structure,
including (1) packed (number_of_tokens x num_heads x head_size)
query/key/value tensors along with (2) ideal attention output
computed using a naive implementation, and (3) memory-mapping data
structures appropriate for decode phase.
* max_block_idx: max physical address in decoder self-attention block-table
(intended to be used as the base address for the encoder/
decoder cross-attention block-table, which is not
constructed in this function)
'''
(
num_heads
,
head_size
,
_
,
batch_size
,
block_size
,
max_q_seq_len
,
_
,
_
,
)
=
test_pt
scale
=
test_rsrcs
.
scale
max_kv_seq_len
=
max_q_seq_len
# Build test tensors
(
qkv
,
prefill_qkv
,
decode_qkv
,
)
=
make_qkv
(
batch_size
,
max_q_seq_len
,
max_kv_seq_len
,
num_heads
,
head_size
,
attn_type
=
AttentionType
.
DECODER
,
device
=
CUDA_DEVICE
)
# Compute correct answer using naive attention implementation
# with causal attention mask
causal_mask
=
make_causal_mask
(
max_q_seq_len
,
max_kv_seq_len
).
to
(
CUDA_DEVICE
)
ideal_output
=
ref_masked_attention
(
qkv
.
query
,
qkv
.
key
,
qkv
.
value
,
scale
=
scale
,
custom_mask
=
causal_mask
,
q_seq_lens
=
qkv
.
q_seq_lens
,
kv_seq_lens
=
qkv
.
kv_seq_lens
)
# Split out the prefill- & decode-phase ideal answers & pack them
prefill_ideal_output
=
torch
.
zeros_like
(
ideal_output
)
decode_ideal_output
=
torch
.
zeros_like
(
ideal_output
[:,
0
:
1
])
for
bdx
,
prefill_q_seq_len
in
enumerate
(
prefill_qkv
.
q_seq_lens
):
prefill_ideal_output
[
bdx
,
:
prefill_q_seq_len
]
=
ideal_output
[
bdx
,
:
prefill_q_seq_len
]
decode_ideal_output
[
bdx
,
:]
=
ideal_output
[
bdx
,
prefill_q_seq_len
:(
prefill_q_seq_len
+
1
)]
prefill_packed_ideal_output
,
_
=
pack_tensor
(
prefill_ideal_output
,
prefill_qkv
.
q_seq_lens
,
device
=
CUDA_DEVICE
)
decode_packed_ideal_output
,
_
=
pack_tensor
(
decode_ideal_output
,
[
1
for
_
in
range
(
batch_size
)],
device
=
CUDA_DEVICE
)
# Build prefill- & decode-phase data structures
# for decoder self-attention. Block tables and
# slot mapping must be in a format compatible
# with KV caching & attention kernels
#
# Prefill-phase:
#
# * Empty block-tables tensor
# * Slot-mapping with entries for prompt tokens
#
# Decode-phase:
# * Block-tables tensor with minimum number of blocks
# required by total num. tokens in the entirety of all sequences
# (including both prefill & decode)
# * Slot-mapping with entries for tokens that will be decoded in the
# current decode iteration
#
# Note: the format described above is simply mirroring what ModelRunner
# produces
prefill_block_tables
=
make_empty_block_tables_tensor
(
device
=
CUDA_DEVICE
)
(
decode_block_tables
,
slot_mapping_list
,
max_block_idx
,
)
=
make_block_tables_slot_mapping
(
block_size
,
qkv
.
q_seq_lens
,
device
=
CUDA_DEVICE
,
block_base_addr
=
block_base_addr
)
(
prefill_slot_mapping
,
decode_slot_mapping
,
)
=
split_slot_mapping
(
slot_mapping_list
,
qkv
.
q_seq_lens
,
device
=
CUDA_DEVICE
)
prefill_pckd_qkv
=
pack_qkv
(
prefill_qkv
,
device
=
CUDA_DEVICE
)
decode_pckd_qkv
=
pack_qkv
(
decode_qkv
,
device
=
CUDA_DEVICE
)
return
(
qkv
,
PhaseTestParameters
(
# Prefill test params
PackedQKVO
(
prefill_pckd_qkv
,
prefill_packed_ideal_output
),
KVMemoryMap
(
prefill_block_tables
,
prefill_slot_mapping
)),
PhaseTestParameters
(
# Decode test params
PackedQKVO
(
decode_pckd_qkv
,
decode_packed_ideal_output
),
KVMemoryMap
(
decode_block_tables
,
decode_slot_mapping
)),
max_block_idx
)
def
_enc_dec_cross_attn_setup_reuses_query
(
decoder_qkv
:
QKVInputs
,
encoder_test_params
:
PhaseTestParameters
,
prefill_decoder_phase_test_params
:
PhaseTestParameters
,
test_pt
:
TestPoint
,
test_rsrcs
:
TestResources
,
block_base_addr
:
int
=
0
,
)
->
Tuple
[
PhaseTestParameters
,
PhaseTestParameters
]:
'''
Set up test vectors & data structures for cross-attention test.
A triplet of synthetic cross-attention key/value tensors are constructed
("baseline" key/value). Given this is a cross-attention test, we assume
query tensors were already synthesized for a prior self-attention test and
will be reused for cross-attention. The key & value sequences generated here
may have a different length than the corresponding queries (as is often
the case for cross-attention between decoder and encoder sequences.)
Cross attention key & value tensors do not grow during autoregressive
inference; thus this function obtains a single key/value pair suitable for
both prefill and decode.
The "baseline" query tensor is received as an argument. The "baseline"
query/key/value tensors are passed to an ideal reference cross-attention
implementation to generate a "baseline" ideal output tensor. This tensor is
split into the "Prefill" ideal output tensor (all but the last element of
each output sequence) and the "Decode" ideal output tensor (*only* the last
element of each output sequence); the "Prefill" and "Decode" ideal output
tensors can be used to validate the prefill and decode test results,
respectively.
This function also constructs the cross-attention KV cache memory mapping
(slot mapping and block table), ensuring that the block table starts at
block_base_addr.
Arguments:
* decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x
num_heads x head_size) decoder self-attention inputs;
this function relies on the query and q_seq_lens
fields
* encoder_test_params: PhaseTestParameters data structure which was
used for encoder inference; KV cache field
is not used by this function
* prefill_decoder_phase_test_params: PhaseTestParameters data structure
used for prefill-phase decoder
self-attention; all fields
including KV cache required
* test_pt: TestPoint data structure; this function relies on the
following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
* block_base_addr: decoder self-attention block-table base address
Returns:
* Prefill-phase encoder/decoder cross-attention PhaseTestParameters data
structure, including (1) packed
(number_of_tokens x num_heads x head_size) query/key/value tensors
along with (2) ideal attention output computed using a
naive implementation, and (3) memory-mapping data structures appropriate
for prefill phase.
* Decode-phase encoder/decoder cross-attention PhaseTestParameters data
structure, including (1) packed
(number_of_tokens x num_heads x head_size) query/key/value tensors
along with (2) ideal attention output computed using a
naive implementation, and (3) memory-mapping data structures appropriate
for decode phase.
'''
assert
encoder_test_params
.
packed_qkvo
.
packed_qkv
is
not
None
assert
prefill_decoder_phase_test_params
.
packed_qkvo
.
packed_qkv
is
not
None
(
num_heads
,
head_size
,
_
,
batch_size
,
block_size
,
max_decoder_seq_len
,
max_encoder_seq_len
,
_
,
)
=
test_pt
scale
=
test_rsrcs
.
scale
decoder_query
=
decoder_qkv
.
query
decoder_seq_lens
=
decoder_qkv
.
q_seq_lens
encoder_seq_lens
=
encoder_test_params
.
packed_qkvo
.
packed_qkv
.
q_seq_lens
prefill_q_seq_lens
=
(
prefill_decoder_phase_test_params
.
packed_qkvo
.
packed_qkv
.
q_seq_lens
)
assert
prefill_q_seq_lens
is
not
None
(
cross_kv
,
_
,
_
,
)
=
make_qkv
(
batch_size
,
max_decoder_seq_len
,
max_encoder_seq_len
,
num_heads
,
head_size
,
force_kv_seq_lens
=
encoder_seq_lens
,
attn_type
=
AttentionType
.
ENCODER_DECODER
,
device
=
CUDA_DEVICE
)
ideal_output
=
ref_masked_attention
(
decoder_query
,
cross_kv
.
key
,
cross_kv
.
value
,
scale
=
scale
,
q_seq_lens
=
decoder_seq_lens
,
kv_seq_lens
=
cross_kv
.
kv_seq_lens
)
prefill_ideal_output
=
torch
.
zeros_like
(
ideal_output
)
decode_ideal_output
=
torch
.
zeros_like
(
ideal_output
[:,
0
:
1
])
for
bdx
,
prefill_q_seq_len
in
enumerate
(
prefill_q_seq_lens
):
prefill_ideal_output
[
bdx
,
:
prefill_q_seq_len
]
=
ideal_output
[
bdx
,
:
prefill_q_seq_len
]
decode_ideal_output
[
bdx
,
:]
=
ideal_output
[
bdx
,
prefill_q_seq_len
:(
prefill_q_seq_len
+
1
)]
prefill_packed_ideal_output
,
_
=
pack_tensor
(
prefill_ideal_output
,
prefill_q_seq_lens
,
device
=
CUDA_DEVICE
)
decode_packed_ideal_output
,
_
=
pack_tensor
(
decode_ideal_output
,
[
1
for
_
in
range
(
batch_size
)],
device
=
CUDA_DEVICE
)
# Build prefill- & decode-phase data structures
# for encoder/decoder cross-attention. Block tables and
# slot mapping must be in a format compatible
# with KV caching & attention kernels
#
# Whereas decoder self-attention extracts relationships between
# equal-length Q/K/V sequences, which mutually grow in length
# with each decoded token, cross-attention relates the Q sequence
# - which grows with each new decoded token - to fixed-length
# K and V sequences derived from the encoder hidden states.
#
# Prefill-phase:
#
# * Empty block-tables tensor
# * Slot-mapping with as many entries as there are tokens in the encoder
# prompt.
#
# Decode-phase:
# * Block-tables tensor with minimum number of blocks to
# accommodate K & V tensors which are equal in lnegth
# to the encoder prompt length
# * Empty slot-mapping tensor (since K & V are fixed in size,
# new decoded tokens are not KV-cached and require no slot-
# mapping)
#
# Note: the format above is simply an extension of what ModelRunner
# produces for decoder-only models
prefill_block_tables
=
make_empty_block_tables_tensor
(
device
=
CUDA_DEVICE
)
decode_slot_mapping
=
make_empty_slot_mapping_tensor
(
device
=
CUDA_DEVICE
)
(
decode_block_tables
,
prefill_slot_mapping_list
,
_
,
)
=
make_block_tables_slot_mapping
(
block_size
,
cross_kv
.
kv_seq_lens
,
block_base_addr
=
block_base_addr
,
device
=
CUDA_DEVICE
)
prefill_slot_mapping
=
maybe_make_long_tensor
(
prefill_slot_mapping_list
,
device
=
CUDA_DEVICE
)
# Packed key/value (query is already provided)
packed_cross_kv
=
pack_qkv
(
cross_kv
,
device
=
CUDA_DEVICE
)
return
(
PhaseTestParameters
(
# Prefill-phase test params
PackedQKVO
(
packed_cross_kv
,
prefill_packed_ideal_output
),
KVMemoryMap
(
prefill_block_tables
,
prefill_slot_mapping
)),
PhaseTestParameters
(
# Decode-phase test params
PackedQKVO
(
None
,
decode_packed_ideal_output
),
KVMemoryMap
(
decode_block_tables
,
decode_slot_mapping
)))
def
_run_encoder_attention_test
(
attn
:
Attention
,
encoder_test_params
:
PhaseTestParameters
,
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
'''
Run encoder attention.
attn.forward() is passed attn_type=AttentionType.ENCODER in order
to configure the kernel invocation for encoder attention
Requires attn_metadata.num_decode_tokens == 0
(There is no encoder execution in the decode-phase)
Arguments:
* attn: Attention wrapper instance
* encoder_test_params: encoder PhaseTestParameters data structure;
this function relies on the packed
(number_of_tokens x num_heads x head_size)
query/key/value fields
* attn_metadata: attention metadata for encoder/decoder-self attention
Returns:
* Attention.forward() applied to packed {query,key,value} and
& attn_metadata
'''
assert
attn_metadata
.
num_decode_tokens
==
0
attn_type
=
AttentionType
.
ENCODER
packed_qkv
=
encoder_test_params
.
packed_qkvo
.
packed_qkv
assert
packed_qkv
is
not
None
return
attn
.
forward
(
packed_qkv
.
query
,
packed_qkv
.
key
,
packed_qkv
.
value
,
None
,
attn_metadata
,
attn_type
=
attn_type
)
def
_run_decoder_self_attention_test
(
test_rsrcs
:
TestResources
,
decoder_test_params
:
PhaseTestParameters
,
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
'''
Run decoder self-attention test.
attn.forward() is passed attn_type=AttentionType.DECODER
in order to configure the kernel invocation for decoder self-attention.
Arguments:
* test_rsrcs: TestResources instance; this function relies on the kv_cache
and attn (Attention wrapper instance) fields
* decoder_test_params: decoder PhaseTestParameters data structure;
this function relies on the packed
(number_of_tokens x num_heads x head_size)
query/key/value fields
* attn_metadata: attention metadata for decoder-self attention
(contains KV cache memory-mapping)
Returns:
* Attention.forward() applied to packed_{query,key,value}, kv_cache
& attn_metadata
'''
attn_type
=
AttentionType
.
DECODER
attn
=
test_rsrcs
.
attn
kv_cache
=
test_rsrcs
.
kv_cache
packed_qkv
=
decoder_test_params
.
packed_qkvo
.
packed_qkv
assert
packed_qkv
is
not
None
return
attn
.
forward
(
packed_qkv
.
query
,
packed_qkv
.
key
,
packed_qkv
.
value
,
kv_cache
,
attn_metadata
,
attn_type
=
attn_type
)
def
_run_encoder_decoder_cross_attention_test
(
test_rsrcs
:
TestResources
,
decoder_test_params
:
PhaseTestParameters
,
cross_test_params
:
Optional
[
PhaseTestParameters
],
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
'''
Run encoder/decoder cross-attention test.
Via PhaseTestParameters data structures, consumes the same query utilized
for decoder self-attention, plus a key/value specific to cross-attention.
if cross_test_params is None or cross_test_params.packed_qkvo.packed_qkv
is None, this reflects that in decode-phase cross attention there
is no growth in the key and value tensors.
attn.forward() is passed attn_type=AttentionType.ENCODER_DECODER
in order to configure the kernel invocation for encoder/decoder cross-
attention.
Arguments:
* test_rsrcs: TestResources instance; this function relies on the kv_cache
and attn (Attention wrapper instance) fields
* decoder_test_params: decoder PhaseTestParameters data structure;
this function relies on the packed
(number_of_tokens x num_heads x head_size)
query field
* cross_test_params: encoder/decoder PhaseTestParameters data structure;
this function relies on the packed
(number_of_tokens x num_heads x head_size)
key/value fields
* attn_metadata: attention metadata for encoder/decoder-self attention
Returns:
* Attention.forward() applied to packed_{query,key,value}, kv_cache
& attn_metadata
'''
assert
decoder_test_params
.
packed_qkvo
.
packed_qkv
is
not
None
attn_type
=
AttentionType
.
ENCODER_DECODER
attn
=
test_rsrcs
.
attn
kv_cache
=
test_rsrcs
.
kv_cache
if
cross_test_params
is
None
:
key
=
None
value
=
None
else
:
cross_pckd_qkv
=
cross_test_params
.
packed_qkvo
.
packed_qkv
key
=
(
None
if
cross_pckd_qkv
is
None
else
cross_pckd_qkv
.
key
)
value
=
(
None
if
cross_pckd_qkv
is
None
else
cross_pckd_qkv
.
value
)
return
attn
.
forward
(
decoder_test_params
.
packed_qkvo
.
packed_qkv
.
query
,
key
,
value
,
kv_cache
,
attn_metadata
,
attn_type
=
attn_type
)
@
pytest
.
mark
.
skipif
(
is_hip
(),
reason
=
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"backend_name"
,
BACKEND_NAMES
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"max_dec_seq_len"
,
MAX_DEC_SEQ_LENS
)
@
pytest
.
mark
.
parametrize
(
"max_enc_seq_len"
,
MAX_ENC_SEQ_LENS
)
def
test_encoder_only
(
num_heads
:
int
,
head_size
:
int
,
backend_name
:
str
,
batch_size
:
int
,
block_size
:
int
,
max_dec_seq_len
:
int
,
max_enc_seq_len
:
int
,
monkeypatch
):
# Force Attention wrapper backend
override_backend_env_variable
(
monkeypatch
,
backend_name
)
# Note: KV cache size of 4096 is arbitrary & chosen intentionally
# to be more than necessary, since exceeding the kv cache size
# is not part of this test
test_pt
=
TestPoint
(
num_heads
,
head_size
,
backend_name
,
batch_size
,
block_size
,
max_dec_seq_len
,
max_enc_seq_len
,
4096
)
# Attention scale factor, attention backend instance, attention wrapper
# instance, KV cache init
test_rsrcs
=
_make_test_resources
(
test_pt
)
# Construct encoder attention test params (only used
# during prefill)
enc_test_params
=
_encoder_attn_setup
(
test_pt
,
test_rsrcs
)
# Shared prefill metadata structure
prephase_attn_metadata
:
AttentionMetadata
=
make_test_metadata
(
test_rsrcs
.
attn_backend
,
True
,
None
,
decoder_test_params
=
None
,
encoder_test_params
=
enc_test_params
,
cross_test_params
=
None
,
device
=
CUDA_DEVICE
)
# PREFILL: encoder attention
enc_pckd_act_out
:
torch
.
Tensor
=
(
_run_encoder_attention_test
(
test_rsrcs
.
attn
,
enc_test_params
,
prephase_attn_metadata
))
# - Is encoder attention result correct?
assert_actual_matches_ideal
(
enc_test_params
,
enc_pckd_act_out
)
@
pytest
.
mark
.
skipif
(
is_hip
(),
reason
=
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"backend_name"
,
BACKEND_NAMES
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
BATCH_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"max_dec_seq_len"
,
MAX_DEC_SEQ_LENS
)
@
pytest
.
mark
.
parametrize
(
"max_enc_seq_len"
,
MAX_ENC_SEQ_LENS
)
def
test_e2e_enc_dec_attn
(
num_heads
:
int
,
head_size
:
int
,
backend_name
:
str
,
batch_size
:
int
,
block_size
:
int
,
max_dec_seq_len
:
int
,
max_enc_seq_len
:
int
,
monkeypatch
,
)
->
None
:
'''
End-to-end encoder/decoder test:
* Construct fake test vectors for (1) encoder attention,
(2) decoder self-attention, and (3) encoder/decoder cross-attention
* Construct (1) attention metadata structure with self- and cross-attention
attributes for prefill-phase, and (2) an analogous attention metadata
structure but for decode-phase
* Test attention steps in the following order
* Encoder attention
* Prefill self-attention
* Prefill cross-attention
* Decode self-attention
* Decode cross-attention
* Besides being reflective of realistic use-cases, this order would
exacerbate any accidental overlap in the self-/cross-attention
block tables, which one hopes to avoid
* Validate output correctness against ideal reference attention
implementation
Block tables are constructed such that cross-attention KV cache is in a
higher, non-intersecting address-space than self-attention KV cache.
Self- and cross-attention share the same query tensor but not the K/V
tensors. Self-attention K/Vs must have the same seq len as Q while
cross-attention K/Vs are allowed to differ in seq len, as is often the case
for cross-attention.
This test utilizes PyTest monkey patching to force the attention backend
via an environment variable.
Note on ROCm/HIP: currently encoder/decoder models are not supported on
AMD GPUs, therefore this test simply is skipped if is_hip().
Note on metadata: there is a single attention metadata structure shared by
all prefill-phase attention operations (encoder, decoder, enc/dec cross),
and a single one shared by all decode-phase attention operations
(decoder & enc/dec cross.) This is intended to reflect the behavior
of ModelRunner, which constructs a single attention metadata structure for
each prefill or decode run. A realistic scenario would rely on the
attention backend to utilize the appropriate attention metadata fields
according to the value of attn_metadata.attention_type. Thus, this test is
organized so as to confirm that the backend-under-test can handle a
shared prefill attention metadata structure & a shared decode attention
metadata structure.
'''
# Force Attention wrapper backend
override_backend_env_variable
(
monkeypatch
,
backend_name
)
# Note: KV cache size of 4096 is arbitrary & chosen intentionally
# to be more than necessary, since exceeding the kv cache size
# is not part of this test
test_pt
=
TestPoint
(
num_heads
,
head_size
,
backend_name
,
batch_size
,
block_size
,
max_dec_seq_len
,
max_enc_seq_len
,
4096
)
# Attention scale factor, attention backend instance, attention wrapper
# instance, KV cache init
test_rsrcs
=
_make_test_resources
(
test_pt
)
# Construct encoder attention test params (only used
# during prefill)
enc_test_params
=
_encoder_attn_setup
(
test_pt
,
test_rsrcs
)
# Construct Decoder self-attention prefill-phase & decode-phase
# test params, including query/key/value tensors, decoder self-attention
# memory-mapping. cross_block_base_addr is the uppermost address in the
# decoder self-attention block-table, i.e. a base address which the
# encoder/decoder cross-attention block-table may build downward toward.
(
dec_qkv
,
prephase_dec_test_params
,
decphase_dec_test_params
,
cross_block_base_addr
,
)
=
_decoder_attn_setup
(
test_pt
,
test_rsrcs
)
# Construct encoder/decoder cross-attention prefill-phase & decode-phase
# test params, including key/value tensors, cross-attention memory-mapping
(
prephase_cross_test_params
,
decphase_cross_test_params
,
)
=
_enc_dec_cross_attn_setup_reuses_query
(
dec_qkv
,
enc_test_params
,
prephase_dec_test_params
,
test_pt
,
test_rsrcs
,
block_base_addr
=
cross_block_base_addr
)
# Shared prefill metadata structure
assert
prephase_dec_test_params
.
packed_qkvo
.
packed_qkv
is
not
None
prephase_attn_metadata
:
AttentionMetadata
=
make_test_metadata
(
test_rsrcs
.
attn_backend
,
True
,
prephase_dec_test_params
.
packed_qkvo
.
packed_qkv
.
q_seq_lens
,
decoder_test_params
=
prephase_dec_test_params
,
encoder_test_params
=
enc_test_params
,
cross_test_params
=
prephase_cross_test_params
,
device
=
CUDA_DEVICE
)
# PREFILL: encoder attention
enc_pckd_act_out
=
_run_encoder_attention_test
(
test_rsrcs
.
attn
,
enc_test_params
,
prephase_attn_metadata
)
# - Is encoder attention result correct?
assert_actual_matches_ideal
(
enc_test_params
,
enc_pckd_act_out
)
# PREFILL: decoder self-attention test
prephase_dec_pckd_act_out
=
_run_decoder_self_attention_test
(
test_rsrcs
,
prephase_dec_test_params
,
prephase_attn_metadata
)
# - Is prefill decoder self-attention correct?
assert_actual_matches_ideal
(
prephase_dec_test_params
,
prephase_dec_pckd_act_out
)
# PREFILL: encoder/decoder cross-attention test
prephase_cross_pckd_act_out
=
_run_encoder_decoder_cross_attention_test
(
test_rsrcs
,
prephase_dec_test_params
,
prephase_cross_test_params
,
prephase_attn_metadata
)
# - Is prefill encoder/decoder cross-attention correct?
assert_actual_matches_ideal
(
prephase_cross_test_params
,
prephase_cross_pckd_act_out
)
# DECODE: build decode-phase attention metadata
decphase_attn_metadata
:
AttentionMetadata
=
make_test_metadata
(
test_rsrcs
.
attn_backend
,
False
,
dec_qkv
.
q_seq_lens
,
decoder_test_params
=
decphase_dec_test_params
,
encoder_test_params
=
enc_test_params
,
cross_test_params
=
decphase_cross_test_params
,
device
=
CUDA_DEVICE
)
# DECODE: decoder self-attention test
decphase_dec_pckd_act_out
=
_run_decoder_self_attention_test
(
test_rsrcs
,
decphase_dec_test_params
,
decphase_attn_metadata
)
# - Is decode-phase decoder self-attention correct?
assert_actual_matches_ideal
(
decphase_dec_test_params
,
decphase_dec_pckd_act_out
)
# DECODE: encoder/decoder cross-attention test
decphase_cross_pckd_act_out
=
_run_encoder_decoder_cross_attention_test
(
test_rsrcs
,
decphase_dec_test_params
,
None
,
decphase_attn_metadata
)
# - Is decode-phase encoder/decoder cross-attention correct?
assert_actual_matches_ideal
(
decphase_cross_test_params
,
decphase_cross_pckd_act_out
)
tests/kernels/test_flash_attn.py
View file @
705f6a35
...
...
@@ -25,7 +25,7 @@ def ref_paged_attn(
block_tables
=
block_tables
.
cpu
().
numpy
()
_
,
block_size
,
num_kv_heads
,
head_size
=
key_cache
.
shape
outputs
=
[]
outputs
:
List
[
torch
.
Tensor
]
=
[]
start_idx
=
0
for
i
in
range
(
num_seqs
):
query_len
=
query_lens
[
i
]
...
...
@@ -70,7 +70,7 @@ def ref_paged_attn(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
torch
.
inference_mode
def
test_flash_attn_with_paged_kv
(
kv_lens
:
List
[
Tuple
[
int
,
int
]
],
kv_lens
:
List
[
int
],
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
...
...
tests/kernels/test_flashinfer.py
0 → 100644
View file @
705f6a35
from
typing
import
List
,
Optional
,
Tuple
import
flashinfer
import
pytest
import
torch
NUM_HEADS
=
[(
16
,
16
),
(
32
,
8
),
(
64
,
8
)]
HEAD_SIZES
=
[
128
,
256
]
BLOCK_SIZES
=
[
16
,
32
]
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
NUM_BLOCKS
=
32768
# Large enough to test overflow in index calculation.
def
ref_paged_attn
(
query
:
torch
.
Tensor
,
key_cache
:
torch
.
Tensor
,
value_cache
:
torch
.
Tensor
,
query_lens
:
List
[
int
],
kv_lens
:
List
[
int
],
block_tables
:
torch
.
Tensor
,
scale
:
float
,
sliding_window
:
Optional
[
int
]
=
None
,
soft_cap
:
Optional
[
float
]
=
None
,
)
->
torch
.
Tensor
:
num_seqs
=
len
(
query_lens
)
block_tables
=
block_tables
.
cpu
().
numpy
()
_
,
block_size
,
num_kv_heads
,
head_size
=
key_cache
.
shape
outputs
:
List
[
torch
.
Tensor
]
=
[]
start_idx
=
0
for
i
in
range
(
num_seqs
):
query_len
=
query_lens
[
i
]
kv_len
=
kv_lens
[
i
]
q
=
query
[
start_idx
:
start_idx
+
query_len
]
q
*=
scale
num_kv_blocks
=
(
kv_len
+
block_size
-
1
)
//
block_size
block_indices
=
block_tables
[
i
,
:
num_kv_blocks
]
k
=
key_cache
[
block_indices
].
view
(
-
1
,
num_kv_heads
,
head_size
)
k
=
k
[:
kv_len
]
v
=
value_cache
[
block_indices
].
view
(
-
1
,
num_kv_heads
,
head_size
)
v
=
v
[:
kv_len
]
if
q
.
shape
[
1
]
!=
k
.
shape
[
1
]:
k
=
torch
.
repeat_interleave
(
k
,
q
.
shape
[
1
]
//
k
.
shape
[
1
],
dim
=
1
)
v
=
torch
.
repeat_interleave
(
v
,
q
.
shape
[
1
]
//
v
.
shape
[
1
],
dim
=
1
)
attn
=
torch
.
einsum
(
"qhd,khd->hqk"
,
q
,
k
).
float
()
empty_mask
=
torch
.
ones
(
query_len
,
kv_len
)
mask
=
torch
.
triu
(
empty_mask
,
diagonal
=
kv_len
-
query_len
+
1
).
bool
()
if
sliding_window
is
not
None
:
sliding_window_mask
=
torch
.
triu
(
empty_mask
,
diagonal
=
kv_len
-
(
query_len
+
sliding_window
)
+
1
).
bool
().
logical_not
()
mask
|=
sliding_window_mask
if
soft_cap
is
not
None
:
attn
=
soft_cap
*
torch
.
tanh
(
attn
/
soft_cap
)
attn
.
masked_fill_
(
mask
,
float
(
"-inf"
))
attn
=
torch
.
softmax
(
attn
,
dim
=-
1
).
to
(
v
.
dtype
)
out
=
torch
.
einsum
(
"hqk,khd->qhd"
,
attn
,
v
)
outputs
.
append
(
out
)
start_idx
+=
query_len
return
torch
.
cat
(
outputs
,
dim
=
0
)
@
pytest
.
mark
.
parametrize
(
"kv_lens"
,
[[
1328
,
18
,
463
],
[
1
,
54
,
293
,
70
]])
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
30.0
,
50.0
])
@
torch
.
inference_mode
def
test_flashinfer_decode_with_paged_kv
(
kv_lens
:
List
[
int
],
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
])
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
num_seqs
=
len
(
kv_lens
)
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
assert
num_query_heads
%
num_kv_heads
==
0
max_kv_len
=
max
(
kv_lens
)
scale
=
head_size
**-
0.5
query
=
torch
.
randn
(
num_seqs
,
num_query_heads
,
head_size
,
dtype
=
dtype
)
key_value_cache
=
torch
.
randn
(
NUM_BLOCKS
,
2
,
block_size
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
key_cache
=
key_value_cache
[:,
0
,
:,
:,
:].
squeeze
(
1
)
value_cache
=
key_value_cache
[:,
1
,
:,
:,
:].
squeeze
(
1
)
max_num_blocks_per_seq
=
(
max_kv_len
+
block_size
-
1
)
//
block_size
block_tables
=
torch
.
randint
(
0
,
NUM_BLOCKS
,
(
num_seqs
,
max_num_blocks_per_seq
),
dtype
=
torch
.
int32
)
kv_indptr
=
[
0
]
kv_indices
=
[]
kv_last_page_lens
=
[]
for
i
in
range
(
num_seqs
):
seq_len
=
kv_lens
[
i
]
assert
seq_len
>
0
num_blocks
=
(
seq_len
+
block_size
-
1
)
//
block_size
kv_indices
.
extend
(
block_tables
[
i
,
:
num_blocks
])
kv_indptr
.
append
(
kv_indptr
[
-
1
]
+
num_blocks
)
kv_last_page_len
=
seq_len
%
block_size
if
kv_last_page_len
==
0
:
kv_last_page_len
=
block_size
kv_last_page_lens
.
append
(
kv_last_page_len
)
kv_indptr
=
torch
.
tensor
(
kv_indptr
,
dtype
=
torch
.
int32
)
kv_indices
=
torch
.
tensor
(
kv_indices
,
dtype
=
torch
.
int32
)
kv_last_page_lens
=
torch
.
tensor
(
kv_last_page_lens
,
dtype
=
torch
.
int32
)
workspace_buffer
=
torch
.
empty
(
128
*
1024
*
1024
,
dtype
=
torch
.
int8
)
wrapper
=
flashinfer
.
\
BatchDecodeWithPagedKVCacheWrapper
(
workspace_buffer
,
"NHD"
)
wrapper
.
begin_forward
(
kv_indptr
,
kv_indices
,
kv_last_page_lens
,
num_query_heads
,
num_kv_heads
,
head_size
,
block_size
,
"NONE"
,
data_type
=
dtype
)
output
=
wrapper
.
forward
(
query
,
key_value_cache
,
logits_soft_cap
=
soft_cap
)
ref_output
=
ref_paged_attn
(
query
=
query
,
key_cache
=
key_cache
,
value_cache
=
value_cache
,
query_lens
=
[
1
]
*
num_seqs
,
kv_lens
=
kv_lens
,
block_tables
=
block_tables
,
scale
=
scale
,
soft_cap
=
soft_cap
)
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
1e-2
),
\
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
@
pytest
.
mark
.
parametrize
(
"seq_lens"
,
[[(
1
,
1328
),
(
5
,
18
),
(
129
,
463
)]])
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
BLOCK_SIZES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
30.0
,
50.0
])
@
torch
.
inference_mode
def
test_flashinfer_prefill_with_paged_kv
(
seq_lens
:
List
[
Tuple
[
int
,
int
]],
num_heads
:
Tuple
[
int
,
int
],
head_size
:
int
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
Optional
[
float
])
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
cuda
.
manual_seed_all
(
0
)
num_seqs
=
len
(
seq_lens
)
query_lens
=
[
x
[
0
]
for
x
in
seq_lens
]
kv_lens
=
[
x
[
1
]
for
x
in
seq_lens
]
num_query_heads
=
num_heads
[
0
]
num_kv_heads
=
num_heads
[
1
]
assert
num_query_heads
%
num_kv_heads
==
0
max_kv_len
=
max
(
kv_lens
)
scale
=
head_size
**-
0.5
query
=
torch
.
randn
(
sum
(
query_lens
),
num_query_heads
,
head_size
,
dtype
=
dtype
)
key_value_cache
=
torch
.
randn
(
NUM_BLOCKS
,
2
,
block_size
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
key_cache
=
key_value_cache
[:,
0
,
:,
:,
:].
squeeze
(
1
)
value_cache
=
key_value_cache
[:,
1
,
:,
:,
:].
squeeze
(
1
)
# Normalize the scale of the key and value caches to mitigate
# numerical instability.
key_cache
/=
head_size
**
0.5
value_cache
/=
head_size
**
0.5
max_num_blocks_per_seq
=
(
max_kv_len
+
block_size
-
1
)
//
block_size
block_tables
=
torch
.
randint
(
0
,
NUM_BLOCKS
,
(
num_seqs
,
max_num_blocks_per_seq
),
dtype
=
torch
.
int32
)
qo_indptr
=
[
0
]
kv_indptr
=
[
0
]
kv_indices
=
[]
kv_last_page_lens
=
[]
for
i
in
range
(
num_seqs
):
seq_len
=
kv_lens
[
i
]
assert
seq_len
>
0
num_blocks
=
(
seq_len
+
block_size
-
1
)
//
block_size
kv_indices
.
extend
(
block_tables
[
i
,
:
num_blocks
])
kv_indptr
.
append
(
kv_indptr
[
-
1
]
+
num_blocks
)
kv_last_page_len
=
seq_len
%
block_size
if
kv_last_page_len
==
0
:
kv_last_page_len
=
block_size
kv_last_page_lens
.
append
(
kv_last_page_len
)
qo_indptr
.
append
(
qo_indptr
[
-
1
]
+
query_lens
[
i
])
qo_indptr
=
torch
.
tensor
(
qo_indptr
,
dtype
=
torch
.
int32
)
kv_indptr
=
torch
.
tensor
(
kv_indptr
,
dtype
=
torch
.
int32
)
kv_indices
=
torch
.
tensor
(
kv_indices
,
dtype
=
torch
.
int32
)
kv_last_page_lens
=
torch
.
tensor
(
kv_last_page_lens
,
dtype
=
torch
.
int32
)
workspace_buffer
=
torch
.
empty
(
128
*
1024
*
1024
,
dtype
=
torch
.
int8
)
wrapper
=
flashinfer
.
BatchPrefillWithPagedKVCacheWrapper
(
workspace_buffer
,
"NHD"
)
wrapper
.
begin_forward
(
qo_indptr
,
kv_indptr
,
kv_indices
,
kv_last_page_lens
,
num_query_heads
,
num_kv_heads
,
head_size
,
block_size
,
)
output
=
wrapper
.
forward
(
query
,
key_value_cache
,
logits_soft_cap
=
soft_cap
,
)
ref_output
=
ref_paged_attn
(
query
=
query
,
key_cache
=
key_cache
,
value_cache
=
value_cache
,
query_lens
=
query_lens
,
kv_lens
=
kv_lens
,
block_tables
=
block_tables
,
scale
=
scale
,
soft_cap
=
soft_cap
)
assert
torch
.
allclose
(
output
,
ref_output
,
atol
=
1e-2
,
rtol
=
1e-2
),
\
f
"
{
torch
.
max
(
torch
.
abs
(
output
-
ref_output
))
}
"
Prev
1
…
7
8
9
10
11
12
13
14
15
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment