Commit 705f6a35 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

parents af837396 4cf256ae
...@@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download ...@@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download
from vllm import LLM from vllm import LLM
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from ..conftest import cleanup from ...conftest import cleanup
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
...@@ -20,8 +20,6 @@ PROMPTS = [ ...@@ -20,8 +20,6 @@ PROMPTS = [
LORA_NAME = "typeof/zephyr-7b-beta-lora" LORA_NAME = "typeof/zephyr-7b-beta-lora"
pytestmark = pytest.mark.llm
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm(): def llm():
...@@ -44,7 +42,7 @@ def llm(): ...@@ -44,7 +42,7 @@ def llm():
cleanup() cleanup()
@pytest.fixture(scope="session") @pytest.fixture(scope="module")
def zephyr_lora_files(): def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME) return snapshot_download(repo_id=LORA_NAME)
......
import pytest
@pytest.fixture
def sample_regex():
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
@pytest.fixture
def sample_json_schema():
return {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
},
"required": ["company", "position"]
}
}
},
"required": ["name", "age", "skills", "work_history"]
}
@pytest.fixture
def sample_guided_choice():
return [
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
"Ruby", "Swift", "Kotlin"
]
@pytest.fixture
def sample_sql_statements():
return ("""
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
""")
\ No newline at end of file
# imports for guided decoding tests # imports for guided decoding tests
import json import json
import re import re
from typing import List
import jsonschema import jsonschema
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
# using Ray for overall ease of process management, parallel requests,
# and debugging.
import ray
import torch import torch
# downloading lora to test lora requests # downloading lora to test lora requests
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from openai import BadRequestError from openai import BadRequestError
from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer
from ..utils import ServerRunner
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing # technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here # generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora" LORA_NAME = "typeof/zephyr-7b-beta-lora"
TEST_SCHEMA = {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"work history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "string"
},
"position": {
"type": "string"
}
},
"required": ["company", "position"]
}
}
},
"required": ["name", "age", "skills", "work history"]
}
TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
TEST_CHOICE = [
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
"Swift", "Kotlin"
]
pytestmark = pytest.mark.openai
@pytest.fixture(scope="module")
@pytest.fixture(scope="session")
def zephyr_lora_files(): def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME) return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(zephyr_lora_files): def server(zephyr_lora_files):
ray.init() with RemoteOpenAIServer([
server_runner = ServerRunner.remote([ "--model",
"--model", MODEL_NAME,
MODEL_NAME, # use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment "--dtype",
"--dtype", "bfloat16",
"bfloat16", "--max-model-len",
"--max-model-len", "8192",
"8192", "--enforce-eager",
"--enforce-eager", # lora config below
"--gpu-memory-utilization", "--enable-lora",
"0.75", "--lora-modules",
# lora config below f"zephyr-lora={zephyr_lora_files}",
"--enable-lora", f"zephyr-lora2={zephyr_lora_files}",
"--lora-modules", "--max-lora-rank",
f"zephyr-lora={zephyr_lora_files}", "64",
f"zephyr-lora2={zephyr_lora_files}", "--max-cpu-loras",
"--max-lora-rank", "2",
"64", "--max-num-seqs",
"--max-cpu-loras", "128",
"2", ]) as remote_server:
"--max-num-seqs", yield remote_server
"128",
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def embedding_server(zephyr_lora_files): def client(server):
ray.shutdown() return server.get_async_client()
ray.init()
server_runner = ServerRunner.remote([
"--model",
EMBEDDING_MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--enforce-eager",
"--gpu-memory-utilization",
"0.75",
"--max-model-len",
"8192",
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="module")
def client():
client = openai.AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client
@pytest.mark.asyncio
async def test_check_models(server, client: openai.AsyncOpenAI):
models = await client.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert all(model.root == MODEL_NAME for model in models)
assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2"
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
async def test_single_completion(server, client: openai.AsyncOpenAI,
model_name: str):
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
async def test_no_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=None,
)
choice = completion.choices[0]
assert choice.logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=0,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_some_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=5,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs
await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=21,
)
...
with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs
stream = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=30,
stream=True,
)
async for chunk in stream:
...
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 0
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -300,8 +62,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, ...@@ -300,8 +62,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
) )
async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -326,8 +87,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, ...@@ -326,8 +87,7 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -354,8 +114,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, ...@@ -354,8 +114,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -382,7 +141,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, ...@@ -382,7 +141,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -425,7 +184,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, ...@@ -425,7 +184,7 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_single_chat_session(server, client: openai.AsyncOpenAI, async def test_single_chat_session(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -465,48 +224,13 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, ...@@ -465,48 +224,13 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
assert message.content is not None and len(message.content) >= 0 assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_completion_streaming(server, client: openai.AsyncOpenAI,
model_name: str):
prompt = "What is an LLM?"
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks = []
finish_reason_count = 0
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert "".join(chunks) == single_output
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter # just test 1 lora hereafter
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_chat_streaming(server, client: openai.AsyncOpenAI, async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -533,7 +257,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, ...@@ -533,7 +257,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
chunks = [] chunks: List[str] = []
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
...@@ -555,8 +279,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, ...@@ -555,8 +279,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
"model_name", "model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
) )
async def test_chat_completion_stream_options(server, async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
client: openai.AsyncOpenAI,
model_name: str): model_name: str):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -621,195 +344,56 @@ async def test_chat_completion_stream_options(server, ...@@ -621,195 +344,56 @@ async def test_chat_completion_stream_options(server,
stream_options={"include_usage": True}) stream_options={"include_usage": True})
@pytest.mark.asyncio # NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
@pytest.mark.parametrize( # (i.e. using the same ordering as in the Completions API tests), the test
"model_name", # will fail on the second `guided_decoding_backend` even when I swap their order
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], # (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
)
async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
model_name: str):
prompt = "What is the capital of France?"
# Test stream=True, stream_options={"include_usage": False}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={"include_usage": False})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options={"include_usage": True}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={"include_usage": True})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=False, stream_options={"include_usage": None}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
# Test stream=False, stream_options={"include_usage": True}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_batch_completions(server, client: openai.AsyncOpenAI,
model_name: str):
# test simple list
batch = await client.completions.create(
model=model_name,
prompt=["Hello, my name is", "Hello, my name is"],
max_tokens=5,
temperature=0.0,
)
assert len(batch.choices) == 2
assert batch.choices[0].text == batch.choices[1].text
# test n = 2
batch = await client.completions.create(
model=model_name,
prompt=["Hello, my name is", "Hello, my name is"],
n=2,
max_tokens=5,
temperature=0.0,
extra_body=dict(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search=True),
)
assert len(batch.choices) == 4
assert batch.choices[0].text != batch.choices[
1].text, "beam search should be different"
assert batch.choices[0].text == batch.choices[
2].text, "two copies of the same prompt should be the same"
assert batch.choices[1].text == batch.choices[
3].text, "two copies of the same prompt should be the same"
# test streaming
batch = await client.completions.create(
model=model_name,
prompt=["Hello, my name is", "Hello, my name is"],
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
async for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
assert texts[0] == texts[1]
@pytest.mark.asyncio
async def test_logits_bias(server, client: openai.AsyncOpenAI):
prompt = "Hello, my name is"
max_tokens = 5
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# Test exclusive selection
token_id = 1000
completion = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=max_tokens,
temperature=0.0,
logit_bias={str(token_id): 100},
seed=42,
)
assert len(completion.choices[0].text) >= 5
response_tokens = tokenizer(completion.choices[0].text,
add_special_tokens=False)["input_ids"]
expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
add_special_tokens=False)["input_ids"]
assert all([
response == expected
for response, expected in zip(response_tokens, expected_tokens)
])
# Test ban
completion = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=max_tokens,
temperature=0.0,
)
response_tokens = tokenizer(completion.choices[0].text,
add_special_tokens=False)["input_ids"]
first_response = completion.choices[0].text
completion = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=max_tokens,
temperature=0.0,
logit_bias={str(token): -100
for token in response_tokens},
)
assert first_response != completion.choices[0].text
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_json_completion(server, client: openai.AsyncOpenAI, async def test_guided_choice_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str,
completion = await client.completions.create( sample_guided_choice):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
"The best language for type-safe systems programming is "
}]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
prompt=f"Give an example JSON for an employee profile " messages=messages,
f"that fits this schema: {TEST_SCHEMA}", max_tokens=10,
n=3, extra_body=dict(guided_choice=sample_guided_choice,
temperature=1.0,
max_tokens=500,
extra_body=dict(guided_json=TEST_SCHEMA,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
choice1 = chat_completion.choices[0].message.content
assert choice1 in sample_guided_choice
assert completion.id is not None messages.append({"role": "assistant", "content": choice1})
assert len(completion.choices) == 3 messages.append({
for i in range(3): "role": "user",
output_json = json.loads(completion.choices[i].text) "content": "I disagree, pick another one"
jsonschema.validate(instance=output_json, schema=TEST_SCHEMA) })
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=10,
extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
choice2 = chat_completion.choices[0].message.content
assert choice2 in sample_guided_choice
assert choice1 != choice2
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_json_chat(server, client: openai.AsyncOpenAI, async def test_guided_json_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str,
sample_json_schema):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -818,18 +402,18 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, ...@@ -818,18 +402,18 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
"user", "user",
"content": "content":
f"Give an example JSON for an employee profile that " f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}" f"fits this schema: {sample_json_schema}"
}] }]
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_tokens=1000,
extra_body=dict(guided_json=TEST_SCHEMA, extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None assert message.content is not None
json1 = json.loads(message.content) json1 = json.loads(message.content)
jsonschema.validate(instance=json1, schema=TEST_SCHEMA) jsonschema.validate(instance=json1, schema=sample_json_schema)
messages.append({"role": "assistant", "content": message.content}) messages.append({"role": "assistant", "content": message.content})
messages.append({ messages.append({
...@@ -842,12 +426,12 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, ...@@ -842,12 +426,12 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=1000, max_tokens=1000,
extra_body=dict(guided_json=TEST_SCHEMA, extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None assert message.content is not None
json2 = json.loads(message.content) json2 = json.loads(message.content)
jsonschema.validate(instance=json2, schema=TEST_SCHEMA) jsonschema.validate(instance=json2, schema=sample_json_schema)
assert json1["name"] != json2["name"] assert json1["name"] != json2["name"]
assert json1["age"] != json2["age"] assert json1["age"] != json2["age"]
...@@ -855,28 +439,8 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, ...@@ -855,28 +439,8 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, async def test_guided_regex_chat(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str, sample_regex):
completion = await client.completions.create(
model=MODEL_NAME,
prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
n=3,
temperature=1.0,
max_tokens=20,
extra_body=dict(guided_regex=TEST_REGEX,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert len(completion.choices) == 3
for i in range(3):
assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -884,17 +448,17 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, ...@@ -884,17 +448,17 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
"role": "role":
"user", "user",
"content": "content":
f"Give an example IP address with this regex: {TEST_REGEX}" f"Give an example IP address with this regex: {sample_regex}"
}] }]
chat_completion = await client.chat.completions.create( chat_completion = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=20, max_tokens=20,
extra_body=dict(guided_regex=TEST_REGEX, extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
ip1 = chat_completion.choices[0].message.content ip1 = chat_completion.choices[0].message.content
assert ip1 is not None assert ip1 is not None
assert re.fullmatch(TEST_REGEX, ip1) is not None assert re.fullmatch(sample_regex, ip1) is not None
messages.append({"role": "assistant", "content": ip1}) messages.append({"role": "assistant", "content": ip1})
messages.append({"role": "user", "content": "Give me a different one"}) messages.append({"role": "user", "content": "Give me a different one"})
...@@ -902,39 +466,16 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, ...@@ -902,39 +466,16 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_tokens=20, max_tokens=20,
extra_body=dict(guided_regex=TEST_REGEX, extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
ip2 = chat_completion.choices[0].message.content ip2 = chat_completion.choices[0].message.content
assert ip2 is not None assert ip2 is not None
assert re.fullmatch(TEST_REGEX, ip2) is not None assert re.fullmatch(sample_regex, ip2) is not None
assert ip1 != ip2 assert ip1 != ip2
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
["outlines", "lm-format-enforcer"])
async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend: str):
completion = await client.completions.create(
model=MODEL_NAME,
prompt="The best language for type-safe systems programming is ",
n=2,
temperature=1.0,
max_tokens=10,
extra_body=dict(guided_choice=TEST_CHOICE,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert len(completion.choices) == 2
for i in range(2):
assert completion.choices[i].text in TEST_CHOICE
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
guided_decoding_backend: str):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -944,52 +485,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, ...@@ -944,52 +485,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
"content": "content":
"The best language for type-safe systems programming is " "The best language for type-safe systems programming is "
}] }]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=10,
extra_body=dict(guided_choice=TEST_CHOICE,
guided_decoding_backend=guided_decoding_backend))
choice1 = chat_completion.choices[0].message.content
assert choice1 in TEST_CHOICE
messages.append({"role": "assistant", "content": choice1})
messages.append({
"role": "user",
"content": "I disagree, pick another one"
})
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=10,
extra_body=dict(guided_choice=TEST_CHOICE,
guided_decoding_backend=guided_decoding_backend))
choice2 = chat_completion.choices[0].message.content
assert choice2 in TEST_CHOICE
assert choice1 != choice2
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
guided_decoding_backend: str):
with pytest.raises(openai.BadRequestError):
_ = await client.completions.create(
model=MODEL_NAME,
prompt="Give an example JSON that fits this schema: 42",
extra_body=dict(guided_json=42,
guided_decoding_backend=guided_decoding_backend))
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
"The best language for type-safe systems programming is "
}]
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(model=MODEL_NAME, _ = await client.chat.completions.create(model=MODEL_NAME,
messages=messages, messages=messages,
...@@ -998,18 +494,13 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, ...@@ -998,18 +494,13 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
2: "C++" 2: "C++"
})) }))
with pytest.raises(openai.BadRequestError):
_ = await client.completions.create(
model=MODEL_NAME,
prompt="Give an example string that fits this regex",
extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str,
sample_guided_choice):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -1025,7 +516,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, ...@@ -1025,7 +516,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
max_tokens=10, max_tokens=10,
logprobs=True, logprobs=True,
top_logprobs=5, top_logprobs=5,
extra_body=dict(guided_choice=TEST_CHOICE, extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs is not None
...@@ -1040,8 +531,9 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, ...@@ -1040,8 +531,9 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"]) ["outlines", "lm-format-enforcer"])
async def test_named_tool_use(server, client: openai.AsyncOpenAI, async def test_named_tool_use(client: openai.AsyncOpenAI,
guided_decoding_backend: str): guided_decoding_backend: str,
sample_json_schema):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -1050,7 +542,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI, ...@@ -1050,7 +542,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
"user", "user",
"content": "content":
f"Give an example JSON for an employee profile that " f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}" f"fits this schema: {sample_json_schema}"
}] }]
# non-streaming # non-streaming
...@@ -1064,7 +556,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI, ...@@ -1064,7 +556,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
"function": { "function": {
"name": "dummy_function_name", "name": "dummy_function_name",
"description": "This is a dummy function", "description": "This is a dummy function",
"parameters": TEST_SCHEMA "parameters": sample_json_schema
} }
}], }],
tool_choice={ tool_choice={
...@@ -1077,7 +569,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI, ...@@ -1077,7 +569,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
assert len(message.content) == 0 assert len(message.content) == 0
json_string = message.tool_calls[0].function.arguments json_string = message.tool_calls[0].function.arguments
json1 = json.loads(json_string) json1 = json.loads(json_string)
jsonschema.validate(instance=json1, schema=TEST_SCHEMA) jsonschema.validate(instance=json1, schema=sample_json_schema)
messages.append({"role": "assistant", "content": json_string}) messages.append({"role": "assistant", "content": json_string})
messages.append({ messages.append({
...@@ -1098,7 +590,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI, ...@@ -1098,7 +590,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
"function": { "function": {
"name": "dummy_function_name", "name": "dummy_function_name",
"description": "This is a dummy function", "description": "This is a dummy function",
"parameters": TEST_SCHEMA "parameters": sample_json_schema
} }
}], }],
tool_choice={ tool_choice={
...@@ -1123,7 +615,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI, ...@@ -1123,7 +615,7 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
# finish reason should only return in last block # finish reason should only return in last block
assert finish_reason_count == 1 assert finish_reason_count == 1
json2 = json.loads("".join(output)) json2 = json.loads("".join(output))
jsonschema.validate(instance=json2, schema=TEST_SCHEMA) jsonschema.validate(instance=json2, schema=sample_json_schema)
assert json1["name"] != json2["name"] assert json1["name"] != json2["name"]
assert json1["age"] != json2["age"] assert json1["age"] != json2["age"]
...@@ -1131,7 +623,8 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI, ...@@ -1131,7 +623,8 @@ async def test_named_tool_use(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_required_tool_use_not_yet_supported( async def test_required_tool_use_not_yet_supported(
server, client: openai.AsyncOpenAI, guided_decoding_backend: str): client: openai.AsyncOpenAI, guided_decoding_backend: str,
sample_json_schema):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -1140,7 +633,7 @@ async def test_required_tool_use_not_yet_supported( ...@@ -1140,7 +633,7 @@ async def test_required_tool_use_not_yet_supported(
"user", "user",
"content": "content":
f"Give an example JSON for an employee profile that " f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}" f"fits this schema: {sample_json_schema}"
}] }]
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
...@@ -1153,7 +646,7 @@ async def test_required_tool_use_not_yet_supported( ...@@ -1153,7 +646,7 @@ async def test_required_tool_use_not_yet_supported(
"function": { "function": {
"name": "dummy_function_name", "name": "dummy_function_name",
"description": "This is a dummy function", "description": "This is a dummy function",
"parameters": TEST_SCHEMA "parameters": sample_json_schema
} }
}], }],
tool_choice="required") tool_choice="required")
...@@ -1168,7 +661,7 @@ async def test_required_tool_use_not_yet_supported( ...@@ -1168,7 +661,7 @@ async def test_required_tool_use_not_yet_supported(
"function": { "function": {
"name": "dummy_function_name", "name": "dummy_function_name",
"description": "This is a dummy function", "description": "This is a dummy function",
"parameters": TEST_SCHEMA "parameters": sample_json_schema
} }
}], }],
tool_choice="auto") tool_choice="auto")
...@@ -1176,8 +669,9 @@ async def test_required_tool_use_not_yet_supported( ...@@ -1176,8 +669,9 @@ async def test_required_tool_use_not_yet_supported(
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"]) @pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_inconsistent_tool_choice_and_tools( async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
server, client: openai.AsyncOpenAI, guided_decoding_backend: str): guided_decoding_backend: str,
sample_json_schema):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -1186,7 +680,7 @@ async def test_inconsistent_tool_choice_and_tools( ...@@ -1186,7 +680,7 @@ async def test_inconsistent_tool_choice_and_tools(
"user", "user",
"content": "content":
f"Give an example JSON for an employee profile that " f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}" f"fits this schema: {sample_json_schema}"
}] }]
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
...@@ -1211,7 +705,7 @@ async def test_inconsistent_tool_choice_and_tools( ...@@ -1211,7 +705,7 @@ async def test_inconsistent_tool_choice_and_tools(
"function": { "function": {
"name": "dummy_function_name", "name": "dummy_function_name",
"description": "This is a dummy function", "description": "This is a dummy function",
"parameters": TEST_SCHEMA "parameters": sample_json_schema
} }
}], }],
tool_choice={ tool_choice={
...@@ -1223,7 +717,7 @@ async def test_inconsistent_tool_choice_and_tools( ...@@ -1223,7 +717,7 @@ async def test_inconsistent_tool_choice_and_tools(
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_response_format_json_object(server, client: openai.AsyncOpenAI): async def test_response_format_json_object(client: openai.AsyncOpenAI):
for _ in range(2): for _ in range(2):
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -1243,7 +737,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): ...@@ -1243,7 +737,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_extra_fields(server, client: openai.AsyncOpenAI): async def test_extra_fields(client: openai.AsyncOpenAI):
with pytest.raises(BadRequestError) as exc_info: with pytest.raises(BadRequestError) as exc_info:
await client.chat.completions.create( await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -1259,7 +753,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI): ...@@ -1259,7 +753,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_complex_message_content(server, client: openai.AsyncOpenAI): async def test_complex_message_content(client: openai.AsyncOpenAI):
resp = await client.chat.completions.create( resp = await client.chat.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
messages=[{ messages=[{
...@@ -1279,7 +773,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI): ...@@ -1279,7 +773,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_custom_role(server, client: openai.AsyncOpenAI): async def test_custom_role(client: openai.AsyncOpenAI):
# Not sure how the model handles custom roles so we just check that # Not sure how the model handles custom roles so we just check that
# both string and complex message content are handled in the same way # both string and complex message content are handled in the same way
...@@ -1310,77 +804,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI): ...@@ -1310,77 +804,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_guided_grammar(server, client: openai.AsyncOpenAI): async def test_long_seed(client: openai.AsyncOpenAI):
simple_sql_grammar = """
start: select_statement
select_statement: "SELECT" column "from" table "where" condition
column: "col_1" | "col_2"
table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""
completion = await client.completions.create(
model=MODEL_NAME,
prompt=("Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"),
temperature=1.0,
max_tokens=500,
extra_body=dict(guided_grammar=simple_sql_grammar))
content = completion.choices[0].text
# use Lark to parse the output, and make sure it's a valid parse tree
from lark import Lark
parser = Lark(simple_sql_grammar)
parser.parse(content)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
assert content.strip() == ground_truth
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
@pytest.mark.parametrize("logprobs_arg", [1, 0])
async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
model_name: str, logprobs_arg: int):
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# test using text and token IDs
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
completion = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
echo=True,
logprobs=logprobs_arg)
prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
list) else prompt
assert re.search(r"^" + prompt_text, completion.choices[0].text)
logprobs = completion.choices[0].logprobs
assert logprobs is not None
assert len(logprobs.text_offset) > 5
assert (len(logprobs.token_logprobs) > 5
and logprobs.token_logprobs[0] is None)
assert (len(logprobs.top_logprobs) > 5
and logprobs.top_logprobs[0] is None)
for top_logprobs in logprobs.top_logprobs[1:]:
assert max(logprobs_arg,
1) <= len(top_logprobs) <= logprobs_arg + 1
assert len(logprobs.tokens) > 5
@pytest.mark.asyncio
async def test_long_seed(server, client: openai.AsyncOpenAI):
for seed in [ for seed in [
torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).min - 1,
torch.iinfo(torch.long).max + 1 torch.iinfo(torch.long).max + 1
...@@ -1397,83 +821,3 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): ...@@ -1397,83 +821,3 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
assert ("greater_than_equal" in exc_info.value.message assert ("greater_than_equal" in exc_info.value.message
or "less_than_equal" in exc_info.value.message) or "less_than_equal" in exc_info.value.message)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9
assert embeddings.usage.total_tokens == 9
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str):
# test List[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
# test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
if __name__ == "__main__":
pytest.main([__file__])
# imports for guided decoding tests
import json
import re
from typing import List
import jsonschema
import openai # use the official client for correctness check
import pytest
import requests
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module")
def server(zephyr_lora_files):
with RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
]) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def client(server):
return server.get_async_client()
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=None,
)
choice = completion.choices[0]
assert choice.logprobs is None
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=0,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) == 1
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
completion = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=5,
)
choice = completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
model_name: str):
with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs
await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=21,
)
...
with pytest.raises(
(openai.BadRequestError, openai.APIError)): # test using token IDs
stream = await client.completions.create(
model=MODEL_NAME,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=30,
stream=True,
)
async for chunk in stream:
...
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
assert len(completion.choices[0].text) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_completion_streaming(client: openai.AsyncOpenAI,
model_name: str):
prompt = "What is an LLM?"
single_completion = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
)
single_output = single_completion.choices[0].text
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: List[str] = []
finish_reason_count = 0
async for chunk in stream:
chunks.append(chunk.choices[0].text)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert "".join(chunks) == single_output
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
)
async def test_completion_stream_options(client: openai.AsyncOpenAI,
model_name: str):
prompt = "What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": False,
"continuous_usage_stats":
False,
})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": False,
"continuous_usage_stats":
True,
})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats":
False,
})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={
"include_usage": True,
"continuous_usage_stats":
True,
})
async for chunk in stream:
assert chunk.usage is not None
assert chunk.usage.prompt_tokens > 0
assert chunk.usage.completion_tokens > 0
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
chunk.usage.completion_tokens)
if chunk.choices[0].finish_reason is not None:
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=False, stream_options=
# {"include_usage": None}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
# Test stream=False, stream_options=
# {"include_usage": True}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with pytest.raises(BadRequestError):
await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": None})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with pytest.raises(BadRequestError):
await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"continuous_usage_stats": True})
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
# test both text and token IDs
for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
# test simple list
batch = await client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
)
assert len(batch.choices) == 2
assert batch.choices[0].text == batch.choices[1].text
# test n = 2
batch = await client.completions.create(
model=model_name,
prompt=prompts,
n=2,
max_tokens=5,
temperature=0.0,
extra_body=dict(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search=True),
)
assert len(batch.choices) == 4
assert batch.choices[0].text != batch.choices[
1].text, "beam search should be different"
assert batch.choices[0].text == batch.choices[
2].text, "two copies of the same prompt should be the same"
assert batch.choices[1].text == batch.choices[
3].text, "two copies of the same prompt should be the same"
# test streaming
batch = await client.completions.create(
model=model_name,
prompt=prompts,
max_tokens=5,
temperature=0.0,
stream=True,
)
texts = [""] * 2
async for chunk in batch:
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
assert texts[0] == texts[1]
@pytest.mark.asyncio
async def test_logits_bias(client: openai.AsyncOpenAI):
prompt = "Hello, my name is"
max_tokens = 5
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# Test exclusive selection
token_id = 1000
completion = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=max_tokens,
temperature=0.0,
logit_bias={str(token_id): 100},
seed=42,
)
assert len(completion.choices[0].text) >= 5
response_tokens = tokenizer(completion.choices[0].text,
add_special_tokens=False)["input_ids"]
expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
add_special_tokens=False)["input_ids"]
assert all([
response == expected
for response, expected in zip(response_tokens, expected_tokens)
])
# Test ban
completion = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=max_tokens,
temperature=0.0,
)
response_tokens = tokenizer(completion.choices[0].text,
add_special_tokens=False)["input_ids"]
first_response = completion.choices[0].text
completion = await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
max_tokens=max_tokens,
temperature=0.0,
logit_bias={str(token): -100
for token in response_tokens},
)
assert first_response != completion.choices[0].text
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_json_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str,
sample_json_schema):
completion = await client.completions.create(
model=MODEL_NAME,
prompt=f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}",
n=3,
temperature=1.0,
max_tokens=500,
extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert len(completion.choices) == 3
for i in range(3):
output_json = json.loads(completion.choices[i].text)
jsonschema.validate(instance=output_json, schema=sample_json_schema)
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_regex_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str,
sample_regex):
completion = await client.completions.create(
model=MODEL_NAME,
prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
n=3,
temperature=1.0,
max_tokens=20,
extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert len(completion.choices) == 3
for i in range(3):
assert re.fullmatch(sample_regex,
completion.choices[i].text) is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_choice_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str,
sample_guided_choice):
completion = await client.completions.create(
model=MODEL_NAME,
prompt="The best language for type-safe systems programming is ",
n=2,
temperature=1.0,
max_tokens=10,
extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert len(completion.choices) == 2
for i in range(2):
assert completion.choices[i].text in sample_guided_choice
@pytest.mark.asyncio
async def test_guided_grammar(client: openai.AsyncOpenAI,
sample_sql_statements):
completion = await client.completions.create(
model=MODEL_NAME,
prompt=("Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"),
temperature=1.0,
max_tokens=500,
extra_body=dict(guided_grammar=sample_sql_statements))
content = completion.choices[0].text
# use Lark to parse the output, and make sure it's a valid parse tree
from lark import Lark
parser = Lark(sample_sql_statements)
parser.parse(content)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
assert content.strip() == ground_truth
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
@pytest.mark.parametrize("logprobs_arg", [1, 0])
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
model_name: str, logprobs_arg: int):
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# test using text and token IDs
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
completion = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
echo=True,
logprobs=logprobs_arg)
prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
list) else prompt
assert re.search(r"^" + prompt_text, completion.choices[0].text)
logprobs = completion.choices[0].logprobs
assert logprobs is not None
assert len(logprobs.text_offset) > 5
assert (len(logprobs.token_logprobs) > 5
and logprobs.token_logprobs[0] is None)
assert (len(logprobs.top_logprobs) > 5
and logprobs.top_logprobs[0] is None)
for top_logprobs in logprobs.top_logprobs[1:]:
assert max(logprobs_arg,
1) <= len(top_logprobs) <= logprobs_arg + 1
assert len(logprobs.tokens) > 5
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
guided_decoding_backend: str,
sample_json_schema, sample_regex):
with pytest.raises(openai.BadRequestError):
_ = await client.completions.create(
model=MODEL_NAME,
prompt="Give an example JSON that fits this schema: 42",
extra_body=dict(guided_json=42,
guided_decoding_backend=guided_decoding_backend))
with pytest.raises(openai.BadRequestError):
_ = await client.completions.create(
model=MODEL_NAME,
prompt="Give an example string that fits this regex",
extra_body=dict(guided_regex=sample_regex,
guided_json=sample_json_schema))
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
base_url = str(client.base_url)[:-3].strip("/")
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
for add_special in [False, True]:
prompt = "This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize",
json={
"add_special_tokens": add_special,
"model": model_name,
"prompt": prompt
})
response.raise_for_status()
assert response.json() == {
"tokens": tokens,
"count": len(tokens),
"max_model_len": 8192
}
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
base_url = str(client.base_url)[:-3]
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
prompt = "This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=False)
response = requests.post(base_url + "detokenize",
json={
"model": model_name,
"tokens": tokens
})
response.raise_for_status()
assert response.json() == {"prompt": prompt}
import base64
import numpy as np
import openai
import pytest
from ...utils import RemoteOpenAIServer
EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@pytest.fixture(scope="module")
def embedding_server():
with RemoteOpenAIServer([
"--model",
EMBEDDING_MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--enforce-eager",
"--max-model-len",
"8192",
"--enforce-eager",
]) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.fixture(scope="module")
def embedding_client(embedding_server):
return embedding_server.get_async_client()
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9
assert embeddings.usage.total_tokens == 9
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
# test List[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
# test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embeddings = await embedding_client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
]
responses_float = await embedding_client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float")
responses_base64 = await embedding_client.embeddings.create(
input=input_texts, model=model_name, encoding_format="base64")
decoded_responses_base64_data = []
for data in responses_base64.data:
decoded_responses_base64_data.append(
np.frombuffer(base64.b64decode(data.embedding),
dtype="float").tolist())
assert responses_float.data[0].embedding == decoded_responses_base64_data[
0]
assert responses_float.data[1].embedding == decoded_responses_base64_data[
1]
...@@ -10,61 +10,17 @@ from vllm.model_executor.guided_decoding import ( ...@@ -10,61 +10,17 @@ from vllm.model_executor.guided_decoding import (
from vllm.model_executor.guided_decoding.outlines_logits_processors import ( from vllm.model_executor.guided_decoding.outlines_logits_processors import (
JSONLogitsProcessor, RegexLogitsProcessor) JSONLogitsProcessor, RegexLogitsProcessor)
TEST_SCHEMA = {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"work history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "string"
},
"position": {
"type": "string"
}
},
"required": ["company", "position"]
}
}
},
"required": ["name", "age", "skills", "work history"]
}
TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" def test_guided_logits_processors(sample_regex, sample_json_schema):
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
pytestmark = pytest.mark.openai
def test_guided_logits_processors():
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer) regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
json_LP = JSONLogitsProcessor(TEST_SCHEMA, json_LP = JSONLogitsProcessor(sample_json_schema,
tokenizer, tokenizer,
whitespace_pattern=None) whitespace_pattern=None)
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an example IPv4 address with this regex: {TEST_REGEX}") f"Give an example IPv4 address with this regex: {sample_regex}")
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
regex_LP(token_ids, tensor) regex_LP(token_ids, tensor)
...@@ -72,7 +28,8 @@ def test_guided_logits_processors(): ...@@ -72,7 +28,8 @@ def test_guided_logits_processors():
assert not torch.allclose(tensor, original_tensor) assert not torch.allclose(tensor, original_tensor)
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an employee profile that fits this schema: {TEST_SCHEMA}") f"Give an employee profile that fits this schema: {sample_json_schema}"
)
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
json_LP(token_ids, tensor) json_LP(token_ids, tensor)
...@@ -82,13 +39,14 @@ def test_guided_logits_processors(): ...@@ -82,13 +39,14 @@ def test_guided_logits_processors():
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"]) @pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
async def test_guided_logits_processor_black_box(backend: str): async def test_guided_logits_processor_black_box(backend: str, sample_regex,
sample_json_schema):
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an example IPv4 address with this regex: {TEST_REGEX}") f"Give an example IPv4 address with this regex: {sample_regex}")
regex_request = CompletionRequest(model='test', regex_request = CompletionRequest(model='test',
prompt=token_ids, prompt=token_ids,
guided_regex=TEST_REGEX) guided_regex=sample_regex)
regex_lp = await get_guided_decoding_logits_processor( regex_lp = await get_guided_decoding_logits_processor(
backend, regex_request, tokenizer) backend, regex_request, tokenizer)
assert regex_lp is not None assert regex_lp is not None
...@@ -99,10 +57,11 @@ async def test_guided_logits_processor_black_box(backend: str): ...@@ -99,10 +57,11 @@ async def test_guided_logits_processor_black_box(backend: str):
assert not torch.allclose(tensor, original_tensor) assert not torch.allclose(tensor, original_tensor)
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an employee profile that fits this schema: {TEST_SCHEMA}") f"Give an employee profile that fits this schema: {sample_json_schema}"
)
json_request = CompletionRequest(model='test', json_request = CompletionRequest(model='test',
prompt=token_ids, prompt=token_ids,
guided_json=TEST_SCHEMA) guided_json=sample_json_schema)
json_lp = await get_guided_decoding_logits_processor( json_lp = await get_guided_decoding_logits_processor(
backend, json_request, tokenizer) backend, json_request, tokenizer)
assert json_lp is not None assert json_lp is not None
......
import openai # use the official client for correctness check
import pytest
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module")
def server(zephyr_lora_files):
with RemoteOpenAIServer([
"--model",
MODEL_NAME,
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"128",
]) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def client(server):
return server.get_async_client()
@pytest.mark.asyncio
async def test_check_models(client: openai.AsyncOpenAI):
models = await client.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert all(model.root == MODEL_NAME for model in models)
assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2"
import sys import sys
import time import time
import pytest
import torch import torch
from openai import OpenAI, OpenAIError from openai import OpenAI, OpenAIError
...@@ -10,8 +9,6 @@ from vllm.model_executor.models.opt import OPTForCausalLM ...@@ -10,8 +9,6 @@ from vllm.model_executor.models.opt import OPTForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.utils import get_open_port from vllm.utils import get_open_port
pytestmark = pytest.mark.openai
class MyOPTForCausalLM(OPTForCausalLM): class MyOPTForCausalLM(OPTForCausalLM):
......
...@@ -6,7 +6,8 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput ...@@ -6,7 +6,8 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
# ruff: noqa: E501 # ruff: noqa: E501
INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
......
import asyncio import asyncio
from dataclasses import dataclass from dataclasses import dataclass
import pytest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
MODEL_NAME = "openai-community/gpt2" MODEL_NAME = "openai-community/gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}" CHAT_TEMPLATE = "Dummy chat template for testing {}"
pytestmark = pytest.mark.openai
@dataclass @dataclass
class MockModelConfig: class MockModelConfig:
......
from pathlib import Path from typing import Dict, List
from typing import Dict
import openai import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import ray
from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64 from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
from ..utils import ServerRunner from ...utils import VLLM_PATH, RemoteOpenAIServer
MODEL_NAME = "llava-hf/llava-1.5-7b-hf" MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent / LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja"
"examples/template_llava.jinja")
assert LLAVA_CHAT_TEMPLATE.exists() assert LLAVA_CHAT_TEMPLATE.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
...@@ -22,43 +20,26 @@ TEST_IMAGE_URLS = [ ...@@ -22,43 +20,26 @@ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
] ]
pytestmark = pytest.mark.openai
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
ray.init() with RemoteOpenAIServer([
server_runner = ServerRunner.remote([ "--model",
"--model", MODEL_NAME,
MODEL_NAME, "--dtype",
"--dtype", "bfloat16",
"bfloat16", "--max-model-len",
"--max-model-len", "4096",
"4096", "--enforce-eager",
"--enforce-eager", "--chat-template",
"--image-input-type", str(LLAVA_CHAT_TEMPLATE),
"pixel_values", ]) as remote_server:
"--image-token-id", yield remote_server
"32000",
"--image-input-shape",
"1,3,336,336", @pytest.fixture(scope="module")
"--image-feature-size", def client(server):
"576", return server.get_async_client()
"--chat-template",
str(LLAVA_CHAT_TEMPLATE),
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="session")
def client():
client = openai.AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client
@pytest_asyncio.fixture(scope="session") @pytest_asyncio.fixture(scope="session")
...@@ -73,7 +54,7 @@ async def base64_encoded_image() -> Dict[str, str]: ...@@ -73,7 +54,7 @@ async def base64_encoded_image() -> Dict[str, str]:
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, async def test_single_chat_session_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str): model_name: str, image_url: str):
messages = [{ messages = [{
"role": "role":
...@@ -126,7 +107,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, ...@@ -126,7 +107,7 @@ async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded( async def test_single_chat_session_image_base64encoded(
server, client: openai.AsyncOpenAI, model_name: str, image_url: str, client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]): base64_encoded_image: Dict[str, str]):
messages = [{ messages = [{
...@@ -180,7 +161,7 @@ async def test_single_chat_session_image_base64encoded( ...@@ -180,7 +161,7 @@ async def test_single_chat_session_image_base64encoded(
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, async def test_chat_streaming_image(client: openai.AsyncOpenAI,
model_name: str, image_url: str): model_name: str, image_url: str):
messages = [{ messages = [{
"role": "role":
...@@ -217,7 +198,7 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, ...@@ -217,7 +198,7 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
chunks = [] chunks: List[str] = []
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
...@@ -237,8 +218,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, ...@@ -237,8 +218,8 @@ async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_multi_image_input(server, client: openai.AsyncOpenAI, async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
model_name: str, image_url: str): image_url: str):
messages = [{ messages = [{
"role": "role":
...@@ -280,7 +261,3 @@ async def test_multi_image_input(server, client: openai.AsyncOpenAI, ...@@ -280,7 +261,3 @@ async def test_multi_image_input(server, client: openai.AsyncOpenAI,
) )
completion = completion.choices[0].text completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0 assert completion is not None and len(completion) >= 0
if __name__ == "__main__":
pytest.main([__file__])
...@@ -73,27 +73,27 @@ def ref_single_query_cached_kv_attention( ...@@ -73,27 +73,27 @@ def ref_single_query_cached_kv_attention(
block_size = value_cache.shape[3] block_size = value_cache.shape[3]
num_seqs = query.shape[0] num_seqs = query.shape[0]
block_tables = block_tables.cpu().tolist() block_tables_lst = block_tables.cpu().tolist()
seq_lens = seq_lens.cpu().tolist() seq_lens_lst = seq_lens.cpu().tolist()
for i in range(num_seqs): for i in range(num_seqs):
q = query[i].unsqueeze(0) q = query[i].unsqueeze(0)
block_table = block_tables[i] block_table = block_tables_lst[i]
seq_len = int(seq_lens[i]) seq_len = int(seq_lens_lst[i])
keys = [] keys_lst: List[torch.Tensor] = []
values = [] values_lst: List[torch.Tensor] = []
for j in range(seq_len): for j in range(seq_len):
block_number = int(block_table[j // block_size]) block_number = int(block_table[j // block_size])
block_offset = j % block_size block_offset = j % block_size
k = key_cache[block_number, :, :, block_offset, :] k = key_cache[block_number, :, :, block_offset, :]
k = k.reshape(num_kv_heads, head_size) k = k.reshape(num_kv_heads, head_size)
keys.append(k) keys_lst.append(k)
v = value_cache[block_number, :, :, block_offset] v = value_cache[block_number, :, :, block_offset]
values.append(v) values_lst.append(v)
keys = torch.stack(keys, dim=0) keys = torch.stack(keys_lst, dim=0)
values = torch.stack(values, dim=0) values = torch.stack(values_lst, dim=0)
if num_queries_per_kv > 1: if num_queries_per_kv > 1:
# Handle MQA and GQA # Handle MQA and GQA
keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
...@@ -158,14 +158,15 @@ def test_paged_attention( ...@@ -158,14 +158,15 @@ def test_paged_attention(
# Create the block tables. # Create the block tables.
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables = [] block_tables_lst: List[List[int]] = []
for _ in range(num_seqs): for _ in range(num_seqs):
block_table = [ block_table = [
random.randint(0, NUM_BLOCKS - 1) random.randint(0, NUM_BLOCKS - 1)
for _ in range(max_num_blocks_per_seq) for _ in range(max_num_blocks_per_seq)
] ]
block_tables.append(block_table) block_tables_lst.append(block_table)
block_tables = torch.tensor(block_tables, dtype=torch.int)
block_tables = torch.tensor(block_tables_lst, dtype=torch.int)
# Create the KV caches. # Create the KV caches.
key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
...@@ -284,7 +285,7 @@ def ref_multi_query_kv_attention( ...@@ -284,7 +285,7 @@ def ref_multi_query_kv_attention(
dtype: torch.dtype, dtype: torch.dtype,
) -> torch.Tensor: ) -> torch.Tensor:
num_seqs = len(cu_seq_lens) - 1 num_seqs = len(cu_seq_lens) - 1
ref_outputs = [] ref_outputs: List[torch.Tensor] = []
for i in range(num_seqs): for i in range(num_seqs):
start_idx = cu_seq_lens[i] start_idx = cu_seq_lens[i]
end_idx = cu_seq_lens[i + 1] end_idx = cu_seq_lens[i + 1]
...@@ -304,8 +305,8 @@ def ref_multi_query_kv_attention( ...@@ -304,8 +305,8 @@ def ref_multi_query_kv_attention(
attn_mask=attn_mask, attn_mask=attn_mask,
) )
ref_outputs.append(ref_output) ref_outputs.append(ref_output)
ref_output = torch.cat(ref_outputs, dim=0)
return ref_output return torch.cat(ref_outputs, dim=0)
# TODO(woosuk): Add tests for USE_ALIBI=True. # TODO(woosuk): Add tests for USE_ALIBI=True.
......
...@@ -9,8 +9,8 @@ from vllm.attention.selector import which_attn_to_use ...@@ -9,8 +9,8 @@ from vllm.attention.selector import which_attn_to_use
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"]) "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("device", ["cpu", "hip"]) @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
def test_env(name: str, device: str, monkeypatch): def test_env(name: str, device: str, monkeypatch):
"""Test that the attention selector can be set via environment variable. """Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend. Note that we do not test FlashAttn because it is the default backend.
...@@ -28,6 +28,11 @@ def test_env(name: str, device: str, monkeypatch): ...@@ -28,6 +28,11 @@ def test_env(name: str, device: str, monkeypatch):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, backend = which_attn_to_use(8, 16, 8, None, torch.float16,
torch.float16, 16) torch.float16, 16)
assert backend.name == "ROCM_FLASH" assert backend.name == "ROCM_FLASH"
elif device == "openvino":
with patch("vllm.attention.selector.is_openvino", return_value=True):
backend = which_attn_to_use(8, 16, 8, None, torch.float16,
torch.float16, 16)
assert backend.name == "OPENVINO"
else: else:
backend = which_attn_to_use(8, 16, 8, None, torch.float16, backend = which_attn_to_use(8, 16, 8, None, torch.float16,
torch.float16, 16) torch.float16, 16)
...@@ -42,32 +47,32 @@ def test_flash_attn(monkeypatch): ...@@ -42,32 +47,32 @@ def test_flash_attn(monkeypatch):
# Unsupported CUDA arch # Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=[7, 5]): with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
assert backend.name != "FLASH_ATTN" assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported data type # Unsupported data type
backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16) backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16)
assert backend.name != "FLASH_ATTN" assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported kv cache data type # Unsupported kv cache data type
backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16) backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16)
assert backend.name != "FLASH_ATTN" assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported block size # Unsupported block size
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8) backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8)
assert backend.name != "FLASH_ATTN" assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported sliding window # Unsupported sliding window
backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16) backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16)
assert backend.name != "FLASH_ATTN" assert backend.name != STR_FLASH_ATTN_VAL
# flash-attn is not installed # flash-attn is not installed
with patch.dict('sys.modules', {'vllm_flash_attn': None}): with patch.dict('sys.modules', {'vllm_flash_attn': None}):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
assert backend.name != "FLASH_ATTN" assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported head size # Unsupported head size
backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16) backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
assert backend.name != "FLASH_ATTN" assert backend.name != STR_FLASH_ATTN_VAL
def test_invalid_env(monkeypatch): def test_invalid_env(monkeypatch):
......
...@@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention( ...@@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention(
block_size = value_cache.shape[3] block_size = value_cache.shape[3]
num_seqs = query.shape[0] num_seqs = query.shape[0]
block_tables = block_tables.cpu().tolist() block_tables_lst = block_tables.cpu().tolist()
seq_lens = seq_lens.cpu().tolist() seq_lens_lst = seq_lens.cpu().tolist()
for i in range(num_seqs): for i in range(num_seqs):
q = query[i].unsqueeze(0) q = query[i].unsqueeze(0)
block_table = block_tables[i] block_table = block_tables_lst[i]
seq_len = int(seq_lens[i]) seq_len = int(seq_lens_lst[i])
keys = [] keys_lst: List[torch.Tensor] = []
values = [] values_lst: List[torch.Tensor] = []
for j in range(seq_len): for j in range(seq_len):
block_number = int(block_table[j // block_size]) block_number = int(block_table[j // block_size])
block_offset = j % block_size block_offset = j % block_size
k = key_cache[block_number, :, :, block_offset, :] k = key_cache[block_number, :, :, block_offset, :]
k = k.reshape(num_kv_heads, head_size) k = k.reshape(num_kv_heads, head_size)
keys.append(k) keys_lst.append(k)
v = value_cache[block_number, :, :, block_offset] v = value_cache[block_number, :, :, block_offset]
values.append(v) values_lst.append(v)
keys = torch.stack(keys, dim=0) keys = torch.stack(keys_lst, dim=0)
values = torch.stack(values, dim=0) values = torch.stack(values_lst, dim=0)
if num_queries_per_kv > 1: if num_queries_per_kv > 1:
# Handle MQA and GQA # Handle MQA and GQA
keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1)
...@@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill( ...@@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill(
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
ref_output = ref_multi_query_kv_attention( ref_output = ref_multi_query_kv_attention(
cu_seq_lens, cu_seq_lens.tolist(),
query, query,
key, key,
value, value,
......
import random import random
from typing import Tuple from typing import List, Tuple
import pytest import pytest
import torch import torch
...@@ -64,7 +64,7 @@ def test_copy_blocks( ...@@ -64,7 +64,7 @@ def test_copy_blocks(
src_blocks = random.sample(range(num_blocks), num_mappings) src_blocks = random.sample(range(num_blocks), num_mappings)
remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
block_mapping = [] block_mapping: List[Tuple[int, int]] = []
for i in range(num_mappings): for i in range(num_mappings):
src = src_blocks[i] src = src_blocks[i]
dst1 = dst_blocks[2 * i] dst1 = dst_blocks[2 * i]
...@@ -132,8 +132,8 @@ def test_reshape_and_cache( ...@@ -132,8 +132,8 @@ def test_reshape_and_cache(
torch.set_default_device(device) torch.set_default_device(device)
# Create a random slot mapping. # Create a random slot mapping.
num_slots = block_size * num_blocks num_slots = block_size * num_blocks
slot_mapping = random.sample(range(num_slots), num_tokens) slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long) slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long)
qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype) qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
_, key, value = qkv.unbind(dim=1) _, key, value = qkv.unbind(dim=1)
...@@ -171,12 +171,12 @@ def test_reshape_and_cache( ...@@ -171,12 +171,12 @@ def test_reshape_and_cache(
# Run the reference implementation. # Run the reference implementation.
reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
block_indicies = block_indicies.cpu().tolist() block_indicies_lst = block_indicies.cpu().tolist()
block_offsets = slot_mapping % block_size block_offsets = slot_mapping % block_size
block_offsets = block_offsets.cpu().tolist() block_offsets_lst = block_offsets.cpu().tolist()
for i in range(num_tokens): for i in range(num_tokens):
block_idx = block_indicies[i] block_idx = block_indicies_lst[i]
block_offset = block_offsets[i] block_offset = block_offsets_lst[i]
cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
cloned_value_cache[block_idx, :, :, block_offset] = value[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i]
...@@ -225,8 +225,10 @@ def test_reshape_and_cache_flash( ...@@ -225,8 +225,10 @@ def test_reshape_and_cache_flash(
# Create a random slot mapping. # Create a random slot mapping.
num_slots = block_size * num_blocks num_slots = block_size * num_blocks
slot_mapping = random.sample(range(num_slots), num_tokens) slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device) slot_mapping = torch.tensor(slot_mapping_lst,
dtype=torch.long,
device=device)
qkv = torch.randn(num_tokens, qkv = torch.randn(num_tokens,
3, 3,
...@@ -258,13 +260,13 @@ def test_reshape_and_cache_flash( ...@@ -258,13 +260,13 @@ def test_reshape_and_cache_flash(
slot_mapping, kv_cache_dtype) slot_mapping, kv_cache_dtype)
# Run the reference implementation. # Run the reference implementation.
block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor') block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
block_indicies = block_indicies.cpu().tolist() block_indicies_lst = block_indicies.cpu().tolist()
block_offsets = slot_mapping % block_size block_offsets = slot_mapping % block_size
block_offsets = block_offsets.cpu().tolist() block_offsets_lst = block_offsets.cpu().tolist()
for i in range(num_tokens): for i in range(num_tokens):
block_idx = block_indicies[i] block_idx = block_indicies_lst[i]
block_offset = block_offsets[i] block_offset = block_offsets_lst[i]
cloned_key_cache[block_idx, block_offset, :, :] = key[i] cloned_key_cache[block_idx, block_offset, :, :] = key[i]
cloned_value_cache[block_idx, block_offset, :, :] = value[i] cloned_value_cache[block_idx, block_offset, :, :] = value[i]
......
...@@ -2,36 +2,53 @@ ...@@ -2,36 +2,53 @@
Run `pytest tests/kernels/test_cutlass.py`. Run `pytest tests/kernels/test_cutlass.py`.
""" """
from typing import Type from typing import Optional, Type
import pytest import pytest
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform
CUDA_DEVICES = [ CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
] ]
capability = torch.cuda.get_device_capability() capability = current_platform.get_device_capability()
capability = capability[0] * 10 + capability[1] capability = capability[0] * 10 + capability[1]
def to_fp8(tensor: torch.tensor): def to_fp8(tensor: torch.Tensor):
finfo = torch.finfo(torch.float8_e4m3fn) finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp( return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def to_int8(tensor: torch.tensor): def to_int8(tensor: torch.Tensor):
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
def baseline_scaled_mm(a: torch.Tensor,
b: torch.Tensor,
scale_a: torch.Tensor,
scale_b: torch.Tensor,
out_dtype: Type[torch.dtype],
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
output = (scale_a * (scale_b * (torch.mm(
a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
if bias is not None:
output = output + bias
return output
def cutlass_fp8_gemm_helper(m: int, def cutlass_fp8_gemm_helper(m: int,
n: int, n: int,
k: int, k: int,
per_token_act_quant: bool, per_token_act_quant: bool,
per_out_channel_weight_quant: bool, per_out_channel_weight_quant: bool,
use_bias: bool,
out_dtype: Type[torch.dtype] = torch.bfloat16, out_dtype: Type[torch.dtype] = torch.bfloat16,
device: str = "cuda"): device: str = "cuda"):
# Test for a cutlass kernel with per-token activation quantization # Test for a cutlass kernel with per-token activation quantization
...@@ -42,16 +59,19 @@ def cutlass_fp8_gemm_helper(m: int, ...@@ -42,16 +59,19 @@ def cutlass_fp8_gemm_helper(m: int,
m_a_scales = m if per_token_act_quant else 1 m_a_scales = m if per_token_act_quant else 1
n_b_scales = n if per_out_channel_weight_quant else 1 n_b_scales = n if per_out_channel_weight_quant else 1
scale_a = (torch.randn( scale_a = (torch.randn((m_a_scales, 1), device=device,
(m_a_scales, 1), device=device, dtype=torch.float32) / 10) dtype=torch.float32))
scale_b = (torch.randn( scale_b = (torch.randn((1, n_b_scales), device=device,
(1, n_b_scales), device=device, dtype=torch.float32) / 10) dtype=torch.float32))
if use_bias:
bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
else:
bias = None
out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype) out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
baseline = torch.mm(scale_a * a.to(dtype=torch.float32), baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
scale_b * b.to(dtype=torch.float32)).to(out_dtype)
assert torch.allclose(out, baseline, rtol=1e-2, atol=1e-1) assert torch.allclose(out, baseline, rtol=1e-2, atol=5e-2)
def cutlass_int8_gemm_helper(m: int, def cutlass_int8_gemm_helper(m: int,
...@@ -59,6 +79,7 @@ def cutlass_int8_gemm_helper(m: int, ...@@ -59,6 +79,7 @@ def cutlass_int8_gemm_helper(m: int,
k: int, k: int,
per_token_act_quant: bool, per_token_act_quant: bool,
per_out_channel_weight_quant: bool, per_out_channel_weight_quant: bool,
use_bias: bool,
out_dtype: Type[torch.dtype] = torch.bfloat16, out_dtype: Type[torch.dtype] = torch.bfloat16,
device: str = "cuda"): device: str = "cuda"):
# Test for a cutlass kernel with per-token activation quantization # Test for a cutlass kernel with per-token activation quantization
...@@ -69,15 +90,18 @@ def cutlass_int8_gemm_helper(m: int, ...@@ -69,15 +90,18 @@ def cutlass_int8_gemm_helper(m: int,
m_a_scales = m if per_token_act_quant else 1 m_a_scales = m if per_token_act_quant else 1
n_b_scales = n if per_out_channel_weight_quant else 1 n_b_scales = n if per_out_channel_weight_quant else 1
scale_a = (torch.randn( scale_a = (torch.randn((m_a_scales, 1), device=device,
(m_a_scales, 1), device=device, dtype=torch.float32) / 10) dtype=torch.float32))
scale_b = (torch.randn( scale_b = (torch.randn((1, n_b_scales), device=device,
(1, n_b_scales), device=device, dtype=torch.float32) / 10) dtype=torch.float32))
out = ops.cutlass_scaled_mm_dq(a, b, scale_a, scale_b, out_dtype) if use_bias:
baseline = torch.mm(scale_a * a.to(dtype=torch.float32), bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
scale_b * else:
b.to(dtype=torch.float32)).to(dtype=out_dtype) bias = None
out = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
baseline = baseline_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
...@@ -87,11 +111,12 @@ def cutlass_int8_gemm_helper(m: int, ...@@ -87,11 +111,12 @@ def cutlass_int8_gemm_helper(m: int,
@pytest.mark.parametrize("k", [128, 496, 1024]) @pytest.mark.parametrize("k", [128, 496, 1024])
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.skipif(capability < 89, @pytest.mark.skipif(capability < 89,
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
per_out_ch: bool): per_out_ch: bool, use_bias: bool):
cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch) cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
@pytest.mark.parametrize("m", [512, 222, 33, 1]) @pytest.mark.parametrize("m", [512, 222, 33, 1])
...@@ -99,49 +124,72 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, ...@@ -99,49 +124,72 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool,
@pytest.mark.parametrize("k", [128, 496, 1024]) @pytest.mark.parametrize("k", [128, 496, 1024])
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool, def test_cutlass_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
per_out_ch: bool): per_out_ch: bool, use_bias: bool):
cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch) cutlass_int8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias)
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("use_bias", [True, False])
def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, def test_cutlass_int8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
out_dtype: Type[torch.dtype]): out_dtype: Type[torch.dtype],
cutlass_int8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias: bool):
out_dtype) cutlass_int8_gemm_helper(512,
512,
512,
per_act_token,
per_out_ch,
use_bias,
out_dtype=out_dtype)
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16]) @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.skipif(capability < 89, @pytest.mark.skipif(capability < 89,
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool, def test_cutlass_fp8_gemm_output_dtype(per_act_token: bool, per_out_ch: bool,
out_dtype: Type[torch.dtype]): out_dtype: Type[torch.dtype],
cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias: bool):
out_dtype) cutlass_fp8_gemm_helper(512,
512,
512,
per_act_token,
per_out_ch,
use_bias,
out_dtype=out_dtype)
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.skipif(capability < 89, @pytest.mark.skipif(capability < 89,
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool, def test_cutlass_fp8_gemm_devices(per_act_token: bool, per_out_ch: bool,
device: str): use_bias: bool, device: str):
cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, cutlass_fp8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, use_bias,
torch.bfloat16, device) torch.bfloat16, device)
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool, def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
device: str): use_bias: bool, device: str):
cutlass_int8_gemm_helper(512, 512, 512, per_act_token, per_out_ch, cutlass_int8_gemm_helper(512,
torch.bfloat16, device) 512,
512,
per_act_token,
per_out_ch,
use_bias,
out_dtype=torch.bfloat16,
device=device)
# For the following two tests: # For the following two tests:
...@@ -151,20 +199,26 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool, ...@@ -151,20 +199,26 @@ def test_cutlass_int8_gemm_devices(per_act_token: bool, per_out_ch: bool,
# kernel must handle any M thrown at it. # kernel must handle any M thrown at it.
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.skipif(capability < 89, @pytest.mark.skipif(capability < 89,
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool): def test_cutlass_fp8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
use_bias: bool):
for nk in range(32, 128, 32): for nk in range(32, 128, 32):
for m in range(1, 128): for m in range(1, 128):
cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch) cutlass_fp8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
use_bias)
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False])
def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool): @pytest.mark.parametrize("use_bias", [True, False])
def test_cutlass_int8_gemm_m_sweep(per_act_token: bool, per_out_ch: bool,
use_bias: bool):
for nk in range(32, 128, 32): for nk in range(32, 128, 32):
for m in range(1, 128): for m in range(1, 128):
cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch) cutlass_int8_gemm_helper(m, nk, nk, per_act_token, per_out_ch,
use_bias)
# Test working with a subset of A and B # Test working with a subset of A and B
...@@ -180,14 +234,16 @@ def test_cutlass_subset(): ...@@ -180,14 +234,16 @@ def test_cutlass_subset():
scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
out = ops.cutlass_scaled_mm_dq(a, out = ops.cutlass_scaled_mm(a,
b, b,
scale_a, scale_a,
scale_b, scale_b,
out_dtype=torch.bfloat16) out_dtype=torch.bfloat16)
baseline = torch.mm(scale_a * a.to(dtype=torch.float32), baseline = baseline_scaled_mm(a,
scale_b * b,
b.to(dtype=torch.float32)).to(dtype=torch.bfloat16) scale_a,
scale_b,
out_dtype=torch.bfloat16)
assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
...@@ -203,8 +259,8 @@ class CutlassLayer(torch.nn.Module): ...@@ -203,8 +259,8 @@ class CutlassLayer(torch.nn.Module):
self.out_dtype = out_dtype self.out_dtype = out_dtype
def forward(self, a): def forward(self, a):
return ops.cutlass_scaled_mm_dq(a, self.b, self.scale_a, self.scale_b, return ops.cutlass_scaled_mm(a, self.b, self.scale_a, self.scale_b,
self.out_dtype) self.out_dtype)
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
......
"""
Tests:
* E2E test of Encoder attention + Decoder self-attention +
Encoder/decoder cross-attention (collectively
"encoder/decoder attention")
* Confirm enc/dec models will fail for chunked prefill
* Confirm enc/dec models will fail for prefix caching
"""
from typing import NamedTuple, Optional
import pytest
import torch
from tests.kernels.utils import *
from tests.kernels.utils import make_causal_mask, maybe_make_long_tensor
from vllm.attention import Attention, AttentionMetadata
from vllm.attention.backends.abstract import AttentionBackend, AttentionType
from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
from vllm.utils import is_hip
HEAD_SIZES = [64, 256]
NUM_HEADS = [1, 16]
BATCH_SIZES = [1, 16]
BLOCK_SIZES = [16]
BACKEND_NAMES = [STR_XFORMERS_ATTN_VAL]
CUDA_DEVICE = "cuda:0"
MAX_DEC_SEQ_LENS = [128]
MAX_ENC_SEQ_LENS = [128]
# Narrow teest-cases for unsupported-scenario
# tests
HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
class TestPoint(NamedTuple):
"""
Encapsulates the attributes which define a single invocation
of the test_e2e_enc_dec_attn() test
Attributes:
num_heads: The number of heads in the model.
head_size: Head dimension
backend_name: Name of the backend framework used.
batch_size: Number of samples per batch.
block_size: Size of each block of data processed.
max_dec_seq_len: Maximum sequence length for the decoder.
max_enc_seq_len: Maximum sequence length for the encoder.
num_blocks: Number of blocks in the model.
"""
num_heads: int
head_size: int
backend_name: str
batch_size: int
block_size: int
max_dec_seq_len: int
max_enc_seq_len: int
num_blocks: int
class TestResources(NamedTuple):
'''
Encapsulates key components for performing an
encoder/decoder attention test
Note that
(1) attn automatically selects an attention backend
based on platform info & a set of canned
heuristics
(2) attn_backend is thus *not the same backend
instance* used by attn, but rather it is
intended to be a
*different instance* of the *same backend class*;
it is assumed that the user of TestResources
will leverage attn_backend for the purpose of
constructing backend-compatible attention
metadata instances
Attributes:
* scale: 1/sqrt(d) scale factor for attn
* attn_backend: implementatino of abstraction
attention interface using
a particular kernel library
i.e. XFormers
* attn: Attention layer instance
* kv_cache: shared key/value cache for all attention
'''
scale: float
attn_backend: AttentionBackend
attn: Attention
kv_cache: torch.Tensor
def _make_test_resources(test_pt: TestPoint, ) -> TestResources:
'''
Build key components for performing encoder/decoder attention test.
Note that
(1) The Attention instance constructed here, automatically selects
an attention backend class based on platform info & a set of canned
heuristics, so
(2) The attention backend instance constructed here is thus *not
the same backend instance* used by attn, but rather it is
intended to be a *different instance* of the *same backend class*;
therefore,
(3) This function requires that test_pt.backend_name matches the backend
class that Attention will automatically select when it is constructed.
Arguments:
* test_pt: TestPoint data structure; this function relies on the
following fields: num_heads, head_size, num_blocks,
block_size, backend_name
Returns:
* TestResources data structure.
'''
scale = float(1.0 / (test_pt.head_size**0.5))
attn_backend = make_backend(test_pt.backend_name)
attn = Attention(
test_pt.num_heads,
test_pt.head_size,
scale=scale,
)
if test_pt.num_blocks is None or test_pt.num_heads is None:
# Caller does not require a KV cache
return TestResources(scale, attn_backend, attn, None)
# Construct KV cache
kv_cache = make_kv_cache(test_pt.num_blocks,
test_pt.num_heads,
test_pt.head_size,
test_pt.block_size,
device=CUDA_DEVICE)
return TestResources(scale, attn_backend, attn, kv_cache)
def _encoder_attn_setup(
test_pt: TestPoint,
test_rsrcs: TestResources,
) -> PhaseTestParameters:
'''
Set up test vectors & data structures for encoder attention test.
A triplet of synthetic query/key/value tensors are constructed.
Given this is an encoder attention test, the key & value
sequences will have the same length as the corresponding queries.
The query/key/value tensors are passed to an ideal reference
self-attention implementation to generate an ideal output tensor.
Encoder inference does not populate the KV cache, therefore
no KV cache memory mapping is constructed
Arguments:
* test_pt: TestPoint data structure; this function relies on the
following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
Returns:
* PhaseTestParameters data structure comprising (1) packed query/key/value
tensors, (2) the ideal output of attention computed using a naive
implementation, and (3) KVCache field set to None
'''
(
num_heads,
head_size,
_,
batch_size,
_,
_,
max_q_seq_len,
_,
) = test_pt
scale = test_rsrcs.scale
max_kv_seq_len = max_q_seq_len
# Make test tensors
qkv_in, _, _ = make_qkv(batch_size,
max_q_seq_len,
max_kv_seq_len,
num_heads,
head_size,
attn_type=AttentionType.ENCODER,
device=CUDA_DEVICE)
# Compute correct answer using naive non-causal attention
# implementation
ideal_output = ref_masked_attention(qkv_in.query,
qkv_in.key,
qkv_in.value,
scale=scale,
q_seq_lens=qkv_in.q_seq_lens,
kv_seq_lens=qkv_in.kv_seq_lens)
packed_ideal_output, _ = pack_tensor(ideal_output,
qkv_in.q_seq_lens,
device=CUDA_DEVICE)
packed_qkv = pack_qkv(qkv_in, device=CUDA_DEVICE)
return PhaseTestParameters(
PackedQKVO(packed_qkv, packed_ideal_output),
None # No KV cache
)
def _decoder_attn_setup(
test_pt: TestPoint,
test_rsrcs: TestResources,
block_base_addr: int = 0,
) -> Tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]:
'''
Set up test vectors & data structures for self-attention test.
A triplet of synthetic query/key/value tensors are constructed ("baseline"
query/key/value). Given this is a self-attention test, the key & value
sequences will have the same length as the corresponding queries.
"Prefill" query/key/value tensors are derived by masking out the last value
in each baseline query/key/value. These tensors are used to test prefill &
populate KV cache for a subsequent decode test.
"Decode" query/key/value tensors are derived by extracting *only* the last
value from each baseline query/key/value (i.e. complement of the prefill
tensors.) These tensors are used to test decode, conditional on the kv cache
being populated during the prefill test.
The baseline query/key/value tensors are passed to an ideal reference
self-attention implementation to generate a "Baseline" ideal output tensor.
This tensor is split into the "Prefill" ideal output tensor (all but the
last element of each output sequence) and the "Decode" ideal output tensor
(*only* the last element of each output sequence); the "Prefill" and
"Decode" ideal output tensors can be used to validate the prefill and decode
test results, respectively.
This function also constructs the self-attention KV cache memory mapping
(slot mapping and block table), ensuring that the block table starts at
block_base_addr
Arguments:
* test_pt: TestPoint data structure; this function relies on the
following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
* block_base_addr: decoder self-attention block-table base address
Returns:
* qkv: Unpacked (batch_size x padded_seq_len x num_heads x
head_size) query/key/value tensors
* Prefill-phase decoder self-attention PhaseTestParameters data structure,
including (1) packed (number_of_tokens x num_heads x head_size)
query/key/value tensors along with (2) ideal attention output
computed using a naive implementation, and (3) memory-mapping data
structures appropriate for prefill phase.
* Decode-phase decoder self-attention PhaseTestParameters data structure,
including (1) packed (number_of_tokens x num_heads x head_size)
query/key/value tensors along with (2) ideal attention output
computed using a naive implementation, and (3) memory-mapping data
structures appropriate for decode phase.
* max_block_idx: max physical address in decoder self-attention block-table
(intended to be used as the base address for the encoder/
decoder cross-attention block-table, which is not
constructed in this function)
'''
(
num_heads,
head_size,
_,
batch_size,
block_size,
max_q_seq_len,
_,
_,
) = test_pt
scale = test_rsrcs.scale
max_kv_seq_len = max_q_seq_len
# Build test tensors
(
qkv,
prefill_qkv,
decode_qkv,
) = make_qkv(batch_size,
max_q_seq_len,
max_kv_seq_len,
num_heads,
head_size,
attn_type=AttentionType.DECODER,
device=CUDA_DEVICE)
# Compute correct answer using naive attention implementation
# with causal attention mask
causal_mask = make_causal_mask(max_q_seq_len,
max_kv_seq_len).to(CUDA_DEVICE)
ideal_output = ref_masked_attention(qkv.query,
qkv.key,
qkv.value,
scale=scale,
custom_mask=causal_mask,
q_seq_lens=qkv.q_seq_lens,
kv_seq_lens=qkv.kv_seq_lens)
# Split out the prefill- & decode-phase ideal answers & pack them
prefill_ideal_output = torch.zeros_like(ideal_output)
decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
for bdx, prefill_q_seq_len in enumerate(prefill_qkv.q_seq_lens):
prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
bdx, :prefill_q_seq_len]
decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
prefill_q_seq_len + 1)]
prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
prefill_qkv.q_seq_lens,
device=CUDA_DEVICE)
decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
[1 for _ in range(batch_size)],
device=CUDA_DEVICE)
# Build prefill- & decode-phase data structures
# for decoder self-attention. Block tables and
# slot mapping must be in a format compatible
# with KV caching & attention kernels
#
# Prefill-phase:
#
# * Empty block-tables tensor
# * Slot-mapping with entries for prompt tokens
#
# Decode-phase:
# * Block-tables tensor with minimum number of blocks
# required by total num. tokens in the entirety of all sequences
# (including both prefill & decode)
# * Slot-mapping with entries for tokens that will be decoded in the
# current decode iteration
#
# Note: the format described above is simply mirroring what ModelRunner
# produces
prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
(
decode_block_tables,
slot_mapping_list,
max_block_idx,
) = make_block_tables_slot_mapping(block_size,
qkv.q_seq_lens,
device=CUDA_DEVICE,
block_base_addr=block_base_addr)
(
prefill_slot_mapping,
decode_slot_mapping,
) = split_slot_mapping(slot_mapping_list,
qkv.q_seq_lens,
device=CUDA_DEVICE)
prefill_pckd_qkv = pack_qkv(prefill_qkv, device=CUDA_DEVICE)
decode_pckd_qkv = pack_qkv(decode_qkv, device=CUDA_DEVICE)
return (
qkv,
PhaseTestParameters( # Prefill test params
PackedQKVO(prefill_pckd_qkv, prefill_packed_ideal_output),
KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
PhaseTestParameters( # Decode test params
PackedQKVO(decode_pckd_qkv, decode_packed_ideal_output),
KVMemoryMap(decode_block_tables, decode_slot_mapping)),
max_block_idx)
def _enc_dec_cross_attn_setup_reuses_query(
decoder_qkv: QKVInputs,
encoder_test_params: PhaseTestParameters,
prefill_decoder_phase_test_params: PhaseTestParameters,
test_pt: TestPoint,
test_rsrcs: TestResources,
block_base_addr: int = 0,
) -> Tuple[PhaseTestParameters, PhaseTestParameters]:
'''
Set up test vectors & data structures for cross-attention test.
A triplet of synthetic cross-attention key/value tensors are constructed
("baseline" key/value). Given this is a cross-attention test, we assume
query tensors were already synthesized for a prior self-attention test and
will be reused for cross-attention. The key & value sequences generated here
may have a different length than the corresponding queries (as is often
the case for cross-attention between decoder and encoder sequences.)
Cross attention key & value tensors do not grow during autoregressive
inference; thus this function obtains a single key/value pair suitable for
both prefill and decode.
The "baseline" query tensor is received as an argument. The "baseline"
query/key/value tensors are passed to an ideal reference cross-attention
implementation to generate a "baseline" ideal output tensor. This tensor is
split into the "Prefill" ideal output tensor (all but the last element of
each output sequence) and the "Decode" ideal output tensor (*only* the last
element of each output sequence); the "Prefill" and "Decode" ideal output
tensors can be used to validate the prefill and decode test results,
respectively.
This function also constructs the cross-attention KV cache memory mapping
(slot mapping and block table), ensuring that the block table starts at
block_base_addr.
Arguments:
* decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x
num_heads x head_size) decoder self-attention inputs;
this function relies on the query and q_seq_lens
fields
* encoder_test_params: PhaseTestParameters data structure which was
used for encoder inference; KV cache field
is not used by this function
* prefill_decoder_phase_test_params: PhaseTestParameters data structure
used for prefill-phase decoder
self-attention; all fields
including KV cache required
* test_pt: TestPoint data structure; this function relies on the
following fields: batch_size, num_heads, head_size,
block_size, max_q_seq_len
* test_rsrcs: TestResources data structure; this function relies on the
scale field
* block_base_addr: decoder self-attention block-table base address
Returns:
* Prefill-phase encoder/decoder cross-attention PhaseTestParameters data
structure, including (1) packed
(number_of_tokens x num_heads x head_size) query/key/value tensors
along with (2) ideal attention output computed using a
naive implementation, and (3) memory-mapping data structures appropriate
for prefill phase.
* Decode-phase encoder/decoder cross-attention PhaseTestParameters data
structure, including (1) packed
(number_of_tokens x num_heads x head_size) query/key/value tensors
along with (2) ideal attention output computed using a
naive implementation, and (3) memory-mapping data structures appropriate
for decode phase.
'''
assert encoder_test_params.packed_qkvo.packed_qkv is not None
assert prefill_decoder_phase_test_params.packed_qkvo.packed_qkv is not None
(
num_heads,
head_size,
_,
batch_size,
block_size,
max_decoder_seq_len,
max_encoder_seq_len,
_,
) = test_pt
scale = test_rsrcs.scale
decoder_query = decoder_qkv.query
decoder_seq_lens = decoder_qkv.q_seq_lens
encoder_seq_lens = encoder_test_params.packed_qkvo.packed_qkv.q_seq_lens
prefill_q_seq_lens = (
prefill_decoder_phase_test_params.packed_qkvo.packed_qkv.q_seq_lens)
assert prefill_q_seq_lens is not None
(
cross_kv,
_,
_,
) = make_qkv(batch_size,
max_decoder_seq_len,
max_encoder_seq_len,
num_heads,
head_size,
force_kv_seq_lens=encoder_seq_lens,
attn_type=AttentionType.ENCODER_DECODER,
device=CUDA_DEVICE)
ideal_output = ref_masked_attention(decoder_query,
cross_kv.key,
cross_kv.value,
scale=scale,
q_seq_lens=decoder_seq_lens,
kv_seq_lens=cross_kv.kv_seq_lens)
prefill_ideal_output = torch.zeros_like(ideal_output)
decode_ideal_output = torch.zeros_like(ideal_output[:, 0:1])
for bdx, prefill_q_seq_len in enumerate(prefill_q_seq_lens):
prefill_ideal_output[bdx, :prefill_q_seq_len] = ideal_output[
bdx, :prefill_q_seq_len]
decode_ideal_output[bdx, :] = ideal_output[bdx, prefill_q_seq_len:(
prefill_q_seq_len + 1)]
prefill_packed_ideal_output, _ = pack_tensor(prefill_ideal_output,
prefill_q_seq_lens,
device=CUDA_DEVICE)
decode_packed_ideal_output, _ = pack_tensor(decode_ideal_output,
[1 for _ in range(batch_size)],
device=CUDA_DEVICE)
# Build prefill- & decode-phase data structures
# for encoder/decoder cross-attention. Block tables and
# slot mapping must be in a format compatible
# with KV caching & attention kernels
#
# Whereas decoder self-attention extracts relationships between
# equal-length Q/K/V sequences, which mutually grow in length
# with each decoded token, cross-attention relates the Q sequence
# - which grows with each new decoded token - to fixed-length
# K and V sequences derived from the encoder hidden states.
#
# Prefill-phase:
#
# * Empty block-tables tensor
# * Slot-mapping with as many entries as there are tokens in the encoder
# prompt.
#
# Decode-phase:
# * Block-tables tensor with minimum number of blocks to
# accommodate K & V tensors which are equal in lnegth
# to the encoder prompt length
# * Empty slot-mapping tensor (since K & V are fixed in size,
# new decoded tokens are not KV-cached and require no slot-
# mapping)
#
# Note: the format above is simply an extension of what ModelRunner
# produces for decoder-only models
prefill_block_tables = make_empty_block_tables_tensor(device=CUDA_DEVICE)
decode_slot_mapping = make_empty_slot_mapping_tensor(device=CUDA_DEVICE)
(
decode_block_tables,
prefill_slot_mapping_list,
_,
) = make_block_tables_slot_mapping(block_size,
cross_kv.kv_seq_lens,
block_base_addr=block_base_addr,
device=CUDA_DEVICE)
prefill_slot_mapping = maybe_make_long_tensor(prefill_slot_mapping_list,
device=CUDA_DEVICE)
# Packed key/value (query is already provided)
packed_cross_kv = pack_qkv(cross_kv, device=CUDA_DEVICE)
return (
PhaseTestParameters( # Prefill-phase test params
PackedQKVO(packed_cross_kv, prefill_packed_ideal_output),
KVMemoryMap(prefill_block_tables, prefill_slot_mapping)),
PhaseTestParameters( # Decode-phase test params
PackedQKVO(None, decode_packed_ideal_output),
KVMemoryMap(decode_block_tables, decode_slot_mapping)))
def _run_encoder_attention_test(
attn: Attention,
encoder_test_params: PhaseTestParameters,
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
'''
Run encoder attention.
attn.forward() is passed attn_type=AttentionType.ENCODER in order
to configure the kernel invocation for encoder attention
Requires attn_metadata.num_decode_tokens == 0
(There is no encoder execution in the decode-phase)
Arguments:
* attn: Attention wrapper instance
* encoder_test_params: encoder PhaseTestParameters data structure;
this function relies on the packed
(number_of_tokens x num_heads x head_size)
query/key/value fields
* attn_metadata: attention metadata for encoder/decoder-self attention
Returns:
* Attention.forward() applied to packed {query,key,value} and
& attn_metadata
'''
assert attn_metadata.num_decode_tokens == 0
attn_type = AttentionType.ENCODER
packed_qkv = encoder_test_params.packed_qkvo.packed_qkv
assert packed_qkv is not None
return attn.forward(packed_qkv.query,
packed_qkv.key,
packed_qkv.value,
None,
attn_metadata,
attn_type=attn_type)
def _run_decoder_self_attention_test(
test_rsrcs: TestResources,
decoder_test_params: PhaseTestParameters,
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
'''
Run decoder self-attention test.
attn.forward() is passed attn_type=AttentionType.DECODER
in order to configure the kernel invocation for decoder self-attention.
Arguments:
* test_rsrcs: TestResources instance; this function relies on the kv_cache
and attn (Attention wrapper instance) fields
* decoder_test_params: decoder PhaseTestParameters data structure;
this function relies on the packed
(number_of_tokens x num_heads x head_size)
query/key/value fields
* attn_metadata: attention metadata for decoder-self attention
(contains KV cache memory-mapping)
Returns:
* Attention.forward() applied to packed_{query,key,value}, kv_cache
& attn_metadata
'''
attn_type = AttentionType.DECODER
attn = test_rsrcs.attn
kv_cache = test_rsrcs.kv_cache
packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
assert packed_qkv is not None
return attn.forward(packed_qkv.query,
packed_qkv.key,
packed_qkv.value,
kv_cache,
attn_metadata,
attn_type=attn_type)
def _run_encoder_decoder_cross_attention_test(
test_rsrcs: TestResources,
decoder_test_params: PhaseTestParameters,
cross_test_params: Optional[PhaseTestParameters],
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
'''
Run encoder/decoder cross-attention test.
Via PhaseTestParameters data structures, consumes the same query utilized
for decoder self-attention, plus a key/value specific to cross-attention.
if cross_test_params is None or cross_test_params.packed_qkvo.packed_qkv
is None, this reflects that in decode-phase cross attention there
is no growth in the key and value tensors.
attn.forward() is passed attn_type=AttentionType.ENCODER_DECODER
in order to configure the kernel invocation for encoder/decoder cross-
attention.
Arguments:
* test_rsrcs: TestResources instance; this function relies on the kv_cache
and attn (Attention wrapper instance) fields
* decoder_test_params: decoder PhaseTestParameters data structure;
this function relies on the packed
(number_of_tokens x num_heads x head_size)
query field
* cross_test_params: encoder/decoder PhaseTestParameters data structure;
this function relies on the packed
(number_of_tokens x num_heads x head_size)
key/value fields
* attn_metadata: attention metadata for encoder/decoder-self attention
Returns:
* Attention.forward() applied to packed_{query,key,value}, kv_cache
& attn_metadata
'''
assert decoder_test_params.packed_qkvo.packed_qkv is not None
attn_type = AttentionType.ENCODER_DECODER
attn = test_rsrcs.attn
kv_cache = test_rsrcs.kv_cache
if cross_test_params is None:
key = None
value = None
else:
cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv
key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key)
value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value)
return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query,
key,
value,
kv_cache,
attn_metadata,
attn_type=attn_type)
@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("backend_name", BACKEND_NAMES)
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
def test_encoder_only(num_heads: int, head_size: int, backend_name: str,
batch_size: int, block_size: int, max_dec_seq_len: int,
max_enc_seq_len: int, monkeypatch):
# Force Attention wrapper backend
override_backend_env_variable(monkeypatch, backend_name)
# Note: KV cache size of 4096 is arbitrary & chosen intentionally
# to be more than necessary, since exceeding the kv cache size
# is not part of this test
test_pt = TestPoint(num_heads, head_size, backend_name, batch_size,
block_size, max_dec_seq_len, max_enc_seq_len, 4096)
# Attention scale factor, attention backend instance, attention wrapper
# instance, KV cache init
test_rsrcs = _make_test_resources(test_pt)
# Construct encoder attention test params (only used
# during prefill)
enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
# Shared prefill metadata structure
prephase_attn_metadata: AttentionMetadata = make_test_metadata(
test_rsrcs.attn_backend,
True,
None,
decoder_test_params=None,
encoder_test_params=enc_test_params,
cross_test_params=None,
device=CUDA_DEVICE)
# PREFILL: encoder attention
enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test(
test_rsrcs.attn, enc_test_params, prephase_attn_metadata))
# - Is encoder attention result correct?
assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("backend_name", BACKEND_NAMES)
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("max_dec_seq_len", MAX_DEC_SEQ_LENS)
@pytest.mark.parametrize("max_enc_seq_len", MAX_ENC_SEQ_LENS)
def test_e2e_enc_dec_attn(
num_heads: int,
head_size: int,
backend_name: str,
batch_size: int,
block_size: int,
max_dec_seq_len: int,
max_enc_seq_len: int,
monkeypatch,
) -> None:
'''
End-to-end encoder/decoder test:
* Construct fake test vectors for (1) encoder attention,
(2) decoder self-attention, and (3) encoder/decoder cross-attention
* Construct (1) attention metadata structure with self- and cross-attention
attributes for prefill-phase, and (2) an analogous attention metadata
structure but for decode-phase
* Test attention steps in the following order
* Encoder attention
* Prefill self-attention
* Prefill cross-attention
* Decode self-attention
* Decode cross-attention
* Besides being reflective of realistic use-cases, this order would
exacerbate any accidental overlap in the self-/cross-attention
block tables, which one hopes to avoid
* Validate output correctness against ideal reference attention
implementation
Block tables are constructed such that cross-attention KV cache is in a
higher, non-intersecting address-space than self-attention KV cache.
Self- and cross-attention share the same query tensor but not the K/V
tensors. Self-attention K/Vs must have the same seq len as Q while
cross-attention K/Vs are allowed to differ in seq len, as is often the case
for cross-attention.
This test utilizes PyTest monkey patching to force the attention backend
via an environment variable.
Note on ROCm/HIP: currently encoder/decoder models are not supported on
AMD GPUs, therefore this test simply is skipped if is_hip().
Note on metadata: there is a single attention metadata structure shared by
all prefill-phase attention operations (encoder, decoder, enc/dec cross),
and a single one shared by all decode-phase attention operations
(decoder & enc/dec cross.) This is intended to reflect the behavior
of ModelRunner, which constructs a single attention metadata structure for
each prefill or decode run. A realistic scenario would rely on the
attention backend to utilize the appropriate attention metadata fields
according to the value of attn_metadata.attention_type. Thus, this test is
organized so as to confirm that the backend-under-test can handle a
shared prefill attention metadata structure & a shared decode attention
metadata structure.
'''
# Force Attention wrapper backend
override_backend_env_variable(monkeypatch, backend_name)
# Note: KV cache size of 4096 is arbitrary & chosen intentionally
# to be more than necessary, since exceeding the kv cache size
# is not part of this test
test_pt = TestPoint(num_heads, head_size, backend_name, batch_size,
block_size, max_dec_seq_len, max_enc_seq_len, 4096)
# Attention scale factor, attention backend instance, attention wrapper
# instance, KV cache init
test_rsrcs = _make_test_resources(test_pt)
# Construct encoder attention test params (only used
# during prefill)
enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs)
# Construct Decoder self-attention prefill-phase & decode-phase
# test params, including query/key/value tensors, decoder self-attention
# memory-mapping. cross_block_base_addr is the uppermost address in the
# decoder self-attention block-table, i.e. a base address which the
# encoder/decoder cross-attention block-table may build downward toward.
(
dec_qkv,
prephase_dec_test_params,
decphase_dec_test_params,
cross_block_base_addr,
) = _decoder_attn_setup(test_pt, test_rsrcs)
# Construct encoder/decoder cross-attention prefill-phase & decode-phase
# test params, including key/value tensors, cross-attention memory-mapping
(
prephase_cross_test_params,
decphase_cross_test_params,
) = _enc_dec_cross_attn_setup_reuses_query(
dec_qkv,
enc_test_params,
prephase_dec_test_params,
test_pt,
test_rsrcs,
block_base_addr=cross_block_base_addr)
# Shared prefill metadata structure
assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None
prephase_attn_metadata: AttentionMetadata = make_test_metadata(
test_rsrcs.attn_backend,
True,
prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens,
decoder_test_params=prephase_dec_test_params,
encoder_test_params=enc_test_params,
cross_test_params=prephase_cross_test_params,
device=CUDA_DEVICE)
# PREFILL: encoder attention
enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn,
enc_test_params,
prephase_attn_metadata)
# - Is encoder attention result correct?
assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out)
# PREFILL: decoder self-attention test
prephase_dec_pckd_act_out = _run_decoder_self_attention_test(
test_rsrcs, prephase_dec_test_params, prephase_attn_metadata)
# - Is prefill decoder self-attention correct?
assert_actual_matches_ideal(prephase_dec_test_params,
prephase_dec_pckd_act_out)
# PREFILL: encoder/decoder cross-attention test
prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
test_rsrcs, prephase_dec_test_params, prephase_cross_test_params,
prephase_attn_metadata)
# - Is prefill encoder/decoder cross-attention correct?
assert_actual_matches_ideal(prephase_cross_test_params,
prephase_cross_pckd_act_out)
# DECODE: build decode-phase attention metadata
decphase_attn_metadata: AttentionMetadata = make_test_metadata(
test_rsrcs.attn_backend,
False,
dec_qkv.q_seq_lens,
decoder_test_params=decphase_dec_test_params,
encoder_test_params=enc_test_params,
cross_test_params=decphase_cross_test_params,
device=CUDA_DEVICE)
# DECODE: decoder self-attention test
decphase_dec_pckd_act_out = _run_decoder_self_attention_test(
test_rsrcs, decphase_dec_test_params, decphase_attn_metadata)
# - Is decode-phase decoder self-attention correct?
assert_actual_matches_ideal(decphase_dec_test_params,
decphase_dec_pckd_act_out)
# DECODE: encoder/decoder cross-attention test
decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test(
test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata)
# - Is decode-phase encoder/decoder cross-attention correct?
assert_actual_matches_ideal(decphase_cross_test_params,
decphase_cross_pckd_act_out)
...@@ -25,7 +25,7 @@ def ref_paged_attn( ...@@ -25,7 +25,7 @@ def ref_paged_attn(
block_tables = block_tables.cpu().numpy() block_tables = block_tables.cpu().numpy()
_, block_size, num_kv_heads, head_size = key_cache.shape _, block_size, num_kv_heads, head_size = key_cache.shape
outputs = [] outputs: List[torch.Tensor] = []
start_idx = 0 start_idx = 0
for i in range(num_seqs): for i in range(num_seqs):
query_len = query_lens[i] query_len = query_lens[i]
...@@ -70,7 +70,7 @@ def ref_paged_attn( ...@@ -70,7 +70,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@torch.inference_mode @torch.inference_mode
def test_flash_attn_with_paged_kv( def test_flash_attn_with_paged_kv(
kv_lens: List[Tuple[int, int]], kv_lens: List[int],
num_heads: Tuple[int, int], num_heads: Tuple[int, int],
head_size: int, head_size: int,
dtype: torch.dtype, dtype: torch.dtype,
......
from typing import List, Optional, Tuple
import flashinfer
import pytest
import torch
NUM_HEADS = [(16, 16), (32, 8), (64, 8)]
HEAD_SIZES = [128, 256]
BLOCK_SIZES = [16, 32]
DTYPES = [torch.float16, torch.bfloat16]
NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation.
def ref_paged_attn(
query: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
query_lens: List[int],
kv_lens: List[int],
block_tables: torch.Tensor,
scale: float,
sliding_window: Optional[int] = None,
soft_cap: Optional[float] = None,
) -> torch.Tensor:
num_seqs = len(query_lens)
block_tables = block_tables.cpu().numpy()
_, block_size, num_kv_heads, head_size = key_cache.shape
outputs: List[torch.Tensor] = []
start_idx = 0
for i in range(num_seqs):
query_len = query_lens[i]
kv_len = kv_lens[i]
q = query[start_idx:start_idx + query_len]
q *= scale
num_kv_blocks = (kv_len + block_size - 1) // block_size
block_indices = block_tables[i, :num_kv_blocks]
k = key_cache[block_indices].view(-1, num_kv_heads, head_size)
k = k[:kv_len]
v = value_cache[block_indices].view(-1, num_kv_heads, head_size)
v = v[:kv_len]
if q.shape[1] != k.shape[1]:
k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
attn = torch.einsum("qhd,khd->hqk", q, k).float()
empty_mask = torch.ones(query_len, kv_len)
mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
if sliding_window is not None:
sliding_window_mask = torch.triu(empty_mask,
diagonal=kv_len -
(query_len + sliding_window) +
1).bool().logical_not()
mask |= sliding_window_mask
if soft_cap is not None:
attn = soft_cap * torch.tanh(attn / soft_cap)
attn.masked_fill_(mask, float("-inf"))
attn = torch.softmax(attn, dim=-1).to(v.dtype)
out = torch.einsum("hqk,khd->qhd", attn, v)
outputs.append(out)
start_idx += query_len
return torch.cat(outputs, dim=0)
@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
@torch.inference_mode
def test_flashinfer_decode_with_paged_kv(kv_lens: List[int],
num_heads: Tuple[int,
int], head_size: int,
dtype: torch.dtype, block_size: int,
soft_cap: Optional[float]) -> None:
torch.set_default_device("cuda")
torch.cuda.manual_seed_all(0)
num_seqs = len(kv_lens)
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
assert num_query_heads % num_kv_heads == 0
max_kv_len = max(kv_lens)
scale = head_size**-0.5
query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
key_value_cache = torch.randn(NUM_BLOCKS,
2,
block_size,
num_kv_heads,
head_size,
dtype=dtype)
key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
block_tables = torch.randint(0,
NUM_BLOCKS,
(num_seqs, max_num_blocks_per_seq),
dtype=torch.int32)
kv_indptr = [0]
kv_indices = []
kv_last_page_lens = []
for i in range(num_seqs):
seq_len = kv_lens[i]
assert seq_len > 0
num_blocks = (seq_len + block_size - 1) // block_size
kv_indices.extend(block_tables[i, :num_blocks])
kv_indptr.append(kv_indptr[-1] + num_blocks)
kv_last_page_len = seq_len % block_size
if kv_last_page_len == 0:
kv_last_page_len = block_size
kv_last_page_lens.append(kv_last_page_len)
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
wrapper = flashinfer.\
BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD")
wrapper.begin_forward(kv_indptr,
kv_indices,
kv_last_page_lens,
num_query_heads,
num_kv_heads,
head_size,
block_size,
"NONE",
data_type=dtype)
output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap)
ref_output = ref_paged_attn(query=query,
key_cache=key_cache,
value_cache=value_cache,
query_lens=[1] * num_seqs,
kv_lens=kv_lens,
block_tables=block_tables,
scale=scale,
soft_cap=soft_cap)
assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
f"{torch.max(torch.abs(output - ref_output))}"
@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]])
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
@torch.inference_mode
def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
num_heads: Tuple[int, int],
head_size: int, dtype: torch.dtype,
block_size: int,
soft_cap: Optional[float]) -> None:
torch.set_default_device("cuda")
torch.cuda.manual_seed_all(0)
num_seqs = len(seq_lens)
query_lens = [x[0] for x in seq_lens]
kv_lens = [x[1] for x in seq_lens]
num_query_heads = num_heads[0]
num_kv_heads = num_heads[1]
assert num_query_heads % num_kv_heads == 0
max_kv_len = max(kv_lens)
scale = head_size**-0.5
query = torch.randn(sum(query_lens),
num_query_heads,
head_size,
dtype=dtype)
key_value_cache = torch.randn(NUM_BLOCKS,
2,
block_size,
num_kv_heads,
head_size,
dtype=dtype)
key_cache = key_value_cache[:, 0, :, :, :].squeeze(1)
value_cache = key_value_cache[:, 1, :, :, :].squeeze(1)
# Normalize the scale of the key and value caches to mitigate
# numerical instability.
key_cache /= head_size**0.5
value_cache /= head_size**0.5
max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
block_tables = torch.randint(0,
NUM_BLOCKS,
(num_seqs, max_num_blocks_per_seq),
dtype=torch.int32)
qo_indptr = [0]
kv_indptr = [0]
kv_indices = []
kv_last_page_lens = []
for i in range(num_seqs):
seq_len = kv_lens[i]
assert seq_len > 0
num_blocks = (seq_len + block_size - 1) // block_size
kv_indices.extend(block_tables[i, :num_blocks])
kv_indptr.append(kv_indptr[-1] + num_blocks)
kv_last_page_len = seq_len % block_size
if kv_last_page_len == 0:
kv_last_page_len = block_size
kv_last_page_lens.append(kv_last_page_len)
qo_indptr.append(qo_indptr[-1] + query_lens[i])
qo_indptr = torch.tensor(qo_indptr, dtype=torch.int32)
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
workspace_buffer, "NHD")
wrapper.begin_forward(
qo_indptr,
kv_indptr,
kv_indices,
kv_last_page_lens,
num_query_heads,
num_kv_heads,
head_size,
block_size,
)
output = wrapper.forward(
query,
key_value_cache,
logits_soft_cap=soft_cap,
)
ref_output = ref_paged_attn(query=query,
key_cache=key_cache,
value_cache=value_cache,
query_lens=query_lens,
kv_lens=kv_lens,
block_tables=block_tables,
scale=scale,
soft_cap=soft_cap)
assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
f"{torch.max(torch.abs(output - ref_output))}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment