".github/workflows/container-validation-dynamo.yml" did not exist on "ecf53ce2b38971dc9b5dde6f70d74cfc5b870c35"
Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
import subprocess
import sys
import tempfile
import time
from http import HTTPStatus
import openai
import pytest
import pytest_asyncio
import requests
from prometheus_client.parser import text_string_to_metric_families
from transformers import AutoTokenizer
from ...utils import RemoteOpenAIServer
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"1024",
"--enforce-eager",
"--max-num-seqs",
"128",
]
@pytest.fixture(scope="module",
params=[
"",
"--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
])
def server(default_server_args, request):
if request.param:
default_server_args.append(request.param)
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as cl:
yield cl
_PROMPT = "Hello my name is Robert and I love magic"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
_NUM_REQUESTS = 10
_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
_NUM_GENERATION_TOKENS_PER_REQUEST = 10
# {metric_family: [(suffix, expected_value)]}
EXPECTED_VALUES = {
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
"vllm:time_per_output_token_seconds":
[("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
"vllm:request_prompt_tokens":
[("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_generation_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens":
[("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
}
@pytest.mark.asyncio
async def test_metrics_counts(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME,
prompt=_TOKENIZED_PROMPT,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
response = requests.get(base_url + "/metrics")
print(response.text)
assert response.status_code == HTTPStatus.OK
# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
found_metric = False
# Check to see if the metric_family is found in the prom endpoint.
for family in text_string_to_metric_families(response.text):
if family.name == metric_family:
found_metric = True
# Check that each suffix is found in the prom endpoint.
for suffix, expected_value in suffix_values_list:
metric_name_w_suffix = f"{metric_family}{suffix}"
found_suffix = False
for sample in family.samples:
if sample.name == metric_name_w_suffix:
found_suffix = True
# For each suffix, value sure the value matches
# what we expect.
assert sample.value == expected_value, (
f"{metric_name_w_suffix} expected value of "
f"{expected_value} did not match found value "
f"{sample.value}")
break
assert found_suffix, (
f"Did not find {metric_name_w_suffix} in prom endpoint"
)
break
assert found_metric, (f"Did not find {metric_family} in prom endpoint")
EXPECTED_METRICS = [
"vllm:num_requests_running",
"vllm:num_requests_swapped",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:cpu_cache_usage_perc",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:cache_config_info",
# labels in cache_config_info
"block_size",
"cache_dtype",
"cpu_offload_gb",
"enable_prefix_caching",
"gpu_memory_utilization",
"num_cpu_blocks",
"num_gpu_blocks",
"num_gpu_blocks_override",
"sliding_window",
"swap_space_bytes",
]
@pytest.mark.asyncio
async def test_metrics_exist(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
# sending a request triggers the metrics to be logged.
await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
response = requests.get(base_url + "/metrics")
assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS:
assert metric in response.text
def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501
base_url = "0.0.0.0"
port = "8001"
server_url = f"http://{base_url}:{port}"
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
input_file.write(input_batch)
input_file.flush()
proc = subprocess.Popen([
sys.executable,
"-m",
"vllm.entrypoints.openai.run_batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"intfloat/e5-mistral-7b-instruct",
"--enable-metrics",
"--url",
base_url,
"--port",
port,
], )
def is_server_up(url):
try:
response = requests.get(url)
return response.status_code == 200
except requests.ConnectionError:
return False
while not is_server_up(server_url):
time.sleep(1)
response = requests.get(server_url + "/metrics")
assert response.status_code == HTTPStatus.OK
proc.wait()
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio
# downloading lora to test lora requests # downloading lora to test lora requests
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
...@@ -43,18 +44,21 @@ def server(zephyr_lora_files): ...@@ -43,18 +44,21 @@ def server(zephyr_lora_files):
yield remote_server yield remote_server
@pytest.fixture(scope="module") @pytest_asyncio.fixture
def client(server): async def client(server):
return server.get_async_client() async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_models(client: openai.AsyncOpenAI): async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
models = await client.models.list() models = await client.models.list()
models = models.data models = models.data
served_model = models[0] served_model = models[0]
lora_models = models[1:] lora_models = models[1:]
assert served_model.id == MODEL_NAME assert served_model.id == MODEL_NAME
assert all(model.root == MODEL_NAME for model in models) assert served_model.root == MODEL_NAME
assert all(lora_model.root == zephyr_lora_files
for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora" assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2" assert lora_models[1].id == "zephyr-lora2"
import sys from ...utils import VLLM_PATH, RemoteOpenAIServer
import time
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
import torch assert chatml_jinja_path.exists()
from openai import OpenAI, OpenAIError
from vllm import ModelRegistry def run_and_test_dummy_opt_api_server(model, tp=1):
from vllm.model_executor.models.opt import OPTForCausalLM # the model is registered through the plugin
from vllm.model_executor.sampling_metadata import SamplingMetadata server_args = [
from vllm.utils import get_open_port "--gpu-memory-utilization",
"0.10",
"--dtype",
class MyOPTForCausalLM(OPTForCausalLM): "float32",
"--chat-template",
def compute_logits(self, hidden_states: torch.Tensor, str(chatml_jinja_path),
sampling_metadata: SamplingMetadata) -> torch.Tensor: "--load-format",
# this dummy model always predicts the first token "dummy",
logits = super().compute_logits(hidden_states, sampling_metadata) "-tp",
logits.zero_() f"{tp}",
logits[:, 0] += 1.0 ]
return logits with RemoteOpenAIServer(model, server_args) as server:
client = server.get_client()
completion = client.chat.completions.create(
def server_function(port): model=model,
# register our dummy model messages=[{
ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) "role": "system",
sys.argv = ["placeholder.py"] + \ "content": "You are a helpful assistant."
("--model facebook/opt-125m --gpu-memory-utilization 0.10 " }, {
f"--dtype float32 --api-key token-abc123 --port {port}").split() "role": "user",
import runpy "content": "Hello!"
runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') }],
temperature=0,
)
def test_oot_registration_for_api_server(): generated_text = completion.choices[0].message.content
port = get_open_port() assert generated_text is not None
ctx = torch.multiprocessing.get_context() # make sure only the first token is generated
server = ctx.Process(target=server_function, args=(port, )) rest = generated_text.replace("<s>", "")
server.start() assert rest == ""
MAX_SERVER_START_WAIT_S = 60
client = OpenAI(
base_url=f"http://localhost:{port}/v1", def test_oot_registration_for_api_server(dummy_opt_path: str):
api_key="token-abc123", run_and_test_dummy_opt_api_server(dummy_opt_path)
)
now = time.time()
while True:
try:
completion = client.chat.completions.create(
model="facebook/opt-125m",
messages=[{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "Hello!"
}],
temperature=0,
)
break
except OpenAIError as e:
if "Connection error" in str(e):
time.sleep(3)
if time.time() - now > MAX_SERVER_START_WAIT_S:
raise RuntimeError("Server did not start in time") from e
else:
raise e
server.kill()
generated_text = completion.choices[0].message.content
# make sure only the first token is generated
rest = generated_text.replace("<s>", "")
assert rest == ""
# imports for guided decoding tests
import re
import openai
import pytest
from ...utils import RemoteOpenAIServer
@pytest.mark.asyncio
async def test_empty_prompt():
model_name = "gpt2"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile('.+Prompt cannot be empty.+')):
await client.completions.create(model=model_name,
prompt="",
max_tokens=5,
temperature=0.0)
...@@ -25,59 +25,63 @@ def server_with_return_tokens_as_token_ids_flag( ...@@ -25,59 +25,63 @@ def server_with_return_tokens_as_token_ids_flag(
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_completion_return_tokens_as_token_ids_completion( async def test_completion_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag): server_with_return_tokens_as_token_ids_flag):
client = server_with_return_tokens_as_token_ids_flag.get_async_client() async with server_with_return_tokens_as_token_ids_flag.get_async_client(
) as client:
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
# Include Unicode characters to test for dividing a single # Include Unicode characters to test for dividing a single
# character across multiple tokens: 🎉 is [28705, 31862] for the # character across multiple tokens: 🎉 is [28705, 31862] for the
# Zephyr tokenizer # Zephyr tokenizer
prompt="Say 'Hello, world! 🎉'", prompt="Say 'Hello, world! 🎉'",
echo=True, echo=True,
temperature=0, temperature=0,
max_tokens=10, max_tokens=10,
logprobs=1) logprobs=1)
text = completion.choices[0].text text = completion.choices[0].text
token_strs = completion.choices[0].logprobs.tokens token_strs = completion.choices[0].logprobs.tokens
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# Check that the token representations are consistent between raw tokens # Check that the token representations are consistent between raw
# and top_logprobs # tokens and top_logprobs
# Slice off the first one, because there's no scoring associated with BOS # Slice off the first one, because there's no scoring associated
top_logprobs = completion.choices[0].logprobs.top_logprobs[1:] # with BOS
top_logprob_keys = [ top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs top_logprob_keys = [
] next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
assert token_strs[1:] == top_logprob_keys ]
assert token_strs[1:] == top_logprob_keys
# Check that decoding the tokens gives the expected text # Check that decoding the tokens gives the expected text
tokens = [int(token.removeprefix("token_id:")) for token in token_strs] tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
assert text == tokenizer.decode(tokens, skip_special_tokens=True) assert text == tokenizer.decode(tokens, skip_special_tokens=True)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_chat_return_tokens_as_token_ids_completion( async def test_chat_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag): server_with_return_tokens_as_token_ids_flag):
client = server_with_return_tokens_as_token_ids_flag.get_async_client() async with server_with_return_tokens_as_token_ids_flag.get_async_client(
response = await client.chat.completions.create( ) as client:
model=MODEL_NAME, response = await client.chat.completions.create(
# Include Unicode characters to test for dividing a single model=MODEL_NAME,
# character across multiple tokens: 🎉 is [28705, 31862] for the # Include Unicode characters to test for dividing a single
# Zephyr tokenizer # character across multiple tokens: 🎉 is [28705, 31862] for the
messages=[{ # Zephyr tokenizer
"role": "system", messages=[{
"content": "You like to respond in only emojis, like 🎉" "role": "system",
}, { "content": "You like to respond in only emojis, like 🎉"
"role": "user", }, {
"content": "Please write some emojis: 🐱🐶🎉" "role": "user",
}], "content": "Please write some emojis: 🐱🐶🎉"
temperature=0, }],
max_tokens=8, temperature=0,
logprobs=True) max_tokens=8,
logprobs=True)
text = response.choices[0].message.content text = response.choices[0].message.content
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
token_ids = [] token_ids = []
for logprob_content in response.choices[0].logprobs.content: for logprob_content in response.choices[0].logprobs.content:
token_ids.append(int(logprob_content.token.removeprefix("token_id:"))) token_ids.append(
assert tokenizer.decode(token_ids, skip_special_tokens=True) == text int(logprob_content.token.removeprefix("token_id:")))
assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
...@@ -7,13 +7,41 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput ...@@ -7,13 +7,41 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
# ruff: noqa: E501 # ruff: noqa: E501
INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
def test_e2e(): def test_empty_file():
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
input_file.write("")
input_file.flush()
proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
input_file.name, "-o", output_file.name, "--model",
"intfloat/e5-mistral-7b-instruct"
], )
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
assert contents.strip() == ""
def test_completions():
with tempfile.NamedTemporaryFile( with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile( "w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file: "r") as output_file:
...@@ -35,7 +63,7 @@ def test_e2e(): ...@@ -35,7 +63,7 @@ def test_e2e():
BatchRequestOutput.model_validate_json(line) BatchRequestOutput.model_validate_json(line)
def test_e2e_invalid_input(): def test_completions_invalid_input():
""" """
Ensure that we fail when the input doesn't conform to the openai api. Ensure that we fail when the input doesn't conform to the openai api.
""" """
...@@ -52,3 +80,25 @@ def test_e2e_invalid_input(): ...@@ -52,3 +80,25 @@ def test_e2e_invalid_input():
proc.communicate() proc.communicate()
proc.wait() proc.wait()
assert proc.returncode != 0, f"{proc=}" assert proc.returncode != 0, f"{proc=}"
def test_embeddings():
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
input_file.write(INPUT_EMBEDDING_BATCH)
input_file.flush()
proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
input_file.name, "-o", output_file.name, "--model",
"intfloat/e5-mistral-7b-instruct"
], )
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
...@@ -3,13 +3,21 @@ from contextlib import suppress ...@@ -3,13 +3,21 @@ from contextlib import suppress
from dataclasses import dataclass from dataclasses import dataclass
from unittest.mock import MagicMock from unittest.mock import MagicMock
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.config import MultiModalConfig
from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_engine import BaseModelPath
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
MODEL_NAME = "openai-community/gpt2" MODEL_NAME = "openai-community/gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}" CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass @dataclass
...@@ -20,6 +28,8 @@ class MockModelConfig: ...@@ -20,6 +28,8 @@ class MockModelConfig:
max_model_len = 100 max_model_len = 100
tokenizer_revision = None tokenizer_revision = None
embedding_mode = False embedding_mode = False
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
@dataclass @dataclass
...@@ -35,7 +45,7 @@ async def _async_serving_chat_init(): ...@@ -35,7 +45,7 @@ async def _async_serving_chat_init():
serving_completion = OpenAIServingChat(engine, serving_completion = OpenAIServingChat(engine,
model_config, model_config,
served_model_names=[MODEL_NAME], BASE_MODEL_PATHS,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
lora_modules=None, lora_modules=None,
...@@ -50,12 +60,13 @@ def test_async_serving_chat_init(): ...@@ -50,12 +60,13 @@ def test_async_serving_chat_init():
def test_serving_chat_should_set_correct_max_tokens(): def test_serving_chat_should_set_correct_max_tokens():
mock_engine = MagicMock(spec=AsyncLLMEngine) mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
serving_chat = OpenAIServingChat(mock_engine, serving_chat = OpenAIServingChat(mock_engine,
MockModelConfig(), MockModelConfig(),
served_model_names=[MODEL_NAME], BASE_MODEL_PATHS,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
lora_modules=None, lora_modules=None,
...@@ -73,7 +84,6 @@ def test_serving_chat_should_set_correct_max_tokens(): ...@@ -73,7 +84,6 @@ def test_serving_chat_should_set_correct_max_tokens():
with suppress(Exception): with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req)) asyncio.run(serving_chat.create_chat_completion(req))
# AsyncLLMEngine.generate(inputs, sampling_params, ...)
assert mock_engine.generate.call_args.args[1].max_tokens == 93 assert mock_engine.generate.call_args.args[1].max_tokens == 93
req.max_tokens = 10 req.max_tokens = 10
......
from http import HTTPStatus
from unittest.mock import MagicMock
import pytest
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest,
UnloadLoraAdapterRequest)
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
MODEL_NAME = "meta-llama/Llama-2-7b"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
LORA_LOADING_SUCCESS_MESSAGE = (
"Success: LoRA adapter '{lora_name}' added successfully.")
LORA_UNLOADING_SUCCESS_MESSAGE = (
"Success: LoRA adapter '{lora_name}' removed successfully.")
async def _async_serving_engine_init():
mock_engine_client = MagicMock(spec=EngineClient)
mock_model_config = MagicMock(spec=ModelConfig)
# Set the max_model_len attribute to avoid missing attribute
mock_model_config.max_model_len = 2048
serving_engine = OpenAIServing(mock_engine_client,
mock_model_config,
BASE_MODEL_PATHS,
lora_modules=None,
prompt_adapters=None,
request_logger=None)
return serving_engine
@pytest.mark.asyncio
async def test_load_lora_adapter_success():
serving_engine = await _async_serving_engine_init()
request = LoadLoraAdapterRequest(lora_name="adapter",
lora_path="/path/to/adapter2")
response = await serving_engine.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
assert len(serving_engine.lora_requests) == 1
assert serving_engine.lora_requests[0].lora_name == "adapter"
@pytest.mark.asyncio
async def test_load_lora_adapter_missing_fields():
serving_engine = await _async_serving_engine_init()
request = LoadLoraAdapterRequest(lora_name="", lora_path="")
response = await serving_engine.load_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput"
assert response.code == HTTPStatus.BAD_REQUEST
@pytest.mark.asyncio
async def test_load_lora_adapter_duplicate():
serving_engine = await _async_serving_engine_init()
request = LoadLoraAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_engine.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1')
assert len(serving_engine.lora_requests) == 1
request = LoadLoraAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_engine.load_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput"
assert response.code == HTTPStatus.BAD_REQUEST
assert len(serving_engine.lora_requests) == 1
@pytest.mark.asyncio
async def test_unload_lora_adapter_success():
serving_engine = await _async_serving_engine_init()
request = LoadLoraAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_engine.load_lora_adapter(request)
assert len(serving_engine.lora_requests) == 1
request = UnloadLoraAdapterRequest(lora_name="adapter1")
response = await serving_engine.unload_lora_adapter(request)
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1')
assert len(serving_engine.lora_requests) == 0
@pytest.mark.asyncio
async def test_unload_lora_adapter_missing_fields():
serving_engine = await _async_serving_engine_init()
request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
response = await serving_engine.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput"
assert response.code == HTTPStatus.BAD_REQUEST
@pytest.mark.asyncio
async def test_unload_lora_adapter_not_found():
serving_engine = await _async_serving_engine_init()
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
response = await serving_engine.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput"
assert response.code == HTTPStatus.BAD_REQUEST
import json
import os
import openai
import pytest
from ...utils import RemoteOpenAIServer
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.mark.asyncio
async def test_shutdown_on_engine_failure(tmp_path):
# Use a bad adapter to crash the engine
# (This test will fail when that bug is fixed)
adapter_path = tmp_path / "bad_adapter"
os.mkdir(adapter_path)
with open(adapter_path / "adapter_model_config.json", "w") as f:
json.dump({"not": "real"}, f)
with open(adapter_path / "adapter_model.safetensors", "wb") as f:
f.write(b"this is fake")
# dtype, max-len etc set so that this can run in CI
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
"--max-num-seqs",
"128",
"--enable-lora",
"--lora-modules",
f"bad-adapter={tmp_path / 'bad_adapter'}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
async with remote_server.get_async_client() as client:
with pytest.raises(
(openai.APIConnectionError, openai.InternalServerError)):
# This crashes the engine
await client.completions.create(model="bad-adapter",
prompt="Hello, my name is")
# Now the server should shut down
return_code = remote_server.proc.wait(timeout=8)
assert return_code is not None
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio
import requests import requests
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
...@@ -42,9 +43,10 @@ def tokenizer_name(model_name: str, ...@@ -42,9 +43,10 @@ def tokenizer_name(model_name: str,
model_name == "zephyr-lora2") else model_name model_name == "zephyr-lora2") else model_name
@pytest.fixture(scope="module") @pytest_asyncio.fixture
def client(server): async def client(server):
return server.get_async_client() async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -102,28 +104,40 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, ...@@ -102,28 +104,40 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
"role": "user", "role": "user",
"content": "Can I ask a question? vllm1" "content": "Can I ask a question? vllm1"
}] }]
for continue_final in [False, True]:
prompt = tokenizer.apply_chat_template( if add_generation and continue_final:
add_generation_prompt=add_generation, continue
conversation=conversation, if continue_final:
tokenize=False) conversation.append({
tokens = tokenizer.encode(prompt, add_special_tokens=add_special) "role": "assistant",
"content": "Sure,"
response = requests.post(base_url + "/tokenize", })
json={
"add_generation_prompt": prompt = tokenizer.apply_chat_template(
add_generation, add_generation_prompt=add_generation,
"add_special_tokens": add_special, continue_final_message=continue_final,
"messages": conversation, conversation=conversation,
"model": model_name tokenize=False)
}) tokens = tokenizer.encode(prompt,
response.raise_for_status() add_special_tokens=add_special)
assert response.json() == { response = requests.post(base_url + "/tokenize",
"tokens": tokens, json={
"count": len(tokens), "add_generation_prompt":
"max_model_len": 8192 add_generation,
} "continue_final_message":
continue_final,
"add_special_tokens": add_special,
"messages": conversation,
"model": model_name
})
response.raise_for_status()
assert response.json() == {
"tokens": tokens,
"count": len(tokens),
"max_model_len": 8192
}
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -2,14 +2,14 @@ from typing import Dict, List ...@@ -2,14 +2,14 @@ from typing import Dict, List
import openai import openai
import pytest import pytest
import pytest_asyncio
from vllm.multimodal.utils import encode_image_base64, fetch_image from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import VLLM_PATH, RemoteOpenAIServer from ...utils import RemoteOpenAIServer
MODEL_NAME = "llava-hf/llava-1.5-7b-hf" MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja" MAXIMUM_IMAGES = 2
assert LLAVA_CHAT_TEMPLATE.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [ TEST_IMAGE_URLS = [
...@@ -26,19 +26,23 @@ def server(): ...@@ -26,19 +26,23 @@ def server():
"--dtype", "--dtype",
"bfloat16", "bfloat16",
"--max-model-len", "--max-model-len",
"4096", "2048",
"--max-num-seqs",
"5",
"--enforce-eager", "--enforce-eager",
"--chat-template", "--trust-remote-code",
str(LLAVA_CHAT_TEMPLATE), "--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
@pytest.fixture(scope="module") @pytest_asyncio.fixture
def client(server): async def client(server):
return server.get_async_client() async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
...@@ -82,7 +86,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, ...@@ -82,7 +86,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=596, total_tokens=606) completion_tokens=10, prompt_tokens=772, total_tokens=782)
message = choice.message message = choice.message
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -137,7 +141,7 @@ async def test_single_chat_session_image_base64encoded( ...@@ -137,7 +141,7 @@ async def test_single_chat_session_image_base64encoded(
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=596, total_tokens=606) completion_tokens=10, prompt_tokens=772, total_tokens=782)
message = choice.message message = choice.message
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -215,26 +219,22 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, ...@@ -215,26 +219,22 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize(
"image_urls",
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
image_url: str): image_urls: List[str]):
messages = [{ messages = [{
"role": "role":
"user", "user",
"content": [ "content": [
{ *({
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": image_url "url": image_url
} }
}, } for image_url in image_urls),
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{ {
"type": "text", "type": "text",
"text": "What's in this image?" "text": "What's in this image?"
...@@ -242,20 +242,30 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, ...@@ -242,20 +242,30 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
], ],
}] }]
with pytest.raises(openai.BadRequestError): # test multi-image input if len(image_urls) > MAXIMUM_IMAGES:
await client.chat.completions.create( with pytest.raises(openai.BadRequestError): # test multi-image input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
else:
chat_completion = await client.chat.completions.create(
model=model_name, model=model_name,
messages=messages, messages=messages,
max_tokens=10, max_tokens=10,
temperature=0.0, temperature=0.0,
) )
message = chat_completion.choices[0].message
# the server should still work afterwards assert message.content is not None and len(message.content) >= 0
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
import warnings
from typing import Optional
import pytest
from PIL import Image
from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (parse_chat_messages,
parse_chat_messages_futures)
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.utils import encode_image_base64
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
@pytest.fixture(scope="module")
def phi3v_model_config():
return ModelConfig(PHI3V_MODEL_ID,
PHI3V_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16",
seed=0,
limit_mm_per_prompt={
"image": 2,
})
@pytest.fixture(scope="module")
def phi3v_tokenizer():
return TokenizerGroup(
tokenizer_id=PHI3V_MODEL_ID,
enable_lora=False,
max_num_seqs=5,
max_input_length=None,
)
@pytest.fixture(scope="module")
def image_url():
image = ImageAsset('cherry_blossom')
base64 = encode_image_base64(image.pil_image)
return f"data:image/jpeg;base64,{base64}"
def _assert_mm_data_is_image_input(
mm_data: Optional[MultiModalDataDict],
image_count: int,
) -> None:
assert mm_data is not None
assert set(mm_data.keys()) == {"image"}
image_data = mm_data.get("image")
assert image_data is not None
if image_count == 1:
assert isinstance(image_data, Image.Image)
else:
assert isinstance(image_data, list) and len(image_data) == image_count
def test_parse_chat_messages_single_image(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in the image?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [{
"role": "user",
"content": "<|image_1|>\nWhat's in the image?"
}]
_assert_mm_data_is_image_input(mm_data, 1)
@pytest.mark.asyncio
async def test_parse_chat_messages_single_image_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_future = parse_chat_messages_futures([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in the image?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [{
"role": "user",
"content": "<|image_1|>\nWhat's in the image?"
}]
_assert_mm_data_is_image_input(await mm_future, 1)
def test_parse_chat_messages_multiple_images(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in these images?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [{
"role":
"user",
"content":
"<|image_1|>\n<|image_2|>\nWhat's in these images?"
}]
_assert_mm_data_is_image_input(mm_data, 2)
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_future = parse_chat_messages_futures([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in these images?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [{
"role":
"user",
"content":
"<|image_1|>\n<|image_2|>\nWhat's in these images?"
}]
_assert_mm_data_is_image_input(await mm_future, 2)
def test_parse_chat_messages_placeholder_already_in_prompt(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type":
"text",
"text":
"What's in <|image_1|> and how does it compare to <|image_2|>?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [{
"role":
"user",
"content":
"What's in <|image_1|> and how does it compare to <|image_2|>?"
}]
_assert_mm_data_is_image_input(mm_data, 2)
def test_parse_chat_messages_placeholder_one_already_in_prompt(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type":
"text",
"text":
"What's in <|image_1|> and how does it compare to the other one?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [{
"role":
"user",
"content":
"<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
"other one?"
}]
_assert_mm_data_is_image_input(mm_data, 2)
def test_parse_chat_messages_multiple_images_across_messages(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in this image?"
}]
}, {
"role": "assistant",
"content": "Some stuff."
}, {
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What about this one?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [
{
"role": "user",
"content": "<|image_1|>\nWhat's in this image?"
},
{
"role": "assistant",
"content": "Some stuff."
},
{
"role": "user",
"content": "<|image_2|>\nWhat about this one?"
},
]
_assert_mm_data_is_image_input(mm_data, 2)
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="coroutine 'async_get_and_parse_image' was never awaited")
with pytest.raises(
ValueError,
match="At most 2 image\\(s\\) may be provided in one request\\."
):
parse_chat_messages([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in these images?"
}]
}], phi3v_model_config, phi3v_tokenizer)
def test_parse_chat_messages_rejects_too_many_images_across_messages(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="coroutine 'async_get_and_parse_image' was never awaited")
with pytest.raises(
ValueError,
match="At most 2 image\\(s\\) may be provided in one request\\."
):
parse_chat_messages([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in this image?"
}]
}, {
"role": "assistant",
"content": "Some stuff."
}, {
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What about these two?"
}]
}], phi3v_model_config, phi3v_tokenizer)
...@@ -2,6 +2,13 @@ from typing import Optional, Tuple, Union ...@@ -2,6 +2,13 @@ from typing import Optional, Tuple, Union
import torch import torch
from vllm.utils import is_hip
# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
ROCM_FP8_MAX = 224.0
FP8_DTYPE = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
return torch.as_tensor(x, dtype=torch.float32, device='cuda') return torch.as_tensor(x, dtype=torch.float32, device='cuda')
...@@ -11,13 +18,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor, ...@@ -11,13 +18,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
scale_ub: Optional[torch.tensor] = None) \ scale_ub: Optional[torch.tensor] = None) \
-> Tuple[torch.tensor, torch.tensor]: -> Tuple[torch.tensor, torch.tensor]:
assert quant_dtype in [torch.int8, torch.float8_e4m3fn] assert quant_dtype in [torch.int8, FP8_DTYPE]
if scale_ub is not None: if scale_ub is not None:
assert quant_dtype == torch.float8_e4m3fn assert quant_dtype == FP8_DTYPE
qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \ qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
else torch.finfo(quant_dtype) else torch.finfo(quant_dtype)
qtype_max = as_float32_tensor(qtype_traits.max) qtype_traits_max = ROCM_FP8_MAX if is_hip() else qtype_traits.max
qtype_traits_min = -ROCM_FP8_MAX if is_hip() else qtype_traits.min
qtype_max = as_float32_tensor(qtype_traits_max)
s_1 = as_float32_tensor(1.0) s_1 = as_float32_tensor(1.0)
s_512 = as_float32_tensor(512.0) s_512 = as_float32_tensor(512.0)
...@@ -37,15 +46,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor, ...@@ -37,15 +46,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
iscales = as_float32_tensor(s_1 / scales) iscales = as_float32_tensor(s_1 / scales)
torch_out = as_float32_tensor(x) * iscales torch_out = as_float32_tensor(x) * iscales
torch_out = torch_out.round() torch_out = torch_out.round()
torch_out = torch_out.clamp(qtype_traits.min, torch_out = torch_out.clamp(qtype_traits_min,
qtype_traits.max).to(quant_dtype) qtype_traits_max).to(quant_dtype)
else: else:
assert quant_dtype == torch.float8_e4m3fn assert quant_dtype == FP8_DTYPE
min_scaling_factor = s_1 / (qtype_max * s_512) min_scaling_factor = s_1 / (qtype_max * s_512)
scales = scales.clamp(min=min_scaling_factor) scales = scales.clamp(min=min_scaling_factor)
torch_out = as_float32_tensor(x) / scales torch_out = as_float32_tensor(x) / scales
torch_out = torch_out.clamp(qtype_traits.min, torch_out = torch_out.clamp(qtype_traits_min,
qtype_traits.max).to(quant_dtype) qtype_traits_max).to(quant_dtype)
return torch_out, scales return torch_out, scales
...@@ -56,8 +65,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor, ...@@ -56,8 +65,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
-> Tuple[torch.tensor, torch.tensor]: -> Tuple[torch.tensor, torch.tensor]:
fp8_traits = torch.finfo(torch.float8_e4m3fn) fp8_traits = torch.finfo(FP8_DTYPE)
fp8_max = as_float32_tensor(fp8_traits.max) fp8_traits_max = ROCM_FP8_MAX if is_hip() else fp8_traits.max
fp8_traits_min = -ROCM_FP8_MAX if is_hip() else fp8_traits.min
fp8_max = as_float32_tensor(fp8_traits_max)
one = as_float32_tensor(1.0) one = as_float32_tensor(1.0)
# For fp8, in order to match the cuda kernel output, we have to do exactly # For fp8, in order to match the cuda kernel output, we have to do exactly
...@@ -68,5 +79,5 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ ...@@ -68,5 +79,5 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
ref_scale = x_max / fp8_max ref_scale = x_max / fp8_max
ref_iscale = one / ref_scale ref_iscale = one / ref_scale
ref_out = (as_float32_tensor(x) * ref_iscale).clamp( ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
fp8_traits.min, fp8_traits.max).to(dtype=torch.float8_e4m3fn) fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
return ref_out, ref_scale return ref_out, ref_scale.view((1, ))
...@@ -3,8 +3,11 @@ from typing import Type ...@@ -3,8 +3,11 @@ from typing import Type
import pytest import pytest
import torch import torch
from tests.kernels.utils import opcheck
from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
NewGELU, SiluAndMul) NewGELU, QuickGELU,
SiluAndMul)
from vllm.utils import seed_everything
from .allclose_default import get_default_atol, get_default_rtol from .allclose_default import get_default_atol, get_default_rtol
...@@ -32,25 +35,33 @@ def test_act_and_mul( ...@@ -32,25 +35,33 @@ def test_act_and_mul(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
x = torch.randn(num_tokens, 2 * d, dtype=dtype) x = torch.randn(num_tokens, 2 * d, dtype=dtype)
if activation == "silu": if activation == "silu":
layer = SiluAndMul() layer = SiluAndMul()
fn = torch.ops._C.silu_and_mul
elif activation == "gelu": elif activation == "gelu":
layer = GeluAndMul(approximate="none") layer = GeluAndMul(approximate="none")
fn = torch.ops._C.gelu_and_mul
elif activation == "gelu_tanh": elif activation == "gelu_tanh":
layer = GeluAndMul(approximate="tanh") layer = GeluAndMul(approximate="tanh")
fn = torch.ops._C.gelu_tanh_and_mul
out = layer(x) out = layer(x)
ref_out = layer.forward_native(x) ref_out = layer.forward_native(x)
# The SiLU and GELU implementations are equivalent to the native PyTorch # The SiLU and GELU implementations are equivalent to the native PyTorch
# implementations, so we can do exact comparison. # implementations, so we can do exact comparison.
assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0) torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
d = x.shape[-1] // 2
output_shape = (x.shape[:-1] + (d, ))
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
opcheck(fn, (out, x))
@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
(NewGELU, torch.ops._C.gelu_new),
(QuickGELU, torch.ops._C.gelu_quick)])
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D) @pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
...@@ -65,15 +76,17 @@ def test_activation( ...@@ -65,15 +76,17 @@ def test_activation(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
x = torch.randn(num_tokens, d, dtype=dtype) x = torch.randn(num_tokens, d, dtype=dtype)
layer = activation() layer = activation[0]()
fn = activation[1]
out = layer(x) out = layer(x)
ref_out = layer.forward_native(x) ref_out = layer.forward_native(x)
assert torch.allclose(out, torch.testing.assert_close(out,
ref_out, ref_out,
atol=get_default_atol(out), atol=get_default_atol(out),
rtol=get_default_rtol(out)) rtol=get_default_rtol(out))
out = torch.empty_like(x)
opcheck(fn, (out, x))
import torch
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops # noqa: F401
def test_aqlm_dequant_opcheck():
codes = torch.randint(-32768,
32767, (22016, 512, 1),
device='cuda',
dtype=torch.int16)
codebooks = torch.rand((2, 65536, 1, 8),
device='cuda',
dtype=torch.float16)
codebook_partition_sizes = [11008, 11008]
opcheck(torch.ops._C.aqlm_dequant,
(codes, codebooks, codebook_partition_sizes))
def test_aqlm_gemm_opcheck():
input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
codes = torch.randint(-32768,
32767, (12288, 512, 1),
device='cuda',
dtype=torch.int16)
codebooks = torch.rand((3, 65536, 1, 8),
device='cuda',
dtype=torch.float16)
scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
codebook_partition_sizes = [4096, 4096, 4096]
bias = None
opcheck(torch.ops._C.aqlm_gemm,
(input, codes, codebooks, scales, codebook_partition_sizes, None))
opcheck(torch.ops._C.aqlm_gemm,
(input, codes, codebooks, scales, codebook_partition_sizes, bias))
...@@ -3,14 +3,17 @@ from typing import List, Optional, Tuple ...@@ -3,14 +3,17 @@ from typing import List, Optional, Tuple
import pytest import pytest
import torch import torch
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import get_max_shared_memory_bytes, is_hip from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything
from .allclose_default import get_default_atol, get_default_rtol from .allclose_default import get_default_atol, get_default_rtol
if not is_hip():
from xformers import ops as xops
from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability. # This will change depending on the compute capability.
# - 512 as a buffer # - 512 as a buffer
...@@ -29,8 +32,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing ...@@ -29,8 +32,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing
# FlashAttention forward only supports head dimension at most 128 # FlashAttention forward only supports head dimension at most 128
# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256 HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
] if not is_hip() else [64, 80, 96, 112, 128]
BLOCK_SIZES = [16, 32] BLOCK_SIZES = [16, 32]
USE_ALIBI = [False, True] USE_ALIBI = [False, True]
...@@ -112,7 +114,8 @@ def ref_single_query_cached_kv_attention( ...@@ -112,7 +114,8 @@ def ref_single_query_cached_kv_attention(
output[i].copy_(out, non_blocking=True) output[i].copy_(out, non_blocking=True)
@pytest.mark.parametrize("version", ["v1", "v2"]) @pytest.mark.parametrize(
"version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"])
@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("head_size", HEAD_SIZES)
...@@ -135,12 +138,11 @@ def test_paged_attention( ...@@ -135,12 +138,11 @@ def test_paged_attention(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
if kv_cache_dtype == "fp8" and head_size % 16: if ((kv_cache_dtype == "fp8" and head_size % 16)
or (version == "rocm" and head_size not in (64, 128))):
pytest.skip() pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed) seed_everything(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
num_query_heads, num_kv_heads = num_heads num_query_heads, num_kv_heads = num_heads
...@@ -199,7 +201,15 @@ def test_paged_attention( ...@@ -199,7 +201,15 @@ def test_paged_attention(
k_scale, k_scale,
v_scale, v_scale,
) )
elif version == "v2":
opcheck(torch.ops._C.paged_attention_v1,
(output, query, key_cache, value_cache, num_kv_heads, scale,
block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
cond=(head_size == HEAD_SIZES[0]
and block_size == BLOCK_SIZES[0]))
elif version in ("v2", "rocm"):
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE) num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
assert PARTITION_SIZE % block_size == 0 assert PARTITION_SIZE % block_size == 0
num_seqs, num_heads, head_size = output.shape num_seqs, num_heads, head_size = output.shape
...@@ -212,25 +222,64 @@ def test_paged_attention( ...@@ -212,25 +222,64 @@ def test_paged_attention(
dtype=torch.float32, dtype=torch.float32,
) )
max_logits = torch.empty_like(exp_sums) max_logits = torch.empty_like(exp_sums)
ops.paged_attention_v2( if version == "v2":
output, ops.paged_attention_v2(
exp_sums, output,
max_logits, exp_sums,
tmp_output, max_logits,
query, tmp_output,
key_cache, query,
value_cache, key_cache,
num_kv_heads, value_cache,
scale, num_kv_heads,
block_tables, scale,
seq_lens, block_tables,
block_size, seq_lens,
max_seq_len, block_size,
alibi_slopes, max_seq_len,
kv_cache_dtype, alibi_slopes,
k_scale, kv_cache_dtype,
v_scale, k_scale,
) v_scale,
)
opcheck(torch.ops._C.paged_attention_v2,
(output, exp_sums, max_logits, tmp_output, query,
key_cache, value_cache, num_kv_heads, scale, block_tables,
seq_lens, block_size, max_seq_len, alibi_slopes,
kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
cond=(head_size == HEAD_SIZES[0]
and block_size == BLOCK_SIZES[0]))
else:
ops.paged_attention_rocm(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
opcheck(torch.ops._rocm_C.paged_attention,
(output, exp_sums, max_logits, tmp_output, query,
key_cache, value_cache, num_kv_heads, scale, block_tables,
seq_lens, block_size, max_seq_len, alibi_slopes,
kv_cache_dtype, k_scale, v_scale),
cond=(head_size == HEAD_SIZES[0]
and block_size == BLOCK_SIZES[0]))
else: else:
raise AssertionError(f"Unknown version: {version}") raise AssertionError(f"Unknown version: {version}")
...@@ -277,7 +326,7 @@ def test_paged_attention( ...@@ -277,7 +326,7 @@ def test_paged_attention(
atol, rtol = 1e-3, 1e-5 atol, rtol = 1e-3, 1e-5
if kv_cache_dtype == "fp8": if kv_cache_dtype == "fp8":
atol, rtol = 1e-2, 1e-5 atol, rtol = 1e-2, 1e-5
assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
def ref_multi_query_kv_attention( def ref_multi_query_kv_attention(
...@@ -320,6 +369,8 @@ def ref_multi_query_kv_attention( ...@@ -320,6 +369,8 @@ def ref_multi_query_kv_attention(
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.skipif(is_hip(),
reason="Xformers backend is not supported on ROCm.")
@torch.inference_mode() @torch.inference_mode()
def test_multi_query_kv_attention( def test_multi_query_kv_attention(
num_seqs: int, num_seqs: int,
...@@ -329,10 +380,7 @@ def test_multi_query_kv_attention( ...@@ -329,10 +380,7 @@ def test_multi_query_kv_attention(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
random.seed(seed) seed_everything(seed)
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
# MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
# As the xformers library is already tested with its own tests, we can use # As the xformers library is already tested with its own tests, we can use
...@@ -382,4 +430,4 @@ def test_multi_query_kv_attention( ...@@ -382,4 +430,4 @@ def test_multi_query_kv_attention(
) )
atol = get_default_atol(output) if is_hip() else 1e-3 atol = get_default_atol(output) if is_hip() else 1e-3
rtol = get_default_rtol(output) if is_hip() else 1e-5 rtol = get_default_rtol(output) if is_hip() else 1e-5
assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
...@@ -3,9 +3,9 @@ from unittest.mock import patch ...@@ -3,9 +3,9 @@ from unittest.mock import patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import (STR_FLASH_ATTN_VAL, STR_INVALID_VAL, from tests.kernels.utils import override_backend_env_variable
override_backend_env_variable)
from vllm.attention.selector import which_attn_to_use from vllm.attention.selector import which_attn_to_use
from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -20,22 +20,22 @@ def test_env(name: str, device: str, monkeypatch): ...@@ -20,22 +20,22 @@ def test_env(name: str, device: str, monkeypatch):
if device == "cpu": if device == "cpu":
with patch("vllm.attention.selector.is_cpu", return_value=True): with patch("vllm.attention.selector.is_cpu", return_value=True):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, backend = which_attn_to_use(16, None, torch.float16, torch.float16,
torch.float16, 16) 16, False)
assert backend.name == "TORCH_SDPA" assert backend.name == "TORCH_SDPA"
elif device == "hip": elif device == "hip":
with patch("vllm.attention.selector.is_hip", return_value=True): with patch("vllm.attention.selector.is_hip", return_value=True):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, backend = which_attn_to_use(16, None, torch.float16, torch.float16,
torch.float16, 16) 16, False)
assert backend.name == "ROCM_FLASH" assert backend.name == "ROCM_FLASH"
elif device == "openvino": elif device == "openvino":
with patch("vllm.attention.selector.is_openvino", return_value=True): with patch("vllm.attention.selector.is_openvino", return_value=True):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, backend = which_attn_to_use(16, None, torch.float16, torch.float16,
torch.float16, 16) 16, False)
assert backend.name == "OPENVINO" assert backend.name == "OPENVINO"
else: else:
backend = which_attn_to_use(8, 16, 8, None, torch.float16, backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16,
torch.float16, 16) False)
assert backend.name == name assert backend.name == name
...@@ -45,33 +45,38 @@ def test_flash_attn(monkeypatch): ...@@ -45,33 +45,38 @@ def test_flash_attn(monkeypatch):
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch # Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=[7, 5]): with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) backend = which_attn_to_use(16, None, torch.float16, None, 16, False)
assert backend.name != STR_FLASH_ATTN_VAL assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported data type # Unsupported data type
backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16) backend = which_attn_to_use(16, None, torch.float8_e4m3fn, None, 16, False)
assert backend.name != STR_FLASH_ATTN_VAL assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported kv cache data type # Unsupported kv cache data type
backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16) backend = which_attn_to_use(16, None, torch.float16, "fp8", 16, False)
assert backend.name != STR_FLASH_ATTN_VAL assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported block size # Unsupported block size
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8) backend = which_attn_to_use(16, None, torch.float16, None, 8, False)
assert backend.name != STR_FLASH_ATTN_VAL assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported sliding window # Unsupported sliding window
backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16) backend = which_attn_to_use(16, 1, torch.float16, None, 16, False)
assert backend.name != STR_FLASH_ATTN_VAL assert backend.name != STR_FLASH_ATTN_VAL
# flash-attn is not installed # flash-attn is not installed
with patch.dict('sys.modules', {'vllm_flash_attn': None}): with patch.dict('sys.modules', {'vllm_flash_attn': None}):
backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) backend = which_attn_to_use(16, None, torch.float16, None, 16, False)
assert backend.name != STR_FLASH_ATTN_VAL assert backend.name != STR_FLASH_ATTN_VAL
# Unsupported head size # Unsupported head size
backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16) backend = which_attn_to_use(17, None, torch.float16, None, 16, False)
assert backend.name != STR_FLASH_ATTN_VAL
# Attention-free models should bypass env and use PlaceholderAttention
backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16,
True)
assert backend.name != STR_FLASH_ATTN_VAL assert backend.name != STR_FLASH_ATTN_VAL
...@@ -79,4 +84,4 @@ def test_invalid_env(monkeypatch): ...@@ -79,4 +84,4 @@ def test_invalid_env(monkeypatch):
"""Throw an exception if the backend name is invalid.""" """Throw an exception if the backend name is invalid."""
override_backend_env_variable(monkeypatch, STR_INVALID_VAL) override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
with pytest.raises(ValueError): with pytest.raises(ValueError):
which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) which_attn_to_use(16, None, torch.float16, None, 16, False)
\ No newline at end of file \ No newline at end of file
import os
import pytest
import torch
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
reason="AWQ is not supported on this GPU type.")
def test_awq_dequantize_opcheck():
os.environ["VLLM_USE_TRITON_AWQ"] = "0"
qweight = torch.randint(-2000000000,
2000000000, (8192, 256),
device='cuda',
dtype=torch.int32)
scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
split_k_iters = 0
thx = 0
thy = 0
opcheck(torch.ops._C.awq_dequantize,
(qweight, scales, zeros, split_k_iters, thx, thy))
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
reason="AWQ is not supported on this GPU type.")
def test_awq_gemm_opcheck():
os.environ["VLLM_USE_TRITON_AWQ"] = "0"
input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
qweight = torch.randint(-2000000000,
2000000000, (8192, 256),
device='cuda',
dtype=torch.int32)
scales = torch.randint(-2000000000,
2000000000, (64, 256),
device='cuda',
dtype=torch.int32)
qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
split_k_iters = 8
opcheck(torch.ops._C.awq_gemm,
(input, qweight, qzeros, scales, split_k_iters))
"""Test AWQ with fused MoE Marlin kernels.
Run `pytest tests/kernels/test_awq_marlin.py`.
"""
import pytest
import torch
from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe,
torch_moe_single)
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
fused_marlin_moe, single_marlin_moe)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
awq_marlin_quantize)
from vllm.scalar_type import scalar_types
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 1024, 512])
@pytest.mark.parametrize("e", [8, 64])
@pytest.mark.parametrize("topk", [2, 6])
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
@pytest.mark.skipif(not (ops.supports_moe_ops
and hasattr(torch.ops._moe_C, "marlin_gemm_moe")),
reason="Marlin is not supported on this GPU type.")
def test_fused_marlin_moe_awq(
m: int,
n: int,
k: int,
e: int,
topk: int,
group_size: int,
):
torch.manual_seed(7)
num_bits = 4
quant_type = scalar_types.uint4
dtype = torch.float16
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
w_ref1_l = []
qweights1_l = []
scales1_l = []
zp1_l = []
for i in range(w1.shape[0]):
w_ref1, qweight1, scales1, zp1 = awq_marlin_quantize(
w1[i].transpose(1, 0), quant_type, group_size)
w_ref1_l.append(w_ref1)
qweights1_l.append(qweight1)
scales1_l.append(scales1)
zp1_l.append(zp1)
w_ref1 = stack_and_dev(w_ref1_l)
qweight1 = stack_and_dev(qweights1_l).contiguous()
scales1 = stack_and_dev(scales1_l)
zp1 = stack_and_dev(zp1_l)
w_ref2_l = []
qweights2_l = []
scales2_l = []
zp2_l = []
for i in range(w2.shape[0]):
w_ref2, qweight2, scales2, zp2 = awq_marlin_quantize(
w2[i].transpose(1, 0), quant_type, group_size)
w_ref2_l.append(w_ref2)
qweights2_l.append(qweight2)
scales2_l.append(scales2)
zp2_l.append(zp2)
w_ref2 = stack_and_dev(w_ref2_l)
qweight2 = stack_and_dev(qweights2_l).contiguous()
scales2 = stack_and_dev(scales2_l)
zp2 = stack_and_dev(zp2_l)
score = torch.randn((m, e), device="cuda", dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, False)
marlin_output = fused_marlin_moe(
a,
qweight1,
qweight2,
scales1,
scales2,
score,
topk_weights,
topk_ids,
w1_zeros=zp1,
w2_zeros=zp2,
num_bits=num_bits,
)
torch_output = torch_moe(
a,
w_ref1.transpose(1, 2),
w_ref2.transpose(1, 2),
score,
topk,
)
assert compute_max_diff(marlin_output, torch_output) < 4e-2
@pytest.mark.skip("This test is here for the sake of debugging, "
"don't run it in automated tests.")
@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 1024, 512])
@pytest.mark.parametrize("e", [8, 64])
@pytest.mark.parametrize("topk", [2, 6])
@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
def test_single_marlin_moe_multiply_awq(
m: int,
n: int,
k: int,
e: int,
topk: int,
group_size: int,
):
torch.manual_seed(7)
num_bits = 4
quant_type = scalar_types.uint4
dtype = torch.float16
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
w_ref_l = []
qweights_l = []
scales_l = []
zp_l = []
for i in range(w.shape[0]):
w_ref, qweight, scales, zp = awq_marlin_quantize(
w[i].transpose(1, 0), quant_type, group_size)
w_ref_l.append(w_ref)
qweights_l.append(qweight)
scales_l.append(scales)
zp_l.append(zp)
w_ref = stack_and_dev(w_ref_l)
qweight = stack_and_dev(qweights_l).contiguous()
scales = stack_and_dev(scales_l).contiguous()
zp = stack_and_dev(zp_l).contiguous()
score = torch.randn((m, e), device="cuda", dtype=dtype)
marlin_output = single_marlin_moe(a,
qweight,
scales,
score,
topk,
renormalize=False,
w_zeros=zp,
num_bits=num_bits)
torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
assert compute_max_diff(marlin_output, torch_output) < 1e-2
"""Tests for the AWQ Triton kernel.
Run `pytest tests/kernels/test_awq_triton.py`.
"""
import pytest
import torch
from vllm.model_executor.layers.quantization.awq_triton import (
AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
from vllm.utils import seed_everything
device = "cuda"
def reverse_awq_order(t: torch.Tensor):
bits = 4
AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
reverse_order_tensor = torch.arange(
t.shape[-1],
dtype=torch.int32,
device=t.device,
)
reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
reverse_order_tensor = reverse_order_tensor.view(-1)
t = t[:, reverse_order_tensor] & 0xF
return t
# qweights - [R , C // 8], int32
# scales - [R // G, C ], float16
# zeros - [R // G, C // 8], int32
def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
qzeros: torch.Tensor,
group_size: int) -> torch.Tensor:
if group_size == -1:
group_size = qweight.shape[0]
bits = 4
shifts = torch.arange(0, 32, bits, device=qzeros.device)
iweights = torch.bitwise_right_shift(qweight[:, :, None],
shifts[None, None, :]).to(torch.int8)
iweights = iweights.view(iweights.shape[0], -1)
zeros = torch.bitwise_right_shift(qzeros[:, :, None],
shifts[None, None, :]).to(torch.int8)
zeros = zeros.view(qzeros.shape[0], -1)
zeros = reverse_awq_order(zeros)
iweights = reverse_awq_order(iweights)
iweights = torch.bitwise_and(iweights, (2**bits) - 1)
zeros = torch.bitwise_and(zeros, (2**bits) - 1)
scales = scales.repeat_interleave(group_size, dim=0)
zeros = zeros.repeat_interleave(group_size, dim=0)
return (iweights - zeros) * scales
# qweights - [R , C // 8], int32
# scales - [R // G, C ], float16
# zeros - [R // G, C // 8], int32
@pytest.mark.parametrize("qweight_rows", [3584, 18944, 128, 256, 512, 1024])
@pytest.mark.parametrize("qweight_cols", [448, 576, 4736, 16, 32, 64, 128])
@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
def test_dequantize(qweight_rows, qweight_cols, group_size):
if group_size == -1:
group_size = qweight_rows
qweight_dtype = torch.int32
scales_rows = qweight_rows // group_size
scales_cols = qweight_cols * 8
scales_dtype = torch.float16
zeros_rows = scales_rows
zeros_cols = qweight_cols
zeros_dtype = torch.int32
seed_everything(0)
qweight = torch.randint(0,
torch.iinfo(torch.int32).max,
(qweight_rows, qweight_cols),
dtype=qweight_dtype,
device=device)
scales = torch.rand(scales_rows,
scales_cols,
dtype=scales_dtype,
device=device)
zeros = torch.randint(0,
torch.iinfo(torch.int32).max,
(zeros_rows, zeros_cols),
dtype=zeros_dtype,
device=device)
iweights_triton = awq_dequantize_triton(qweight, scales, zeros)
assert (not torch.any(torch.isinf(iweights_triton))
and not torch.any(torch.isnan(iweights_triton)))
iweights_torch = awq_dequantize_torch(qweight, scales, zeros, group_size)
torch.testing.assert_close(iweights_triton, iweights_torch)
# input - [N, K]
# qweight - [K, M // 8]
# qzeros - [K // G, M // 8]
# scales - [K // G, M]
@pytest.mark.parametrize("N", [1, 2, 4, 8, 14, 17, 23, 32])
@pytest.mark.parametrize("K", [128])
@pytest.mark.parametrize("M", [16, 24, 32])
@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
@pytest.mark.parametrize("splitK", [1, 8])
def test_gemm(N, K, M, splitK, group_size):
if group_size == -1:
group_size = K
split_k_iters = splitK
input_rows = N
input_cols = K
input_dtype = torch.float32
qweight_rows = input_cols
qweight_cols = M // 8
scales_rows = qweight_rows // group_size
scales_cols = M
scales_dtype = torch.float32
qzeros_rows = scales_rows
qzeros_cols = qweight_cols
seed_everything(0)
input = torch.rand((input_rows, input_cols),
dtype=input_dtype,
device=device)
qweight = torch.randint(0,
torch.iinfo(torch.int32).max,
(qweight_rows, qweight_cols),
device=device)
qzeros = torch.randint(0,
torch.iinfo(torch.int32).max,
(qzeros_rows, qzeros_cols),
device=device)
scales = torch.rand((scales_rows, scales_cols),
dtype=scales_dtype,
device=device)
output_triton = awq_gemm_triton(input, qweight, scales, qzeros,
split_k_iters)
assert (not torch.any(torch.isinf(output_triton))
and not torch.any(torch.isnan(output_triton)))
dequantized_weights = awq_dequantize_triton(qweight, scales, qzeros)
output_torch = torch.matmul(input, dequantized_weights)
assert (not torch.any(torch.isinf(output_torch))
and not torch.any(torch.isnan(output_torch)))
torch.testing.assert_close(output_triton.cpu(),
output_torch.cpu(),
atol=1e-1,
rtol=1e-1)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment