Commit 78c1f9e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1(tests)

parent 86a65417
...@@ -74,7 +74,7 @@ class TestSetting: ...@@ -74,7 +74,7 @@ class TestSetting:
), ),
pytest.param( pytest.param(
TestSetting( TestSetting(
model="BAAI/bge-base-en-v1.5", model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
model_args=["--runner", "pooling"], model_args=["--runner", "pooling"],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy import copy
import logging
from contextlib import nullcontext from contextlib import nullcontext
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
...@@ -19,7 +18,6 @@ from vllm.config import ( ...@@ -19,7 +18,6 @@ from vllm.config import (
) )
from vllm.config.compilation import CompilationMode, PassConfig from vllm.config.compilation import CompilationMode, PassConfig
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.logger import _print_warning_once
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import ( from vllm.utils.torch_utils import (
_is_torch_equal_or_newer, _is_torch_equal_or_newer,
......
...@@ -42,7 +42,6 @@ class MockModelConfig: ...@@ -42,7 +42,6 @@ class MockModelConfig:
tokenizer_revision = None tokenizer_revision = None
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig() hf_config = MockHFConfig()
hf_text_config = MockHFConfig()
logits_processor_pattern = None logits_processor_pattern = None
logits_processors: list[str] | None = None logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None diff_sampling_param: dict | None = None
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import os
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.tokenizers import get_tokenizer
from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import VLLM_PATH, models_path_prefix
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATION_OUTPUT = [
(
os.path.join(models_path_prefix, "facebook/opt-125m"),
chatml_jinja_path,
True,
False,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
""",
),
(
os.path.join(models_path_prefix, "facebook/opt-125m"),
chatml_jinja_path,
False,
False,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of""",
),
(
os.path.join(models_path_prefix, "facebook/opt-125m"),
chatml_jinja_path,
False,
True,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of""",
),
]
TEST_MESSAGES = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "What is the capital of"},
]
ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"}
def test_load_chat_template():
# Testing chatml template
template_content = load_chat_template(chat_template=chatml_jinja_path)
# Test assertions
assert template_content is not None
# Hard coded value for template_chatml.jinja
assert (
template_content
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
)
def test_no_load_chat_template_filelike():
# Testing chatml template
template = "../../examples/does_not_exist"
with pytest.raises(ValueError, match="looks like a file path"):
load_chat_template(chat_template=template)
def test_no_load_chat_template_literallike():
# Testing chatml template
template = "{{ messages }}"
template_content = load_chat_template(chat_template=template)
assert template_content == template
@pytest.mark.parametrize(
"model,template,add_generation_prompt,continue_final_message,expected_output",
MODEL_TEMPLATE_GENERATION_OUTPUT,
)
def test_get_gen_prompt(
model, template, add_generation_prompt, continue_final_message, expected_output
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Initialize the tokenizer
tokenizer = get_tokenizer(
tokenizer_name=model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
template_content = load_chat_template(chat_template=template)
# Create a mock request object using keyword arguments
mock_request = ChatCompletionRequest(
model=model,
messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
if continue_final_message
else TEST_MESSAGES,
add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message,
)
# Call the function and get the result
result = apply_hf_chat_template(
tokenizer=tokenizer,
conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content,
model_config=model_config,
tools=None,
add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,
)
# Test assertion
assert result == expected_output, (
f"The generated prompt does not match the expected output for "
f"model {model} and template {template}"
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import numpy as np
import openai
import pytest
import os
import pytest_asyncio
import requests
import torch
import torch.nn.functional as F
from tests.models.language.pooling.embed_utils import (
run_embedding_correctness_test)
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer
from utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="module")
def hf_model(hf_runner):
with hf_runner(MODEL_NAME, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
yield hf_model
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 11
assert embeddings.usage.total_tokens == 11
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
model_name: str):
# test list[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 33
assert embeddings.usage.total_tokens == 33
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test list[list[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(server: RemoteOpenAIServer,
client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "user",
"content": "The cat sat on the mat.",
}, {
"role": "assistant",
"content": "A feline was resting on a rug.",
}, {
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
}]
chat_response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = EmbeddingResponse.model_validate(
completion_response.model_dump(mode="json"))
assert chat_embeddings.id is not None
assert completion_embeddings.id is not None
assert chat_embeddings.created <= completion_embeddings.created
assert chat_embeddings.model_dump(
exclude={"id", "created"}) == (completion_embeddings.model_dump(
exclude={"id", "created"}))
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
]
responses_float = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="float")
float_data = [d.embedding for d in responses_float.data]
run_embedding_correctness_test(hf_model, input_texts, float_data)
responses_base64 = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="base64")
base64_data = []
for data in responses_base64.data:
base64_data.append(
np.frombuffer(base64.b64decode(data.embedding),
dtype="float32").tolist())
run_embedding_correctness_test(hf_model, input_texts, base64_data)
# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await client.embeddings.create(input=input_texts,
model=model_name)
default_data = [d.embedding for d in responses_default.data]
run_embedding_correctness_test(hf_model, input_texts, default_data)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 10})
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
extra_body={"truncate_prompt_tokens": 10})
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
response = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193})
assert "error" in response.object
assert "truncate_prompt_tokens value is greater than max_model_len. "\
"Please, select a smaller truncation size." in response.message
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer,
client: openai.AsyncOpenAI):
input_texts = [
"The chef prepared a delicious meal.",
]
request_args = {
"model": MODEL_NAME,
"input": input_texts,
"encoding_format": "float",
}
completion_response = await client.embeddings.create(**request_args)
invocation_response = requests.post(server.url_for("invocations"),
json=request_args)
invocation_response.raise_for_status()
completion_output = completion_response.model_dump()
invocation_output = invocation_response.json()
assert completion_output.keys() == invocation_output.keys()
for completion_data, invocation_data in zip(completion_output["data"],
invocation_output["data"]):
assert completion_data.keys() == invocation_data.keys()
check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="completion",
name_1="invocation")
@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
messages = [{
"role": "user",
"content": "The cat sat on the mat.",
}, {
"role": "assistant",
"content": "A feline was resting on a rug.",
}, {
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
}]
request_args = {
"model": MODEL_NAME,
"messages": messages,
"encoding_format": "float",
}
chat_response = requests.post(server.url_for("v1/embeddings"),
json=request_args)
chat_response.raise_for_status()
invocation_response = requests.post(server.url_for("invocations"),
json=request_args)
invocation_response.raise_for_status()
chat_output = chat_response.json()
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
for chat_data, invocation_data in zip(chat_output["data"],
invocation_output["data"]):
assert chat_data.keys() == invocation_data.keys()
check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="chat",
name_1="invocation")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
input_text = ["The chef prepared a delicious meal."]
async def get_outputs(normalize):
request_args = {
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"normalize": normalize
}
response = requests.post(server.url_for("v1/embeddings"),
json=request_args)
outputs = response.json()
return torch.tensor([x['embedding'] for x in outputs["data"]])
default = await get_outputs(normalize=None)
w_normal = await get_outputs(normalize=True)
wo_normal = await get_outputs(normalize=False)
assert torch.allclose(default, w_normal,
atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal,
atol=1e-2), "wo_normal should not use normal."
assert torch.allclose(
w_normal, F.normalize(wo_normal, p=2, dim=-1),
atol=1e-2), "w_normal should be close to normal(wo_normal)."
# SPDX-License-Identifier: Apache-2.0
import os
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import pytest
import requests
import torch
import torch.nn.functional as F
from torch import tensor
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import ScoreResponse
from utils import models_path_prefix
MODELS = [
{
"name": os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),
"is_cross_encoder": True
},
{
"name": "BAAI/bge-base-en-v1.5",
"is_cross_encoder": False
},
]
DTYPE = "half"
def run_transformers(hf_model, model, text_pairs):
if model["is_cross_encoder"]:
return hf_model.predict(text_pairs).tolist()
else:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
return [
F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
for pair in hf_embeddings
]
@pytest.fixture(scope="class", params=MODELS)
def model(request):
yield request.param
@pytest.fixture(scope="class")
def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server
@pytest.fixture(scope="class")
def runner(model: dict[str, Any], hf_runner):
kwargs = {
"dtype": DTYPE,
"is_cross_encoder" if model["is_cross_encoder"]\
else "is_sentence_transformer": True
}
with hf_runner(model["name"], **kwargs) as hf_model:
yield hf_model
class TestModel:
def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?"
]
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_score_max_model_len(self, server: RemoteOpenAIServer,
model: dict[str, Any]):
text_1 = "What is the capital of France?" * 20
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
score_response.text
# Test truncation
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"truncate_prompt_tokens": 101
})
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \
score_response.text
def test_invocations(self, server: RemoteOpenAIServer, model: dict[str,
Any]):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
request_args = {
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
}
score_response = requests.post(server.url_for("score"),
json=request_args)
score_response.raise_for_status()
invocation_response = requests.post(server.url_for("invocations"),
json=request_args)
invocation_response.raise_for_status()
score_output = score_response.json()
invocation_output = invocation_response.json()
assert score_output.keys() == invocation_output.keys()
for score_data, invocation_data in zip(score_output["data"],
invocation_output["data"]):
assert score_data.keys() == invocation_data.keys()
assert score_data["score"] == pytest.approx(
invocation_data["score"], rel=0.05)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16
def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
Any]):
def get_outputs(activation):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"activation": activation
})
if response.status_code != 200:
return response
outputs = response.json()
return torch.tensor([x['score'] for x in outputs["data"]])
if model["is_cross_encoder"]:
default = get_outputs(activation=None)
w_activation = get_outputs(activation=True)
wo_activation = get_outputs(activation=False)
assert torch.allclose(default, w_activation,
atol=1e-2), "Default should use activation."
assert not torch.allclose(
w_activation, wo_activation,
atol=1e-2), "wo_activation should not use activation."
assert torch.allclose(
F.sigmoid(wo_activation), w_activation, atol=1e-2
), "w_activation should be close to activation(wo_activation)."
else:
get_outputs(activation=None)
# The activation parameter only works for the is_cross_encoder model
response = get_outputs(activation=True)
assert response.status_code == 400
...@@ -15,9 +15,6 @@ from vllm.platforms import current_platform ...@@ -15,9 +15,6 @@ from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.mem_utils import get_max_shared_memory_bytes
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
if current_platform.is_rocm():
from flash_attn import vllm_flash_attn_with_kvcache
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability. # This will change depending on the compute capability.
# - 512 as a buffer # - 512 as a buffer
......
...@@ -162,7 +162,6 @@ def test_reshape_and_cache( ...@@ -162,7 +162,6 @@ def test_reshape_and_cache(
torch.testing.assert_close(key_cache, cloned_key_cache) torch.testing.assert_close(key_cache, cloned_key_cache)
torch.testing.assert_close(value_cache, cloned_value_cache) torch.testing.assert_close(value_cache, cloned_value_cache)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("head_size", HEAD_SIZES)
......
...@@ -104,16 +104,18 @@ def test_flash_mla( ...@@ -104,16 +104,18 @@ def test_flash_mla(
descale_k = None descale_k = None
def flash_mla(): def flash_mla():
return flash_mla_with_kvcache(q, return flash_mla_with_kvcache(
blocked_k, q,
block_table, blocked_k,
cache_seqlens, block_table,
dv, cache_seqlens,
tile_scheduler_metadata, dv,
num_splits, tile_scheduler_metadata,
causal=causal, num_splits,
descale_q=descale_q, causal=causal,
descale_k=descale_k) descale_q=descale_q,
descale_k=descale_k,
)
def scaled_dot_product_attention(query, key, value, is_causal=False): def scaled_dot_product_attention(query, key, value, is_causal=False):
query = query.float() query = query.float()
......
...@@ -53,7 +53,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE): ...@@ -53,7 +53,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
dtype=torch.float32, dtype=torch.float32,
device="cuda", device="cuda",
) )
best_config = None
# Call the original implementation. # Call the original implementation.
decode_attention_fwd( decode_attention_fwd(
...@@ -67,7 +66,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE): ...@@ -67,7 +66,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
attn_logits, attn_logits,
num_kv_splits, num_kv_splits,
sm_scale, sm_scale,
best_config,
) )
# Page size can be larger than 1. # Page size can be larger than 1.
...@@ -88,7 +86,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE): ...@@ -88,7 +86,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
attn_logits, attn_logits,
num_kv_splits, num_kv_splits,
sm_scale, sm_scale,
best_config,
PAGE_SIZE, PAGE_SIZE,
) )
......
...@@ -146,84 +146,9 @@ def test_fused_rms_norm_quant( ...@@ -146,84 +146,9 @@ def test_fused_rms_norm_quant(
(out_quant_fused, x, weight, quant_scale_t, 1e-6), (out_quant_fused, x, weight, quant_scale_t, 1e-6),
) )
# @pytest.mark.parametrize("num_tokens", NUM_TOKENS) torch.testing.assert_close(
# @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) out_quant.to(dtype=torch.float32),
# @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) out_quant_fused.to(dtype=torch.float32),
# @pytest.mark.parametrize("dtype", DTYPES) atol=1e-3,
# @pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0]) rtol=1e-3,
# @pytest.mark.parametrize("seed", SEEDS) )
# @pytest.mark.parametrize("device", CUDA_DEVICES) \ No newline at end of file
# @pytest.mark.parametrize("strided_input", [False, True])
# def test_fused_rms_norm_quant(
# num_tokens: int,
# hidden_size: int,
# add_residual: bool,
# dtype: torch.dtype,
# quant_scale: float,
# seed: int,
# device: str,
# strided_input: bool,
# ) -> None:
# current_platform.seed_everything(seed)
# torch.set_default_device(device)
# weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
# scale = 1 / (2 * hidden_size)
# last_dim = 2 * hidden_size if strided_input else hidden_size
# x_base = torch.randn(num_tokens, last_dim, dtype=dtype)
# x = x_base[..., :hidden_size]
# assert x.is_contiguous() != strided_input
# x *= scale
# if add_residual:
# residual = torch.randn_like(x) * scale
# residual_fused = residual.clone()
# else:
# residual = residual_fused = None
# out_norm = torch.empty_like(x)
# out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
# out_quant_fused = torch.empty_like(out_quant)
# quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
# if add_residual:
# torch.ops._C.fused_add_rms_norm_static_fp8_quant(
# out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
# )
# # Unfused kernel is in-place so it goes second
# # Also use a separate clone of x to avoid modifying the input
# x_unfused_base = x_base.clone()
# x_unfused = x_unfused_base[..., :hidden_size]
# assert x_unfused.is_contiguous() != strided_input
# torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
# torch.ops._C.static_scaled_fp8_quant(
# out_quant, x_unfused.contiguous(), quant_scale_t
# )
# torch.cuda.synchronize()
# torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
# opcheck(
# torch.ops._C.fused_add_rms_norm_static_fp8_quant,
# (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
# )
# else:
# torch.ops._C.rms_norm_static_fp8_quant(
# out_quant_fused, x, weight, quant_scale_t, 1e-6
# )
# torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
# torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)
# opcheck(
# torch.ops._C.rms_norm_static_fp8_quant,
# (out_quant_fused, x, weight, quant_scale_t, 1e-6),
# )
# torch.testing.assert_close(
# out_quant.to(dtype=torch.float32),
# out_quant_fused.to(dtype=torch.float32),
# atol=1e-3,
# rtol=1e-3,
# )
...@@ -22,9 +22,6 @@ from vllm.distributed import ( ...@@ -22,9 +22,6 @@ from vllm.distributed import (
) )
from vllm.forward_context import set_forward_context from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig, FusedMoEConfig,
FusedMoEParallelConfig, FusedMoEParallelConfig,
...@@ -43,6 +40,7 @@ from .mk_objects import ( ...@@ -43,6 +40,7 @@ from .mk_objects import (
TestMoEQuantConfig, TestMoEQuantConfig,
expert_info, expert_info,
make_fused_experts, make_fused_experts,
make_prepare_finalize,
prepare_finalize_info, prepare_finalize_info,
) )
from .parallel_utils import ProcessGroupInfo from .parallel_utils import ProcessGroupInfo
...@@ -605,12 +603,10 @@ def make_modular_kernel( ...@@ -605,12 +603,10 @@ def make_modular_kernel(
routing_method=RoutingMethodType.DeepSeekV3, routing_method=RoutingMethodType.DeepSeekV3,
) )
prepare_finalize = maybe_make_prepare_finalize( # make modular kernel
moe=moe, prepare_finalize = make_prepare_finalize(
quant_config=quant_config, config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
allow_new_interface=True,
) )
assert prepare_finalize is not None
fused_experts = make_fused_experts( fused_experts = make_fused_experts(
config.fused_experts_type, config.fused_experts_type,
...@@ -692,4 +688,4 @@ def run_modular_kernel( ...@@ -692,4 +688,4 @@ def run_modular_kernel(
): ):
out = mk.forward(**mk_kwargs) out = mk.forward(**mk_kwargs)
return out return out
\ No newline at end of file
...@@ -7,6 +7,9 @@ import torch ...@@ -7,6 +7,9 @@ import torch
# Fused experts and PrepareFinalize imports # Fused experts and PrepareFinalize imports
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe import TritonExperts from vllm.model_executor.layers.fused_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts, BatchedDeepGemmExperts,
) )
...@@ -252,12 +255,13 @@ if has_pplx(): ...@@ -252,12 +255,13 @@ if has_pplx():
) )
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100): if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts, FlashInferExperts,
) )
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
create_flashinfer_prepare_finalize,
)
register_prepare_and_finalize( register_prepare_and_finalize(
FlashInferCutlassMoEPrepareAndFinalize, FlashInferCutlassMoEPrepareAndFinalize,
...@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe(): ...@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
] ]
def make_prepare_finalize(
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
backend: str | None,
moe: FusedMoEConfig,
quant_config: FusedMoEQuantConfig,
) -> mk.FusedMoEPrepareAndFinalize:
if backend != "naive" and backend is not None:
prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
assert prepare_finalize is not None
return prepare_finalize
elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
return create_flashinfer_prepare_finalize(
use_dp=moe.moe_parallel_config.dp_size > 1
)
else:
return MoEPrepareAndFinalizeNoEP()
def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor: def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
s = rank * num_local_experts s = rank * num_local_experts
e = s + num_local_experts e = s + num_local_experts
...@@ -473,4 +495,4 @@ def make_fused_experts( ...@@ -473,4 +495,4 @@ def make_fused_experts(
torch.set_printoptions(threshold=1000, edgeitems=5, linewidth=80) torch.set_printoptions(threshold=1000, edgeitems=5, linewidth=80)
return experts return experts
\ No newline at end of file
...@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( ...@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
) )
kernel = mk.FusedMoEModularKernel( kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(
defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
moe_config=moe_config,
quant_config=quant_config,
)
),
FlashInferExperts( FlashInferExperts(
moe_config=moe_config, moe_config=moe_config,
quant_config=quant_config, quant_config=quant_config,
...@@ -315,4 +320,4 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( ...@@ -315,4 +320,4 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
) )
torch.testing.assert_close( torch.testing.assert_close(
output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2 output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2
) )
\ No newline at end of file
...@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph( ...@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph(
) )
flashinfer_experts = FusedMoEModularKernel( flashinfer_experts = FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(), MoEPrepareAndFinalizeNoEP(
defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
moe_config=moe_config,
quant_config=quant_config,
)
),
FlashInferExperts(moe_config=moe_config, quant_config=quant_config), FlashInferExperts(moe_config=moe_config, quant_config=quant_config),
) )
...@@ -169,4 +174,4 @@ def test_flashinfer_fp4_moe_no_graph( ...@@ -169,4 +174,4 @@ def test_flashinfer_fp4_moe_no_graph(
if __name__ == "__main__": if __name__ == "__main__":
test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half) test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half)
\ No newline at end of file
...@@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization.awq_triton import ( ...@@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
) )
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
device = "cuda" device = "cuda"
def reverse_awq_order(t: torch.Tensor): def reverse_awq_order(t: torch.Tensor):
...@@ -168,4 +168,4 @@ def test_gemm(N, K, M, splitK, group_size): ...@@ -168,4 +168,4 @@ def test_gemm(N, K, M, splitK, group_size):
torch.testing.assert_close( torch.testing.assert_close(
output_triton.cpu(), output_torch.cpu(), atol=1e-1, rtol=1e-1 output_triton.cpu(), output_torch.cpu(), atol=1e-1, rtol=1e-1
) )
\ No newline at end of file
...@@ -8,6 +8,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant ...@@ -8,6 +8,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm._custom_ops import scaled_int8_quant from vllm._custom_ops import scaled_int8_quant
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
from vllm.platforms import current_platform
DTYPES = [torch.bfloat16, torch.float] DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
...@@ -63,7 +64,6 @@ def test_dynamic_scaled_int8_quant( ...@@ -63,7 +64,6 @@ def test_dynamic_scaled_int8_quant(
opcheck_int8_quant_dynamic(ops_out, x) opcheck_int8_quant_dynamic(ops_out, x)
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Currently, there is not supported on ROCm.") reason="Currently, there is not supported on ROCm.")
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
...@@ -169,7 +169,6 @@ def test_static_scaled_int8_azp_quant( ...@@ -169,7 +169,6 @@ def test_static_scaled_int8_azp_quant(
torch.testing.assert_close(out1, out2, atol=1, rtol=0.0) torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
opcheck_int8_quant_static(out2, x, scale_arg, azp_arg) opcheck_int8_quant_static(out2, x, scale_arg, azp_arg)
@pytest.mark.parametrize("is_max", [True, False]) @pytest.mark.parametrize("is_max", [True, False])
@torch.inference_mode() @torch.inference_mode()
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_flash_attention kernel
Run `pytest tests/kernels/test_triton_flash_attention.py`.
"""
import pytest
import torch
from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
MetaData,
compute_alibi_tensor,
scale_fp8,
triton_attention_rocm)
from vllm.platforms import current_platform
class ReferenceAttention:
def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
input_metadata):
self.Z = Z
self.HQ = HQ
self.HK = HK
self.N_CTX_Q = N_CTX_Q
self.N_CTX_K = N_CTX_K
self.D_HEAD = D_HEAD
self.use_alibi = use_alibi
self.dtype = dtype
self.input_metadata = input_metadata
def fwd(self, q, k, v):
scores = torch.einsum('bhqd,bhkd->bhqk', q,
k).float() * self.input_metadata.sm_scale
if self.input_metadata.causal:
mask = torch.tril(torch.ones(self.N_CTX_Q,
self.N_CTX_K,
device="cuda"),
diagonal=self.N_CTX_K - self.N_CTX_Q)
scores[:, :, mask == 0] = float("-inf")
if self.input_metadata.bias is not None:
scores += self.input_metadata.bias
if self.use_alibi:
scores += compute_alibi_tensor(self.input_metadata.alibi_slopes,
self.N_CTX_Q, self.N_CTX_K)
p = torch.softmax(scores, dim=-1)
if self.input_metadata.causal:
# If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
# into softmax. This creates a row of NaNs as -inf - -inf == NaN.
# So we fix this by converting the NaNs to 0s, which is what they
# should be out of the softmax.
nan_mask = torch.isnan(p)
p[nan_mask == 1] = 0
ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(self.dtype), v)
# compare
if self.input_metadata.layout == 'bshd':
ref_out = ref_out.transpose(1, 2).clone()
return ref_out
# def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
# q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
# self.dtype)
# k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
# self.dtype)
# v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
# self.dtype)
# result = self.fwd(q, k, v)
# if self.input_metadata.o_scale is not None:
# result, _ = scale_fp8(result, self.input_metadata.o_scale)
# return result
# def fwd_fp8_kv(self, q, k_quantized, v_quantized):
# k_descale, v_descale = (self.input_metadata.k_descale,
# self.input_metadata.v_descale)
# k_dequantized = (k_quantized.to(torch.float32) *
# k_descale.to(torch.float32)).to(self.dtype)
# v_dequantized = (v_quantized.to(torch.float32) *
# v_descale.to(torch.float32)).to(self.dtype)
# return self.fwd(q, k_dequantized, v_dequantized)
def varlen_fwd(self, q, k, v, is_mqa=False):
ref_out = torch.empty_like(q)
if is_mqa:
# Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
# the size aligns with Q.
k_ref = k.view(k.shape[0], k.shape[1], 1,
k.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
v_ref = v.view(v.shape[0], v.shape[1], 1,
v.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
else:
k_ref = k
v_ref = v
for i in range(0, self.input_metadata.num_contexts):
start_q, start_k = self.input_metadata.cu_seqlens_q[
i], self.input_metadata.cu_seqlens_k[i]
end_q, end_k = self.input_metadata.cu_seqlens_q[
i + 1], self.input_metadata.cu_seqlens_k[i + 1]
k_curr = k_ref[start_k:end_k]
v_curr = v_ref[start_k:end_k]
if is_mqa:
k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
scores = torch.einsum('qhd,khd->qhk', q[start_q:end_q],
k_curr).float()
p = torch.softmax(scores * self.input_metadata.sm_scale,
dim=-1).half()
ref_out[start_q:end_q] = torch.einsum('qhk,khd->qhd', p, v_curr)
return ref_out
def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
q_descale = None
if not fp8_kv:
q, q_descale = scale_fp8(q)
k, k_descale = scale_fp8(k)
v, v_descale = scale_fp8(v)
# In real world use case, the p scale would be a parameter trained by the
# model.
p_scale = None
o_scale = torch.rand(1, device="cuda",
requires_grad=False) if use_o_scale else None
return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
def input_helper(
Z,
HQ,
HK,
N_CTX_Q,
N_CTX_K,
D_HEAD,
dtype,
layout=None,
use_alibi=None,
causal=None,
is_fp8=False,
fp8_kv=False,
use_o_scale=False,
use_bias=False,
):
assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
current_platform.seed_everything(0)
# Initialize q, k, v
if layout == 'bhsd':
q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
elif layout == 'bshd':
q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
if use_alibi:
# for n heads the set of slopes is the geometric sequence that starts
# 2^(-8/n)
alibi_slopes = torch.tensor(
[2**(-8 / HQ * i) for i in range(1, HQ + 1)],
dtype=torch.float32,
device="cuda").repeat(Z, 1)
else:
alibi_slopes = None
if use_bias:
bias = torch.randn((1, HQ, N_CTX_Q, N_CTX_K),
dtype=dtype,
device="cuda",
requires_grad=False)
else:
bias = None
q = torch.randn(q_tensor_shape,
dtype=dtype,
device="cuda",
requires_grad=False)
k = torch.randn(k_tensor_shape,
dtype=dtype,
device="cuda",
requires_grad=False)
v = torch.randn(k_tensor_shape,
dtype=dtype,
device="cuda",
requires_grad=False)
if is_fp8:
(q, k, v, q_descale, k_descale, v_descale, p_scale,
o_scale) = quantize_input(q,
k,
v,
use_o_scale=use_o_scale,
fp8_kv=fp8_kv)
else:
q_descale = k_descale = v_descale = p_scale = o_scale = None
input_metadata = MetaData(sm_scale=D_HEAD**-0.5,
max_seqlens_q=N_CTX_Q,
max_seqlens_k=N_CTX_K,
layout=layout,
alibi_slopes=alibi_slopes,
alibi_batch=Z,
alibi_nheads=HQ,
q_descale=q_descale,
k_descale=k_descale,
v_descale=v_descale,
p_scale=p_scale,
o_scale=o_scale,
bias=bias,
seqlen_q=N_CTX_Q,
seqlen_k=N_CTX_K)
return q, k, v, input_metadata
def varlen_input_helper(Z,
HQ,
HK,
N_CTX_Q,
N_CTX_K,
D_HEAD,
dtype,
equal_seqlens=False):
current_platform.seed_everything(0)
# Random sequence lengths. Using N_CTX as kind of max of sum of individual
# seqs
if not equal_seqlens:
max_seqlens_q = N_CTX_Q // Z
max_seqlens_k = N_CTX_K // Z
seqlens_q = torch.randint(1,
max_seqlens_q + 1, (Z, ),
dtype=torch.int32)
seqlens_k = torch.randint(1,
max_seqlens_k + 1, (Z, ),
dtype=torch.int32)
else:
seqlens_q = torch.full((Z, ), N_CTX_Q // Z)
seqlens_k = torch.full((Z, ), N_CTX_K // Z)
# Calculate cumulative sequence lengths
cu_seqlens_q = torch.cat([
torch.tensor([0], dtype=torch.int32),
seqlens_q.cumsum(dim=0, dtype=torch.int32)
])
cu_seqlens_k = torch.cat([
torch.tensor([0], dtype=torch.int32),
seqlens_k.cumsum(dim=0, dtype=torch.int32)
])
cu_seqlens_q = cu_seqlens_q.to(device="cuda")
cu_seqlens_k = cu_seqlens_k.to(device="cuda")
# Initialize q, k, v with variable lengths
total_q = cu_seqlens_q[-1].item()
total_k = cu_seqlens_k[-1].item()
q = torch.randn((total_q, HQ, D_HEAD), dtype=dtype,
device="cuda").normal_(mean=0., std=0.5).requires_grad_()
k = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
device="cuda").normal_(mean=0., std=0.5).requires_grad_()
v = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
device="cuda").normal_(mean=0., std=0.5).requires_grad_()
sm_scale = D_HEAD**-0.5
input_metadata = MetaData(sm_scale=sm_scale)
input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
return q, k, v, input_metadata
@pytest.mark.parametrize('Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD', [
(1, 48, 12, 1, 1, 64),
(4, 4, 4, 128, 128, 65),
(16, 48, 48, 1, 1, 128),
(64, 48, 24, 3, 3, 128),
(4, 4, 4, 113, 123, 1),
])
@pytest.mark.parametrize('causal', [True, False])
@pytest.mark.parametrize('use_alibi', [True, False])
@pytest.mark.parametrize('layout', ['bshd'])
def test_op_fwd(Z,
HQ,
HK,
N_CTX_Q,
N_CTX_K,
D_HEAD,
causal,
use_alibi,
layout,
dtype=torch.float16):
current_platform.seed_everything(0)
q, k, v, input_metadata = input_helper(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
dtype, layout, use_alibi, causal)
o = torch.empty_like(q)
# triton implementation
tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
# Transpose here if layout is bshd so we have same reference code for all
# layouts
if layout == 'bshd':
q = q.transpose(1, 2).clone()
k = k.transpose(1, 2).clone()
v = v.transpose(1, 2).clone()
# Replicate K and V if using MQA/GQA
if HQ != HK:
k = k.view(k.shape[0], k.shape[1], -1, k.shape[2],
k.shape[3]).expand(-1, -1, HQ // HK, -1,
-1).reshape(k.shape[0], -1, k.shape[2],
k.shape[3])
v = v.view(v.shape[0], v.shape[1], -1, v.shape[2],
v.shape[3]).expand(-1, -1, HQ // HK, -1,
-1).reshape(v.shape[0], -1, v.shape[2],
v.shape[3])
ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
use_alibi, dtype, input_metadata)
ref_out = ref_impl.fwd(q, k, v)
torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
# @pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
# (4, 48, 1, 1, 64),
# (4, 48, 1, 1, 128),
# (4, 48, 3, 3, 128),
# (4, 4, 128, 128, 65),
# ])
# @pytest.mark.parametrize('causal', [True, False])
# @pytest.mark.parametrize('layout', ['bhsd'])
# @pytest.mark.parametrize('use_o_scale', [True, False])
# @pytest.mark.skipif(torch.cuda.get_device_capability() < (9, 0),
# reason="Triton FP8 requires CUDA 9.0 or higher")
# def test_op_fwd_fp8(Z,
# H,
# N_CTX_Q,
# N_CTX_K,
# D_HEAD,
# causal,
# layout,
# use_o_scale,
# dtype=torch.float32):
# current_platform.seed_everything(0)
# # Disable grad to save memory it won't run into OOM on CI machine.
# # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
# # dtype, layout)
# q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
# Z,
# H,
# H,
# N_CTX_Q,
# N_CTX_K,
# D_HEAD,
# dtype,
# causal=causal,
# layout=layout,
# is_fp8=True,
# use_o_scale=use_o_scale)
# o = torch.empty_like(q_quantized) if use_o_scale else None
# tri_out, _ = triton_attention_rocm(q_quantized, k_quantized, v_quantized,
# o, input_metadata)
# ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
# dtype, input_metadata)
# ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
# # compare
# torch.testing.assert_close(ref_out.to(torch.float32),
# tri_out.to(torch.float32),
# atol=7e-2,
# rtol=2e-1)
# @pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
# (4, 48, 1, 1, 64),
# (4, 48, 1, 1, 128),
# (4, 48, 3, 3, 128),
# (4, 4, 128, 128, 65),
# (4, 4, 113, 123, 1),
# ])
# @pytest.mark.parametrize('causal', [True, False])
# @pytest.mark.parametrize('layout', ['bhsd'])
# def test_op_fwd_fp8_kv(Z,
# H,
# N_CTX_Q,
# N_CTX_K,
# D_HEAD,
# causal,
# layout,
# dtype=torch.float32):
# current_platform.seed_everything(0)
# q, k_quantized, v_quantized, input_metadata = input_helper(Z,
# H,
# H,
# N_CTX_Q,
# N_CTX_K,
# D_HEAD,
# dtype,
# causal=causal,
# layout=layout,
# is_fp8=True,
# fp8_kv=True)
# o = torch.empty_like(q)
# tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o,
# input_metadata)
# ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
# dtype, input_metadata)
# ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
# torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
(4, 48, 1, 1, 64),
(4, 48, 1, 1, 128),
(4, 48, 3, 3, 128),
(4, 4, 128, 128, 65),
])
@pytest.mark.parametrize('causal', [True, False])
@pytest.mark.parametrize('use_bias', [True])
@pytest.mark.parametrize('dtype', [torch.bfloat16])
def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
current_platform.seed_everything(0)
q, k, v, input_metadata = input_helper(Z,
H,
H,
N_CTX_Q,
N_CTX_K,
D_HEAD,
dtype,
layout='bhsd',
causal=causal,
use_bias=use_bias)
o = torch.empty_like(q)
# triton implementation
tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
dtype, input_metadata)
ref_out = ref_impl.fwd(q, k, v)
# compare
torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
# NOTE: Uses thd layout, so also tests thd.
@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(1, 48, 256, 64),
(4, 48, 512, 64),
(16, 48, 512, 64),
(64, 48, 128, 128)])
@pytest.mark.parametrize('causal', [True, False])
def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX,
D_HEAD, dtype)
tri_out = torch.empty_like(q)
triton_attention_rocm(q, k, v, tri_out, input_metadata)
ref_impl = ReferenceAttention(Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype,
input_metadata)
ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
# NOTE: Uses thd layout, so also tests thd.
@pytest.mark.parametrize('Z, HQ, HK, N_CTX, D_HEAD', [(2, 48, 24, 128, 64),
(4, 48, 12, 256, 64),
(4, 48, 4, 512, 64),
(4, 64, 16, 128, 128)])
@pytest.mark.parametrize('causal', [False])
def test_op_varlen_mqa_fwd(Z,
HQ,
HK,
N_CTX,
D_HEAD,
causal,
dtype=torch.float16):
q, k, v, input_metadata = varlen_input_helper(Z, HQ, HK, N_CTX, N_CTX,
D_HEAD, dtype)
tri_out = torch.empty_like(q)
triton_attention_rocm(q, k, v, tri_out, input_metadata)
ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False,
dtype, input_metadata)
ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
...@@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest ...@@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest
from vllm.v1.worker.gpu_worker import Worker from vllm.v1.worker.gpu_worker import Worker
from ..utils import models_path_prefix from ..utils import models_path_prefix
MODEL_PATH = "Qwen/Qwen3-0.6B" MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen3-0.6B")
NUM_LORAS = 16 NUM_LORAS = 16
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random import random
import os
import numpy as np import numpy as np
import pytest import pytest
import torch import torch
...@@ -9,23 +10,10 @@ from transformers import AutoModelForTokenClassification ...@@ -9,23 +10,10 @@ from transformers import AutoModelForTokenClassification
from tests.models.utils import softmax from tests.models.utils import softmax
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ....utils import models_path_prefix
@pytest.fixture(autouse=True) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "boltuix/NeuroBERT-NER")])
def seed_everything():
"""Seed all random number generators for reproducibility."""
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
yield
@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
# The float32 is required for this tiny model to pass the test. # The float32 is required for this tiny model to pass the test.
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode @torch.inference_mode
...@@ -68,7 +56,6 @@ def test_bert_models( ...@@ -68,7 +56,6 @@ def test_bert_models(
@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"]) @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.flaky(reruns=3)
@torch.inference_mode @torch.inference_mode
def test_modernbert_models( def test_modernbert_models(
hf_runner, hf_runner,
...@@ -77,14 +64,6 @@ def test_modernbert_models( ...@@ -77,14 +64,6 @@ def test_modernbert_models(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
# NOTE: https://github.com/vllm-project/vllm/pull/32403
# `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
# model, which can cause numerical precision variance and edge cases.
# We use @flaky(reruns=3) to mitigate intermittent failures.
print(
f"\n[NOTE] Testing {model} (randomly initialized weights) - "
"flaky tolerance enabled due to numerical precision variance."
)
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts) vllm_outputs = vllm_model.token_classify(example_prompts)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment