Commit 78c1f9e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1(tests)

parent 86a65417
......@@ -74,7 +74,7 @@ class TestSetting:
),
pytest.param(
TestSetting(
model="BAAI/bge-base-en-v1.5",
model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
model_args=["--runner", "pooling"],
pp_size=1,
tp_size=1,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import logging
from contextlib import nullcontext
from unittest.mock import MagicMock, patch
......@@ -19,7 +18,6 @@ from vllm.config import (
)
from vllm.config.compilation import CompilationMode, PassConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.logger import _print_warning_once
from vllm.platforms import current_platform
from vllm.utils.torch_utils import (
_is_torch_equal_or_newer,
......
......@@ -42,7 +42,6 @@ class MockModelConfig:
tokenizer_revision = None
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
hf_text_config = MockHFConfig()
logits_processor_pattern = None
logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import os
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.tokenizers import get_tokenizer
from ...models.registry import HF_EXAMPLE_MODELS
from ...utils import VLLM_PATH, models_path_prefix
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATION_OUTPUT = [
(
os.path.join(models_path_prefix, "facebook/opt-125m"),
chatml_jinja_path,
True,
False,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
""",
),
(
os.path.join(models_path_prefix, "facebook/opt-125m"),
chatml_jinja_path,
False,
False,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of""",
),
(
os.path.join(models_path_prefix, "facebook/opt-125m"),
chatml_jinja_path,
False,
True,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of""",
),
]
TEST_MESSAGES = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
{"role": "user", "content": "What is the capital of"},
]
ASSISTANT_MESSAGE_TO_CONTINUE = {"role": "assistant", "content": "The capital of"}
def test_load_chat_template():
# Testing chatml template
template_content = load_chat_template(chat_template=chatml_jinja_path)
# Test assertions
assert template_content is not None
# Hard coded value for template_chatml.jinja
assert (
template_content
== """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
)
def test_no_load_chat_template_filelike():
# Testing chatml template
template = "../../examples/does_not_exist"
with pytest.raises(ValueError, match="looks like a file path"):
load_chat_template(chat_template=template)
def test_no_load_chat_template_literallike():
# Testing chatml template
template = "{{ messages }}"
template_content = load_chat_template(chat_template=template)
assert template_content == template
@pytest.mark.parametrize(
"model,template,add_generation_prompt,continue_final_message,expected_output",
MODEL_TEMPLATE_GENERATION_OUTPUT,
)
def test_get_gen_prompt(
model, template, add_generation_prompt, continue_final_message, expected_output
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Initialize the tokenizer
tokenizer = get_tokenizer(
tokenizer_name=model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
template_content = load_chat_template(chat_template=template)
# Create a mock request object using keyword arguments
mock_request = ChatCompletionRequest(
model=model,
messages=TEST_MESSAGES + [ASSISTANT_MESSAGE_TO_CONTINUE]
if continue_final_message
else TEST_MESSAGES,
add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message,
)
# Call the function and get the result
result = apply_hf_chat_template(
tokenizer=tokenizer,
conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content,
model_config=model_config,
tools=None,
add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,
)
# Test assertion
assert result == expected_output, (
f"The generated prompt does not match the expected output for "
f"model {model} and template {template}"
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import numpy as np
import openai
import pytest
import os
import pytest_asyncio
import requests
import torch
import torch.nn.functional as F
from tests.models.language.pooling.embed_utils import (
run_embedding_correctness_test)
from tests.models.utils import check_embeddings_close
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer
from utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"512",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="module")
def hf_model(hf_runner):
with hf_runner(MODEL_NAME, dtype=DTYPE,
is_sentence_transformer=True) as hf_model:
yield hf_model
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 11
assert embeddings.usage.total_tokens == 11
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
model_name: str):
# test list[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 33
assert embeddings.usage.total_tokens == 33
vllm_outputs = [d.embedding for d in embeddings.data]
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test list[list[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(server: RemoteOpenAIServer,
client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "user",
"content": "The cat sat on the mat.",
}, {
"role": "assistant",
"content": "A feline was resting on a rug.",
}, {
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
}]
chat_response = requests.post(
server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
},
)
chat_response.raise_for_status()
chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = EmbeddingResponse.model_validate(
completion_response.model_dump(mode="json"))
assert chat_embeddings.id is not None
assert completion_embeddings.id is not None
assert chat_embeddings.created <= completion_embeddings.created
assert chat_embeddings.model_dump(
exclude={"id", "created"}) == (completion_embeddings.model_dump(
exclude={"id", "created"}))
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
]
responses_float = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="float")
float_data = [d.embedding for d in responses_float.data]
run_embedding_correctness_test(hf_model, input_texts, float_data)
responses_base64 = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="base64")
base64_data = []
for data in responses_base64.data:
base64_data.append(
np.frombuffer(base64.b64decode(data.embedding),
dtype="float32").tolist())
run_embedding_correctness_test(hf_model, input_texts, base64_data)
# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await client.embeddings.create(input=input_texts,
model=model_name)
default_data = [d.embedding for d in responses_default.data]
run_embedding_correctness_test(hf_model, input_texts, default_data)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embedding_response = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 10})
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
]
embedding_response = await client.embeddings.create(
model=model_name,
input=input_tokens,
extra_body={"truncate_prompt_tokens": 10})
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
response = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193})
assert "error" in response.object
assert "truncate_prompt_tokens value is greater than max_model_len. "\
"Please, select a smaller truncation size." in response.message
@pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer,
client: openai.AsyncOpenAI):
input_texts = [
"The chef prepared a delicious meal.",
]
request_args = {
"model": MODEL_NAME,
"input": input_texts,
"encoding_format": "float",
}
completion_response = await client.embeddings.create(**request_args)
invocation_response = requests.post(server.url_for("invocations"),
json=request_args)
invocation_response.raise_for_status()
completion_output = completion_response.model_dump()
invocation_output = invocation_response.json()
assert completion_output.keys() == invocation_output.keys()
for completion_data, invocation_data in zip(completion_output["data"],
invocation_output["data"]):
assert completion_data.keys() == invocation_data.keys()
check_embeddings_close(embeddings_0_lst=[completion_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="completion",
name_1="invocation")
@pytest.mark.asyncio
async def test_invocations_conversation(server: RemoteOpenAIServer):
messages = [{
"role": "user",
"content": "The cat sat on the mat.",
}, {
"role": "assistant",
"content": "A feline was resting on a rug.",
}, {
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
}]
request_args = {
"model": MODEL_NAME,
"messages": messages,
"encoding_format": "float",
}
chat_response = requests.post(server.url_for("v1/embeddings"),
json=request_args)
chat_response.raise_for_status()
invocation_response = requests.post(server.url_for("invocations"),
json=request_args)
invocation_response.raise_for_status()
chat_output = chat_response.json()
invocation_output = invocation_response.json()
assert chat_output.keys() == invocation_output.keys()
for chat_data, invocation_data in zip(chat_output["data"],
invocation_output["data"]):
assert chat_data.keys() == invocation_data.keys()
check_embeddings_close(embeddings_0_lst=[chat_data["embedding"]],
embeddings_1_lst=[invocation_data["embedding"]],
name_0="chat",
name_1="invocation")
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
input_text = ["The chef prepared a delicious meal."]
async def get_outputs(normalize):
request_args = {
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"normalize": normalize
}
response = requests.post(server.url_for("v1/embeddings"),
json=request_args)
outputs = response.json()
return torch.tensor([x['embedding'] for x in outputs["data"]])
default = await get_outputs(normalize=None)
w_normal = await get_outputs(normalize=True)
wo_normal = await get_outputs(normalize=False)
assert torch.allclose(default, w_normal,
atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal,
atol=1e-2), "wo_normal should not use normal."
assert torch.allclose(
w_normal, F.normalize(wo_normal, p=2, dim=-1),
atol=1e-2), "w_normal should be close to normal(wo_normal)."
# SPDX-License-Identifier: Apache-2.0
import os
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import pytest
import requests
import torch
import torch.nn.functional as F
from torch import tensor
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.openai.protocol import ScoreResponse
from utils import models_path_prefix
MODELS = [
{
"name": os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),
"is_cross_encoder": True
},
{
"name": "BAAI/bge-base-en-v1.5",
"is_cross_encoder": False
},
]
DTYPE = "half"
def run_transformers(hf_model, model, text_pairs):
if model["is_cross_encoder"]:
return hf_model.predict(text_pairs).tolist()
else:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
return [
F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
for pair in hf_embeddings
]
@pytest.fixture(scope="class", params=MODELS)
def model(request):
yield request.param
@pytest.fixture(scope="class")
def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server
@pytest.fixture(scope="class")
def runner(model: dict[str, Any], hf_runner):
kwargs = {
"dtype": DTYPE,
"is_cross_encoder" if model["is_cross_encoder"]\
else "is_sentence_transformer": True
}
with hf_runner(model["name"], **kwargs) as hf_model:
yield hf_model
class TestModel:
def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?"
]
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
def test_score_max_model_len(self, server: RemoteOpenAIServer,
model: dict[str, Any]):
text_1 = "What is the capital of France?" * 20
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
score_response.text
# Test truncation
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"truncate_prompt_tokens": 101
})
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \
score_response.text
def test_invocations(self, server: RemoteOpenAIServer, model: dict[str,
Any]):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
request_args = {
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
}
score_response = requests.post(server.url_for("score"),
json=request_args)
score_response.raise_for_status()
invocation_response = requests.post(server.url_for("invocations"),
json=request_args)
invocation_response.raise_for_status()
score_output = score_response.json()
invocation_output = invocation_response.json()
assert score_output.keys() == invocation_output.keys()
for score_data, invocation_data in zip(score_output["data"],
invocation_output["data"]):
assert score_data.keys() == invocation_data.keys()
assert score_data["score"] == pytest.approx(
invocation_data["score"], rel=0.05)
# TODO: reset this tolerance to 0.01 once we find
# an alternative to flash_attn with bfloat16
def test_activation(self, server: RemoteOpenAIServer, model: dict[str,
Any]):
def get_outputs(activation):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"activation": activation
})
if response.status_code != 200:
return response
outputs = response.json()
return torch.tensor([x['score'] for x in outputs["data"]])
if model["is_cross_encoder"]:
default = get_outputs(activation=None)
w_activation = get_outputs(activation=True)
wo_activation = get_outputs(activation=False)
assert torch.allclose(default, w_activation,
atol=1e-2), "Default should use activation."
assert not torch.allclose(
w_activation, wo_activation,
atol=1e-2), "wo_activation should not use activation."
assert torch.allclose(
F.sigmoid(wo_activation), w_activation, atol=1e-2
), "w_activation should be close to activation(wo_activation)."
else:
get_outputs(activation=None)
# The activation parameter only works for the is_cross_encoder model
response = get_outputs(activation=True)
assert response.status_code == 400
......@@ -15,9 +15,6 @@ from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes
from vllm.utils.torch_utils import set_random_seed
if current_platform.is_rocm():
from flash_attn import vllm_flash_attn_with_kvcache
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
......
......@@ -162,7 +162,6 @@ def test_reshape_and_cache(
torch.testing.assert_close(key_cache, cloned_key_cache)
torch.testing.assert_close(value_cache, cloned_value_cache)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
......
......@@ -104,16 +104,18 @@ def test_flash_mla(
descale_k = None
def flash_mla():
return flash_mla_with_kvcache(q,
blocked_k,
block_table,
cache_seqlens,
dv,
tile_scheduler_metadata,
num_splits,
causal=causal,
descale_q=descale_q,
descale_k=descale_k)
return flash_mla_with_kvcache(
q,
blocked_k,
block_table,
cache_seqlens,
dv,
tile_scheduler_metadata,
num_splits,
causal=causal,
descale_q=descale_q,
descale_k=descale_k,
)
def scaled_dot_product_attention(query, key, value, is_causal=False):
query = query.float()
......
......@@ -53,7 +53,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
dtype=torch.float32,
device="cuda",
)
best_config = None
# Call the original implementation.
decode_attention_fwd(
......@@ -67,7 +66,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
attn_logits,
num_kv_splits,
sm_scale,
best_config,
)
# Page size can be larger than 1.
......@@ -88,7 +86,6 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
attn_logits,
num_kv_splits,
sm_scale,
best_config,
PAGE_SIZE,
)
......
......@@ -146,84 +146,9 @@ def test_fused_rms_norm_quant(
(out_quant_fused, x, weight, quant_scale_t, 1e-6),
)
# @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
# @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
# @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
# @pytest.mark.parametrize("dtype", DTYPES)
# @pytest.mark.parametrize("quant_scale", [0.01, 1.0, 10.0])
# @pytest.mark.parametrize("seed", SEEDS)
# @pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("strided_input", [False, True])
# def test_fused_rms_norm_quant(
# num_tokens: int,
# hidden_size: int,
# add_residual: bool,
# dtype: torch.dtype,
# quant_scale: float,
# seed: int,
# device: str,
# strided_input: bool,
# ) -> None:
# current_platform.seed_everything(seed)
# torch.set_default_device(device)
# weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
# scale = 1 / (2 * hidden_size)
# last_dim = 2 * hidden_size if strided_input else hidden_size
# x_base = torch.randn(num_tokens, last_dim, dtype=dtype)
# x = x_base[..., :hidden_size]
# assert x.is_contiguous() != strided_input
# x *= scale
# if add_residual:
# residual = torch.randn_like(x) * scale
# residual_fused = residual.clone()
# else:
# residual = residual_fused = None
# out_norm = torch.empty_like(x)
# out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
# out_quant_fused = torch.empty_like(out_quant)
# quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
# if add_residual:
# torch.ops._C.fused_add_rms_norm_static_fp8_quant(
# out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6
# )
# # Unfused kernel is in-place so it goes second
# # Also use a separate clone of x to avoid modifying the input
# x_unfused_base = x_base.clone()
# x_unfused = x_unfused_base[..., :hidden_size]
# assert x_unfused.is_contiguous() != strided_input
# torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
# torch.ops._C.static_scaled_fp8_quant(
# out_quant, x_unfused.contiguous(), quant_scale_t
# )
# torch.cuda.synchronize()
# torch.testing.assert_close(residual_fused, residual, atol=1e-2, rtol=1e-2)
# opcheck(
# torch.ops._C.fused_add_rms_norm_static_fp8_quant,
# (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6),
# )
# else:
# torch.ops._C.rms_norm_static_fp8_quant(
# out_quant_fused, x, weight, quant_scale_t, 1e-6
# )
# torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
# torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, quant_scale_t)
# opcheck(
# torch.ops._C.rms_norm_static_fp8_quant,
# (out_quant_fused, x, weight, quant_scale_t, 1e-6),
# )
# torch.testing.assert_close(
# out_quant.to(dtype=torch.float32),
# out_quant_fused.to(dtype=torch.float32),
# atol=1e-3,
# rtol=1e-3,
# )
torch.testing.assert_close(
out_quant.to(dtype=torch.float32),
out_quant_fused.to(dtype=torch.float32),
atol=1e-3,
rtol=1e-3,
)
\ No newline at end of file
......@@ -22,9 +22,6 @@ from vllm.distributed import (
)
from vllm.forward_context import set_forward_context
from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
FusedMoEParallelConfig,
......@@ -43,6 +40,7 @@ from .mk_objects import (
TestMoEQuantConfig,
expert_info,
make_fused_experts,
make_prepare_finalize,
prepare_finalize_info,
)
from .parallel_utils import ProcessGroupInfo
......@@ -605,12 +603,10 @@ def make_modular_kernel(
routing_method=RoutingMethodType.DeepSeekV3,
)
prepare_finalize = maybe_make_prepare_finalize(
moe=moe,
quant_config=quant_config,
allow_new_interface=True,
# make modular kernel
prepare_finalize = make_prepare_finalize(
config.prepare_finalize_type, config.all2all_backend(), moe, quant_config
)
assert prepare_finalize is not None
fused_experts = make_fused_experts(
config.fused_experts_type,
......@@ -692,4 +688,4 @@ def run_modular_kernel(
):
out = mk.forward(**mk_kwargs)
return out
return out
\ No newline at end of file
......@@ -7,6 +7,9 @@ import torch
# Fused experts and PrepareFinalize imports
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
)
......@@ -252,12 +255,13 @@ if has_pplx():
)
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
create_flashinfer_prepare_finalize,
)
register_prepare_and_finalize(
FlashInferCutlassMoEPrepareAndFinalize,
......@@ -425,6 +429,24 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
]
def make_prepare_finalize(
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
backend: str | None,
moe: FusedMoEConfig,
quant_config: FusedMoEQuantConfig,
) -> mk.FusedMoEPrepareAndFinalize:
if backend != "naive" and backend is not None:
prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
assert prepare_finalize is not None
return prepare_finalize
elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
return create_flashinfer_prepare_finalize(
use_dp=moe.moe_parallel_config.dp_size > 1
)
else:
return MoEPrepareAndFinalizeNoEP()
def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
s = rank * num_local_experts
e = s + num_local_experts
......@@ -473,4 +495,4 @@ def make_fused_experts(
torch.set_printoptions(threshold=1000, edgeitems=5, linewidth=80)
return experts
return experts
\ No newline at end of file
......@@ -294,7 +294,12 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
)
kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
MoEPrepareAndFinalizeNoEP(
defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
moe_config=moe_config,
quant_config=quant_config,
)
),
FlashInferExperts(
moe_config=moe_config,
quant_config=quant_config,
......@@ -315,4 +320,4 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
)
torch.testing.assert_close(
output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2
)
)
\ No newline at end of file
......@@ -106,7 +106,12 @@ def test_flashinfer_fp4_moe_no_graph(
)
flashinfer_experts = FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
MoEPrepareAndFinalizeNoEP(
defer_input_quant=FlashInferExperts.expects_unquantized_inputs(
moe_config=moe_config,
quant_config=quant_config,
)
),
FlashInferExperts(moe_config=moe_config, quant_config=quant_config),
)
......@@ -169,4 +174,4 @@ def test_flashinfer_fp4_moe_no_graph(
if __name__ == "__main__":
test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half)
test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half)
\ No newline at end of file
......@@ -15,7 +15,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
)
from vllm.utils.torch_utils import set_random_seed
device = "cuda"
device = "cuda"
def reverse_awq_order(t: torch.Tensor):
......@@ -168,4 +168,4 @@ def test_gemm(N, K, M, splitK, group_size):
torch.testing.assert_close(
output_triton.cpu(), output_torch.cpu(), atol=1e-1, rtol=1e-1
)
)
\ No newline at end of file
......@@ -8,6 +8,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant
from tests.kernels.utils import opcheck
from vllm._custom_ops import scaled_int8_quant
from vllm.utils.torch_utils import set_random_seed
from vllm.platforms import current_platform
DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
......@@ -63,7 +64,6 @@ def test_dynamic_scaled_int8_quant(
opcheck_int8_quant_dynamic(ops_out, x)
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Currently, there is not supported on ROCm.")
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
......@@ -169,7 +169,6 @@ def test_static_scaled_int8_azp_quant(
torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
opcheck_int8_quant_static(out2, x, scale_arg, azp_arg)
@pytest.mark.parametrize("is_max", [True, False])
@torch.inference_mode()
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_flash_attention kernel
Run `pytest tests/kernels/test_triton_flash_attention.py`.
"""
import pytest
import torch
from vllm.attention.ops.triton_flash_attention import (SUPPORTED_LAYOUTS,
MetaData,
compute_alibi_tensor,
scale_fp8,
triton_attention_rocm)
from vllm.platforms import current_platform
class ReferenceAttention:
def __init__(self, Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, use_alibi, dtype,
input_metadata):
self.Z = Z
self.HQ = HQ
self.HK = HK
self.N_CTX_Q = N_CTX_Q
self.N_CTX_K = N_CTX_K
self.D_HEAD = D_HEAD
self.use_alibi = use_alibi
self.dtype = dtype
self.input_metadata = input_metadata
def fwd(self, q, k, v):
scores = torch.einsum('bhqd,bhkd->bhqk', q,
k).float() * self.input_metadata.sm_scale
if self.input_metadata.causal:
mask = torch.tril(torch.ones(self.N_CTX_Q,
self.N_CTX_K,
device="cuda"),
diagonal=self.N_CTX_K - self.N_CTX_Q)
scores[:, :, mask == 0] = float("-inf")
if self.input_metadata.bias is not None:
scores += self.input_metadata.bias
if self.use_alibi:
scores += compute_alibi_tensor(self.input_metadata.alibi_slopes,
self.N_CTX_Q, self.N_CTX_K)
p = torch.softmax(scores, dim=-1)
if self.input_metadata.causal:
# If N_CTX_Q > N_CTX_K, there's at least one row of all -infs going
# into softmax. This creates a row of NaNs as -inf - -inf == NaN.
# So we fix this by converting the NaNs to 0s, which is what they
# should be out of the softmax.
nan_mask = torch.isnan(p)
p[nan_mask == 1] = 0
ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(self.dtype), v)
# compare
if self.input_metadata.layout == 'bshd':
ref_out = ref_out.transpose(1, 2).clone()
return ref_out
# def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
# q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
# self.dtype)
# k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
# self.dtype)
# v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
# self.dtype)
# result = self.fwd(q, k, v)
# if self.input_metadata.o_scale is not None:
# result, _ = scale_fp8(result, self.input_metadata.o_scale)
# return result
# def fwd_fp8_kv(self, q, k_quantized, v_quantized):
# k_descale, v_descale = (self.input_metadata.k_descale,
# self.input_metadata.v_descale)
# k_dequantized = (k_quantized.to(torch.float32) *
# k_descale.to(torch.float32)).to(self.dtype)
# v_dequantized = (v_quantized.to(torch.float32) *
# v_descale.to(torch.float32)).to(self.dtype)
# return self.fwd(q, k_dequantized, v_dequantized)
def varlen_fwd(self, q, k, v, is_mqa=False):
ref_out = torch.empty_like(q)
if is_mqa:
# Make KV look like HQ/HK "groups" of HK. Later, we will reshape so
# the size aligns with Q.
k_ref = k.view(k.shape[0], k.shape[1], 1,
k.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
v_ref = v.view(v.shape[0], v.shape[1], 1,
v.shape[2]).expand(-1, -1, self.HQ // self.HK, -1)
else:
k_ref = k
v_ref = v
for i in range(0, self.input_metadata.num_contexts):
start_q, start_k = self.input_metadata.cu_seqlens_q[
i], self.input_metadata.cu_seqlens_k[i]
end_q, end_k = self.input_metadata.cu_seqlens_q[
i + 1], self.input_metadata.cu_seqlens_k[i + 1]
k_curr = k_ref[start_k:end_k]
v_curr = v_ref[start_k:end_k]
if is_mqa:
k_curr = k_curr.reshape(k_curr.shape[0], -1, k_curr.shape[3])
v_curr = v_curr.reshape(v_curr.shape[0], -1, v_curr.shape[3])
scores = torch.einsum('qhd,khd->qhk', q[start_q:end_q],
k_curr).float()
p = torch.softmax(scores * self.input_metadata.sm_scale,
dim=-1).half()
ref_out[start_q:end_q] = torch.einsum('qhk,khd->qhd', p, v_curr)
return ref_out
def quantize_input(q, k, v, fp8_kv=False, use_o_scale=False):
q_descale = None
if not fp8_kv:
q, q_descale = scale_fp8(q)
k, k_descale = scale_fp8(k)
v, v_descale = scale_fp8(v)
# In real world use case, the p scale would be a parameter trained by the
# model.
p_scale = None
o_scale = torch.rand(1, device="cuda",
requires_grad=False) if use_o_scale else None
return q, k, v, q_descale, k_descale, v_descale, p_scale, o_scale
def input_helper(
Z,
HQ,
HK,
N_CTX_Q,
N_CTX_K,
D_HEAD,
dtype,
layout=None,
use_alibi=None,
causal=None,
is_fp8=False,
fp8_kv=False,
use_o_scale=False,
use_bias=False,
):
assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
current_platform.seed_everything(0)
# Initialize q, k, v
if layout == 'bhsd':
q_tensor_shape = (Z, HQ, N_CTX_Q, D_HEAD)
k_tensor_shape = (Z, HK, N_CTX_K, D_HEAD)
elif layout == 'bshd':
q_tensor_shape = (Z, N_CTX_Q, HQ, D_HEAD)
k_tensor_shape = (Z, N_CTX_K, HK, D_HEAD)
if use_alibi:
# for n heads the set of slopes is the geometric sequence that starts
# 2^(-8/n)
alibi_slopes = torch.tensor(
[2**(-8 / HQ * i) for i in range(1, HQ + 1)],
dtype=torch.float32,
device="cuda").repeat(Z, 1)
else:
alibi_slopes = None
if use_bias:
bias = torch.randn((1, HQ, N_CTX_Q, N_CTX_K),
dtype=dtype,
device="cuda",
requires_grad=False)
else:
bias = None
q = torch.randn(q_tensor_shape,
dtype=dtype,
device="cuda",
requires_grad=False)
k = torch.randn(k_tensor_shape,
dtype=dtype,
device="cuda",
requires_grad=False)
v = torch.randn(k_tensor_shape,
dtype=dtype,
device="cuda",
requires_grad=False)
if is_fp8:
(q, k, v, q_descale, k_descale, v_descale, p_scale,
o_scale) = quantize_input(q,
k,
v,
use_o_scale=use_o_scale,
fp8_kv=fp8_kv)
else:
q_descale = k_descale = v_descale = p_scale = o_scale = None
input_metadata = MetaData(sm_scale=D_HEAD**-0.5,
max_seqlens_q=N_CTX_Q,
max_seqlens_k=N_CTX_K,
layout=layout,
alibi_slopes=alibi_slopes,
alibi_batch=Z,
alibi_nheads=HQ,
q_descale=q_descale,
k_descale=k_descale,
v_descale=v_descale,
p_scale=p_scale,
o_scale=o_scale,
bias=bias,
seqlen_q=N_CTX_Q,
seqlen_k=N_CTX_K)
return q, k, v, input_metadata
def varlen_input_helper(Z,
HQ,
HK,
N_CTX_Q,
N_CTX_K,
D_HEAD,
dtype,
equal_seqlens=False):
current_platform.seed_everything(0)
# Random sequence lengths. Using N_CTX as kind of max of sum of individual
# seqs
if not equal_seqlens:
max_seqlens_q = N_CTX_Q // Z
max_seqlens_k = N_CTX_K // Z
seqlens_q = torch.randint(1,
max_seqlens_q + 1, (Z, ),
dtype=torch.int32)
seqlens_k = torch.randint(1,
max_seqlens_k + 1, (Z, ),
dtype=torch.int32)
else:
seqlens_q = torch.full((Z, ), N_CTX_Q // Z)
seqlens_k = torch.full((Z, ), N_CTX_K // Z)
# Calculate cumulative sequence lengths
cu_seqlens_q = torch.cat([
torch.tensor([0], dtype=torch.int32),
seqlens_q.cumsum(dim=0, dtype=torch.int32)
])
cu_seqlens_k = torch.cat([
torch.tensor([0], dtype=torch.int32),
seqlens_k.cumsum(dim=0, dtype=torch.int32)
])
cu_seqlens_q = cu_seqlens_q.to(device="cuda")
cu_seqlens_k = cu_seqlens_k.to(device="cuda")
# Initialize q, k, v with variable lengths
total_q = cu_seqlens_q[-1].item()
total_k = cu_seqlens_k[-1].item()
q = torch.randn((total_q, HQ, D_HEAD), dtype=dtype,
device="cuda").normal_(mean=0., std=0.5).requires_grad_()
k = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
device="cuda").normal_(mean=0., std=0.5).requires_grad_()
v = torch.randn((total_k, HK, D_HEAD), dtype=dtype,
device="cuda").normal_(mean=0., std=0.5).requires_grad_()
sm_scale = D_HEAD**-0.5
input_metadata = MetaData(sm_scale=sm_scale)
input_metadata.set_varlen_params(cu_seqlens_q, cu_seqlens_k)
return q, k, v, input_metadata
@pytest.mark.parametrize('Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD', [
(1, 48, 12, 1, 1, 64),
(4, 4, 4, 128, 128, 65),
(16, 48, 48, 1, 1, 128),
(64, 48, 24, 3, 3, 128),
(4, 4, 4, 113, 123, 1),
])
@pytest.mark.parametrize('causal', [True, False])
@pytest.mark.parametrize('use_alibi', [True, False])
@pytest.mark.parametrize('layout', ['bshd'])
def test_op_fwd(Z,
HQ,
HK,
N_CTX_Q,
N_CTX_K,
D_HEAD,
causal,
use_alibi,
layout,
dtype=torch.float16):
current_platform.seed_everything(0)
q, k, v, input_metadata = input_helper(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
dtype, layout, use_alibi, causal)
o = torch.empty_like(q)
# triton implementation
tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
# Transpose here if layout is bshd so we have same reference code for all
# layouts
if layout == 'bshd':
q = q.transpose(1, 2).clone()
k = k.transpose(1, 2).clone()
v = v.transpose(1, 2).clone()
# Replicate K and V if using MQA/GQA
if HQ != HK:
k = k.view(k.shape[0], k.shape[1], -1, k.shape[2],
k.shape[3]).expand(-1, -1, HQ // HK, -1,
-1).reshape(k.shape[0], -1, k.shape[2],
k.shape[3])
v = v.view(v.shape[0], v.shape[1], -1, v.shape[2],
v.shape[3]).expand(-1, -1, HQ // HK, -1,
-1).reshape(v.shape[0], -1, v.shape[2],
v.shape[3])
ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD,
use_alibi, dtype, input_metadata)
ref_out = ref_impl.fwd(q, k, v)
torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
# @pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
# (4, 48, 1, 1, 64),
# (4, 48, 1, 1, 128),
# (4, 48, 3, 3, 128),
# (4, 4, 128, 128, 65),
# ])
# @pytest.mark.parametrize('causal', [True, False])
# @pytest.mark.parametrize('layout', ['bhsd'])
# @pytest.mark.parametrize('use_o_scale', [True, False])
# @pytest.mark.skipif(torch.cuda.get_device_capability() < (9, 0),
# reason="Triton FP8 requires CUDA 9.0 or higher")
# def test_op_fwd_fp8(Z,
# H,
# N_CTX_Q,
# N_CTX_K,
# D_HEAD,
# causal,
# layout,
# use_o_scale,
# dtype=torch.float32):
# current_platform.seed_everything(0)
# # Disable grad to save memory it won't run into OOM on CI machine.
# # q, k, v, input_metadata = input_helper(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD,
# # dtype, layout)
# q_quantized, k_quantized, v_quantized, input_metadata = input_helper(
# Z,
# H,
# H,
# N_CTX_Q,
# N_CTX_K,
# D_HEAD,
# dtype,
# causal=causal,
# layout=layout,
# is_fp8=True,
# use_o_scale=use_o_scale)
# o = torch.empty_like(q_quantized) if use_o_scale else None
# tri_out, _ = triton_attention_rocm(q_quantized, k_quantized, v_quantized,
# o, input_metadata)
# ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
# dtype, input_metadata)
# ref_out = ref_impl.fwd_fp8(q_quantized, k_quantized, v_quantized)
# # compare
# torch.testing.assert_close(ref_out.to(torch.float32),
# tri_out.to(torch.float32),
# atol=7e-2,
# rtol=2e-1)
# @pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
# (4, 48, 1, 1, 64),
# (4, 48, 1, 1, 128),
# (4, 48, 3, 3, 128),
# (4, 4, 128, 128, 65),
# (4, 4, 113, 123, 1),
# ])
# @pytest.mark.parametrize('causal', [True, False])
# @pytest.mark.parametrize('layout', ['bhsd'])
# def test_op_fwd_fp8_kv(Z,
# H,
# N_CTX_Q,
# N_CTX_K,
# D_HEAD,
# causal,
# layout,
# dtype=torch.float32):
# current_platform.seed_everything(0)
# q, k_quantized, v_quantized, input_metadata = input_helper(Z,
# H,
# H,
# N_CTX_Q,
# N_CTX_K,
# D_HEAD,
# dtype,
# causal=causal,
# layout=layout,
# is_fp8=True,
# fp8_kv=True)
# o = torch.empty_like(q)
# tri_out, _ = triton_attention_rocm(q, k_quantized, v_quantized, o,
# input_metadata)
# ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
# dtype, input_metadata)
# ref_out = ref_impl.fwd_fp8_kv(q, k_quantized, v_quantized)
# torch.testing.assert_close(ref_out, tri_out, atol=3e-2, rtol=8e-1)
@pytest.mark.parametrize('Z, H, N_CTX_Q, N_CTX_K, D_HEAD', [
(4, 48, 1, 1, 64),
(4, 48, 1, 1, 128),
(4, 48, 3, 3, 128),
(4, 4, 128, 128, 65),
])
@pytest.mark.parametrize('causal', [True, False])
@pytest.mark.parametrize('use_bias', [True])
@pytest.mark.parametrize('dtype', [torch.bfloat16])
def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
current_platform.seed_everything(0)
q, k, v, input_metadata = input_helper(Z,
H,
H,
N_CTX_Q,
N_CTX_K,
D_HEAD,
dtype,
layout='bhsd',
causal=causal,
use_bias=use_bias)
o = torch.empty_like(q)
# triton implementation
tri_out, _ = triton_attention_rocm(q, k, v, o, input_metadata)
ref_impl = ReferenceAttention(Z, H, H, N_CTX_Q, N_CTX_K, D_HEAD, False,
dtype, input_metadata)
ref_out = ref_impl.fwd(q, k, v)
# compare
torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
# NOTE: Uses thd layout, so also tests thd.
@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(1, 48, 256, 64),
(4, 48, 512, 64),
(16, 48, 512, 64),
(64, 48, 128, 128)])
@pytest.mark.parametrize('causal', [True, False])
def test_op_varlen_fwd(Z, H, N_CTX, D_HEAD, causal, dtype=torch.float16):
q, k, v, input_metadata = varlen_input_helper(Z, H, H, N_CTX, N_CTX,
D_HEAD, dtype)
tri_out = torch.empty_like(q)
triton_attention_rocm(q, k, v, tri_out, input_metadata)
ref_impl = ReferenceAttention(Z, H, H, N_CTX, N_CTX, D_HEAD, False, dtype,
input_metadata)
ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=False)
torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
# NOTE: Uses thd layout, so also tests thd.
@pytest.mark.parametrize('Z, HQ, HK, N_CTX, D_HEAD', [(2, 48, 24, 128, 64),
(4, 48, 12, 256, 64),
(4, 48, 4, 512, 64),
(4, 64, 16, 128, 128)])
@pytest.mark.parametrize('causal', [False])
def test_op_varlen_mqa_fwd(Z,
HQ,
HK,
N_CTX,
D_HEAD,
causal,
dtype=torch.float16):
q, k, v, input_metadata = varlen_input_helper(Z, HQ, HK, N_CTX, N_CTX,
D_HEAD, dtype)
tri_out = torch.empty_like(q)
triton_attention_rocm(q, k, v, tri_out, input_metadata)
ref_impl = ReferenceAttention(Z, HQ, HK, N_CTX, N_CTX, D_HEAD, False,
dtype, input_metadata)
ref_out = ref_impl.varlen_fwd(q, k, v, is_mqa=True)
torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)
......@@ -22,7 +22,7 @@ from vllm.lora.request import LoRARequest
from vllm.v1.worker.gpu_worker import Worker
from ..utils import models_path_prefix
MODEL_PATH = "Qwen/Qwen3-0.6B"
MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen3-0.6B")
NUM_LORAS = 16
......
......@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import os
import numpy as np
import pytest
import torch
......@@ -9,23 +10,10 @@ from transformers import AutoModelForTokenClassification
from tests.models.utils import softmax
from vllm.platforms import current_platform
from ....utils import models_path_prefix
@pytest.fixture(autouse=True)
def seed_everything():
"""Seed all random number generators for reproducibility."""
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
yield
@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "boltuix/NeuroBERT-NER")])
# The float32 is required for this tiny model to pass the test.
@pytest.mark.parametrize("dtype", ["float"])
@torch.inference_mode
......@@ -68,7 +56,6 @@ def test_bert_models(
@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.flaky(reruns=3)
@torch.inference_mode
def test_modernbert_models(
hf_runner,
......@@ -77,14 +64,6 @@ def test_modernbert_models(
model: str,
dtype: str,
) -> None:
# NOTE: https://github.com/vllm-project/vllm/pull/32403
# `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
# model, which can cause numerical precision variance and edge cases.
# We use @flaky(reruns=3) to mitigate intermittent failures.
print(
f"\n[NOTE] Testing {model} (randomly initialized weights) - "
"flaky tolerance enabled due to numerical precision variance."
)
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment