Commit f48954a4 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.5.0

parents 1dba29d3 8f89d720
......@@ -7,7 +7,8 @@ from vllm.core.interfaces import AllocStatus
from vllm.sequence import Logprob, SequenceStatus
from vllm.utils import chunk_list
from ..utils import create_seq_group, create_seq_group_encoder_decoder
from ..utils import (create_dummy_prompt, create_seq_group,
create_seq_group_encoder_decoder)
@pytest.mark.parametrize("block_size", [16])
......@@ -255,6 +256,61 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
assert num_consumed_blocks == expected_consumed_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("num_cpu_blocks", [4])
@pytest.mark.parametrize("num_gpu_blocks", [4])
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
@pytest.mark.parametrize("enable_caching", [False, True])
def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
enable_caching):
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
"""
block_manager = BlockSpaceManagerV2(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id = 0
prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt)
assert block_manager.can_swap_out(seq_group)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_out(seq_group)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == gpu_blocks
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
prompt.status = SequenceStatus.SWAPPED
# Swap seq group from CPU -> GPU.
assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_in(seq_group)
cpu_blocks = block_manager.get_block_table(prompt)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == [cpu_blocks[0]]
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
@pytest.mark.parametrize("block_size", [8, 16])
@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
@pytest.mark.parametrize("num_slots_to_append", [50])
......
......@@ -42,18 +42,16 @@ def test_models(
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
enforce_eager = backend_by_env_var == "FLASHINFER"
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
vllm_model = vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
enforce_eager=enforce_eager,
distributed_executor_backend=distributed_executor_backend)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model,
dtype=dtype,
tensor_parallel_size=2,
enforce_eager=enforce_eager,
distributed_executor_backend=distributed_executor_backend
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
......
......@@ -45,21 +45,19 @@ def test_models(
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
vllm_model = vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
......
import os
import torch
from vllm.distributed.parallel_state import is_in_the_same_node
torch.distributed.init_process_group(backend="gloo")
test_result = is_in_the_same_node(torch.distributed.group.WORLD)
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
assert test_result == expected, f"Expected {expected}, got {test_result}"
......@@ -19,9 +19,8 @@ MAX_TOKENS = 1024
@pytest.fixture
def vllm_model(vllm_runner):
vllm_model = vllm_runner(MODEL)
yield vllm_model
del vllm_model
with vllm_runner(MODEL) as vllm_model:
yield vllm_model
def test_stop_reason(vllm_model, example_prompts):
......
......@@ -10,7 +10,8 @@ MAX_TOKENS = 200
@pytest.fixture(scope="session")
def vllm_model(vllm_runner):
return vllm_runner(MODEL)
with vllm_runner(MODEL) as vllm_model:
yield vllm_model
@pytest.mark.skip_global_cleanup
......
......@@ -63,7 +63,6 @@ def test_guided_logits_processors():
tokenizer,
whitespace_pattern=None)
regex_LP.init_state()
token_ids = tokenizer.encode(
f"Give an example IPv4 address with this regex: {TEST_REGEX}")
tensor = torch.rand(32000)
......@@ -72,7 +71,6 @@ def test_guided_logits_processors():
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
json_LP.init_state()
token_ids = tokenizer.encode(
f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
tensor = torch.rand(32000)
......
import weakref
import pytest
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from vllm import LLM
from vllm.lora.request import LoRARequest
from ..conftest import cleanup
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
LORA_NAME = "typeof/zephyr-7b-beta-lora"
pytestmark = pytest.mark.llm
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
tensor_parallel_size=1,
max_model_len=8192,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
max_num_seqs=128,
enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
del llm
cleanup()
@pytest.fixture(scope="session")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.mark.skip_global_cleanup
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
lora_request = [
LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
for idx in range(len(PROMPTS))
]
# Multiple SamplingParams should be matched with each prompt
outputs = llm.generate(PROMPTS, lora_request=lora_request)
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
# Single LoRARequest should be applied to every prompt
single_lora_request = lora_request[0]
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
assert len(PROMPTS) == len(outputs)
......@@ -167,9 +167,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
assert completion.choices[0].text is not None and len(
completion.choices[0].text) >= 5
assert completion.choices[0].finish_reason == "length"
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11)
......@@ -180,8 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
max_tokens=5,
temperature=0.0,
)
assert completion.choices[0].text is not None and len(
completion.choices[0].text) >= 5
assert len(completion.choices[0].text) >= 5
@pytest.mark.asyncio
......@@ -206,9 +206,9 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
# first test base model, then test loras
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
[MODEL_NAME, "zephyr-lora"],
)
async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
......@@ -224,7 +224,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) <= 1
assert len(choice.logprobs.top_logprobs[0]) == 1
@pytest.mark.asyncio
......@@ -246,7 +246,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) <= 6
assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
@pytest.mark.asyncio
......@@ -264,7 +264,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=6,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=21,
)
...
with pytest.raises(
......@@ -274,7 +276,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
logprobs=6,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=30,
stream=True,
)
async for chunk in stream:
......@@ -287,55 +291,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_single_chat_session(server, client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert chat_completion.id is not None
assert chat_completion.choices is not None and len(
chat_completion.choices) == 1
assert chat_completion.choices[0].message is not None
assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.content[
0].top_logprobs is not None
assert len(
chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
assert len(completion.choices[0].text) >= 0
@pytest.mark.asyncio
......@@ -390,7 +346,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
choice = chat_completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.content is not None
assert len(choice.logprobs.content[0].top_logprobs) <= 1
assert len(choice.logprobs.content[0].top_logprobs) == 0
@pytest.mark.asyncio
......@@ -418,11 +374,14 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
choice = chat_completion.choices[0]
assert choice.logprobs is not None
assert choice.logprobs.content is not None
assert len(choice.logprobs.content[0].top_logprobs) <= 6
assert len(choice.logprobs.content[0].top_logprobs) == 5
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
model_name: str):
messages = [{
......@@ -463,7 +422,51 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_single_chat_session(server, client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=37, total_tokens=47)
message = choice.message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
......@@ -478,8 +481,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
temperature=0.0,
)
single_output = single_completion.choices[0].text
single_usage = single_completion.usage
stream = await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
......@@ -495,7 +496,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text
assert chunk.usage == single_usage
assert "".join(chunks) == single_output
......@@ -550,6 +550,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
)
async def test_chat_completion_stream_options(server,
client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "What is the capital of France?"
}]
# Test stream=True, stream_options={"include_usage": False}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": False})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options={"include_usage": True}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": True})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=False, stream_options={"include_usage": None}
with pytest.raises(BadRequestError):
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
# Test stream=False, stream_options={"include_usage": True}
with pytest.raises(BadRequestError):
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
)
async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
model_name: str):
prompt = "What is the capital of France?"
# Test stream=True, stream_options={"include_usage": False}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={"include_usage": False})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options={"include_usage": True}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={"include_usage": True})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=False, stream_options={"include_usage": None}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
# Test stream=False, stream_options={"include_usage": True}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
......@@ -620,8 +752,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
logit_bias={str(token_id): 100},
seed=42,
)
assert completion.choices[0].text is not None and len(
completion.choices[0].text) >= 5
assert len(completion.choices[0].text) >= 5
response_tokens = tokenizer(completion.choices[0].text,
add_special_tokens=False)["input_ids"]
expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
......@@ -668,9 +799,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 3
assert len(completion.choices) == 3
for i in range(3):
assert completion.choices[i].text is not None
output_json = json.loads(completion.choices[i].text)
jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
......@@ -737,9 +867,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 3
assert len(completion.choices) == 3
for i in range(3):
assert completion.choices[i].text is not None
assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
......@@ -796,7 +925,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 2
assert len(completion.choices) == 2
for i in range(2):
assert completion.choices[i].text in TEST_CHOICE
......@@ -898,12 +1027,199 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
top_logprobs=5,
extra_body=dict(guided_choice=TEST_CHOICE,
guided_decoding_backend=guided_decoding_backend))
assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.content is not None
top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
# -9999.0 is the minimum logprob returned by OpenAI
assert all(
isinstance(token.logprob, float) and token.logprob >= -9999.0
for token in top_logprobs)
for item in top_logprobs:
assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_named_tool_use(server, client: openai.AsyncOpenAI,
guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}"
}]
# non-streaming
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice={
"type": "function",
"function": {
"name": "dummy_function_name"
}
})
message = chat_completion.choices[0].message
assert len(message.content) == 0
json_string = message.tool_calls[0].function.arguments
json1 = json.loads(json_string)
jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
messages.append({"role": "assistant", "content": json_string})
messages.append({
"role":
"user",
"content":
"Give me another one with a different name and age"
})
# streaming
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice={
"type": "function",
"function": {
"name": "dummy_function_name"
}
},
stream=True)
output = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
assert delta.content is None or len(delta.content) == 0
if delta.tool_calls:
output.append(delta.tool_calls[0].function.arguments)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
json2 = json.loads("".join(output))
jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
assert json1["name"] != json2["name"]
assert json1["age"] != json2["age"]
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_required_tool_use_not_yet_supported(
server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}"
}]
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice="required")
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice="auto")
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_inconsistent_tool_choice_and_tools(
server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}"
}]
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tool_choice={
"type": "function",
"function": {
"name":
"dummy_function_name"
}
})
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice={
"type": "function",
"function": {
"name": "nondefined_function_name"
}
})
@pytest.mark.asyncio
......@@ -920,6 +1236,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
response_format={"type": "json_object"})
content = resp.choices[0].message.content
assert content is not None
loaded = json.loads(content)
assert loaded == {"result": 2}, loaded
......@@ -1032,8 +1350,9 @@ number: "1" | "2"
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
)
@pytest.mark.parametrize("logprobs_arg", [1, 0])
async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
model_name: str):
model_name: str, logprobs_arg: int):
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# test using text and token IDs
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
......@@ -1042,12 +1361,11 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
max_tokens=5,
temperature=0.0,
echo=True,
logprobs=1)
logprobs=logprobs_arg)
prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
list) else prompt
assert (completion.choices[0].text is not None
and re.search(r"^" + prompt_text, completion.choices[0].text))
assert re.search(r"^" + prompt_text, completion.choices[0].text)
logprobs = completion.choices[0].logprobs
assert logprobs is not None
assert len(logprobs.text_offset) > 5
......@@ -1055,6 +1373,9 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
and logprobs.token_logprobs[0] is None)
assert (len(logprobs.top_logprobs) > 5
and logprobs.top_logprobs[0] is None)
for top_logprobs in logprobs.top_logprobs[1:]:
assert max(logprobs_arg,
1) <= len(top_logprobs) <= logprobs_arg + 1
assert len(logprobs.tokens) > 5
......@@ -1085,32 +1406,32 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
)
async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str):
input = [
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embeddings = await client.embeddings.create(
model=model_name,
input=input,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 1
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9
assert embeddings.usage.total_tokens == 9
# test using token IDs
input = [1, 1, 1, 1, 1]
input_tokens = [1, 1, 1, 1, 1]
embeddings = await client.embeddings.create(
model=model_name,
input=input,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 1
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
......@@ -1125,29 +1446,29 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str):
# test List[str]
inputs = [
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embeddings = await client.embeddings.create(
model=model_name,
input=inputs,
input=input_texts,
encoding_format="float",
)
assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 3
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
# test List[List[int]]
inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embeddings = await client.embeddings.create(
model=model_name,
input=inputs,
input=input_tokens,
encoding_format="float",
)
assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 4
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
......
from pathlib import Path
from typing import Dict
import openai
import pytest
import pytest_asyncio
import ray
from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
from ..utils import ServerRunner
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
"examples/template_llava.jinja")
assert LLAVA_CHAT_TEMPLATE.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
pytestmark = pytest.mark.openai
@pytest.fixture(scope="module")
def server():
ray.init()
server_runner = ServerRunner.remote([
"--model",
MODEL_NAME,
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--image-input-type",
"pixel_values",
"--image-token-id",
"32000",
"--image-input-shape",
"1,3,336,336",
"--image-feature-size",
"576",
"--chat-template",
str(LLAVA_CHAT_TEMPLATE),
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="session")
def client():
client = openai.AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client
@pytest_asyncio.fixture(scope="session")
async def base64_encoded_image() -> Dict[str, str]:
return {
image_url:
encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=596, total_tokens=606)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded(
server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=596, total_tokens=606)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
)
chunks = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_multi_image_input(server, client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
with pytest.raises(openai.BadRequestError): # test multi-image input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
if __name__ == "__main__":
pytest.main([__file__])
......@@ -44,7 +44,7 @@ def test_act_and_mul(
elif activation == "gelu_tanh":
layer = GeluAndMul(approximate="tanh")
out = layer(x)
ref_out = layer._forward(x)
ref_out = layer.forward_native(x)
# The SiLU and GELU implementations are equivalent to the native PyTorch
# implementations, so we can do exact comparison.
assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
......@@ -72,7 +72,7 @@ def test_activation(
x = torch.randn(num_tokens, d, dtype=dtype)
layer = activation()
out = layer(x)
ref_out = layer._forward(x)
ref_out = layer.forward_native(x)
assert torch.allclose(out,
ref_out,
atol=get_default_atol(out),
......
import os
from unittest.mock import patch
import pytest
import torch
from tests.kernels.utils import (STR_FLASH_ATTN_VAL, STR_INVALID_VAL,
override_backend_env_variable)
from vllm.attention.selector import which_attn_to_use
@pytest.mark.parametrize(
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
def test_env(name: str, device: str):
def test_env(name: str, device: str, monkeypatch):
"""Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend.
"""
name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
os.environ["VLLM_ATTENTION_BACKEND"] = name
override_backend_env_variable(monkeypatch, name)
if device == "cpu":
with patch("vllm.attention.selector.is_cpu", return_value=True):
......@@ -32,14 +33,11 @@ def test_env(name: str, device: str):
torch.float16, 16)
assert backend.name == name
if name_backup is not None:
os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
def test_flash_attn():
def test_flash_attn(monkeypatch):
"""Test FlashAttn validation."""
name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASH_ATTN"
override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
......@@ -71,14 +69,9 @@ def test_flash_attn():
backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
assert backend.name != "FLASH_ATTN"
if name_backup is not None:
os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
def test_invalid_env():
def test_invalid_env(monkeypatch):
"""Throw an exception if the backend name is invalid."""
name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
os.environ["VLLM_ATTENTION_BACKEND"] = "INVALID"
override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
with pytest.raises(ValueError):
which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
......@@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int,
assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
@pytest.mark.parametrize("m", [512, 222, 33, 1])
@pytest.mark.parametrize("m", [512, 222, 100, 33, 1])
@pytest.mark.parametrize("n", [2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 496, 1024])
@pytest.mark.parametrize("per_act_token", [True, False])
......@@ -207,14 +207,21 @@ class CutlassLayer(torch.nn.Module):
self.out_dtype)
def test_cutlass_cuda_graph():
@pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False])
def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
m, n, k = 512, 512, 512
a = to_int8(torch.randn((m, k), device="cuda"))
b = to_int8(torch.randn((n, k), device="cuda").t())
scale_a = (torch.randn((m, 1), device="cuda", dtype=torch.float32) / 10)
scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32) / 10)
m_a_scales = m if per_act_token else 1
n_b_scales = n if per_out_ch else 1
scale_a = (torch.randn(
(m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
scale_b = (torch.randn(
(1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
# Construct a trivial model with a single layer that calls a CUTLASS kernel
model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)
......
import pytest
import torch
from vllm._C import ops
# ruff: noqa: F401
import vllm._C
DTYPES = [torch.half, torch.bfloat16, torch.float]
HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192] # Arbitrary values for testing
HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
8193] # Arbitrary values for testing
NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing
SEEDS = [0]
SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@torch.inference_mode()
def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, seed: int) -> None:
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
x_token_max, _ = x.max(dim=1)
x_token_max = x_token_max.to(dtype=torch.float32)
scales = (x_token_max / float(127.0))[:, None].to(device="cuda",
dtype=torch.float32)
torch_out = (x / scales).round().clamp(int8_traits.min,
int8_traits.max).to(torch.int8)
ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out)
assert torch.allclose(scales_out, scales)
assert torch.allclose(torch_out, ops_out,
atol=1) # big atol to account for rounding errors
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("scale", SCALE)
@torch.inference_mode()
def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype,
seed: int, scale: float) -> None:
def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, seed: int,
scale: float) -> None:
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
out1 = (x / scale).round().clamp(
torch.iinfo(torch.int8).min,
torch.iinfo(torch.int8).max).to(torch.int8)
out1 = (x / scale).round().clamp(int8_traits.min,
int8_traits.max).to(torch.int8)
out2 = torch.empty_like(x, dtype=torch.int8)
ops.static_scaled_int8_quant(out2, x, scale)
scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
torch.ops._C.static_scaled_int8_quant(out2, x, scale_argument)
assert torch.allclose(out1, out2,
atol=1) # big atol to account for rounding errors
......@@ -42,7 +42,7 @@ def test_rms_norm(
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
ref_out = layer._forward(x, residual)
ref_out = layer.forward_native(x, residual)
out = layer(x, residual)
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# numerical errors than other operators because they involve reductions.
......
......@@ -64,7 +64,7 @@ def test_rotary_embedding(
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
ref_query, ref_key = rope._forward(positions, query, key)
ref_query, ref_key = rope.forward_native(positions, query, key)
out_query, out_key = rope.forward(positions, query, key)
# Compare the results.
assert torch.allclose(out_query,
......@@ -121,7 +121,7 @@ def test_batched_rotary_embedding(
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
ref_query, ref_key = rope._forward(positions, query, key)
ref_query, ref_key = rope.forward_native(positions, query, key)
out_query, out_key = rope.forward(positions,
query,
key,
......@@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora(
# NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place.
ref_query, ref_key = rope._forward(positions, query, key, query_offsets)
ref_query, ref_key = rope.forward_native(positions, query, key,
query_offsets)
out_query, out_key = rope.forward(positions, query, key,
query_offsets.flatten())
# Compare the results.
......
"""Kernel test utils"""
import pytest
STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
STR_INVALID_VAL: str = "INVALID"
def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
backend_name: str) -> None:
'''
Override the environment variable indicating the vLLM backend temporarily,
using pytest monkeypatch to ensure that the env vars get
reset once the test context exits.
Arguments:
* mpatch: pytest monkeypatch instance
* backend_name: attention backend name to force
'''
mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
......@@ -42,10 +42,24 @@ def cleanup():
ray.shutdown()
@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
This can provide a ~10x speedup for non-GPU unit tests since they don't need
to initialize torch.
"""
if request.node.get_closest_marker("skip_global_cleanup"):
return False
return True
@pytest.fixture(autouse=True)
def cleanup_fixture():
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
yield
cleanup()
if should_do_global_cleanup_after_test:
cleanup()
@pytest.fixture
......
......@@ -2,6 +2,7 @@ import random
from copy import deepcopy
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from unittest.mock import patch
import pytest
import torch
......@@ -32,7 +33,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
from vllm.model_executor.utils import set_random_seed
from .utils import DummyLoRAManager
......@@ -427,7 +428,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
logits_processor = LogitsProcessor(
vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
lora_logits_processor = LogitsProcessorWithLoRA(
logits_processor, 1024, linear.weight.dtype, linear.weight.device)
logits_processor, 1024, linear.weight.dtype, linear.weight.device,
None)
lora_logits_processor.create_lora_weights(max_loras, lora_config)
return linear, logits_processor, lora_logits_processor
......@@ -867,3 +869,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
torch.allclose(ref_q, actual_q)
torch.allclose(ref_k, actual_k)
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
@pytest.mark.parametrize("seed", list(range(256)))
def test_vocab_parallel_embedding_indices(tp_size, seed):
random.seed(seed)
vocab_size = random.randint(4000, 64000)
added_vocab_size = random.randint(0, 1024)
org_vocab_size = vocab_size - added_vocab_size
last_org_vocab_end_index = 0
last_added_vocab_end_index = org_vocab_size
computed_vocab_size = 0
computed_org_vocab_size = 0
computed_added_vocab_size = 0
vocab_size_padded = -1
all_org_tokens = []
all_added_tokens = []
token_ids = []
for tp_rank in range(tp_size):
with patch(
"vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
return_value=tp_rank
), patch(
"vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
return_value=tp_size):
vocab_embedding = VocabParallelEmbedding(
vocab_size, 1, org_num_embeddings=org_vocab_size)
vocab_size_padded = vocab_embedding.num_embeddings_padded
shard_indices = vocab_embedding.shard_indices
# Assert that the ranges are contiguous
assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
assert (shard_indices.added_vocab_start_index ==
last_added_vocab_end_index)
# Ensure that we are not exceeding the vocab size
computed_vocab_size += shard_indices.num_elements_padded
computed_org_vocab_size += shard_indices.num_org_elements
computed_added_vocab_size += shard_indices.num_added_elements
# Ensure that the ranges are not overlapping
all_org_tokens.extend(
range(shard_indices.org_vocab_start_index,
shard_indices.org_vocab_end_index))
all_added_tokens.extend(
range(shard_indices.added_vocab_start_index,
shard_indices.added_vocab_end_index))
token_ids.extend(
range(shard_indices.org_vocab_start_index,
shard_indices.org_vocab_end_index))
token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
shard_indices.num_org_elements))
token_ids.extend(
range(shard_indices.added_vocab_start_index,
shard_indices.added_vocab_end_index))
token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
shard_indices.num_added_elements))
last_org_vocab_end_index = shard_indices.org_vocab_end_index
last_added_vocab_end_index = shard_indices.added_vocab_end_index
assert computed_vocab_size == vocab_size_padded
assert computed_org_vocab_size == org_vocab_size
assert computed_added_vocab_size == added_vocab_size
# Ensure that the ranges are not overlapping
assert len(all_org_tokens) == len(set(all_org_tokens))
assert len(all_added_tokens) == len(set(all_added_tokens))
assert not set(all_org_tokens).intersection(set(all_added_tokens))
token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
reindex_mapping = vocab_embedding.get_sharded_to_full_mapping()
assert reindex_mapping is not None or tp_size == 1
if reindex_mapping is not None:
reindexed_token_ids = token_ids_tensor[reindex_mapping]
expected = torch.tensor(list(range(0, vocab_size)))
assert reindexed_token_ids[:vocab_size].equal(expected)
assert torch.all(reindexed_token_ids[vocab_size:] == -1)
def test_get_masked_input_and_mask():
x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
# base tp 1 case, no padding
modified_x, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=8,
added_vocab_start_index=8,
added_vocab_end_index=12,
num_org_vocab_padding=0)
assert torch.equal(x, modified_x)
# tp 2 case, no padding
modified_x_rank_0, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=4,
added_vocab_start_index=8,
added_vocab_end_index=10,
num_org_vocab_padding=0)
modified_x_rank_1, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=4,
org_vocab_end_index=8,
added_vocab_start_index=10,
added_vocab_end_index=12,
num_org_vocab_padding=0)
assert torch.equal(modified_x_rank_0,
torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
assert torch.equal(modified_x_rank_1,
torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
# tp 4 case, no padding
modified_x_rank_0, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=2,
added_vocab_start_index=8,
added_vocab_end_index=9,
num_org_vocab_padding=0)
modified_x_rank_1, _ = get_masked_input_and_mask(x,
org_vocab_start_index=2,
org_vocab_end_index=4,
added_vocab_start_index=9,
added_vocab_end_index=10,
num_org_vocab_padding=0)
modified_x_rank_2, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=4,
org_vocab_end_index=6,
added_vocab_start_index=10,
added_vocab_end_index=11,
num_org_vocab_padding=0)
modified_x_rank_3, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=6,
org_vocab_end_index=8,
added_vocab_start_index=11,
added_vocab_end_index=12,
num_org_vocab_padding=0)
assert torch.equal(modified_x_rank_0,
torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
assert torch.equal(modified_x_rank_1,
torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
assert torch.equal(modified_x_rank_2,
torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
assert torch.equal(modified_x_rank_3,
torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
# base tp 1 case, with padding
modified_x, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=8,
added_vocab_start_index=8,
added_vocab_end_index=12,
num_org_vocab_padding=2)
assert torch.equal(modified_x,
torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
# tp 2 case, with padding
modified_x_rank_0, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=4,
added_vocab_start_index=8,
added_vocab_end_index=10,
num_org_vocab_padding=2)
modified_x_rank_1, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=4,
org_vocab_end_index=8,
added_vocab_start_index=10,
added_vocab_end_index=12,
num_org_vocab_padding=2)
assert torch.equal(modified_x_rank_0,
torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
assert torch.equal(modified_x_rank_1,
torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
# tp 4 case, with padding
modified_x_rank_0, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=2,
added_vocab_start_index=8,
added_vocab_end_index=9,
num_org_vocab_padding=2)
modified_x_rank_1, _ = get_masked_input_and_mask(x,
org_vocab_start_index=2,
org_vocab_end_index=4,
added_vocab_start_index=9,
added_vocab_end_index=10,
num_org_vocab_padding=2)
modified_x_rank_2, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=4,
org_vocab_end_index=6,
added_vocab_start_index=10,
added_vocab_end_index=11,
num_org_vocab_padding=2)
modified_x_rank_3, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=6,
org_vocab_end_index=8,
added_vocab_start_index=11,
added_vocab_end_index=12,
num_org_vocab_padding=2)
assert torch.equal(modified_x_rank_0,
torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
assert torch.equal(modified_x_rank_1,
torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
assert torch.equal(modified_x_rank_2,
torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
assert torch.equal(modified_x_rank_3,
torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
......@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int):
return generated_texts
@pytest.mark.parametrize("tp_size", [1])
def test_llama_lora(sql_lora_files, tp_size):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < tp_size:
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
@pytest.mark.parametrize("tp_size", [1, 2, 4])
def test_llama_lora(sql_lora_files, tp_size, num_gpus_available):
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm = vllm.LLM(MODEL_PATH,
enable_lora=True,
......@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size):
print("removing lora")
@pytest.mark.skip("Requires multiple GPUs")
def test_llama_tensor_parallel_equality(sql_lora_files):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
if num_gpus_available < 4:
pytest.skip("Not enough GPUs for tensor parallelism 4")
llm_tp1 = vllm.LLM(MODEL_PATH,
enable_lora=True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment