Commit f48954a4 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.5.0

parents 1dba29d3 8f89d720
...@@ -7,7 +7,8 @@ from vllm.core.interfaces import AllocStatus ...@@ -7,7 +7,8 @@ from vllm.core.interfaces import AllocStatus
from vllm.sequence import Logprob, SequenceStatus from vllm.sequence import Logprob, SequenceStatus
from vllm.utils import chunk_list from vllm.utils import chunk_list
from ..utils import create_seq_group, create_seq_group_encoder_decoder from ..utils import (create_dummy_prompt, create_seq_group,
create_seq_group_encoder_decoder)
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [16])
...@@ -255,6 +256,61 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, ...@@ -255,6 +256,61 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
assert num_consumed_blocks == expected_consumed_blocks assert num_consumed_blocks == expected_consumed_blocks
@pytest.mark.parametrize("block_size", [8])
@pytest.mark.parametrize("num_cpu_blocks", [4])
@pytest.mark.parametrize("num_gpu_blocks", [4])
@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
@pytest.mark.parametrize("enable_caching", [False, True])
def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
enable_caching):
"""Verify blocks number on src/desc device is correct after swapping in/out
sequence group (not missing or extra blocks).
"""
block_manager = BlockSpaceManagerV2(block_size,
num_cpu_blocks,
num_gpu_blocks,
watermark=0,
enable_caching=enable_caching)
prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
prompt.status = SequenceStatus.WAITING
block_manager.allocate(seq_group)
# Emulate a forward pass by appending a single token.
# The block manager then knows how many unprocessed
# tokens will be written in the next forward pass.
token_id = 0
prompt.status = SequenceStatus.RUNNING
prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
# Swap seq group from GPU -> CPU.
gpu_blocks = block_manager.get_block_table(prompt)
assert block_manager.can_swap_out(seq_group)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_out(seq_group)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == gpu_blocks
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
prompt.status = SequenceStatus.SWAPPED
# Swap seq group from CPU -> GPU.
assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
mapping = block_manager.swap_in(seq_group)
cpu_blocks = block_manager.get_block_table(prompt)
mapping_keys = [key for key, _ in mapping]
assert mapping_keys == [cpu_blocks[0]]
after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
@pytest.mark.parametrize("block_size", [8, 16]) @pytest.mark.parametrize("block_size", [8, 16])
@pytest.mark.parametrize("prompt_len", [10, 300, 1000]) @pytest.mark.parametrize("prompt_len", [10, 300, 1000])
@pytest.mark.parametrize("num_slots_to_append", [50]) @pytest.mark.parametrize("num_slots_to_append", [50])
......
...@@ -42,18 +42,16 @@ def test_models( ...@@ -42,18 +42,16 @@ def test_models(
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND) backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
enforce_eager = backend_by_env_var == "FLASHINFER" enforce_eager = backend_by_env_var == "FLASHINFER"
hf_model = hf_runner(model, dtype=dtype) with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
with vllm_runner(model,
vllm_model = vllm_runner( dtype=dtype,
model, tensor_parallel_size=2,
dtype=dtype, enforce_eager=enforce_eager,
tensor_parallel_size=2, distributed_executor_backend=distributed_executor_backend
enforce_eager=enforce_eager, ) as vllm_model:
distributed_executor_backend=distributed_executor_backend) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
...@@ -45,21 +45,19 @@ def test_models( ...@@ -45,21 +45,19 @@ def test_models(
enable_chunked_prefill = True enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size
hf_model = hf_runner(model, dtype=dtype) with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
vllm_model = vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
tensor_parallel_size=2, tensor_parallel_size=2,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
) ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
......
import os
import torch
from vllm.distributed.parallel_state import is_in_the_same_node
torch.distributed.init_process_group(backend="gloo")
test_result = is_in_the_same_node(torch.distributed.group.WORLD)
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
assert test_result == expected, f"Expected {expected}, got {test_result}"
...@@ -19,9 +19,8 @@ MAX_TOKENS = 1024 ...@@ -19,9 +19,8 @@ MAX_TOKENS = 1024
@pytest.fixture @pytest.fixture
def vllm_model(vllm_runner): def vllm_model(vllm_runner):
vllm_model = vllm_runner(MODEL) with vllm_runner(MODEL) as vllm_model:
yield vllm_model yield vllm_model
del vllm_model
def test_stop_reason(vllm_model, example_prompts): def test_stop_reason(vllm_model, example_prompts):
......
...@@ -10,7 +10,8 @@ MAX_TOKENS = 200 ...@@ -10,7 +10,8 @@ MAX_TOKENS = 200
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def vllm_model(vllm_runner): def vllm_model(vllm_runner):
return vllm_runner(MODEL) with vllm_runner(MODEL) as vllm_model:
yield vllm_model
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
......
...@@ -63,7 +63,6 @@ def test_guided_logits_processors(): ...@@ -63,7 +63,6 @@ def test_guided_logits_processors():
tokenizer, tokenizer,
whitespace_pattern=None) whitespace_pattern=None)
regex_LP.init_state()
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an example IPv4 address with this regex: {TEST_REGEX}") f"Give an example IPv4 address with this regex: {TEST_REGEX}")
tensor = torch.rand(32000) tensor = torch.rand(32000)
...@@ -72,7 +71,6 @@ def test_guided_logits_processors(): ...@@ -72,7 +71,6 @@ def test_guided_logits_processors():
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor) assert not torch.allclose(tensor, original_tensor)
json_LP.init_state()
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an employee profile that fits this schema: {TEST_SCHEMA}") f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
tensor = torch.rand(32000) tensor = torch.rand(32000)
......
import weakref
import pytest
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from vllm import LLM
from vllm.lora.request import LoRARequest
from ..conftest import cleanup
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
PROMPTS = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
LORA_NAME = "typeof/zephyr-7b-beta-lora"
pytestmark = pytest.mark.llm
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
tensor_parallel_size=1,
max_model_len=8192,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
max_num_seqs=128,
enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
del llm
cleanup()
@pytest.fixture(scope="session")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.mark.skip_global_cleanup
def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
lora_request = [
LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
for idx in range(len(PROMPTS))
]
# Multiple SamplingParams should be matched with each prompt
outputs = llm.generate(PROMPTS, lora_request=lora_request)
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
# Single LoRARequest should be applied to every prompt
single_lora_request = lora_request[0]
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
assert len(PROMPTS) == len(outputs)
...@@ -167,9 +167,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, ...@@ -167,9 +167,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
assert completion.id is not None assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1 assert completion.choices is not None and len(completion.choices) == 1
assert completion.choices[0].text is not None and len(
completion.choices[0].text) >= 5 choice = completion.choices[0]
assert completion.choices[0].finish_reason == "length" assert len(choice.text) >= 5
assert choice.finish_reason == "length"
assert completion.usage == openai.types.CompletionUsage( assert completion.usage == openai.types.CompletionUsage(
completion_tokens=5, prompt_tokens=6, total_tokens=11) completion_tokens=5, prompt_tokens=6, total_tokens=11)
...@@ -180,8 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, ...@@ -180,8 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI,
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
) )
assert completion.choices[0].text is not None and len( assert len(completion.choices[0].text) >= 5
completion.choices[0].text) >= 5
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -206,9 +206,9 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI, ...@@ -206,9 +206,9 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# first test base model, then test loras # just test 1 lora hereafter
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [MODEL_NAME, "zephyr-lora"],
) )
async def test_zero_logprobs(server, client: openai.AsyncOpenAI, async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
model_name: str): model_name: str):
...@@ -224,7 +224,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, ...@@ -224,7 +224,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
assert choice.logprobs is not None assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) <= 1 assert len(choice.logprobs.top_logprobs[0]) == 1
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -246,7 +246,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI, ...@@ -246,7 +246,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
assert choice.logprobs is not None assert choice.logprobs is not None
assert choice.logprobs.token_logprobs is not None assert choice.logprobs.token_logprobs is not None
assert choice.logprobs.top_logprobs is not None assert choice.logprobs.top_logprobs is not None
assert len(choice.logprobs.top_logprobs[0]) <= 6 assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -264,7 +264,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, ...@@ -264,7 +264,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
prompt=[0, 0, 0, 0, 0], prompt=[0, 0, 0, 0, 0],
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
logprobs=6, # vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=21,
) )
... ...
with pytest.raises( with pytest.raises(
...@@ -274,7 +276,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, ...@@ -274,7 +276,9 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
prompt=[0, 0, 0, 0, 0], prompt=[0, 0, 0, 0, 0],
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
logprobs=6, # vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs=30,
stream=True, stream=True,
) )
async for chunk in stream: async for chunk in stream:
...@@ -287,55 +291,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, ...@@ -287,55 +291,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI,
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
) )
completion = completion.choices[0].text assert len(completion.choices[0].text) >= 0
assert completion is not None and len(completion) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
# just test 1 lora hereafter
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_single_chat_session(server, client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert chat_completion.id is not None
assert chat_completion.choices is not None and len(
chat_completion.choices) == 1
assert chat_completion.choices[0].message is not None
assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.content[
0].top_logprobs is not None
assert len(
chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -390,7 +346,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, ...@@ -390,7 +346,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI,
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.logprobs is not None assert choice.logprobs is not None
assert choice.logprobs.content is not None assert choice.logprobs.content is not None
assert len(choice.logprobs.content[0].top_logprobs) <= 1 assert len(choice.logprobs.content[0].top_logprobs) == 0
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -418,11 +374,14 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, ...@@ -418,11 +374,14 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI,
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.logprobs is not None assert choice.logprobs is not None
assert choice.logprobs.content is not None assert choice.logprobs.content is not None
assert len(choice.logprobs.content[0].top_logprobs) <= 6 assert len(choice.logprobs.content[0].top_logprobs) == 5
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize(
"model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
model_name: str): model_name: str):
messages = [{ messages = [{
...@@ -463,7 +422,51 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, ...@@ -463,7 +422,51 @@ async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter "model_name",
[MODEL_NAME, "zephyr-lora"],
)
async def test_single_chat_session(server, client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role": "user",
"content": "what is 1+1?"
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=37, total_tokens=47)
message = choice.message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora"], [MODEL_NAME, "zephyr-lora"],
) )
...@@ -478,8 +481,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, ...@@ -478,8 +481,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
temperature=0.0, temperature=0.0,
) )
single_output = single_completion.choices[0].text single_output = single_completion.choices[0].text
single_usage = single_completion.usage
stream = await client.completions.create(model=model_name, stream = await client.completions.create(model=model_name,
prompt=prompt, prompt=prompt,
max_tokens=5, max_tokens=5,
...@@ -495,7 +496,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, ...@@ -495,7 +496,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
assert finish_reason_count == 1 assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == "length" assert chunk.choices[0].finish_reason == "length"
assert chunk.choices[0].text assert chunk.choices[0].text
assert chunk.usage == single_usage
assert "".join(chunks) == single_output assert "".join(chunks) == single_output
...@@ -550,6 +550,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, ...@@ -550,6 +550,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
assert "".join(chunks) == output assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
)
async def test_chat_completion_stream_options(server,
client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "What is the capital of France?"
}]
# Test stream=True, stream_options={"include_usage": False}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": False})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options={"include_usage": True}
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
stream_options={"include_usage": True})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=False, stream_options={"include_usage": None}
with pytest.raises(BadRequestError):
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
# Test stream=False, stream_options={"include_usage": True}
with pytest.raises(BadRequestError):
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
)
async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
model_name: str):
prompt = "What is the capital of France?"
# Test stream=True, stream_options={"include_usage": False}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={"include_usage": False})
async for chunk in stream:
assert chunk.usage is None
# Test stream=True, stream_options={"include_usage": True}
stream = await client.completions.create(
model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=True,
stream_options={"include_usage": True})
async for chunk in stream:
if chunk.choices[0].finish_reason is None:
assert chunk.usage is None
else:
assert chunk.usage is None
final_chunk = await stream.__anext__()
assert final_chunk.usage is not None
assert final_chunk.usage.prompt_tokens > 0
assert final_chunk.usage.completion_tokens > 0
assert final_chunk.usage.total_tokens == (
final_chunk.usage.prompt_tokens +
final_chunk.usage.completion_tokens)
assert final_chunk.choices == []
# Test stream=False, stream_options={"include_usage": None}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": None})
# Test stream=False, stream_options={"include_usage": True}
with pytest.raises(BadRequestError):
await client.completions.create(model=model_name,
prompt=prompt,
max_tokens=5,
temperature=0.0,
stream=False,
stream_options={"include_usage": True})
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize(
# just test 1 lora hereafter # just test 1 lora hereafter
...@@ -620,8 +752,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): ...@@ -620,8 +752,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
logit_bias={str(token_id): 100}, logit_bias={str(token_id): 100},
seed=42, seed=42,
) )
assert completion.choices[0].text is not None and len( assert len(completion.choices[0].text) >= 5
completion.choices[0].text) >= 5
response_tokens = tokenizer(completion.choices[0].text, response_tokens = tokenizer(completion.choices[0].text,
add_special_tokens=False)["input_ids"] add_special_tokens=False)["input_ids"]
expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
...@@ -668,9 +799,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI, ...@@ -668,9 +799,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 3 assert len(completion.choices) == 3
for i in range(3): for i in range(3):
assert completion.choices[i].text is not None
output_json = json.loads(completion.choices[i].text) output_json = json.loads(completion.choices[i].text)
jsonschema.validate(instance=output_json, schema=TEST_SCHEMA) jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
...@@ -737,9 +867,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, ...@@ -737,9 +867,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 3 assert len(completion.choices) == 3
for i in range(3): for i in range(3):
assert completion.choices[i].text is not None
assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
...@@ -796,7 +925,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, ...@@ -796,7 +925,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 2 assert len(completion.choices) == 2
for i in range(2): for i in range(2):
assert completion.choices[i].text in TEST_CHOICE assert completion.choices[i].text in TEST_CHOICE
...@@ -898,12 +1027,199 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, ...@@ -898,12 +1027,199 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
top_logprobs=5, top_logprobs=5,
extra_body=dict(guided_choice=TEST_CHOICE, extra_body=dict(guided_choice=TEST_CHOICE,
guided_decoding_backend=guided_decoding_backend)) guided_decoding_backend=guided_decoding_backend))
assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.content is not None
top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs
# -9999.0 is the minimum logprob returned by OpenAI # -9999.0 is the minimum logprob returned by OpenAI
assert all( for item in top_logprobs:
isinstance(token.logprob, float) and token.logprob >= -9999.0 assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})"
for token in top_logprobs)
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_named_tool_use(server, client: openai.AsyncOpenAI,
guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}"
}]
# non-streaming
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice={
"type": "function",
"function": {
"name": "dummy_function_name"
}
})
message = chat_completion.choices[0].message
assert len(message.content) == 0
json_string = message.tool_calls[0].function.arguments
json1 = json.loads(json_string)
jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
messages.append({"role": "assistant", "content": json_string})
messages.append({
"role":
"user",
"content":
"Give me another one with a different name and age"
})
# streaming
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice={
"type": "function",
"function": {
"name": "dummy_function_name"
}
},
stream=True)
output = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
assert delta.content is None or len(delta.content) == 0
if delta.tool_calls:
output.append(delta.tool_calls[0].function.arguments)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
json2 = json.loads("".join(output))
jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
assert json1["name"] != json2["name"]
assert json1["age"] != json2["age"]
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_required_tool_use_not_yet_supported(
server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}"
}]
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice="required")
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice="auto")
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_inconsistent_tool_choice_and_tools(
server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
}, {
"role":
"user",
"content":
f"Give an example JSON for an employee profile that "
f"fits this schema: {TEST_SCHEMA}"
}]
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tool_choice={
"type": "function",
"function": {
"name":
"dummy_function_name"
}
})
with pytest.raises(openai.BadRequestError):
await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
tools=[{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": TEST_SCHEMA
}
}],
tool_choice={
"type": "function",
"function": {
"name": "nondefined_function_name"
}
})
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -920,6 +1236,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): ...@@ -920,6 +1236,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
response_format={"type": "json_object"}) response_format={"type": "json_object"})
content = resp.choices[0].message.content content = resp.choices[0].message.content
assert content is not None
loaded = json.loads(content) loaded = json.loads(content)
assert loaded == {"result": 2}, loaded assert loaded == {"result": 2}, loaded
...@@ -1032,8 +1350,9 @@ number: "1" | "2" ...@@ -1032,8 +1350,9 @@ number: "1" | "2"
"model_name", "model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"], [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
) )
@pytest.mark.parametrize("logprobs_arg", [1, 0])
async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
model_name: str): model_name: str, logprobs_arg: int):
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
# test using text and token IDs # test using text and token IDs
for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]): for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
...@@ -1042,12 +1361,11 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, ...@@ -1042,12 +1361,11 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
max_tokens=5, max_tokens=5,
temperature=0.0, temperature=0.0,
echo=True, echo=True,
logprobs=1) logprobs=logprobs_arg)
prompt_text = tokenizer.decode(prompt) if isinstance(prompt, prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
list) else prompt list) else prompt
assert (completion.choices[0].text is not None assert re.search(r"^" + prompt_text, completion.choices[0].text)
and re.search(r"^" + prompt_text, completion.choices[0].text))
logprobs = completion.choices[0].logprobs logprobs = completion.choices[0].logprobs
assert logprobs is not None assert logprobs is not None
assert len(logprobs.text_offset) > 5 assert len(logprobs.text_offset) > 5
...@@ -1055,6 +1373,9 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, ...@@ -1055,6 +1373,9 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
and logprobs.token_logprobs[0] is None) and logprobs.token_logprobs[0] is None)
assert (len(logprobs.top_logprobs) > 5 assert (len(logprobs.top_logprobs) > 5
and logprobs.top_logprobs[0] is None) and logprobs.top_logprobs[0] is None)
for top_logprobs in logprobs.top_logprobs[1:]:
assert max(logprobs_arg,
1) <= len(top_logprobs) <= logprobs_arg + 1
assert len(logprobs.tokens) > 5 assert len(logprobs.tokens) > 5
...@@ -1085,32 +1406,32 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): ...@@ -1085,32 +1406,32 @@ async def test_long_seed(server, client: openai.AsyncOpenAI):
) )
async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str): model_name: str):
input = [ input_texts = [
"The chef prepared a delicious meal.", "The chef prepared a delicious meal.",
] ]
# test single embedding # test single embedding
embeddings = await client.embeddings.create( embeddings = await client.embeddings.create(
model=model_name, model=model_name,
input=input, input=input_texts,
encoding_format="float", encoding_format="float",
) )
assert embeddings.id is not None assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9 assert embeddings.usage.prompt_tokens == 9
assert embeddings.usage.total_tokens == 9 assert embeddings.usage.total_tokens == 9
# test using token IDs # test using token IDs
input = [1, 1, 1, 1, 1] input_tokens = [1, 1, 1, 1, 1]
embeddings = await client.embeddings.create( embeddings = await client.embeddings.create(
model=model_name, model=model_name,
input=input, input=input_tokens,
encoding_format="float", encoding_format="float",
) )
assert embeddings.id is not None assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5 assert embeddings.usage.prompt_tokens == 5
...@@ -1125,29 +1446,29 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, ...@@ -1125,29 +1446,29 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI,
async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
model_name: str): model_name: str):
# test List[str] # test List[str]
inputs = [ input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.", "The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky." "Stars twinkle brightly in the night sky."
] ]
embeddings = await client.embeddings.create( embeddings = await client.embeddings.create(
model=model_name, model=model_name,
input=inputs, input=input_texts,
encoding_format="float", encoding_format="float",
) )
assert embeddings.id is not None assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 3 assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
# test List[List[int]] # test List[List[int]]
inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]] [25, 32, 64, 77]]
embeddings = await client.embeddings.create( embeddings = await client.embeddings.create(
model=model_name, model=model_name,
input=inputs, input=input_tokens,
encoding_format="float", encoding_format="float",
) )
assert embeddings.id is not None assert embeddings.id is not None
assert embeddings.data is not None and len(embeddings.data) == 4 assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17 assert embeddings.usage.prompt_tokens == 17
......
from pathlib import Path
from typing import Dict
import openai
import pytest
import pytest_asyncio
import ray
from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
from ..utils import ServerRunner
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
"examples/template_llava.jinja")
assert LLAVA_CHAT_TEMPLATE.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
pytestmark = pytest.mark.openai
@pytest.fixture(scope="module")
def server():
ray.init()
server_runner = ServerRunner.remote([
"--model",
MODEL_NAME,
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--image-input-type",
"pixel_values",
"--image-token-id",
"32000",
"--image-input-shape",
"1,3,336,336",
"--image-feature-size",
"576",
"--chat-template",
str(LLAVA_CHAT_TEMPLATE),
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()
@pytest.fixture(scope="session")
def client():
client = openai.AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client
@pytest_asyncio.fixture(scope="session")
async def base64_encoded_image() -> Dict[str, str]:
return {
image_url:
encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=596, total_tokens=606)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded(
server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=596, total_tokens=606)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
)
chunks = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_multi_image_input(server, client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
with pytest.raises(openai.BadRequestError): # test multi-image input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
if __name__ == "__main__":
pytest.main([__file__])
...@@ -44,7 +44,7 @@ def test_act_and_mul( ...@@ -44,7 +44,7 @@ def test_act_and_mul(
elif activation == "gelu_tanh": elif activation == "gelu_tanh":
layer = GeluAndMul(approximate="tanh") layer = GeluAndMul(approximate="tanh")
out = layer(x) out = layer(x)
ref_out = layer._forward(x) ref_out = layer.forward_native(x)
# The SiLU and GELU implementations are equivalent to the native PyTorch # The SiLU and GELU implementations are equivalent to the native PyTorch
# implementations, so we can do exact comparison. # implementations, so we can do exact comparison.
assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0) assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
...@@ -72,7 +72,7 @@ def test_activation( ...@@ -72,7 +72,7 @@ def test_activation(
x = torch.randn(num_tokens, d, dtype=dtype) x = torch.randn(num_tokens, d, dtype=dtype)
layer = activation() layer = activation()
out = layer(x) out = layer(x)
ref_out = layer._forward(x) ref_out = layer.forward_native(x)
assert torch.allclose(out, assert torch.allclose(out,
ref_out, ref_out,
atol=get_default_atol(out), atol=get_default_atol(out),
......
import os
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
import torch import torch
from tests.kernels.utils import (STR_FLASH_ATTN_VAL, STR_INVALID_VAL,
override_backend_env_variable)
from vllm.attention.selector import which_attn_to_use from vllm.attention.selector import which_attn_to_use
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"]) "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
def test_env(name: str, device: str): def test_env(name: str, device: str, monkeypatch):
"""Test that the attention selector can be set via environment variable. """Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend. Note that we do not test FlashAttn because it is the default backend.
""" """
name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
os.environ["VLLM_ATTENTION_BACKEND"] = name override_backend_env_variable(monkeypatch, name)
if device == "cpu": if device == "cpu":
with patch("vllm.attention.selector.is_cpu", return_value=True): with patch("vllm.attention.selector.is_cpu", return_value=True):
...@@ -32,14 +33,11 @@ def test_env(name: str, device: str): ...@@ -32,14 +33,11 @@ def test_env(name: str, device: str):
torch.float16, 16) torch.float16, 16)
assert backend.name == name assert backend.name == name
if name_backup is not None:
os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
def test_flash_attn(): def test_flash_attn(monkeypatch):
"""Test FlashAttn validation.""" """Test FlashAttn validation."""
name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASH_ATTN" override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
# Unsupported CUDA arch # Unsupported CUDA arch
with patch("torch.cuda.get_device_capability", return_value=[7, 5]): with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
...@@ -71,14 +69,9 @@ def test_flash_attn(): ...@@ -71,14 +69,9 @@ def test_flash_attn():
backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16) backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
assert backend.name != "FLASH_ATTN" assert backend.name != "FLASH_ATTN"
if name_backup is not None:
os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
def test_invalid_env(): def test_invalid_env(monkeypatch):
"""Throw an exception if the backend name is invalid.""" """Throw an exception if the backend name is invalid."""
name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None) override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
os.environ["VLLM_ATTENTION_BACKEND"] = "INVALID"
with pytest.raises(ValueError): with pytest.raises(ValueError):
which_attn_to_use(8, 16, 8, None, torch.float16, None, 16) which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
...@@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int, ...@@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int,
assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0) assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
@pytest.mark.parametrize("m", [512, 222, 33, 1]) @pytest.mark.parametrize("m", [512, 222, 100, 33, 1])
@pytest.mark.parametrize("n", [2048, 256, 1024]) @pytest.mark.parametrize("n", [2048, 256, 1024])
@pytest.mark.parametrize("k", [128, 496, 1024]) @pytest.mark.parametrize("k", [128, 496, 1024])
@pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_act_token", [True, False])
...@@ -207,14 +207,21 @@ class CutlassLayer(torch.nn.Module): ...@@ -207,14 +207,21 @@ class CutlassLayer(torch.nn.Module):
self.out_dtype) self.out_dtype)
def test_cutlass_cuda_graph(): @pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False])
def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
m, n, k = 512, 512, 512 m, n, k = 512, 512, 512
a = to_int8(torch.randn((m, k), device="cuda")) a = to_int8(torch.randn((m, k), device="cuda"))
b = to_int8(torch.randn((n, k), device="cuda").t()) b = to_int8(torch.randn((n, k), device="cuda").t())
scale_a = (torch.randn((m, 1), device="cuda", dtype=torch.float32) / 10) m_a_scales = m if per_act_token else 1
scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32) / 10) n_b_scales = n if per_out_ch else 1
scale_a = (torch.randn(
(m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
scale_b = (torch.randn(
(1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
# Construct a trivial model with a single layer that calls a CUTLASS kernel # Construct a trivial model with a single layer that calls a CUTLASS kernel
model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16) model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)
......
import pytest import pytest
import torch import torch
from vllm._C import ops # ruff: noqa: F401
import vllm._C
DTYPES = [torch.half, torch.bfloat16, torch.float] DTYPES = [torch.half, torch.bfloat16, torch.float]
HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192] # Arbitrary values for testing HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
8193] # Arbitrary values for testing
NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing
SEEDS = [0] SEEDS = [0]
SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@torch.inference_mode()
def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
dtype: torch.dtype, seed: int) -> None:
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
x_token_max, _ = x.max(dim=1)
x_token_max = x_token_max.to(dtype=torch.float32)
scales = (x_token_max / float(127.0))[:, None].to(device="cuda",
dtype=torch.float32)
torch_out = (x / scales).round().clamp(int8_traits.min,
int8_traits.max).to(torch.int8)
ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out)
assert torch.allclose(scales_out, scales)
assert torch.allclose(torch_out, ops_out,
atol=1) # big atol to account for rounding errors
@pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("scale", SCALE) @pytest.mark.parametrize("scale", SCALE)
@torch.inference_mode() @torch.inference_mode()
def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
seed: int, scale: float) -> None: dtype: torch.dtype, seed: int,
scale: float) -> None:
torch.random.manual_seed(seed) torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed) torch.cuda.manual_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
out1 = (x / scale).round().clamp( out1 = (x / scale).round().clamp(int8_traits.min,
torch.iinfo(torch.int8).min, int8_traits.max).to(torch.int8)
torch.iinfo(torch.int8).max).to(torch.int8)
out2 = torch.empty_like(x, dtype=torch.int8) out2 = torch.empty_like(x, dtype=torch.int8)
ops.static_scaled_int8_quant(out2, x, scale) scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
torch.ops._C.static_scaled_int8_quant(out2, x, scale_argument)
assert torch.allclose(out1, out2, assert torch.allclose(out1, out2,
atol=1) # big atol to account for rounding errors atol=1) # big atol to account for rounding errors
...@@ -42,7 +42,7 @@ def test_rms_norm( ...@@ -42,7 +42,7 @@ def test_rms_norm(
# NOTE(woosuk): The reference implementation should be executed first # NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place. # because the custom kernel is in-place.
ref_out = layer._forward(x, residual) ref_out = layer.forward_native(x, residual)
out = layer(x, residual) out = layer(x, residual)
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# numerical errors than other operators because they involve reductions. # numerical errors than other operators because they involve reductions.
......
...@@ -64,7 +64,7 @@ def test_rotary_embedding( ...@@ -64,7 +64,7 @@ def test_rotary_embedding(
# NOTE(woosuk): The reference implementation should be executed first # NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place. # because the custom kernel is in-place.
ref_query, ref_key = rope._forward(positions, query, key) ref_query, ref_key = rope.forward_native(positions, query, key)
out_query, out_key = rope.forward(positions, query, key) out_query, out_key = rope.forward(positions, query, key)
# Compare the results. # Compare the results.
assert torch.allclose(out_query, assert torch.allclose(out_query,
...@@ -121,7 +121,7 @@ def test_batched_rotary_embedding( ...@@ -121,7 +121,7 @@ def test_batched_rotary_embedding(
# NOTE(woosuk): The reference implementation should be executed first # NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place. # because the custom kernel is in-place.
ref_query, ref_key = rope._forward(positions, query, key) ref_query, ref_key = rope.forward_native(positions, query, key)
out_query, out_key = rope.forward(positions, out_query, out_key = rope.forward(positions,
query, query,
key, key,
...@@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora( ...@@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora(
# NOTE(woosuk): The reference implementation should be executed first # NOTE(woosuk): The reference implementation should be executed first
# because the custom kernel is in-place. # because the custom kernel is in-place.
ref_query, ref_key = rope._forward(positions, query, key, query_offsets) ref_query, ref_key = rope.forward_native(positions, query, key,
query_offsets)
out_query, out_key = rope.forward(positions, query, key, out_query, out_key = rope.forward(positions, query, key,
query_offsets.flatten()) query_offsets.flatten())
# Compare the results. # Compare the results.
......
"""Kernel test utils"""
import pytest
STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
STR_INVALID_VAL: str = "INVALID"
def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
backend_name: str) -> None:
'''
Override the environment variable indicating the vLLM backend temporarily,
using pytest monkeypatch to ensure that the env vars get
reset once the test context exits.
Arguments:
* mpatch: pytest monkeypatch instance
* backend_name: attention backend name to force
'''
mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
...@@ -42,10 +42,24 @@ def cleanup(): ...@@ -42,10 +42,24 @@ def cleanup():
ray.shutdown() ray.shutdown()
@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
This can provide a ~10x speedup for non-GPU unit tests since they don't need
to initialize torch.
"""
if request.node.get_closest_marker("skip_global_cleanup"):
return False
return True
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def cleanup_fixture(): def cleanup_fixture(should_do_global_cleanup_after_test: bool):
yield yield
cleanup() if should_do_global_cleanup_after_test:
cleanup()
@pytest.fixture @pytest.fixture
......
...@@ -2,6 +2,7 @@ import random ...@@ -2,6 +2,7 @@ import random
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
from unittest.mock import patch
import pytest import pytest
import torch import torch
...@@ -32,7 +33,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ...@@ -32,7 +33,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
from .utils import DummyLoRAManager from .utils import DummyLoRAManager
...@@ -427,7 +428,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, ...@@ -427,7 +428,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
logits_processor = LogitsProcessor( logits_processor = LogitsProcessor(
vocab_size + lora_config.lora_extra_vocab_size, vocab_size) vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
lora_logits_processor = LogitsProcessorWithLoRA( lora_logits_processor = LogitsProcessorWithLoRA(
logits_processor, 1024, linear.weight.dtype, linear.weight.device) logits_processor, 1024, linear.weight.dtype, linear.weight.device,
None)
lora_logits_processor.create_lora_weights(max_loras, lora_config) lora_logits_processor.create_lora_weights(max_loras, lora_config)
return linear, logits_processor, lora_logits_processor return linear, logits_processor, lora_logits_processor
...@@ -867,3 +869,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, ...@@ -867,3 +869,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
torch.allclose(ref_q, actual_q) torch.allclose(ref_q, actual_q)
torch.allclose(ref_k, actual_k) torch.allclose(ref_k, actual_k)
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
@pytest.mark.parametrize("seed", list(range(256)))
def test_vocab_parallel_embedding_indices(tp_size, seed):
random.seed(seed)
vocab_size = random.randint(4000, 64000)
added_vocab_size = random.randint(0, 1024)
org_vocab_size = vocab_size - added_vocab_size
last_org_vocab_end_index = 0
last_added_vocab_end_index = org_vocab_size
computed_vocab_size = 0
computed_org_vocab_size = 0
computed_added_vocab_size = 0
vocab_size_padded = -1
all_org_tokens = []
all_added_tokens = []
token_ids = []
for tp_rank in range(tp_size):
with patch(
"vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
return_value=tp_rank
), patch(
"vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
return_value=tp_size):
vocab_embedding = VocabParallelEmbedding(
vocab_size, 1, org_num_embeddings=org_vocab_size)
vocab_size_padded = vocab_embedding.num_embeddings_padded
shard_indices = vocab_embedding.shard_indices
# Assert that the ranges are contiguous
assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
assert (shard_indices.added_vocab_start_index ==
last_added_vocab_end_index)
# Ensure that we are not exceeding the vocab size
computed_vocab_size += shard_indices.num_elements_padded
computed_org_vocab_size += shard_indices.num_org_elements
computed_added_vocab_size += shard_indices.num_added_elements
# Ensure that the ranges are not overlapping
all_org_tokens.extend(
range(shard_indices.org_vocab_start_index,
shard_indices.org_vocab_end_index))
all_added_tokens.extend(
range(shard_indices.added_vocab_start_index,
shard_indices.added_vocab_end_index))
token_ids.extend(
range(shard_indices.org_vocab_start_index,
shard_indices.org_vocab_end_index))
token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
shard_indices.num_org_elements))
token_ids.extend(
range(shard_indices.added_vocab_start_index,
shard_indices.added_vocab_end_index))
token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
shard_indices.num_added_elements))
last_org_vocab_end_index = shard_indices.org_vocab_end_index
last_added_vocab_end_index = shard_indices.added_vocab_end_index
assert computed_vocab_size == vocab_size_padded
assert computed_org_vocab_size == org_vocab_size
assert computed_added_vocab_size == added_vocab_size
# Ensure that the ranges are not overlapping
assert len(all_org_tokens) == len(set(all_org_tokens))
assert len(all_added_tokens) == len(set(all_added_tokens))
assert not set(all_org_tokens).intersection(set(all_added_tokens))
token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
reindex_mapping = vocab_embedding.get_sharded_to_full_mapping()
assert reindex_mapping is not None or tp_size == 1
if reindex_mapping is not None:
reindexed_token_ids = token_ids_tensor[reindex_mapping]
expected = torch.tensor(list(range(0, vocab_size)))
assert reindexed_token_ids[:vocab_size].equal(expected)
assert torch.all(reindexed_token_ids[vocab_size:] == -1)
def test_get_masked_input_and_mask():
x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
# base tp 1 case, no padding
modified_x, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=8,
added_vocab_start_index=8,
added_vocab_end_index=12,
num_org_vocab_padding=0)
assert torch.equal(x, modified_x)
# tp 2 case, no padding
modified_x_rank_0, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=4,
added_vocab_start_index=8,
added_vocab_end_index=10,
num_org_vocab_padding=0)
modified_x_rank_1, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=4,
org_vocab_end_index=8,
added_vocab_start_index=10,
added_vocab_end_index=12,
num_org_vocab_padding=0)
assert torch.equal(modified_x_rank_0,
torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
assert torch.equal(modified_x_rank_1,
torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
# tp 4 case, no padding
modified_x_rank_0, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=2,
added_vocab_start_index=8,
added_vocab_end_index=9,
num_org_vocab_padding=0)
modified_x_rank_1, _ = get_masked_input_and_mask(x,
org_vocab_start_index=2,
org_vocab_end_index=4,
added_vocab_start_index=9,
added_vocab_end_index=10,
num_org_vocab_padding=0)
modified_x_rank_2, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=4,
org_vocab_end_index=6,
added_vocab_start_index=10,
added_vocab_end_index=11,
num_org_vocab_padding=0)
modified_x_rank_3, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=6,
org_vocab_end_index=8,
added_vocab_start_index=11,
added_vocab_end_index=12,
num_org_vocab_padding=0)
assert torch.equal(modified_x_rank_0,
torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
assert torch.equal(modified_x_rank_1,
torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
assert torch.equal(modified_x_rank_2,
torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
assert torch.equal(modified_x_rank_3,
torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
# base tp 1 case, with padding
modified_x, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=8,
added_vocab_start_index=8,
added_vocab_end_index=12,
num_org_vocab_padding=2)
assert torch.equal(modified_x,
torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
# tp 2 case, with padding
modified_x_rank_0, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=4,
added_vocab_start_index=8,
added_vocab_end_index=10,
num_org_vocab_padding=2)
modified_x_rank_1, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=4,
org_vocab_end_index=8,
added_vocab_start_index=10,
added_vocab_end_index=12,
num_org_vocab_padding=2)
assert torch.equal(modified_x_rank_0,
torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
assert torch.equal(modified_x_rank_1,
torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
# tp 4 case, with padding
modified_x_rank_0, _ = get_masked_input_and_mask(x,
org_vocab_start_index=0,
org_vocab_end_index=2,
added_vocab_start_index=8,
added_vocab_end_index=9,
num_org_vocab_padding=2)
modified_x_rank_1, _ = get_masked_input_and_mask(x,
org_vocab_start_index=2,
org_vocab_end_index=4,
added_vocab_start_index=9,
added_vocab_end_index=10,
num_org_vocab_padding=2)
modified_x_rank_2, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=4,
org_vocab_end_index=6,
added_vocab_start_index=10,
added_vocab_end_index=11,
num_org_vocab_padding=2)
modified_x_rank_3, _ = get_masked_input_and_mask(
x,
org_vocab_start_index=6,
org_vocab_end_index=8,
added_vocab_start_index=11,
added_vocab_end_index=12,
num_org_vocab_padding=2)
assert torch.equal(modified_x_rank_0,
torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
assert torch.equal(modified_x_rank_1,
torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
assert torch.equal(modified_x_rank_2,
torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
assert torch.equal(modified_x_rank_3,
torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
...@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int): ...@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int):
return generated_texts return generated_texts
@pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("tp_size", [1, 2, 4])
def test_llama_lora(sql_lora_files, tp_size): def test_llama_lora(sql_lora_files, tp_size, num_gpus_available):
# Cannot use as it will initialize torch.cuda too early... if num_gpus_available < tp_size:
# if torch.cuda.device_count() < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm = vllm.LLM(MODEL_PATH, llm = vllm.LLM(MODEL_PATH,
enable_lora=True, enable_lora=True,
...@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size): ...@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size):
print("removing lora") print("removing lora")
@pytest.mark.skip("Requires multiple GPUs") def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
def test_llama_tensor_parallel_equality(sql_lora_files): if num_gpus_available < 4:
# Cannot use as it will initialize torch.cuda too early... pytest.skip("Not enough GPUs for tensor parallelism 4")
# if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
llm_tp1 = vllm.LLM(MODEL_PATH, llm_tp1 = vllm.LLM(MODEL_PATH,
enable_lora=True, enable_lora=True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment