Commit 1591c68f authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.2

parents 09bcf00b c7f2cf2b
import asyncio
from dataclasses import dataclass
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
MODEL_NAME = "openai-community/gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}"
@dataclass
class MockModelConfig:
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
@dataclass
class MockEngine:
async def get_model_config(self):
return MockModelConfig
async def _async_serving_chat_init():
serving_completion = OpenAIServingChat(MockEngine(),
served_model_names=[MODEL_NAME],
response_role="assistant",
chat_template=CHAT_TEMPLATE)
return serving_completion
def test_async_serving_chat_init():
serving_completion = asyncio.run(_async_serving_chat_init())
assert serving_completion.tokenizer is not None
assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
...@@ -57,7 +57,9 @@ def test_guided_logits_processors(): ...@@ -57,7 +57,9 @@ def test_guided_logits_processors():
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer) regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
json_LP = JSONLogitsProcessor(TEST_SCHEMA, tokenizer) json_LP = JSONLogitsProcessor(TEST_SCHEMA,
tokenizer,
whitespace_pattern=None)
regex_LP.init_state() regex_LP.init_state()
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
......
...@@ -13,8 +13,10 @@ import pytest ...@@ -13,8 +13,10 @@ import pytest
# and debugging. # and debugging.
import ray import ray
import requests import requests
import torch
# downloading lora to test lora requests # downloading lora to test lora requests
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from openai import BadRequestError
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
...@@ -148,7 +150,7 @@ def server(zephyr_lora_files): ...@@ -148,7 +150,7 @@ def server(zephyr_lora_files):
ray.shutdown() ray.shutdown()
@pytest.fixture(scope="session") @pytest.fixture(scope="module")
def client(): def client():
client = openai.AsyncOpenAI( client = openai.AsyncOpenAI(
base_url="http://localhost:8000/v1", base_url="http://localhost:8000/v1",
...@@ -770,6 +772,40 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): ...@@ -770,6 +772,40 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
assert loaded == {"result": 2}, loaded assert loaded == {"result": 2}, loaded
async def test_extra_fields(server, client: openai.AsyncOpenAI):
with pytest.raises(BadRequestError) as exc_info:
await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "system",
"content": "You are a helpful assistant.",
"extra_field": "0",
}], # type: ignore
temperature=0,
seed=0)
assert "extra_forbidden" in exc_info.value.message
async def test_complex_message_content(server, client: openai.AsyncOpenAI):
resp = await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role":
"user",
"content": [{
"type":
"text",
"text":
"what is 1+1? please provide the result without any other text."
}]
}],
temperature=0,
seed=0)
content = resp.choices[0].message.content
assert content == "2"
async def test_guided_grammar(server, client: openai.AsyncOpenAI): async def test_guided_grammar(server, client: openai.AsyncOpenAI):
simple_sql_grammar = """ simple_sql_grammar = """
start: select_statement start: select_statement
...@@ -835,5 +871,24 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, ...@@ -835,5 +871,24 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
assert len(logprobs.tokens) > 5 assert len(logprobs.tokens) > 5
async def test_long_seed(server, client: openai.AsyncOpenAI):
for seed in [
torch.iinfo(torch.long).min - 1,
torch.iinfo(torch.long).max + 1
]:
with pytest.raises(BadRequestError) as exc_info:
await client.chat.completions.create(
model=MODEL_NAME,
messages=[{
"role": "system",
"content": "You are a helpful assistant.",
}],
temperature=0,
seed=seed)
assert ("greater_than_equal" in exc_info.value.message
or "less_than_equal" in exc_info.value.message)
if __name__ == "__main__": if __name__ == "__main__":
pytest.main([__file__]) pytest.main([__file__])
import pytest import pytest
from vllm.utils import create_kv_caches_with_random from vllm.utils import (create_kv_caches_with_random,
create_kv_caches_with_random_flash)
@pytest.fixture() @pytest.fixture()
def kv_cache_factory(): def kv_cache_factory():
return create_kv_caches_with_random return create_kv_caches_with_random
@pytest.fixture()
def kv_cache_factory_flashinfer():
return create_kv_caches_with_random_flash
...@@ -66,7 +66,7 @@ def ref_single_query_cached_kv_attention( ...@@ -66,7 +66,7 @@ def ref_single_query_cached_kv_attention(
key_cache: torch.Tensor, key_cache: torch.Tensor,
value_cache: torch.Tensor, value_cache: torch.Tensor,
block_tables: torch.Tensor, block_tables: torch.Tensor,
context_lens: torch.Tensor, seq_lens: torch.Tensor,
scale: float, scale: float,
alibi_slopes: Optional[torch.Tensor], alibi_slopes: Optional[torch.Tensor],
) -> None: ) -> None:
...@@ -77,15 +77,15 @@ def ref_single_query_cached_kv_attention( ...@@ -77,15 +77,15 @@ def ref_single_query_cached_kv_attention(
num_seqs = query.shape[0] num_seqs = query.shape[0]
block_tables = block_tables.cpu().tolist() block_tables = block_tables.cpu().tolist()
context_lens = context_lens.cpu().tolist() seq_lens = seq_lens.cpu().tolist()
for i in range(num_seqs): for i in range(num_seqs):
q = query[i].unsqueeze(0) q = query[i].unsqueeze(0)
block_table = block_tables[i] block_table = block_tables[i]
context_len = int(context_lens[i]) seq_len = int(seq_lens[i])
keys = [] keys = []
values = [] values = []
for j in range(context_len): for j in range(seq_len):
block_number = int(block_table[j // block_size]) block_number = int(block_table[j // block_size])
block_offset = j % block_size block_offset = j % block_size
...@@ -105,8 +105,8 @@ def ref_single_query_cached_kv_attention( ...@@ -105,8 +105,8 @@ def ref_single_query_cached_kv_attention(
alibi_bias = None alibi_bias = None
if alibi_slopes is not None: if alibi_slopes is not None:
# Create the ALiBi bias used in the paged attention kernel. # Create the ALiBi bias used in the paged attention kernel.
position_ids = torch.arange(context_len).int() position_ids = torch.arange(seq_len).int()
alibi_bias = (position_ids - context_len + 1).float() alibi_bias = (position_ids - seq_len + 1).float()
alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view( alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
1, 1, -1) 1, 1, -1)
...@@ -154,13 +154,13 @@ def test_paged_attention( ...@@ -154,13 +154,13 @@ def test_paged_attention(
if use_alibi: if use_alibi:
alibi_slopes = torch.randn(num_query_heads, dtype=torch.float) alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)] seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
context_lens[-1] = MAX_SEQ_LEN seq_lens[-1] = MAX_SEQ_LEN
max_context_len = max(context_lens) max_seq_len = max(seq_lens)
context_lens = torch.tensor(context_lens, dtype=torch.int) seq_lens = torch.tensor(seq_lens, dtype=torch.int)
# Create the block tables. # Create the block tables.
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables = [] block_tables = []
for _ in range(num_seqs): for _ in range(num_seqs):
block_table = [ block_table = [
...@@ -191,16 +191,15 @@ def test_paged_attention( ...@@ -191,16 +191,15 @@ def test_paged_attention(
num_kv_heads, num_kv_heads,
scale, scale,
block_tables, block_tables,
context_lens, seq_lens,
block_size, block_size,
max_context_len, max_seq_len,
alibi_slopes, alibi_slopes,
kv_cache_dtype, kv_cache_dtype,
kv_scale, kv_scale,
) )
elif version == "v2": elif version == "v2":
num_partitions = ((max_context_len + PARTITION_SIZE - 1) // num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
PARTITION_SIZE)
assert PARTITION_SIZE % block_size == 0 assert PARTITION_SIZE % block_size == 0
num_seqs, num_heads, head_size = output.shape num_seqs, num_heads, head_size = output.shape
tmp_output = torch.empty( tmp_output = torch.empty(
...@@ -223,9 +222,9 @@ def test_paged_attention( ...@@ -223,9 +222,9 @@ def test_paged_attention(
num_kv_heads, num_kv_heads,
scale, scale,
block_tables, block_tables,
context_lens, seq_lens,
block_size, block_size,
max_context_len, max_seq_len,
alibi_slopes, alibi_slopes,
kv_cache_dtype, kv_cache_dtype,
kv_scale, kv_scale,
...@@ -260,7 +259,7 @@ def test_paged_attention( ...@@ -260,7 +259,7 @@ def test_paged_attention(
key_cache, key_cache,
value_cache, value_cache,
block_tables, block_tables,
context_lens, seq_lens,
scale, scale,
alibi_slopes, alibi_slopes,
) )
......
...@@ -5,6 +5,7 @@ import pytest ...@@ -5,6 +5,7 @@ import pytest
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm._C import cache_ops
from vllm.utils import is_hip from vllm.utils import is_hip
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
...@@ -195,6 +196,82 @@ def test_reshape_and_cache( ...@@ -195,6 +196,82 @@ def test_reshape_and_cache(
assert torch.allclose(value_cache, cloned_value_cache) assert torch.allclose(value_cache, cloned_value_cache)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
def test_reshape_and_cache_flash(
kv_cache_factory_flashinfer,
num_tokens: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
seed: int,
device: str,
kv_cache_dtype: str,
) -> None:
if kv_cache_dtype == "fp8":
pytest.skip()
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)
# Create a random slot mapping.
num_slots = block_size * num_blocks
slot_mapping = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device='cuda')
qkv = torch.randn(num_tokens,
3,
num_heads,
head_size,
dtype=dtype,
device=device)
_, key, value = qkv.unbind(dim=1)
# Create the KV caches.
key_caches, value_caches = kv_cache_factory_flashinfer(
num_blocks,
block_size,
1,
num_heads,
head_size,
kv_cache_dtype,
dtype,
)
key_cache, value_cache = key_caches[0], value_caches[0]
# Clone the KV caches.
cloned_key_cache = key_cache.clone()
cloned_value_cache = value_cache.clone()
# Call the reshape_and_cache kernel.
cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
slot_mapping, kv_cache_dtype)
# Run the reference implementation.
block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
block_indicies = block_indicies.cpu().tolist()
block_offsets = slot_mapping % block_size
block_offsets = block_offsets.cpu().tolist()
for i in range(num_tokens):
block_idx = block_indicies[i]
block_offset = block_offsets[i]
cloned_key_cache[block_idx, block_offset, :, :] = key[i]
cloned_value_cache[block_idx, block_offset, :, :] = value[i]
assert torch.allclose(key_cache, cloned_key_cache)
assert torch.allclose(value_cache, cloned_value_cache)
@pytest.mark.parametrize("direction", COPYING_DIRECTION) @pytest.mark.parametrize("direction", COPYING_DIRECTION)
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_heads", NUM_HEADS)
......
...@@ -77,8 +77,8 @@ def test_mixtral_moe(dtype: torch.dtype): ...@@ -77,8 +77,8 @@ def test_mixtral_moe(dtype: torch.dtype):
for i in range(config.num_local_experts): for i in range(config.num_local_experts):
weights = (hf_moe.experts[i].w1.weight.data, weights = (hf_moe.experts[i].w1.weight.data,
hf_moe.experts[i].w3.weight.data) hf_moe.experts[i].w3.weight.data)
vllm_moe.ws[i][:] = torch.cat(weights, dim=0) vllm_moe.w13_weight[i][:] = torch.cat(weights, dim=0)
vllm_moe.w2s[i][:] = hf_moe.experts[i].w2.weight.data vllm_moe.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
# Generate input batch of dimensions [batch_size, seq_len, hidden_dim] # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda") hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
......
...@@ -15,6 +15,7 @@ DTYPES = [torch.float16] ...@@ -15,6 +15,7 @@ DTYPES = [torch.float16]
CUDA_DEVICES = [ CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
] ]
SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_heads", NUM_HEADS)
...@@ -22,11 +23,13 @@ CUDA_DEVICES = [ ...@@ -22,11 +23,13 @@ CUDA_DEVICES = [
@pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
@torch.inference_mode() @torch.inference_mode()
def test_contexted_kv_attention( def test_contexted_kv_attention(
num_heads: int, num_heads: int,
num_queries_per_kv: int, num_queries_per_kv: int,
head_size: int, head_size: int,
sliding_window: int,
dtype: torch.dtype, dtype: torch.dtype,
device: str, device: str,
) -> None: ) -> None:
...@@ -48,12 +51,12 @@ def test_contexted_kv_attention( ...@@ -48,12 +51,12 @@ def test_contexted_kv_attention(
cache_size = 640 cache_size = 640
block_size = 32 block_size = 32
max_block_per_request = 64 max_block_per_request = 64
subquery_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)] query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)] ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)] seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
num_kv_heads = num_heads // num_queries_per_kv num_kv_heads = num_heads // num_queries_per_kv
num_tokens = sum(subquery_lens) num_tokens = sum(query_lens)
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
query.uniform_(-1e-3, 1e-3) query.uniform_(-1e-3, 1e-3)
output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
...@@ -72,15 +75,15 @@ def test_contexted_kv_attention( ...@@ -72,15 +75,15 @@ def test_contexted_kv_attention(
num_kv_heads, num_kv_heads,
head_size, head_size,
dtype=dtype) dtype=dtype)
k = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype) k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
v = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype) v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
values = torch.arange(0, cache_size, dtype=torch.long) values = torch.arange(0, cache_size, dtype=torch.long)
values = values[torch.randperm(cache_size)] values = values[torch.randperm(cache_size)]
block_table = values[:BS * max_block_per_request].view( block_table = values[:BS * max_block_per_request].view(
BS, max_block_per_request) BS, max_block_per_request)
b_seq_len = torch.tensor(seq_lens, dtype=torch.long) b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long) b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1], b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
dtype=torch.long), dtype=torch.long),
dim=0) dim=0)
max_input_len = MAX_SEQ_LEN max_input_len = MAX_SEQ_LEN
...@@ -89,7 +92,7 @@ def test_contexted_kv_attention( ...@@ -89,7 +92,7 @@ def test_contexted_kv_attention(
dtype=torch.long), dtype=torch.long),
dim=0) dim=0)
for i in range(BS): for i in range(BS):
for j in range(subquery_lens[i]): for j in range(query_lens[i]):
k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
j]) j])
v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
...@@ -123,12 +126,32 @@ def test_contexted_kv_attention( ...@@ -123,12 +126,32 @@ def test_contexted_kv_attention(
# Warm up the Triton kernel by calling it once before actually measuring # Warm up the Triton kernel by calling it once before actually measuring
# generation time # generation time
context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table, context_attention_fwd(query,
b_start_loc, b_seq_len, b_ctx_len, max_input_len) k,
v,
output,
k_cache,
v_cache,
block_table,
b_start_loc,
b_seq_len,
b_ctx_len,
max_input_len,
sliding_window=sliding_window)
torch.cuda.synchronize() torch.cuda.synchronize()
start_time = time.time() start_time = time.time()
context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table, context_attention_fwd(query,
b_start_loc, b_seq_len, b_ctx_len, max_input_len) k,
v,
output,
k_cache,
v_cache,
block_table,
b_start_loc,
b_seq_len,
b_ctx_len,
max_input_len,
sliding_window=sliding_window)
torch.cuda.synchronize() torch.cuda.synchronize()
end_time = time.time() end_time = time.time()
print(f"triton Time: {(end_time - start_time)*1000:.2f} ms") print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
...@@ -155,7 +178,10 @@ def test_contexted_kv_attention( ...@@ -155,7 +178,10 @@ def test_contexted_kv_attention(
value = value.unsqueeze(0) value = value.unsqueeze(0)
attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens( attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
subquery_lens, seq_lens) query_lens, seq_lens)
if sliding_window > 0:
attn_bias = attn_bias.make_local_attention_from_bottomright(
sliding_window)
output_ref = xops.memory_efficient_attention_forward( output_ref = xops.memory_efficient_attention_forward(
query, query,
key, key,
......
...@@ -8,6 +8,10 @@ import torch ...@@ -8,6 +8,10 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
from vllm.config import LoRAConfig from vllm.config import LoRAConfig
from vllm.lora.fully_sharded_layers import (
ColumnParallelLinearWithShardedLoRA,
MergedColumnParallelLinearWithShardedLoRA,
MergedQKVParallelLinearWithShardedLora, RowParallelLinearWithShardedLoRA)
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
# yapf: disable # yapf: disable
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
...@@ -524,13 +528,16 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, ...@@ -524,13 +528,16 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("orientation", ["row", "column"]) @pytest.mark.parametrize("orientation", ["row", "column"])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_linear_parallel(dist_init, num_loras, orientation, device) -> None: def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
device) -> None:
torch.set_default_device(device) torch.set_default_device(device)
max_loras = 8 max_loras = 8
lora_config = LoRAConfig(max_loras=max_loras, lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16) lora_dtype=torch.float16)
def create_random_linear_parallel_layer(): def create_random_linear_parallel_layer():
...@@ -540,14 +547,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None: ...@@ -540,14 +547,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
bias=False, bias=False,
params_dtype=torch.float16) params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data) linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = RowParallelLinearWithLoRA(linear) lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard
else RowParallelLinearWithShardedLoRA(linear))
else: else:
linear = ColumnParallelLinear(4096, linear = ColumnParallelLinear(4096,
4096, 4096,
bias=False, bias=False,
params_dtype=torch.float16) params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data) linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = ColumnParallelLinearWithLoRA(linear) lora_linear = (ColumnParallelLinearWithLoRA(linear)
if not fully_shard else
ColumnParallelLinearWithShardedLoRA(linear))
lora_linear.create_lora_weights(max_loras, lora_config) lora_linear.create_lora_weights(max_loras, lora_config)
return linear, lora_linear return linear, lora_linear
...@@ -629,13 +639,16 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None: ...@@ -629,13 +639,16 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("repeats", [1, 2, 3]) @pytest.mark.parametrize("repeats", [1, 2, 3])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None: def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
device) -> None:
torch.set_default_device(device) torch.set_default_device(device)
max_loras = 8 max_loras = 8
lora_config = LoRAConfig(max_loras=max_loras, lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16) lora_dtype=torch.float16)
def create_column_parallel_packed_layer(): def create_column_parallel_packed_layer():
...@@ -644,7 +657,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None: ...@@ -644,7 +657,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
bias=False, bias=False,
params_dtype=torch.float16) params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data) linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = MergedColumnParallelLinearWithLoRA(linear) lora_linear = (MergedColumnParallelLinearWithLoRA(linear)
if not fully_shard else
MergedColumnParallelLinearWithShardedLoRA(linear))
elif repeats == 3: elif repeats == 3:
linear = QKVParallelLinear(4096, linear = QKVParallelLinear(4096,
64, 64,
...@@ -652,7 +667,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None: ...@@ -652,7 +667,9 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
bias=False, bias=False,
params_dtype=torch.float16) params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data) linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = MergedQKVParallelLinearWithLora(linear) lora_linear = (MergedQKVParallelLinearWithLora(linear)
if not fully_shard else
MergedQKVParallelLinearWithShardedLora(linear))
else: else:
linear = QKVParallelLinear(4096, linear = QKVParallelLinear(4096,
64, 64,
......
...@@ -34,11 +34,14 @@ def _lora_ref_impl( ...@@ -34,11 +34,14 @@ def _lora_ref_impl(
for i, lora_idx in zip(range(bs), indicies.cpu().tolist()): for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
xi = x[i].unsqueeze(0).to(torch.float32) xi = x[i].unsqueeze(0).to(torch.float32)
wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32) wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
wb = wb_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32) if wb_T_all is not None:
wb = wb_T_all[lora_idx, layer_idx].transpose(-1,
-2).to(torch.float32)
tmp = xi @ wa tmp = xi @ wa
y_stage_1[i] = tmp.squeeze(0) y_stage_1[i] = tmp.squeeze(0)
y_final[i] += (tmp @ wb).squeeze(0) * s y_final[i] += ((tmp @ wb).squeeze(0) *
s if wb_T_all is not None else y_stage_1[i])
return y_final, y_stage_1 return y_final, y_stage_1
...@@ -91,12 +94,56 @@ H1 = H2 = [ ...@@ -91,12 +94,56 @@ H1 = H2 = [
128000, 128000,
128256, 128256,
] ]
H2 = [64] + H2
R = [1, 2, 4]
SEED = [0xabcdabcd987] SEED = [0xabcdabcd987]
CUDA_DEVICES = [ CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
] ]
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
@pytest.mark.parametrize("h1", H1)
@pytest.mark.parametrize("r", R)
@pytest.mark.parametrize("seed", SEED)
@torch.inference_mode()
def test_lora_a_extra_shapes(dtype_str, h1, r, seed):
torch.manual_seed(seed)
num_loras = 4
num_layers = 1
bs = 32
dtype = getattr(torch, dtype_str)
device = torch.device("cuda")
wa_T_all = torch.randn(num_loras,
num_layers,
r,
h1,
dtype=dtype,
device=device)
indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
for layer_idx in range(num_layers):
x = torch.randn(bs, h1, dtype=dtype, device=device)
y = torch.randn(bs, r, dtype=dtype, device=device)
y_ref = y.clone()
_lora_ref_impl(
y_ref,
x,
wa_T_all,
None,
indices,
layer_idx,
1.0,
)
y_our = y.clone()
punica.bgmv(y_our, x, wa_T_all, indices, layer_idx, 1.0)
assert_close(y_ref, y_our)
@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
@pytest.mark.parametrize("h1", H1) @pytest.mark.parametrize("h1", H1)
@pytest.mark.parametrize("h2", H2) @pytest.mark.parametrize("h2", H2)
......
from typing import List
import pytest import pytest
from prometheus_client import REGISTRY
from vllm import EngineArgs, LLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
...@@ -68,3 +76,119 @@ def test_metric_counter_generation_tokens( ...@@ -68,3 +76,119 @@ def test_metric_counter_generation_tokens(
assert vllm_generation_count == metric_count, ( assert vllm_generation_count == metric_count, (
f"generation token count: {vllm_generation_count!r}\n" f"generation token count: {vllm_generation_count!r}\n"
f"metric: {metric_count!r}") f"metric: {metric_count!r}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"served_model_name",
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
served_model_name: List[str]) -> None:
vllm_model = vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.3,
served_model_name=served_model_name)
stat_logger = vllm_model.model.llm_engine.stat_logger
metrics_tag_content = stat_logger.labels["model_name"]
del vllm_model
if served_model_name is None or served_model_name == []:
assert metrics_tag_content == model, (
f"Metrics tag model_name is wrong! expect: {model!r}\n"
f"actual: {metrics_tag_content!r}")
else:
assert metrics_tag_content == served_model_name[0], (
f"Metrics tag model_name is wrong! expect: "
f"{served_model_name[0]!r}\n"
f"actual: {metrics_tag_content!r}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("disable_log_stats", [True, False])
@pytest.mark.asyncio
async def test_async_engine_log_metrics_regression(
example_prompts,
model: str,
dtype: str,
max_tokens: int,
disable_log_stats: bool,
) -> None:
"""
Regression test ensuring async engine generates metrics
when disable_log_stats=False
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
"""
engine_args = AsyncEngineArgs(model=model,
dtype=dtype,
disable_log_stats=disable_log_stats)
async_engine = AsyncLLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts):
results = async_engine.generate(
prompt,
SamplingParams(max_tokens=max_tokens),
f"request-id-{i}",
)
# Exhaust the async iterator to make the async engine work
async for _ in results:
pass
assert_metrics(async_engine.engine, disable_log_stats,
len(example_prompts))
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("disable_log_stats", [True, False])
def test_engine_log_metrics_regression(
example_prompts,
model: str,
dtype: str,
max_tokens: int,
disable_log_stats: bool,
) -> None:
engine_args = EngineArgs(model=model,
dtype=dtype,
disable_log_stats=disable_log_stats)
engine = LLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts):
engine.add_request(
f"request-id-{i}",
prompt,
SamplingParams(max_tokens=max_tokens),
)
while engine.has_unfinished_requests():
engine.step()
assert_metrics(engine, disable_log_stats, len(example_prompts))
def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
num_requests: int) -> None:
if disable_log_stats:
with pytest.raises(AttributeError):
_ = engine.stat_logger
else:
assert (engine.stat_logger
is not None), "engine.stat_logger should be set"
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# generated
labels = {'model_name': engine.model_config.model}
request_histogram_metrics = [
"vllm:e2e_request_latency_seconds",
"vllm:request_prompt_tokens",
"vllm:request_generation_tokens",
"vllm:request_params_best_of",
"vllm:request_params_n",
]
for metric_name in request_histogram_metrics:
metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
labels)
assert (
metric_value == num_requests), "Metrics should be collected"
import os import os
import tempfile
import huggingface_hub.constants import huggingface_hub.constants
import pytest import pytest
from huggingface_hub.utils import LocalEntryNotFoundError
from vllm.model_executor.model_loader.weight_utils import enable_hf_transfer from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, enable_hf_transfer)
def test_hf_transfer_auto_activation(): def test_hf_transfer_auto_activation():
...@@ -22,5 +25,30 @@ def test_hf_transfer_auto_activation(): ...@@ -22,5 +25,30 @@ def test_hf_transfer_auto_activation():
HF_TRANFER_ACTIVE) HF_TRANFER_ACTIVE)
def test_download_weights_from_hf():
with tempfile.TemporaryDirectory() as tmpdir:
# assert LocalEntryNotFoundError error is thrown
# if offline is set and model is not cached
huggingface_hub.constants.HF_HUB_OFFLINE = True
with pytest.raises(LocalEntryNotFoundError):
download_weights_from_hf("facebook/opt-125m",
allow_patterns=["*.safetensors", "*.bin"],
cache_dir=tmpdir)
# download the model
huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf("facebook/opt-125m",
allow_patterns=["*.safetensors", "*.bin"],
cache_dir=tmpdir)
# now it should work offline
huggingface_hub.constants.HF_HUB_OFFLINE = True
assert download_weights_from_hf(
"facebook/opt-125m",
allow_patterns=["*.safetensors", "*.bin"],
cache_dir=tmpdir) is not None
if __name__ == "__main__": if __name__ == "__main__":
test_hf_transfer_auto_activation() test_hf_transfer_auto_activation()
test_download_weights_from_hf()
...@@ -43,3 +43,18 @@ def test_models( ...@@ -43,3 +43,18 @@ def test_models(
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, ( assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
vllm_model = vllm_runner(model, dtype=dtype)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
del vllm_model
# flake8: noqa
"""Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU.
"""
import os
import pytest
import torch
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024
MODELS = [
"nm-testing/Meta-Llama-3-8B-Instruct-FP8",
"meta-llama/Meta-Llama-3-8B-Instruct",
]
EXPECTED_STRS_MAP = {
"nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**'
],
"meta-llama/Meta-Llama-3-8B-Instruct": [
'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of',
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
],
}
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
fp8_not_supported = (capability <
QUANTIZATION_METHODS["fp8"].get_min_capability())
@pytest.mark.skipif(fp8_not_supported,
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(
example_prompts,
model_name,
) -> None:
model = LLM(model=model_name,
max_model_len=MAX_MODEL_LEN,
enforce_eager=True,
quantization="fp8")
tokenizer = AutoTokenizer.from_pretrained(model_name)
formatted_prompts = [
tokenizer.apply_chat_template([{
"role": "user",
"content": prompt
}],
tokenize=False,
add_generation_prompt=True)
for prompt in example_prompts
]
params = SamplingParams(max_tokens=20, temperature=0)
generations = []
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for prompt in formatted_prompts:
outputs = model.generate(prompt, params)
generations.append(outputs[0].outputs[0].text)
del model
print(generations)
expected_strs = EXPECTED_STRS_MAP[model_name]
for i in range(len(example_prompts)):
generated_str = generations[i]
expected_str = expected_strs[i]
assert expected_str == generated_str, (
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
"""Compares the outputs of gptq vs gptq_marlin
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
Note: This test currently fails running with --forked with the following:
RuntimeError: Cannot re-initialize CUDA in forked subprocess.
To use CUDA with multiprocessing, you must use the 'spawn' start method
Run `pytest tests/models/test_gptq_marlin.py`.
"""
import os
import pytest
import torch
from tests.models.utils import check_logprobs_close
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
gptq_marlin_not_supported = (
capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
MODELS = [
# act_order==False, group_size=channelwise
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
# act_order==False, group_size=128
("TheBloke/Llama-2-7B-GPTQ", "main"),
# act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
# act_order==True, group_size=64
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
# act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
# 8-bit, act_order==True, group_size=channelwise
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
# 8-bit, act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
# 8-bit, act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
]
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(gptq_marlin_not_supported,
reason="gptq_marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
vllm_runner,
example_prompts,
model,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
model_name, revision = model
# Run marlin.
gptq_marlin_model = vllm_runner(model_name=model_name,
revision=revision,
dtype=dtype,
quantization="marlin",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1)
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
del gptq_marlin_model
# Run gptq.
gptq_model = vllm_runner(model_name=model_name,
revision=revision,
dtype=dtype,
quantization="gptq",
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=1)
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
max_tokens,
num_logprobs)
del gptq_model
check_logprobs_close(
outputs_0_lst=gptq_outputs,
outputs_1_lst=gptq_marlin_outputs,
name_0="gptq",
name_1="gptq_marlin",
)
...@@ -10,12 +10,12 @@ up to 3 times to see if we pass. ...@@ -10,12 +10,12 @@ up to 3 times to see if we pass.
Run `pytest tests/models/test_marlin.py`. Run `pytest tests/models/test_marlin.py`.
""" """
from dataclasses import dataclass from dataclasses import dataclass
import pytest import pytest
import torch import torch
from tests.models.utils import check_logprobs_close
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
capability = torch.cuda.get_device_capability() capability = torch.cuda.get_device_capability()
...@@ -55,43 +55,24 @@ def test_models( ...@@ -55,43 +55,24 @@ def test_models(
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
marlin_model = vllm_runner(model_pair.model_marlin, dtype=dtype) marlin_model = vllm_runner(model_pair.model_marlin,
dtype=dtype,
quantization="marlin")
marlin_outputs = marlin_model.generate_greedy_logprobs( marlin_outputs = marlin_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model
# frees the memory.
del marlin_model del marlin_model
gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype) gptq_model = vllm_runner(model_pair.model_gptq,
dtype=dtype,
quantization="gptq")
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts, gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
max_tokens, max_tokens,
num_logprobs) num_logprobs)
# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting the just model
# frees the memory.
del gptq_model del gptq_model
# loop through the prompts check_logprobs_close(
for prompt_idx in range(len(example_prompts)): outputs_0_lst=gptq_outputs,
gptq_output_ids, gptq_output_str, gptq_logprobs = gptq_outputs[ outputs_1_lst=marlin_outputs,
prompt_idx] name_0="gptq",
marlin_output_ids, marlin_output_str, marlin_logprobs = marlin_outputs[ name_1="marlin",
prompt_idx] )
for idx, (gptq_output_id, marlin_output_id) in enumerate(
zip(gptq_output_ids, marlin_output_ids)):
# If sequence is not an exact match,
if marlin_output_id != gptq_output_id:
# Each predicted token must be in top 5 of the other's
assert gptq_output_id in marlin_logprobs[idx], (
f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
f"Marlin:\t{marlin_output_str!r}")
assert marlin_output_id in gptq_logprobs[idx], (
f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
f"Marlin:\t{marlin_output_str!r}")
# Break out since sequences will now diverge.
break
...@@ -49,3 +49,18 @@ def test_models( ...@@ -49,3 +49,18 @@ def test_models(
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, ( assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
vllm_model = vllm_runner(model, dtype=dtype)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
del vllm_model
def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
"""Compare the logprobs of two sequences generated by different models,
which should be similar but not necessarily equal.
"""
# Loop through responses to each prompt.
for prompt_idx, (outputs_0,
outputs_1) in enumerate(zip(outputs_0_lst,
outputs_1_lst)):
output_ids_0, output_str_0, logprobs_0 = outputs_0
output_ids_1, output_str_1, logprobs_1 = outputs_1
# Loop through generated tokens.
for idx, (output_id_0,
output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
# If generated tokens don't match, then
if output_id_0 != output_id_1:
# Each predicted token must be in top N logprobs of the other
assert output_id_0 in logprobs_1[idx], (
f"Test{prompt_idx}:"
f"\n{name_0}:\t{output_str_0!r}"
f"\n{name_1}:\t{output_str_1!r}")
assert output_id_1 in logprobs_0[idx], (
f"Test{prompt_idx}:"
f"\n{name_0}:\t{output_str_0!r}"
f"\n{name_1}:\t{output_str_1!r}")
# Break out since sequences will now diverge.
break
"""Tests whether Marlin models can be loaded from the autogptq config.
Run `pytest tests/quantization/test_autogptq_marlin_configs.py --forked`.
"""
from dataclasses import dataclass
import pytest
from vllm.config import ModelConfig
@dataclass
class ModelPair:
model_marlin: str
model_gptq: str
# Model Id // Expected Kernel
MODELS_QUANT_TYPE = [
# compat: autogptq <=0.7.1 is_marlin_format: bool
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq"),
# compat: autogptq >=0.8.0 use checkpoint_format: str
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq")
]
@pytest.mark.parametrize("model_quant_type", MODELS_QUANT_TYPE)
def test_auto_gptq(model_quant_type: str, ) -> None:
model_path, quant_type = model_quant_type
model_config_no_quant_arg = ModelConfig(
model_path,
model_path,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
quantization=None # case 1
)
model_config_quant_arg = ModelConfig(
model_path,
model_path,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
quantization="gptq" # case 2
)
assert model_config_no_quant_arg.quantization == quant_type, (
f"Expected quant_type == {quant_type} for {model_path}, "
f"but found {model_config_no_quant_arg.quantization} "
"for no --quantization None case")
assert model_config_quant_arg.quantization == quant_type, (
f"Expected quant_type == {quant_type} for {model_path}, "
f"but found {model_config_quant_arg.quantization} "
"for --quantization gptq case")
"""Tests whether Marlin models can be loaded from the autogptq config.
Run `pytest tests/quantization/test_configs.py --forked`.
"""
from dataclasses import dataclass
import pytest
from vllm.config import ModelConfig
@dataclass
class ModelPair:
model_marlin: str
model_gptq: str
# Model Id // Quantization Arg // Expected Type
MODEL_ARG_EXPTYPES = [
# AUTOGPTQ
# compat: autogptq <=0.7.1 is_marlin_format: bool
# Model Serialized in Marlin Format should always use Marlin kernel.
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
# Model Serialized in Exllama Format.
("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
# compat: autogptq >=0.8.0 use checkpoint_format: str
# Model Serialized in Marlin Format should always use Marlin kernel.
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
# Model Serialized in Exllama Format.
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
# AUTOAWQ
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "ERROR"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
]
@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
def test_auto_gptq(model_arg_exptype: str) -> None:
model_path, quantization_arg, expected_type = model_arg_exptype
try:
model_config = ModelConfig(model_path,
model_path,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
quantization=quantization_arg)
found_quantization_type = model_config.quantization
except ValueError:
found_quantization_type = "ERROR"
assert found_quantization_type == expected_type, (
f"Expected quant_type == {expected_type} for {model_path}, "
f"but found {found_quantization_type} "
f"for no --quantization {quantization_arg} case")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment