Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
# SPDX-License-Identifier: Apache-2.0
import neuronxcc.nki.language as nl
import pytest
import torch
import torch.nn.functional as F
from neuronxcc import nki
from vllm.attention.ops.nki_flash_attn import (
load_block_tables, transform_block_tables_for_indirect_load)
def is_power_of_2(n):
return n > 0 and (n & (n - 1) == 0)
def nki_load_and_transform_block_tables(
block_tables,
num_tiles,
num_blocks_per_tile,
num_head,
head_id,
block_size_tiling_factor,
):
assert is_power_of_2(
num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
block_tables_sbuf = load_block_tables(block_tables, num_tiles,
num_blocks_per_tile)
# we need to pass an Index as head_id
head_id = nl.arange(1)[None, :] + head_id
block_tables_transposed = transform_block_tables_for_indirect_load(
block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
B_P_SIZE = 128
assert block_tables_transposed.shape[1] == B_P_SIZE
out = nl.ndarray(
block_tables_transposed.shape,
dtype=nl.int32,
buffer=nl.shared_hbm,
)
for i in nl.affine_range(block_tables_transposed.shape[0]):
nl.store(dst=out[i], value=block_tables_transposed[i])
return out
def ref_block_tables_transform(
block_tables,
num_tiles,
num_blocks_per_tile,
num_head,
head_id,
block_size_tiling_factor,
):
assert block_tables.numel() == num_tiles * num_blocks_per_tile
block_tables = block_tables.view(num_tiles, num_blocks_per_tile)
B_F_SIZE = 128
num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE
block_tables = F.pad(
block_tables,
(0, 0, 0, num_tiles_padded - num_tiles),
"constant",
0,
)
block_tables = block_tables * num_head + head_id
block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1)
offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1)
block_tables = block_tables * block_size_tiling_factor + offset
block_tables_transposed = block_tables.view(num_tiles_padded, -1).t()
num_blocks_per_tile = block_tables_transposed.shape[0]
assert num_blocks_per_tile % B_F_SIZE == 0
return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
B_F_SIZE, num_tiles_padded)
@pytest.mark.parametrize(
"q_head_per_kv_head,head_id",
[
(1, 0),
(3, 1),
],
)
@pytest.mark.parametrize(
"num_tiles,num_blocks_per_tile",
[
(1, 1),
(13, 16),
(17, 128),
(35, 512),
(128, 128),
(130, 64),
(280, 256),
(315, 1),
],
)
@torch.inference_mode()
def test_load_and_transform_block_tables(
monkeypatch: pytest.MonkeyPatch,
num_tiles,
num_blocks_per_tile,
q_head_per_kv_head,
head_id,
) -> None:
import torch_xla.core.xla_model as xm
device = xm.xla_device()
compiler_flags_str = " ".join([
"-O1",
"--retry_failed_compilation",
])
with monkeypatch.context() as m:
m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
torch.manual_seed(10000)
torch.set_printoptions(sci_mode=False)
# On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
B_P_SIZE = 128
if num_blocks_per_tile < B_P_SIZE:
assert B_P_SIZE % num_blocks_per_tile == 0
block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
else:
block_size_tiling_factor = 1
max_num_blocks = 100000
block_tables = torch.randint(
0,
max_num_blocks,
(num_tiles * num_blocks_per_tile, ),
dtype=torch.int32,
)
nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
block_tables.to(device=device),
num_tiles,
num_blocks_per_tile,
q_head_per_kv_head,
head_id,
block_size_tiling_factor,
).cpu()
ref_out = ref_block_tables_transform(
block_tables,
num_tiles,
num_blocks_per_tile,
q_head_per_kv_head,
head_id,
block_size_tiling_factor,
)
assert (nki_out.shape == ref_out.shape
), f"{nki_out.shape=} != {ref_out.shape=}"
assert torch.all(nki_out == ref_out)
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
from vllm.attention.ops.nki_flash_attn import reshape_and_cache
@pytest.mark.parametrize(
"num_tokens, n_kv_head, d_head, num_blocks, block_size",
[
# Small model configuration (e.g., GPT-2 small)
(32, 12, 64, 4, 128), # Typical sequence processing
(1, 12, 64, 4, 128), # Single token update
(128, 12, 64, 4, 128), # Longer sequence
# Medium model configuration (e.g., GPT-2 medium)
(64, 16, 96, 8, 256), # Standard batch
(256, 16, 96, 8, 256), # Large batch
# Large model configuration (e.g., GPT-3 style)
(48, 32, 128, 16, 512), # Typical processing window
(512, 32, 128, 16, 512), # Full context window
# Edge cases and stress tests
(1024, 8, 32, 32, 32), # Many tokens, small heads
(16, 64, 256, 4, 64), # Few tokens, many heads
(2048, 24, 128, 64, 128), # Large scale test
# Minimal configurations for debugging
(4, 2, 16, 2, 16), # Tiny test case
(1, 1, 8, 1, 8), # Minimal possible
])
def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
block_size):
# Set random seed for reproducibility
torch.manual_seed(42)
# Create CPU tensors for reference implementation
key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
torch.tensor(d_head))
value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
torch.tensor(d_head))
key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
# Run reference implementation on CPU
block_indices = torch.div(slot_mapping_cpu,
block_size,
rounding_mode="floor")
block_offsets = slot_mapping_cpu % block_size
for i in range(num_tokens):
block_idx = block_indices[i]
block_offset = block_offsets[i]
key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i]
value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
# Create XLA device tensors
device = torch.device('xla')
key = key_cpu.to(device)
value = value_cpu.to(device)
key_cache = torch.zeros_like(key_cache_cpu, device=device)
value_cache = torch.zeros_like(value_cache_cpu, device=device)
slot_mapping = slot_mapping_cpu.to(device)
# Run vectorized implementation on XLA device
reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
# Move results back to CPU for comparison
key_cache_result = key_cache.cpu()
value_cache_result = value_cache.cpu()
# Assert results match
torch.testing.assert_close(key_cache_result,
key_cache_cpu,
rtol=1e-5,
atol=1e-5)
torch.testing.assert_close(value_cache_result,
value_cache_cpu,
rtol=1e-5,
atol=1e-5)
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
(7, 8, False, torch.half),
(83, 768, False, torch.half),
(83, 768, True, torch.half),
(83, 768, True, torch.bfloat16),
(83, 768, True, torch.float32),
])
@torch.inference_mode()
def test_rms_norm(
num_tokens: int,
hidden_size: int,
add_residual: bool,
dtype: torch.dtype,
) -> None:
import torch_xla.core.xla_model as xm
device = xm.xla_device()
current_platform.seed_everything(0)
torch.set_default_device("cpu")
layer = RMSNorm(hidden_size).to(dtype=dtype)
layer.weight.data.normal_(mean=1.0, std=0.1)
scale = 1 / (2 * hidden_size)
x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device)
x *= scale
residual = torch.randn_like(x) * scale if add_residual else None
residual_cpu = residual.cpu() if add_residual else None
ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu)
assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
out = layer.to(device=device)(x, residual)
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# numerical errors than other operators because they involve reductions.
# Therefore, we use a larger tolerance.
if add_residual:
assert out[0].is_xla, "output tensor is expected to be XLA tensor"
torch.testing.assert_close(out[0].cpu(),
ref_out[0],
atol=1e-2,
rtol=1e-2)
torch.testing.assert_close(out[1].cpu(),
ref_out[1],
atol=1e-2,
rtol=1e-2)
else:
assert out.is_xla, "output tensor is expected to be XLA tensor"
torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
# SPDX-License-Identifier: Apache-2.0
import random
from unittest.mock import patch
import pytest
import torch
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_random_seed
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
from vllm.utils import is_pin_memory_available
class MockLogitsProcessor(LogitsProcessor):
def __init__(self, vocab_size: int, scale: float,
fake_logits: torch.Tensor):
super().__init__(vocab_size=vocab_size, scale=scale)
self.fake_logits = fake_logits.clone()
def forward(self, *args, **kwargs):
with patch(
"vllm.model_executor.layers.logits_processor._prune_hidden_states",
lambda x, y: x
), patch(
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
lambda *args, **kwargs: self.fake_logits):
return super().forward(*args, **kwargs)
def _prepare_test(
batch_size: int
) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
vocab_size = 32000
input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
fake_logits = torch.full((batch_size, vocab_size),
1e-2,
dtype=input_tensor.dtype)
logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
return input_tensor, fake_logits, logits_processor
RANDOM_SEEDS = list(range(8))
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
def test_logits_processors(seed: int):
import torch_xla.core.xla_model as xm
device = xm.xla_device()
set_random_seed(seed)
torch.set_default_device("cpu")
batch_size = random.randint(1, 256)
input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def pick_ith(token_ids, logits):
logits[len(token_ids)] = float("inf")
return logits
seq_group_metadata_list = []
seq_lens = []
for i in range(batch_size):
seq_group_metadata_list.append(
SequenceGroupMetadata(
request_id=f"test_{i}",
is_prompt=True,
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
sampling_params=SamplingParams(temperature=0,
logits_processors=[pick_ith]),
block_tables={0: [1]},
))
seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
sampling_metadata = SamplingMetadata.prepare(
seq_group_metadata_list,
seq_lens,
query_lens=seq_lens,
device=device,
pin_memory=is_pin_memory_available())
logits_processor_output = logits_processor(
lm_head=None,
hidden_states=input_tensor,
sampling_metadata=sampling_metadata)
fake_logits *= logits_processor.scale
torch.testing.assert_close(logits_processor_output[:, 1],
fake_logits[:, 1],
rtol=1e-4,
atol=0.0)
......@@ -107,7 +107,7 @@ def ref_masked_attention(
masked_score, dim=-1, return_max_reduce=True)
else:
norm_score = ref_softmax(masked_score, dim=-1)
out = torch.einsum("hqk,khd->qhd", norm_score, value)
out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
if return_max_reduce:
return (
out,
......@@ -118,7 +118,7 @@ def ref_masked_attention(
scaled_qk,
)
else:
return out
return (out, )
def ref_context_attention(
......@@ -128,8 +128,6 @@ def ref_context_attention(
query_lens,
seq_lens,
head_size,
num_kv_heads,
num_heads,
num_queries_per_kv,
return_max_reduce=False,
):
......@@ -146,18 +144,19 @@ def ref_context_attention(
attn_mask = torch.logical_not(attn_mask)
attn_mask = attn_mask.float() * -30000
output, cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
ref_masked_attention(
query,
key,
value,
scale,
attn_mask,
return_max_reduce=return_max_reduce,
))
output, *debug_tensors = ref_masked_attention(
query,
key,
value,
scale,
attn_mask,
return_max_reduce=return_max_reduce,
)
output = output.unsqueeze(1)
if return_max_reduce:
cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
debug_tensors)
return (
output,
cached_max,
......@@ -170,65 +169,22 @@ def ref_context_attention(
return output
@pytest.mark.parametrize(
"block_size, large_tile_size",
[
(32, 2048), # 64 blocks
(32, 4096), # 128 blocks
(32, 8192), # 256 blocks
(64, 8192), # 128 blocks
],
)
@pytest.mark.parametrize(
"num_heads,num_queries_per_kv,head_size,mixed_precision",
[
(4, 2, 8, False),
(4, 2, 8, True),
(32, 8, 64, True),
(16, 2, 128, True),
],
)
@torch.inference_mode()
def test_contexted_kv_attention(
num_heads: int,
num_queries_per_kv: int,
head_size: int,
block_size: int,
large_tile_size,
mixed_precision: bool,
) -> None:
import os
import torch_xla.core.xla_model as xm
from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
assert large_tile_size % block_size == 0
device = xm.xla_device()
compiler_flags = [
"--model-type=transformer -O1",
"--internal-hlo2tensorizer-options='--verify-hlo'",
"--retry_failed_compilation",
]
compiler_flags_str = " ".join(compiler_flags)
os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
torch.manual_seed(0)
torch.set_printoptions(sci_mode=False)
min_ctx_len = 32
max_ctx_len = 1024
min_query_len = 16
max_query_len = 512
prefill_batch_size = 4
decode_batch_size = 12
def sample_inputs(
prefill_batch_size,
decode_batch_size,
min_query_len,
max_query_len,
min_ctx_len,
max_ctx_len,
block_size,
num_heads,
num_kv_heads,
head_size,
dtype,
):
batch_size = prefill_batch_size + decode_batch_size
max_model_len = (max_query_len + max_ctx_len) * 4
max_block_per_request = max_model_len // block_size
dtype = torch.float32
cache_size = (batch_size * max_block_per_request) + 2
prefill_ctx_lens = torch.randint(min_ctx_len,
max_ctx_len + 1, (prefill_batch_size, ),
......@@ -244,7 +200,6 @@ def test_contexted_kv_attention(
dtype=torch.long,
).tolist() + [1 for _ in range(decode_batch_size)]
seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
num_kv_heads = num_heads // num_queries_per_kv
num_tokens = sum(query_lens)
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
......@@ -304,171 +259,259 @@ def test_contexted_kv_attention(
cur_ctx += block_size
block_id += 1
(
output_ref,
cached_max,
cached_sum_reciprocal,
lse,
masked_score,
scaled_qk,
) = ref_context_attention(
return (
query,
k,
v,
k_cache,
v_cache,
block_table,
key,
value,
query_lens,
seq_lens,
head_size,
num_kv_heads,
num_heads,
num_queries_per_kv,
return_max_reduce=True,
)
# build neuron program
return_debug_tensors = False
B_P_SIZE = 128
LARGE_TILE_SZ = large_tile_size
def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
num_blocks):
context_lens = seq_lens - query_lens
blocks_per_seq = (context_lens + block_size - 1) // block_size
num_seqs = len(seq_lens)
active_blocks: list[int] = []
for seq_id in range(num_seqs):
active_blocks = (
active_blocks +
block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
return F.pad(
torch.tensor(active_blocks),
(0, num_blocks - len(active_blocks)),
"constant",
0,
)
def ceil_div(a, b):
return (a + b - 1) // b
def pad_to_multiple(a, b):
return ceil_div(a, b) * b
def pad_to_next_power_of_2(a):
assert a > 0
return 2**int(a - 1).bit_length()
# calculate input shapes
max_num_queries = pad_to_multiple(sum(query_lens), block_size)
max_num_queries = pad_to_next_power_of_2(max_num_queries)
head_size_padded = B_P_SIZE
assert head_size_padded >= head_size
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
num_active_blocks = ceil_div(context_lens, block_size).sum().item()
num_active_blocks = pad_to_multiple(num_active_blocks,
LARGE_TILE_SZ // block_size)
context_kv_len = num_active_blocks * block_size
assert (context_kv_len %
LARGE_TILE_SZ == 0), f"invalid context_kv_len={context_kv_len}"
# pad QKV tensors
pad_dims = (
0,
head_size_padded - query.shape[2],
0,
0,
def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
num_blocks):
context_lens = seq_lens - query_lens
blocks_per_seq = (context_lens + block_size - 1) // block_size
num_seqs = len(seq_lens)
active_blocks: list[int] = []
for seq_id in range(num_seqs):
active_blocks = (
active_blocks +
block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
return F.pad(
torch.tensor(active_blocks, dtype=torch.int32),
(0, num_blocks - len(active_blocks)),
"constant",
0,
max_num_queries - query.shape[0],
)
query = F.pad(query, pad_dims, "constant", 0)
k = F.pad(k, pad_dims, "constant", 0)
v = F.pad(v, pad_dims, "constant", 0)
k_cache = F.pad(k_cache, (0, head_size_padded - head_size), "constant", 0)
v_cache = F.pad(v_cache, (0, head_size_padded - head_size), "constant", 0)
# permute QKV tensors
# query: (1, n_heads, d, seq_q)
# key: (1, n_kv_heads, d, seq_k)
# value: (1, n_kv_heads, seq_v, d)
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
# transform block table
active_block_table = get_active_block_tables(
block_table,
torch.tensor(query_lens),
torch.tensor(seq_lens),
block_size,
num_active_blocks,
)
# Build attention masks
prior_mask, active_mask = (
BlockDiagonalCausalFromBottomRightMask.from_seqlens(
query_lens, seq_lens, block_size=block_size))
attn_mask = torch.concat(
[
F.pad(
prior_mask,
(
0,
context_kv_len - prior_mask.shape[1],
0,
max_num_queries - prior_mask.shape[0],
),
"constant",
@pytest.mark.parametrize(
"prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
[
# Test minimal configurations (small block size)
(1, 199, 1, 512, 4, 2, 8, False
), # minimal block size, small dimensions
(1, 199, 1, 512, 4, 2, 8, True), # same with mixed precision
# Test common/medium configurations
(4, 12, 32, 2048, 32, 8, 64, False), # common case, larger heads
(4, 12, 32, 2048, 16, 4, 32,
True), # medium size, mixed precision, grouped-query attention (GQA)
# Test large configurations
(4, 12, 256, 8192, 8, 1, 128, False), # large blocks, large head size
(4, 12, 256, 8192, 64, 8, 64, True), # large blocks, many heads
# Test asymmetric configurations
(2, 24, 64, 4096, 12, 4, 96, False), # varied batch sizes
(8, 8, 128, 2048, 24, 2, 48, True), # balanced batches
# Test edge cases
(1, 128, 16, 1024, 4, 2, 16, False), # large decode batch
(16, 4, 8, 1024, 4, 2, 128, True), # large prefill batch
(4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA)
(4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA)
])
@torch.inference_mode()
def test_contexted_kv_attention(
monkeypatch: pytest.MonkeyPatch,
prefill_batch_size: int,
decode_batch_size: int,
num_heads: int,
num_queries_per_kv: int,
head_size: int,
block_size: int,
large_tile_size,
mixed_precision: bool,
) -> None:
import torch_xla.core.xla_model as xm
from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
reorder_context_mask)
assert large_tile_size % block_size == 0
device = xm.xla_device()
compiler_flags_str = " ".join([
"-O1",
"--retry_failed_compilation",
])
with monkeypatch.context() as m:
m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
torch.manual_seed(0)
torch.set_printoptions(sci_mode=False)
torch.set_default_device("cpu")
dtype = torch.float32
min_ctx_len = 32
max_ctx_len = 1024
min_query_len = 16
max_query_len = 512
num_kv_heads = num_heads // num_queries_per_kv
(
query,
k_active,
v_active,
k_cache,
v_cache,
block_table,
key,
value,
query_lens,
seq_lens,
) = sample_inputs(
prefill_batch_size=prefill_batch_size,
decode_batch_size=decode_batch_size,
min_query_len=min_query_len,
max_query_len=max_query_len,
min_ctx_len=min_ctx_len,
max_ctx_len=max_ctx_len,
block_size=block_size,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
head_size=head_size,
dtype=dtype,
)
output_ref = ref_context_attention(
query,
key,
value,
query_lens,
seq_lens,
head_size,
num_queries_per_kv,
return_max_reduce=False,
)
# build neuron program
B_P_SIZE = 128
assert (large_tile_size >= B_P_SIZE
), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
def ceil_div(a, b):
return (a + b - 1) // b
def pad_to_multiple(a, b):
return ceil_div(a, b) * b
def pad_to_next_power_of_2(a):
assert a > 0
return 2**int(a - 1).bit_length()
# calculate input shapes
max_num_queries = pad_to_next_power_of_2(sum(query_lens))
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
num_active_blocks = ceil_div(context_lens, block_size).sum().item()
num_active_blocks = pad_to_multiple(num_active_blocks,
large_tile_size // block_size)
context_kv_len = num_active_blocks * block_size
assert (
context_kv_len %
large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
# pad QKV tensors
pad_dims = (
0,
0,
0,
0,
0,
max_num_queries - query.shape[0],
)
query = F.pad(query, pad_dims, "constant", 0)
k = F.pad(k_active, pad_dims, "constant", 0)
v = F.pad(v_active, pad_dims, "constant", 0)
# permute QKV tensors
# query: (1, n_heads, d, seq_q)
# key: (1, n_kv_heads, d, seq_k)
# value: (1, n_kv_heads, seq_v, d)
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
# transform block table
active_block_table = get_active_block_tables(
block_table.cpu(),
torch.tensor(query_lens).cpu(),
torch.tensor(seq_lens).cpu(),
block_size,
num_active_blocks,
)
# Build attention masks
prior_mask, active_mask = (
BlockDiagonalCausalFromBottomRightMask.from_seqlens(
query_lens, seq_lens, block_size=block_size))
prior_mask_padded = F.pad(
prior_mask,
(
0,
).bool(),
F.pad(
active_mask,
(
0,
max_num_queries - active_mask.shape[1],
0,
max_num_queries - active_mask.shape[0],
),
"constant",
context_kv_len - prior_mask.shape[1],
0,
).bool(),
],
dim=1,
)
input_args = (
query.to(device=device),
k.to(device=device),
v.to(device=device),
k_cache.to(device=device),
v_cache.to(device=device),
active_block_table.to(torch.int32).to(device=device),
attn_mask.to(device=device),
)
input_kwargs = dict(
n_kv_head=num_kv_heads,
head_size=head_size,
mixed_precision=mixed_precision,
LARGE_TILE_SZ=LARGE_TILE_SZ,
return_debug_tensors=return_debug_tensors,
)
max_num_queries - prior_mask.shape[0],
),
"constant",
0,
).bool()
active_mask_padded = F.pad(
active_mask,
(
0,
max_num_queries - active_mask.shape[1],
0,
max_num_queries - active_mask.shape[0],
),
"constant",
0,
).bool()
attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
dim=1)
attn_mask = reorder_context_mask(attn_mask, large_tile_size,
block_size)
input_args = (
query.to(device=device),
k.to(device=device),
v.to(device=device),
k_cache.to(device=device),
v_cache.to(device=device),
active_block_table.to(device=device),
attn_mask.to(device=device),
)
input_kwargs = dict(
n_kv_head=num_kv_heads,
head_size=head_size,
mixed_precision=mixed_precision,
LARGE_TILE_SZ=large_tile_size,
)
if return_debug_tensors:
output_nki, *debug_tensors = flash_attn_varlen_nkifunc(
*input_args, **input_kwargs)
else:
output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
debug_tensors = []
debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors]
num_actual_tokens = sum(query_lens)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
output_nki = output_nki.cpu().permute(0, 2, 1, 3)[:, :, :, :head_size]
output_nki = output_nki[0, :num_actual_tokens, :, :]
output_ref_padded = F.pad(
output_ref,
(0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
"constant",
0,
)
output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
num_actual_tokens = sum(query_lens)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
output_nki = output_nki.cpu().permute(0, 2, 1, 3)
output_nki = output_nki[0, :num_actual_tokens, :, :]
output_ref_padded = F.pad(
output_ref,
(0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
"constant",
0,
)
output_ref = output_ref_padded.transpose(
0, 1)[0, :num_actual_tokens, :, :]
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
# SPDX-License-Identifier: Apache-2.0
"""
Tests for miscellaneous utilities
"""
import pytest
import torch
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform
@pytest.mark.parametrize(
"max_position,is_neox_style,rotary_dim,head_size,seq_len", [
(16, False, 32, 32, 1024),
(16, False, 32, 128, 1024),
(16, True, 32, 32, 1024),
(16, True, 32, 128, 1024),
])
def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
head_size, seq_len):
import torch_xla.core.xla_model as xm
device = xm.xla_device()
current_platform.seed_everything(0)
torch.set_default_device("cpu")
batch_size = 1
base = 10000
num_heads = 8
rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
is_neox_style, torch.float32)
positions = torch.randint(0,
max_position, (batch_size, seq_len),
device="cpu")
query = torch.randn(batch_size,
seq_len,
num_heads * head_size,
dtype=torch.float32,
device="cpu")
key = torch.randn_like(query)
assert positions.is_cpu, \
"reference input tensor is expected to be CPU tensor."
ref_query, ref_key = rot.to(device="cpu").forward_native(
positions, query, key)
out_query, out_key = rot.to(device=device).forward_neuron(
positions.to(device=device), query.to(device=device),
key.to(device=device))
assert out_query.is_xla and out_key.is_xla, \
"output tensor is expected to be XLA tensor"
torch.testing.assert_close(out_query.cpu(),
ref_query,
atol=1e-2,
rtol=1e-2)
torch.testing.assert_close(out_key.cpu(), ref_key, atol=1e-2, rtol=1e-2)
# SPDX-License-Identifier: Apache-2.0
import functools
from typing import Callable
from unittest.mock import patch
import pytest
import torch
import torch_xla.distributed.xla_multiprocessing as xmp
from typing_extensions import ParamSpec
from vllm.distributed.communication_op import (
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
init_distributed_environment)
from vllm.utils import get_distributed_init_method, get_open_port
_P = ParamSpec("_P")
def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
"""Decorator to reinitialize the Neuron Runtime before executing a test.
This is necessary for distributed tests which need to reallocate Neuron
Cores to separate subprocesses.
"""
@functools.wraps(f)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
runtime = torch.classes.neuron.Runtime()
runtime.initialize()
runtime.unsafe_close()
f(*args, **kwargs)
runtime.initialize()
return wrapper
def all_gather_test_worker(index, tp_degree, distributed_init_method):
init_distributed_environment(tp_degree,
index,
distributed_init_method,
index,
backend="xla")
ensure_model_parallel_initialized(tp_degree, 1)
num_dimensions = 3
tensor_size = list(range(2, num_dimensions + 2))
total_size = 1
for s in tensor_size:
total_size *= s
all_gather_dimension = -1
all_tensors = [
torch.arange(total_size, dtype=torch.float32,
device="xla").reshape(tensor_size) * (r + 1)
for r in range(tp_degree)
]
expected = torch.cat(all_tensors, dim=all_gather_dimension)
t = all_tensors[index % tp_degree]
t = tensor_model_parallel_all_gather(t, all_gather_dimension)
torch.testing.assert_close(t, expected)
def all_reduce_test_worker(index, tp_degree, distributed_init_method):
init_distributed_environment(tp_degree,
index,
distributed_init_method,
index,
backend="xla")
ensure_model_parallel_initialized(tp_degree, 1)
num_elements = 8
all_tensors = [
torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
for r in range(tp_degree)
]
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
t = all_tensors[index % tp_degree]
t = tensor_model_parallel_all_reduce(t)
torch.testing.assert_close(t, expected)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("test_target",
[all_reduce_test_worker, all_gather_test_worker])
@reinitialize_neuron_runtime
def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
test_target):
with patch('torch_xla._XLAC._xla_runtime_is_initialized',
return_value=False):
distributed_init_method = get_distributed_init_method(
"127.0.0.1", get_open_port())
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
','.join(['1' for _ in range(tp_size)]))
xmp.spawn(test_target, args=(tp_size, distributed_init_method))
# SPDX-License-Identifier: Apache-2.0
from typing import Iterable, List, Optional, Tuple, Union
from collections.abc import Iterable
from typing import Optional, Union
import torch
import torch.nn as nn
from vllm.attention import AttentionMetadata
from vllm.config import VllmConfig
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.models.gemma2 import Gemma2Model
......@@ -37,16 +37,12 @@ class MyGemma2Embedding(nn.Module):
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
hidden_states = self.model(
input_ids,
positions,
kv_caches,
attn_metadata,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
)
......@@ -64,7 +60,7 @@ class MyGemma2Embedding(nn.Module):
) -> Optional[PoolerOutput]:
return self._pooler(hidden_states, pooling_metadata)
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
weights = self.hf_to_vllm_mapper.apply(weights)
weights = ((name, data) for name, data in weights
......
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import get_attn_backend
from vllm.utils import STR_INVALID_VAL
from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
def test_platform_plugins():
......@@ -25,8 +25,9 @@ def test_platform_plugins():
f" is loaded. The first import:\n{_init_trace}")
def test_oot_attention_backend(monkeypatch):
def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
# ignore the backend env variable if it is set
override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert backend.get_name() == "Dummy_Backend"
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert backend.get_name() == "Dummy_Backend"
# SPDX-License-Identifier: Apache-2.0
import pytest
from vllm.core.scheduler import Scheduler
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
class DummyV0Scheduler(Scheduler):
def schedule(self):
raise Exception("Exception raised by DummyV0Scheduler")
class DummyScheduler(Scheduler):
class DummyV1Scheduler(V1Scheduler):
def schedule(self):
raise Exception("Exception raised by DummyScheduler")
raise Exception("Exception raised by DummyV1Scheduler")
def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
with pytest.raises(Exception) as exception_info:
engine_args = EngineArgs(
model="facebook/opt-125m",
enforce_eager=True, # reduce test time
scheduler_cls=DummyV0Scheduler,
)
engine = LLMEngine.from_engine_args(engine_args=engine_args)
sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params)
engine.step()
assert str(
exception_info.value) == "Exception raised by DummyV0Scheduler"
def test_scheduler_plugins():
import pytest
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with pytest.raises(Exception) as exception_info:
with pytest.raises(Exception) as exception_info:
engine_args = EngineArgs(
model="facebook/opt-125m",
enforce_eager=True, # reduce test time
scheduler_cls=DummyScheduler,
)
engine_args = EngineArgs(
model="facebook/opt-125m",
enforce_eager=True, # reduce test time
scheduler_cls=DummyV1Scheduler,
)
engine = LLMEngine.from_engine_args(engine_args=engine_args)
engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params)
engine.step()
sampling_params = SamplingParams(max_tokens=1)
engine.add_request("0", "foo", sampling_params)
engine.step()
assert str(exception_info.value) == "Exception raised by DummyScheduler"
assert str(
exception_info.value) == "Exception raised by DummyV1Scheduler"
......@@ -36,7 +36,10 @@ def test_disable_sliding_window(model_len_len, ):
del vllm_disabled_model
cleanup_dist_env_and_memory()
vllm_enabled_model = LLM(model, disable_sliding_window=False)
vllm_enabled_model = LLM(model,
enforce_eager=True,
disable_sliding_window=False,
enable_prefix_caching=False)
vllm_enabled_model.generate("Hi my name is")
model_config = vllm_enabled_model.llm_engine.model_config
assert model_config.max_model_len == full_len, (
......
......@@ -4,21 +4,35 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
from __future__ import annotations
import pytest
import os
from tests.conftest import VllmRunner
from tests.core.utils import SchedulerProxy, create_dummy_prompt
from tests.kernels.utils import override_backend_env_variable
from vllm import SamplingParams, TokensPrompt
from vllm.core.scheduler import Scheduler
from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_outputs_equal
from ..utils import models_path_prefix
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
MODELS = [
os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "distilbert/distilgpt2"),
]
UNSTABLE_PROMPT_SEQUENCE = [
......@@ -49,74 +63,88 @@ def test_mixed_requests(
cached_position: int,
enable_chunked_prefill: bool,
block_size: int,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Test the case when some sequences have the prefix cache hit
and the others don't. The cached position determines where
the sequence is at among the batch of prefills.
"""
override_backend_env_variable(monkeypatch, backend)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
cached_prompt = example_prompts[cached_position]
with vllm_runner(
model,
dtype=dtype,
enable_prefix_caching=True,
enable_chunked_prefill=enable_chunked_prefill,
block_size=block_size,
) as vllm_model:
# Run the first prompt so the cache is populated
vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
# Run all the promopts
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
# Verify number of cached tokens
for i in range(len(req_outputs)):
if i == cached_position:
expected_num_cached_tokens = (
len(req_outputs[i].prompt_token_ids) //
block_size) * block_size
else:
expected_num_cached_tokens = 0
assert (
req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
vllm_outputs = [(
output.prompt_token_ids + list(output.outputs[0].token_ids),
output.prompt + output.outputs[0].text,
) for output in req_outputs]
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.")
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, backend)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
cached_prompt = example_prompts[cached_position]
with vllm_runner(
model,
dtype=dtype,
enable_prefix_caching=True,
enable_chunked_prefill=enable_chunked_prefill,
block_size=block_size,
) as vllm_model:
# Run the first prompt so the cache is populated
vllm_outputs = vllm_model.generate_greedy([cached_prompt],
max_tokens)
# Run all the promopts
greedy_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens)
req_outputs = vllm_model.model.generate(example_prompts,
greedy_params)
# Verify number of cached tokens
for i in range(len(req_outputs)):
if i == cached_position:
expected_num_cached_tokens = (
len(req_outputs[i].prompt_token_ids) //
block_size) * block_size
else:
expected_num_cached_tokens = 0
assert (req_outputs[i].num_cached_tokens ==
expected_num_cached_tokens)
vllm_outputs = [(
output.prompt_token_ids + list(output.outputs[0].token_ids),
output.prompt + output.outputs[0].text,
) for output in req_outputs]
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
def test_unstable_prompt_sequence(
vllm_runner,
backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
override_backend_env_variable(monkeypatch, backend)
with vllm_runner(
"Qwen/Qwen2.5-0.5B-Instruct",
enable_chunked_prefill=True,
enable_prefix_caching=True,
max_model_len=4096,
) as vllm_model:
for prompt in UNSTABLE_PROMPT_SEQUENCE:
vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
SamplingParams(max_tokens=1))
if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.")
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, backend)
with vllm_runner(
"Qwen/Qwen2.5-0.5B-Instruct",
enable_chunked_prefill=True,
enable_prefix_caching=True,
max_model_len=4096,
) as vllm_model:
for prompt in UNSTABLE_PROMPT_SEQUENCE:
vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
SamplingParams(max_tokens=1))
@pytest.mark.parametrize("model", MODELS)
......
......@@ -15,10 +15,12 @@ from ..utils import models_path_prefix
from vllm.platforms import current_platform
from tests.utils import compare_two_settings, fork_new_process_for_each_test
from ..utils import compare_two_settings, create_new_process_for_each_test
models_4bit_to_test = [
(os.path.join(models_path_prefix, "facebook/opt-125m"), "quantize opt model inflight"),
(os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
"quantize inflight model with both HF and Mistral format weights")
]
models_pre_qaunt_4bit_to_test = [
......@@ -37,7 +39,7 @@ models_pre_quant_8bit_to_test = [
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform(),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
......@@ -50,7 +52,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_qaunt_4bit_to_test)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
......@@ -62,7 +64,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_quant_8bit_to_test)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
......@@ -75,7 +77,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform(),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, description) -> None:
......@@ -93,7 +95,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
common_args = [
"--disable-log-stats",
......
......@@ -24,6 +24,14 @@ from vllm.platforms import current_platform
from ..utils import models_path_prefix
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.parametrize(
"model_args",
[
......@@ -220,8 +228,6 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
assert qkv_proj.scheme.group_size == (-1
if group is None else group)
assert qkv_proj.weight_packed.dtype is torch.int32
assert qkv_proj.weight_scale.dtype is torch.float16
assert qkv_proj.scheme.pack_factor == pack_factor
llm.apply_model(check_model)
......
......@@ -5,7 +5,6 @@ Run `pytest tests/quantization/test_configs.py --forked`.
"""
from dataclasses import dataclass
from typing import Tuple
import pytest
import os
......@@ -55,7 +54,7 @@ MODEL_ARG_EXPTYPES = [
@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
model_path, quantization_arg, expected_type = model_arg_exptype
try:
......
......@@ -9,10 +9,17 @@ import os
from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings, models_path_prefix
from vllm.utils import is_hip
from vllm.platforms import current_platform
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
# Fall back to V0 if cpu offloading is enabled.
# Fixture is required to that baseline uses V0.
monkeypatch.setenv('VLLM_USE_V1', '0')
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
......@@ -26,7 +33,7 @@ def test_cpu_offload_fp8():
# max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or is_hip(),
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq():
# Test GPTQ Marlin
......@@ -40,7 +47,7 @@ def test_cpu_offload_gptq():
max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or is_hip(),
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or current_platform.is_rocm(),
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq():
# Test AWQ Marlin
......@@ -54,7 +61,7 @@ def test_cpu_offload_awq():
max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or is_hip(),
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors():
# Test wNa16
......
......@@ -9,12 +9,12 @@ import os
from tests.quantization.utils import is_quant_method_supported
from ..utils import models_path_prefix
from vllm.utils import is_hip
from vllm.platforms import current_platform
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
@pytest.mark.skipif(not is_quant_method_supported("experts_int8") or is_hip(),
@pytest.mark.skipif(not is_quant_method_supported("experts_int8") or current_platform.is_rocm(),
reason="ExpertsInt8 is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
......
......@@ -13,7 +13,6 @@ from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
Fp8LinearMethod)
from vllm.platforms import current_platform
from ..utils import models_path_prefix
from vllm.utils import is_hip
MODELS = [
os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
......@@ -22,7 +21,7 @@ MODELS = [
]
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", MODELS)
@pytest.mark.parametrize("force_marlin", [False, True])
......@@ -47,10 +46,12 @@ KV_CACHE_MODELS = [
]
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
def check_model(model):
......@@ -83,12 +84,15 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
print(outputs[0][1])
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("force_marlin", [False, True])
def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
monkeypatch) -> None:
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
......@@ -106,8 +110,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
assert attn._v_scale == 1.0
if current_platform.is_cuda():
if current_platform.has_device_capability(
89) and not force_marlin:
if current_platform.supports_fp8() and not force_marlin:
# For GPUs with hardware support, we keep weights in fp8
assert fc1.weight.dtype == torch.float8_e4m3fn
else:
......@@ -115,11 +118,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
# for weight-only quantization using Marlin kernels
assert fc1.weight.dtype == torch.int32
elif current_platform.is_rocm():
# Only MI300 and above support quantization='fp8'
if current_platform.has_device_capability(
94) and not force_marlin:
if current_platform.supports_fp8() and not force_marlin:
# For GPUs with hardware support, we keep weights in fp8
assert fc1.weight.dtype == torch.float8_e4m3fnuz
assert fc1.weight.dtype == current_platform.fp8_dtype()
else: # unsupported ROCm platform
pytest.skip(
"Skip `test_load_fp16_model`. "
......@@ -132,7 +133,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
llm.apply_model(check_model)
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_scaled_fp8_quant(dtype) -> None:
......
......@@ -28,8 +28,10 @@ MODEL_QUANT = [
@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
def test_gptq_with_dynamic(vllm_runner, model_id: str,
use_marlin_kernel: bool):
def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
monkeypatch):
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch.setenv("VLLM_USE_V1", "0")
vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment