Commit fcfc474d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.3' into v0.8.3-dev

parents bb94d2e5 296c6572
...@@ -752,3 +752,72 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size, ...@@ -752,3 +752,72 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size) ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
torch.testing.assert_close(dst, expected) torch.testing.assert_close(dst, expected)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
@torch.inference_mode()
def test_concat_and_cache_mla_cpu(
kv_lora_rank: int,
qk_rope_head_dim: int,
num_tokens: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
seed: int,
) -> None:
device = "cpu"
kv_cache_dtype = "auto"
current_platform.seed_everything(seed)
torch.set_default_device(device)
total_slots = num_blocks * block_size
slot_mapping_lst = random.sample(range(total_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping_lst,
dtype=torch.long,
device=device)
kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
k_pe = torch.randn(num_tokens,
qk_rope_head_dim,
dtype=dtype,
device=device)
entry_size = kv_lora_rank + qk_rope_head_dim
scale = torch.tensor(0.1, dtype=torch.float32, device=device)
kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
kv_cache_dtype, device)
ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
for i in range(num_tokens):
slot = slot_mapping[i].item()
block_idx = slot // block_size
block_offset = slot % block_size
ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
if kv_cache_dtype == "fp8":
ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
ops.convert_fp8(ref_kv_cache,
ref_temp,
scale.item(),
kv_dtype=kv_cache_dtype)
else:
ref_kv_cache = ref_temp
opcheck(
torch.ops._C_cache_ops.concat_and_cache_mla,
(kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
test_utils=DEFAULT_OPCHECK_TEST_UTILS,
)
ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
kv_cache_dtype, scale)
torch.testing.assert_close(kv_cache, ref_kv_cache)
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
Run `pytest tests/kernels/test_cutlass.py`. Run `pytest tests/kernels/test_cutlass.py`.
""" """
import random
import pytest import pytest
import torch import torch
...@@ -499,3 +500,140 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool): ...@@ -499,3 +500,140 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
torch.float16) torch.float16)
#print("out:",out) #print("out:",out)
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
# def test_cutlass_support_opcheck():
# opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
# @pytest.mark.parametrize("num_experts", [8, 64])
# @pytest.mark.parametrize("per_act_token", [True, False])
# @pytest.mark.parametrize("per_out_ch", [True, False])
# @pytest.mark.parametrize("use_bias", [False])
# @pytest.mark.skipif(
# (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
# current_platform.get_device_capability()),
# reason="Grouped gemm is not supported on this GPU type.")
# def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
per_out_ch: bool, use_bias: bool):
# Device and dtype setup
device = "cuda"
out_dtype = torch.half
# Create separate A, B, C tensors for each group
a_tensors = []
b_tensors = []
a_scales_tensors = []
b_scales_tensors = []
baseline_tensors = []
expert_offsets = torch.zeros((num_experts + 1),
device=device,
dtype=torch.int32)
problem_sizes = torch.zeros((num_experts, 3),
device=device,
dtype=torch.int32)
if not per_act_token:
one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32)
alignment = 16 # 128 // 8
# For variation, each group has dimensions
n_g = alignment * random.randint(1, 64)
k_g = alignment * random.randint(1, 64)
for g in range(num_experts):
m_g = alignment * random.randint(1, 64)
expert_offsets[g + 1] = expert_offsets[g] + m_g
problem_sizes[g][0] = m_g
problem_sizes[g][1] = n_g
problem_sizes[g][2] = k_g
m_a_scales = m_g if per_act_token else 1
n_b_scales = n_g if per_out_ch else 1
print("shape:", m_g, n_g, k_g)
# Create group-specific A and B (FP8) and output (FP16/FP32)
a_g = to_fp8(torch.randn((m_g, k_g), device=device))
b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
a_tensors.append(a_g)
b_tensors.append(b_g)
# Set up A/B scales
scale_b = torch.randn((1, n_b_scales),
device=device,
dtype=torch.float32)
b_scales_tensors.append(scale_b)
if per_act_token:
scale_a = torch.randn((m_a_scales, 1),
device=device,
dtype=torch.float32)
a_scales_tensors.append(scale_a)
else:
scale_a = one_scale_a
# Compute baseline result for this group
baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype,
None)
baseline_tensors.append(baseline_g)
a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g),
device=device,
dtype=torch.float8_e4m3fn)
b_tensors_stacked = torch.empty((num_experts, n_g, k_g),
device=device,
dtype=torch.float8_e4m3fn)
for g in range(num_experts):
a_tensors_stacked[expert_offsets[g]:expert_offsets[g +
1]] = a_tensors[g]
b_tensors_stacked[g] = b_tensors[g].t()
b_tensors_stacked = b_tensors_stacked.transpose(1, 2)
if per_act_token:
a_scales_tensors_stacked = torch.empty(
(expert_offsets[num_experts], 1),
device=device,
dtype=torch.float32)
for g in range(num_experts):
a_scales_tensors_stacked[
expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g]
else:
a_scales_tensors_stacked = one_scale_a
b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales),
device=device,
dtype=torch.float32)
for g in range(num_experts):
b_scales_tensors_stacked[g] = b_scales_tensors[g]
out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g),
device=device,
dtype=out_dtype)
ab_strides = torch.full((num_experts, ),
a_tensors_stacked.stride(0),
device="cuda",
dtype=torch.int64)
c_strides = torch.full((num_experts, ),
out_tensors_stacked.stride(0),
device="cuda",
dtype=torch.int64)
ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
b_tensors_stacked, a_scales_tensors_stacked,
b_scales_tensors_stacked, expert_offsets[:-1],
problem_sizes, ab_strides, ab_strides, c_strides)
# Validate each group's result against the baseline
for g in range(num_experts):
baseline = baseline_tensors[g]
c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
print(baseline)
print(c)
print("*")
torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
fused_topk)
from vllm.platforms import current_platform
NUM_EXPERTS = [40, 64]
TOP_KS = [6, 8]
def run(a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(
pipeline_parallel_size=1))):
return cutlass_moe_fp8(a,
w1_q,
w2_q,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale)
@pytest.mark.parametrize("m", [2, 64, 224])
@pytest.mark.parametrize("n", [1024, 3072])
@pytest.mark.parametrize("k", [1024, 1536])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()),
reason="Grouped gemm is not supported on this GPU type.")
def test_cutlass_moe_no_graph(
m: int,
n: int,
k: int,
e: int,
topk: int,
per_act_token: bool,
per_out_ch: bool,
):
current_platform.seed_everything(7)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(
pipeline_parallel_size=1))):
dtype = torch.half
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
# Get the right scale for tests.
_, a_scale1 = ops.scaled_fp8_quant(
a, use_per_token_if_dynamic=per_act_token)
a_q, _ = ops.scaled_fp8_quant(a,
a_scale1,
use_per_token_if_dynamic=per_act_token)
a_d = a_q.float().mul(a_scale1).to(dtype)
n_b_scales = 2 * n if per_out_ch else 1
k_b_scales = k if per_out_ch else 1
w1_q = torch.empty((e, 2 * n, k),
device="cuda",
dtype=torch.float8_e4m3fn)
w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
w1_scale = torch.empty((e, n_b_scales, 1),
device="cuda",
dtype=torch.float32)
w2_scale = torch.empty((e, k_b_scales, 1),
device="cuda",
dtype=torch.float32)
ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
for expert in range(e):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
w1[expert], use_per_token_if_dynamic=per_out_ch)
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
w2[expert], use_per_token_if_dynamic=per_out_ch)
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
w1_d = torch.empty_like(w1)
w2_d = torch.empty_like(w2)
for expert in range(e):
w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
score = torch.randn((m, e), device="cuda", dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
cutlass_output = cutlass_moe_fp8(a,
w1_q,
w2_q,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale1)
#print(triton_output)
#print(cutlass_output)
#print("*")
torch.testing.assert_close(triton_output,
cutlass_output,
atol=5e-2,
rtol=1e-2)
@pytest.mark.parametrize("m", [2, 64, 224])
@pytest.mark.parametrize("n", [1024, 3072])
@pytest.mark.parametrize("k", [1024, 1536])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("per_act_token", [True, False])
@pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()),
reason="Grouped gemm is not supported on this GPU type.")
def test_cutlass_moe_cuda_graph(
m: int,
n: int,
k: int,
e: int,
topk: int,
per_act_token: bool,
per_out_ch: bool,
):
current_platform.seed_everything(7)
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(
pipeline_parallel_size=1))):
dtype = torch.half
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
# Get the right scale for tests.
_, a_scale1 = ops.scaled_fp8_quant(
a, use_per_token_if_dynamic=per_act_token)
a_q, _ = ops.scaled_fp8_quant(a,
a_scale1,
use_per_token_if_dynamic=per_act_token)
a_d = a_q.float().mul(a_scale1).to(dtype)
n_b_scales = 2 * n if per_out_ch else 1
k_b_scales = k if per_out_ch else 1
w1_q = torch.empty((e, 2 * n, k),
device="cuda",
dtype=torch.float8_e4m3fn)
w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
w1_scale = torch.empty((e, n_b_scales, 1),
device="cuda",
dtype=torch.float32)
w2_scale = torch.empty((e, k_b_scales, 1),
device="cuda",
dtype=torch.float32)
ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
for expert in range(e):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
w1[expert], use_per_token_if_dynamic=per_out_ch)
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
w2[expert], use_per_token_if_dynamic=per_out_ch)
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
w1_d = torch.empty_like(w1)
w2_d = torch.empty_like(w2)
for expert in range(e):
w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
score = torch.randn((m, e), device="cuda", dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
stream = torch.cuda.Stream()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, stream=stream):
cutlass_output = run(a, a_scale1, w1_q, w2_q, w1_scale, w2_scale,
topk_weights, topk_ids, ab_strides1,
c_strides1, ab_strides2, c_strides2)
torch.cuda.synchronize()
graph.replay()
torch.cuda.synchronize()
#print(triton_output)
#print(cutlass_output)
#print("*")
torch.testing.assert_close(triton_output,
cutlass_output,
atol=9e-2,
rtol=1e-2)
...@@ -69,7 +69,7 @@ QUANT_TYPES = [ ...@@ -69,7 +69,7 @@ QUANT_TYPES = [
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("dtype", [torch.half]) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("quant_type", QUANT_TYPES) @pytest.mark.parametrize("quant_type", QUANT_TYPES)
@torch.inference_mode() @torch.inference_mode()
def test_dequantize(hidden_size: int, dtype: torch.dtype, def test_dequantize(hidden_size: int, dtype: torch.dtype,
...@@ -82,7 +82,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype, ...@@ -82,7 +82,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
ref_output = torch.tensor(dequantize(tensor.data, quant_type), ref_output = torch.tensor(dequantize(tensor.data, quant_type),
device="cuda").to(dtype) device="cuda").to(dtype)
output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"), output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
quant_type, *list(shape)).to(dtype) quant_type, *list(shape), dtype)
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2) torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
......
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
from vllm.model_executor.layers.lightning_attn import (
linear_decode_forward_triton)
from vllm.platforms import current_platform
NUM_HEADS = [4, 8]
HEAD_SIZES = [64]
BATCH_SIZES = [1, 2]
SEQ_LENGTHS = [16]
DTYPES = [torch.float32]
def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
"""Reference implementation of lightning attention core algorithm
The difference from the main implementation is that this processes
each step sequentially, instead of using parallelized triton kernels
"""
B, H, S, D = q.shape
E = v.shape[-1]
dtype = q.dtype
output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device)
# Use clone() to ensure an independent copy
if kv_history is None:
kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device)
else:
kv_cache = kv_history.clone()
# More efficient implementation
# Convert decay factors to matrix form
if ed.dim() == 1:
decay = torch.exp(-ed).view(1, -1, 1, 1)
else:
decay = torch.exp(-ed)
for b in range(B):
for step in range(S):
# Process all heads at once for this position
q_bs = q[b, :, step] # [H, D]
k_bs = k[b, :, step] # [H, D]
v_bs = v[b, :, step] # [H, E]
# Calculate KV outer products for all heads
for h in range(H):
# Calculate KV outer product
kv_outer = torch.outer(k_bs[h], v_bs[h])
# Update KV cache with decay
# Note: Using the same order as in the Triton kernel
kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer
# Calculate attention output
output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h])
# Match the shape returned by the actual implementation
# The actual implementation returns a tensor of shape [B, H, 2, D, E]
# where dimension 2 contains both KV and KV history
kv_reshaped = kv_cache.unsqueeze(2) # [B, H, 1, D, E]
final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
dim=2) # [B, H, 2, D, E]
return output, final_kv_cache
def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
"""Reference implementation: linear attention decode function"""
B, H, _, D = q.shape
output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device)
# Calculate decay factors once (more efficient)
decay = torch.exp(-slope_rate).view(-1, 1, 1) # [H, 1, 1]
# Process each batch
for b in range(B):
slot_id = slot_idx[b].item()
# Skip padding positions
if slot_id == -1:
continue
# Process all heads at once for this batch
q_b = q[b, :, 0] # [H, D]
k_b = k[b, :, 0] # [H, D]
v_b = v[b, :, 0] # [H, D]
# Process each attention head
for h in range(H):
# Get current query, key and value
q_bh = q_b[h]
k_bh = k_b[h]
v_bh = v_b[h]
# Get cache
kv_cache_old = kv_caches[b, h]
# Calculate new key-value outer product
kv_outer = torch.outer(k_bh, v_bh)
# Apply decay and update cache
kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old
# Calculate output
out_h = torch.matmul(q_bh, kv_new)
# Update output and cache
output[b, h * D:(h + 1) * D] = out_h
kv_caches[b, h] = kv_new
return output
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@torch.inference_mode()
def test_linear_decode_forward_triton(
batch_size: int,
num_heads: int,
head_size: int,
dtype: torch.dtype,
):
torch.set_default_device("cuda")
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42)
base = 0.01
q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
kv_caches = base * torch.randn(batch_size,
num_heads,
head_size,
head_size,
dtype=dtype,
device="cuda")
kv_caches_copy = kv_caches.clone()
slope_rate = torch.zeros(num_heads, device="cuda")
for h in range(num_heads):
slope_rate[h] = 0.1 * (h + 1)
slot_idx = torch.arange(batch_size, device="cuda")
triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
slope_rate, slot_idx)
reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
slope_rate, slot_idx)
torch.testing.assert_close(triton_output,
reference_output,
rtol=1e-1,
atol=1e-1)
torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
assert triton_output.shape == (batch_size, num_heads * head_size)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
@torch.inference_mode()
def test_linear_decode_forward_triton_with_padding(
num_heads: int,
head_size: int,
dtype: torch.dtype,
):
torch.set_default_device("cuda")
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42)
batch_size = 4
base = 0.01
q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
kv_caches = base * torch.randn(batch_size,
num_heads,
head_size,
head_size,
dtype=dtype,
device="cuda")
kv_caches_copy = kv_caches.clone()
slope_rate = torch.zeros(num_heads, device="cuda")
for h in range(num_heads):
slope_rate[h] = 0.1 * (h + 1)
slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
slope_rate, slot_idx)
reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
slope_rate, slot_idx)
padding_mask = (slot_idx
!= -1).unsqueeze(1).expand(-1, num_heads * head_size)
triton_masked = triton_output[padding_mask]
reference_masked = reference_output[padding_mask]
atol, rtol = 1.5e-1, 1.5e-1
valid_indices = slot_idx != -1
for i in range(batch_size):
if valid_indices[i] > 0:
torch.testing.assert_close(kv_caches[i],
kv_caches_copy[i],
rtol=rtol,
atol=atol)
torch.testing.assert_close(triton_masked,
reference_masked,
rtol=rtol,
atol=atol)
assert triton_output.shape == (batch_size, num_heads * head_size)
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("seq_len", SEQ_LENGTHS)
@pytest.mark.parametrize("dtype", DTYPES)
@torch.inference_mode()
def test_lightning_attention_reference(
batch_size: int,
num_heads: int,
head_size: int,
seq_len: int,
dtype: torch.dtype,
):
torch.set_default_device("cuda")
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
current_platform.seed_everything(42)
base = 0.01
q = base * torch.randn(
batch_size, num_heads, seq_len, head_size, dtype=dtype)
k = base * torch.randn(
batch_size, num_heads, seq_len, head_size, dtype=dtype)
v = base * torch.randn(
batch_size, num_heads, seq_len, head_size, dtype=dtype)
ed = torch.zeros(num_heads, device="cuda")
for h in range(num_heads):
ed[h] = 0.1 * (h + 1)
kv_history = base * torch.randn(batch_size,
num_heads,
head_size,
head_size,
dtype=dtype,
device="cuda")
kv_history_clone = kv_history.clone()
ref_output, ref_kv_cache = reference_lightning_attention(
q, k, v, ed, 256, kv_history)
from vllm.model_executor.layers.lightning_attn import lightning_attention
actual_output, actual_kv_cache = lightning_attention(
q, k, v, ed, 256, kv_history_clone)
atol, rtol = 1.5e-1, 1.5e-1
torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
torch.testing.assert_close(ref_kv_cache,
actual_kv_cache,
rtol=rtol,
atol=atol)
assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
assert ref_kv_cache.shape == actual_kv_cache.shape
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
import torch.nn.functional as F
from torch import Tensor
import vllm._custom_ops as ops
from vllm.platforms import current_platform
def cdiv(a, b):
return (a + b - 1) // b
def ref_mla(
out: Tensor, # (bs, num_heads, v_head_dim)
query: Tensor, # (bs, num_heads, head_dim)
kv_cache: Tensor, # (num_blocks, block_size, head_dim)
scale: float,
block_tables: Tensor, # (bs, max_num_blocks)
seq_lens: Tensor, # (bs,)
):
bs, num_heads, v_head_dim = out.shape
head_dim = query.shape[2]
for i in range(bs):
# gather and flatten KV-cache
kv = kv_cache[
block_tables[i]] # (max_num_blocks, block_size, head_dim)
kv = kv.view(1, -1,
head_dim)[:, :seq_lens[i]] # (1, seq_len, head_dim)
v = kv[:, :, :v_head_dim]
q = query[i].view(num_heads, 1, head_dim)
o = F.scaled_dot_product_attention(q,
kv,
v,
scale=scale,
enable_gqa=True)
out[i] = o.view(num_heads, v_head_dim)
return out
@pytest.mark.parametrize("bs", [4])
@pytest.mark.parametrize("mean_seq_len", [256])
@pytest.mark.parametrize("h_q", [16])
@pytest.mark.parametrize("d", [576])
@pytest.mark.parametrize("dv", [512])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
@pytest.mark.parametrize("varlen", [False, True])
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_mla_decode_cpu(
bs: int,
mean_seq_len: int,
h_q: int,
d: int,
dv: int,
block_size: int,
dtype: torch.dtype,
varlen: bool,
):
torch.set_default_dtype(dtype)
torch.manual_seed(0)
scale = d**(-0.5)
if varlen:
seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
seq_lens = seq_lens.clip(2).to(torch.int32)
else:
seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
max_seq_len = seq_lens.max().item()
seqlen_pad = cdiv(max_seq_len, 256) * 256 # is this necessary?
q = torch.randn(bs, h_q, d)
block_table = torch.arange(bs * seqlen_pad // block_size,
dtype=torch.int32)
block_table = block_table.view(bs, seqlen_pad // block_size)
kv_cache = torch.randn(block_table.numel(), block_size, d)
for i, seq_len in enumerate(seq_lens.tolist()):
kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
out_mla = q.new_zeros(bs, h_q, dv)
ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table,
seq_lens)
out_ref = q.new_zeros(bs, h_q, dv)
ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
assert not out_mla.isnan().any(), "Likely read out of bounds"
torch.testing.assert_close(out_mla, out_ref)
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
Run `pytest tests/kernels/test_moe.py`. Run `pytest tests/kernels/test_moe.py`.
""" """
import pytest import pytest
import torch import torch
from torch.nn import Parameter from torch.nn import Parameter
...@@ -216,11 +215,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, ...@@ -216,11 +215,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
@pytest.mark.parametrize("dtype", @pytest.mark.parametrize("dtype",
[torch.float32, torch.float16, torch.bfloat16]) [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("padding", [True, False]) @pytest.mark.parametrize("padding", [True, False])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
@torch.inference_mode() @torch.inference_mode()
def test_mixtral_moe(dtype: torch.dtype, padding: bool): def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
monkeypatch):
"""Make sure our Mixtral MoE implementation agrees with the one from """Make sure our Mixtral MoE implementation agrees with the one from
huggingface.""" huggingface."""
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# Instantiate our and huggingface's MoE blocks # Instantiate our and huggingface's MoE blocks
config = MixtralConfig() config = MixtralConfig()
hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda") hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
...@@ -268,10 +273,18 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool): ...@@ -268,10 +273,18 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
torch.bfloat16: 1e-2, torch.bfloat16: 1e-2,
} }
torch.testing.assert_close(hf_states.flatten(0, 1), if use_rocm_aiter:
vllm_states, # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
rtol=mixtral_moe_tol[dtype], # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501
atol=mixtral_moe_tol[dtype]) torch.testing.assert_close(hf_states.flatten(0, 1),
vllm_states,
rtol=0.01,
atol=100)
else:
torch.testing.assert_close(hf_states.flatten(0, 1),
vllm_states,
rtol=mixtral_moe_tol[dtype],
atol=mixtral_moe_tol[dtype])
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
......
...@@ -167,6 +167,7 @@ def test_contexted_kv_attention( ...@@ -167,6 +167,7 @@ def test_contexted_kv_attention(
block_table, block_table,
b_start_loc, b_start_loc,
b_seq_len, b_seq_len,
MAX_CTX_LEN,
max_input_len, max_input_len,
k_scale, k_scale,
v_scale, v_scale,
...@@ -183,6 +184,7 @@ def test_contexted_kv_attention( ...@@ -183,6 +184,7 @@ def test_contexted_kv_attention(
block_table, block_table,
b_start_loc, b_start_loc,
b_seq_len, b_seq_len,
MAX_CTX_LEN,
max_input_len, max_input_len,
k_scale, k_scale,
v_scale, v_scale,
...@@ -401,6 +403,7 @@ def test_contexted_kv_attention_alibi( ...@@ -401,6 +403,7 @@ def test_contexted_kv_attention_alibi(
block_table, block_table,
b_start_loc, b_start_loc,
b_seq_len, b_seq_len,
MAX_CTX_LEN,
max_input_len, max_input_len,
k_scale, k_scale,
v_scale, v_scale,
...@@ -417,6 +420,7 @@ def test_contexted_kv_attention_alibi( ...@@ -417,6 +420,7 @@ def test_contexted_kv_attention_alibi(
block_table, block_table,
b_start_loc, b_start_loc,
b_seq_len, b_seq_len,
MAX_CTX_LEN,
max_input_len, max_input_len,
k_scale, k_scale,
v_scale, v_scale,
......
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_cpu_write(device):
torch.set_default_device(device)
cpu_tensor = torch.zeros(10,
10,
device="cpu",
pin_memory=True,
dtype=torch.int32)
cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
assert cuda_view.device.type == "cuda"
assert cuda_view[0, 0] == 0
assert cuda_view[2, 3] == 0
assert cuda_view[4, 5] == 0
cpu_tensor[0, 0] = 1
cpu_tensor[2, 3] = 2
cpu_tensor[4, 5] = -1
cuda_view.mul_(2)
assert cuda_view[0, 0] == 2
assert cuda_view[2, 3] == 4
assert cuda_view[4, 5] == -2
@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_gpu_write(device):
torch.set_default_device(device)
cpu_tensor = torch.zeros(10,
10,
device="cpu",
pin_memory=True,
dtype=torch.int32)
cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
assert cuda_view.device.type == "cuda"
assert cuda_view[0, 0] == 0
assert cuda_view[2, 3] == 0
assert cuda_view[4, 5] == 0
cuda_view[0, 0] = 1
cuda_view[2, 3] = 2
cuda_view[4, 5] = -1
cuda_view.mul_(2)
assert cpu_tensor[0, 0] == 2
assert cpu_tensor[2, 3] == 4
assert cpu_tensor[4, 5] == -2
\ No newline at end of file
...@@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type): ...@@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type):
qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8) qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
m = qweight.shape[0] m = qweight.shape[0]
n = qweight.shape[1] // type_size * block_size n = qweight.shape[1] // type_size * block_size
opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n)) opcheck(torch.ops._C.ggml_dequantize,
(qweight, quant_type, m, n, torch.float16))
x = torch.rand((m, 512), device='cuda', dtype=torch.float16) x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
opcheck(torch.ops._C.ggml_mul_mat_a8, opcheck(torch.ops._C.ggml_mul_mat_a8,
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
import tempfile import tempfile
from collections import OrderedDict from collections import OrderedDict
from typing import TypedDict
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import pytest import pytest
...@@ -28,28 +27,6 @@ from vllm.platforms import current_platform ...@@ -28,28 +27,6 @@ from vllm.platforms import current_platform
from ..utils import models_path_prefix from ..utils import models_path_prefix
class ContextIDInfo(TypedDict):
lora_id: int
context_length: str
class ContextInfo(TypedDict):
lora: str
context_length: str
LONG_LORA_INFOS: list[ContextIDInfo] = [{
"lora_id": 1,
"context_length": "16k",
}, {
"lora_id": 2,
"context_length": "16k",
}, {
"lora_id": 3,
"context_length": "32k",
}]
@pytest.fixture() @pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool: def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture. """Allow subdirectories to skip global cleanup by overriding this fixture.
...@@ -256,41 +233,6 @@ def long_context_lora_files_16k_1(): ...@@ -256,41 +233,6 @@ def long_context_lora_files_16k_1():
return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_1") return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_1")
@pytest.fixture(scope="session")
def long_context_lora_files_16k_2():
# return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_2")
@pytest.fixture(scope="session")
def long_context_lora_files_32k():
# return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
return os.path.join(models_path_prefix, "SangBinCho/long_context_32k_testing")
@pytest.fixture(scope="session")
def long_context_infos(long_context_lora_files_16k_1,
long_context_lora_files_16k_2,
long_context_lora_files_32k):
cleanup_dist_env_and_memory(shutdown_ray=True)
infos: dict[int, ContextInfo] = {}
for lora_checkpoint_info in LONG_LORA_INFOS:
lora_id = lora_checkpoint_info["lora_id"]
if lora_id == 1:
lora = long_context_lora_files_16k_1
elif lora_id == 2:
lora = long_context_lora_files_16k_2
elif lora_id == 3:
lora = long_context_lora_files_32k
else:
raise AssertionError("Unknown lora id")
infos[lora_id] = {
"context_length": lora_checkpoint_info["context_length"],
"lora": lora,
}
return infos
@pytest.fixture @pytest.fixture
def llama_2_7b_engine_extra_embeddings(): def llama_2_7b_engine_extra_embeddings():
cleanup_dist_env_and_memory(shutdown_ray=True) cleanup_dist_env_and_memory(shutdown_ray=True)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -43,14 +43,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -43,14 +43,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def test_baichuan_lora(baichuan_lora_files): def test_baichuan_lora(baichuan_lora_files):
llm = vllm.LLM(MODEL_PATH, llm = vllm.LLM(MODEL_PATH,
max_model_len=1024, max_model_len=1024,
......
...@@ -20,6 +20,14 @@ EXPECTED_LORA_OUTPUT = [ ...@@ -20,6 +20,14 @@ EXPECTED_LORA_OUTPUT = [
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [ prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format(query="How many singers do we have?"),
...@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_chatglm3_lora(chatglm3_lora_files): def test_chatglm3_lora(chatglm3_lora_files):
llm = vllm.LLM(MODEL_PATH, llm = vllm.LLM(MODEL_PATH,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import importlib
import random import random
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass from dataclasses import dataclass
...@@ -20,7 +19,6 @@ from vllm.lora.fully_sharded_layers import ( ...@@ -20,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
# yapf: disable # yapf: disable
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
LinearScalingRotaryEmbeddingWithLoRA,
LogitsProcessorWithLoRA, LoRAMapping, LogitsProcessorWithLoRA, LoRAMapping,
MergedColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA,
MergedQKVParallelLinearWithLoRA, MergedQKVParallelLinearWithLoRA,
...@@ -29,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, ...@@ -29,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA, RowParallelLinearWithLoRA,
VocabParallelEmbeddingWithLoRA) VocabParallelEmbeddingWithLoRA)
# yapf: enable # yapf: enable
from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights, from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
PackedLoRALayerWeights)
from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.punica_wrapper import get_punica_wrapper
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear, MergedColumnParallelLinear,
...@@ -38,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ...@@ -38,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
ReplicatedLinear, ReplicatedLinear,
RowParallelLinear) RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
...@@ -60,32 +56,16 @@ DEVICES = ([ ...@@ -60,32 +56,16 @@ DEVICES = ([
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
] if current_platform.is_cuda_alike() else ["cpu"]) ] if current_platform.is_cuda_alike() else ["cpu"])
#For GPU, we will launch different triton kernels between the prefill and decode # prefill stage(True) or decode stage(False)
# stages, so we need to verify this. prefill stage(True) or decode stage(False)
STAGES = [True, False] STAGES = [True, False]
# With the inclusion of V1 tests (look at the run_with_both_engines_lora), NUM_RANDOM_SEEDS = 6
# the tests in this file run twice, once with the V0 engine and then with
# the V1 engine.
# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
# with the inclusion of V1 tests to maintain the CI test times.
NUM_RANDOM_SEEDS = 5
# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
# 256 before. It is cut to half with the inclusion of V1 tests to maintain
# the CI test times.
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora): def clean_cache():
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
# Reload punica_gpu as the kernels used are tied to engine type.
from vllm.lora.punica_wrapper import punica_gpu
importlib.reload(punica_gpu)
# Release any memory we might be holding on to. CI runs OOMs otherwise. # Release any memory we might be holding on to. CI runs OOMs otherwise.
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT, from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
_LORA_B_PTR_DICT) _LORA_B_PTR_DICT)
...@@ -95,6 +75,24 @@ def v1(run_with_both_engines_lora): ...@@ -95,6 +75,24 @@ def v1(run_with_both_engines_lora):
yield yield
@pytest.fixture(autouse=True)
def skip_cuda_with_stage_false(request):
"""
On cuda-like platforms, we use the same kernels for prefill and decode
stage, and 'stage' is generally ignored, so we only need to test once.
"""
if current_platform.is_cuda_alike():
try:
if hasattr(request.node, "callspec") and hasattr(
request.node.callspec, "params"):
params = request.node.callspec.params
if "stage" in params and params["stage"] is False:
pytest.skip("Skip test when stage=False")
except Exception:
pass
yield
def get_random_id_to_index(num_loras: int, def get_random_id_to_index(num_loras: int,
num_slots: int, num_slots: int,
log: bool = True) -> list[Optional[int]]: log: bool = True) -> list[Optional[int]]:
...@@ -1016,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, ...@@ -1016,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
atol=atol) atol=atol)
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 8])
@pytest.mark.parametrize("device", ["cuda"])
@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
(6.0, 1.0)])
@pytest.mark.parametrize("max_position", [11, 4096, 32768])
@pytest.mark.parametrize("is_neox_style", [True, False])
@pytest.mark.parametrize("rotary_dim", [None, 32])
@pytest.mark.parametrize("head_size", [32, 108])
@pytest.mark.parametrize("seq_len", [11, 1024])
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Only CUDA backends are supported")
def test_rotary_embedding_long_context(dist_init, num_loras, device,
scaling_factors, max_position,
is_neox_style, rotary_dim, head_size,
seq_len) -> None:
dtype = torch.float16
max_loras = 8
seed = 0
current_platform.seed_everything(seed)
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8,
long_lora_scaling_factors=scaling_factors,
lora_dtype=dtype)
if rotary_dim is None:
rotary_dim = head_size
base = 10000
batch_size = 5 * num_loras
num_heads = 7
# Verify lora is equivalent to linear scaling rotary embedding.
rope = get_rope(
head_size,
rotary_dim,
max_position,
base,
is_neox_style,
)
lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
lora_rope.set_mapping(punica_wrapper)
lora_rope.create_lora_weights(max_loras, lora_config)
linear_rope = get_rope(head_size, rotary_dim, max_position, base,
is_neox_style, {
"rope_type": "linear",
"factor": scaling_factors
})
linear_rope = linear_rope.to(dtype=dtype)
id_to_index = get_random_id_to_index(num_loras, max_loras)
_, index_mapping, prompt_mapping = create_random_inputs(
active_lora_ids=[0],
num_inputs=batch_size,
input_size=(1, max_position),
input_range=(0, lora_config.lora_extra_vocab_size),
input_type=torch.float16,
device=device)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
long_lora_context = LongContextLoRAContext(list(scaling_factors),
rotary_dim)
next_expected_offset = 0
# Make sure the offset is correct.
scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
for scaling_factor, offset in scaling_factor_to_offset.items():
assert offset == next_expected_offset
next_expected_offset += scaling_factor * max_position
for i in range(len(scaling_factors)):
long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
scaling_factors[i], 0)
punica_wrapper.update_metadata(
lora_mapping,
id_to_index,
max_loras,
512,
lora_config.lora_extra_vocab_size,
long_lora_context=long_lora_context,
)
# lora_rope.set_mapping(*mapping_info)
positions = torch.randint(0, max_position, (batch_size, seq_len))
query = torch.randn(batch_size,
seq_len,
num_heads * head_size,
dtype=dtype)
key = torch.randn_like(query)
ref_q, ref_k = linear_rope(positions, query, key)
actual_q, actual_k = lora_rope(positions, query, key)
torch.allclose(ref_q, actual_q)
torch.allclose(ref_k, actual_k)
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))) "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
......
...@@ -29,6 +29,14 @@ EXPECTED_LORA_OUTPUT = [ ...@@ -29,6 +29,14 @@ EXPECTED_LORA_OUTPUT = [
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [ prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
...@@ -72,16 +80,6 @@ def generate_and_test(llm, sql_lora_files): ...@@ -72,16 +80,6 @@ def generate_and_test(llm, sql_lora_files):
print("removing lora") print("removing lora")
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_llama_lora(sql_lora_files): def test_llama_lora(sql_lora_files):
...@@ -127,8 +125,6 @@ def test_llama_lora_warmup(sql_lora_files): ...@@ -127,8 +125,6 @@ def test_llama_lora_warmup(sql_lora_files):
"less when using lora than when not using lora") "less when using lora than when not using lora")
# V1 Test: Failing due to numerics on V1.
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4) @multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_llama_lora_tp4(sql_lora_files): def test_llama_lora_tp4(sql_lora_files):
...@@ -158,20 +154,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): ...@@ -158,20 +154,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
enable_chunked_prefill=True, enable_chunked_prefill=True,
) )
generate_and_test(llm, sql_lora_files) generate_and_test(llm, sql_lora_files)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=4,
fully_sharded_loras=True,
enable_lora_bias=True,
enable_chunked_prefill=True,
)
generate_and_test(llm, sql_lora_files)
...@@ -7,7 +7,6 @@ import torch ...@@ -7,7 +7,6 @@ import torch
from safetensors.torch import load_file from safetensors.torch import load_file
from torch import nn from torch import nn
from vllm import envs
from vllm.config import LoRAConfig from vllm.config import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA, from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA,
...@@ -33,6 +32,17 @@ DEVICES = ([ ...@@ -33,6 +32,17 @@ DEVICES = ([
] if current_platform.is_cuda_alike() else ["cpu"]) ] if current_platform.is_cuda_alike() else ["cpu"])
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Some tests depend on V0 internals. Since both V0 and V1 use the same
LoRAModelManager it is okay to just test V0.
"""
with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_from_lora_tensors(sql_lora_files, device): def test_from_lora_tensors(sql_lora_files, device):
tensors = load_file( tensors = load_file(
...@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): ...@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.device == device assert manager.device == device
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device): sql_lora_files, device):
...@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, ...@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device) device)
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device): sql_lora_files, device):
......
...@@ -60,7 +60,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -60,7 +60,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
@pytest.mark.xfail( @pytest.mark.xfail(
current_platform.is_rocm(), current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm") reason="MiniCPM-V dependency xformers incompatible with ROCm")
@create_new_process_for_each_test()
def test_minicpmv_lora(minicpmv_lora_files): def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM( llm = vllm.LLM(
MODEL_PATH, MODEL_PATH,
...@@ -80,6 +79,8 @@ def test_minicpmv_lora(minicpmv_lora_files): ...@@ -80,6 +79,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
assert EXPECTED_OUTPUT[i].startswith(output2[i]) assert EXPECTED_OUTPUT[i].startswith(output2[i])
@pytest.mark.skipif(current_platform.is_cuda_alike(),
reason="Skipping to avoid redundant model tests")
@pytest.mark.xfail( @pytest.mark.xfail(
current_platform.is_rocm(), current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm") reason="MiniCPM-V dependency xformers incompatible with ROCm")
...@@ -101,6 +102,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): ...@@ -101,6 +102,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
@pytest.mark.skipif(current_platform.is_cuda_alike(),
reason="Skipping to avoid redundant model tests")
@pytest.mark.xfail( @pytest.mark.xfail(
current_platform.is_rocm(), current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm") reason="MiniCPM-V dependency xformers incompatible with ROCm")
......
...@@ -12,6 +12,14 @@ MODEL_PATH = os.path.join(models_path_prefix, "microsoft/phi-2") ...@@ -12,6 +12,14 @@ MODEL_PATH = os.path.join(models_path_prefix, "microsoft/phi-2")
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [ prompts = [
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
...@@ -50,14 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -50,14 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# Skipping for V1 for now as we are hitting, # Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error. # "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip_v1 @pytest.mark.skip_v1
......
...@@ -40,6 +40,14 @@ else: ...@@ -40,6 +40,14 @@ else:
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, def do_sample(llm: vllm.LLM,
lora_path: str, lora_path: str,
lora_id: int, lora_id: int,
...@@ -72,14 +80,6 @@ def do_sample(llm: vllm.LLM, ...@@ -72,14 +80,6 @@ def do_sample(llm: vllm.LLM,
return generated_texts return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("tp_size", [1])
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment