Commit fe851fbc authored by zhouxiang's avatar zhouxiang
Browse files

0.2.6版本新增文件补充

parent e2d98ddc
import pytest
import torch
from transformers.generation.logits_process import (
RepetitionPenaltyLogitsProcessor, TemperatureLogitsWarper,
TopKLogitsWarper, TopPLogitsWarper)
@pytest.mark.parametrize('inplace', [True, False])
def test_process_temperature(inplace):
from lmdeploy.pytorch.engine.logits_process import _process_temperature
batch_size = 4
num_tokens = 16
scores = torch.rand(batch_size, num_tokens)
temperatures = torch.rand(batch_size)
gt = []
for score, temperature in zip(scores, temperatures):
warper = TemperatureLogitsWarper(temperature.item())
gt.append(warper(None, score[None]))
gt = torch.cat(gt)
out = _process_temperature(scores, temperatures, inplace=inplace)
torch.testing.assert_close(out, gt)
@pytest.mark.parametrize('inplace', [True, False])
def test_process_bad_words(inplace):
from lmdeploy.pytorch.engine.logits_process import _process_bad_words
filter_value: float = -float('inf')
batch_size = 4
num_tokens = 16
scores = torch.rand(batch_size, num_tokens)
bad_words = torch.tensor([
[0, 1],
[3, -1],
[4, 4],
[-1, -1],
])
out_scores = _process_bad_words(scores, bad_words, inplace=inplace)
for score, bw in zip(out_scores, bad_words):
bw = bw.tolist()
for w in bw:
if w >= 0:
assert score[w] == filter_value
@pytest.mark.parametrize('inplace', [True, False])
def test_processrepetition_penalty(inplace):
from lmdeploy.pytorch.engine.logits_process import \
_process_repetition_penalty
batch_size = 4
num_tokens = 16
scores = torch.rand(batch_size, num_tokens)
input_ids = torch.tensor([
[0, 1],
[3, 6],
[4, 4],
[0, 0],
])
penalties = 1 + torch.rand(batch_size)
gt = []
for score, ids, penalty in zip(scores, input_ids, penalties):
warper = RepetitionPenaltyLogitsProcessor(penalty.item())
gt.append(warper(ids[None], score[None].clone()))
gt = torch.cat(gt)
out = _process_repetition_penalty(scores,
input_ids,
penalties,
inplace=inplace)
torch.testing.assert_close(out, gt)
@pytest.mark.parametrize('inplace', [True, False])
def test_filter_topk_sorted(inplace):
from lmdeploy.pytorch.engine.logits_process import _filter_topk_sorted
batch_size = 4
num_tokens = 16
scores = torch.rand(batch_size, num_tokens).sort(1, descending=True)[0]
top_k = torch.randint(4, num_tokens - 4, (batch_size, ))
gt = []
for score, k in zip(scores, top_k):
warper = TopKLogitsWarper(k.item())
gt.append(warper(None, score[None].clone()))
gt = torch.cat(gt)
out = _filter_topk_sorted(scores, top_k, inplace=inplace)
torch.testing.assert_close(out, gt)
@pytest.mark.parametrize('inplace', [True, False])
def test_filter_topp_sorted(inplace):
from lmdeploy.pytorch.engine.logits_process import _filter_topp_sorted
batch_size = 4
num_tokens = 16
scores = torch.rand(batch_size, num_tokens).sort(1, descending=True)[0]
top_p = torch.rand(batch_size)
gt = []
for score, p in zip(scores, top_p):
warper = TopPLogitsWarper(p.item())
gt.append(warper(None, score[None].clone()))
gt = torch.cat(gt)
out = _filter_topp_sorted(scores, top_p, inplace=inplace)
torch.testing.assert_close(out, gt)
import asyncio
import pytest
from lmdeploy.pytorch.engine.request import (RequestManager, RequestType,
Response, ResponseType)
class TestRequestHander:
@pytest.fixture
def event_loop(self):
old_loop = asyncio.get_event_loop()
new_loop = asyncio.new_event_loop()
yield new_loop
new_loop.stop()
asyncio.set_event_loop(old_loop)
@pytest.fixture
def thread_safe(self, request):
yield request.param
@pytest.fixture
def manager(self, thread_safe):
yield RequestManager(thread_safe=thread_safe)
@pytest.mark.parametrize('thread_safe', [True, False])
def test_bind(self, manager, event_loop):
def __stop_engine_callback(reqs, **kwargs):
for req in reqs:
manager.response(
Response(type=ResponseType.SUCCESS,
sender_id=req.sender_id,
req_id=req.req_id,
data=f'{req.data} success'))
async def __dummy_loop():
while True:
manager.step()
await asyncio.sleep(0.1)
asyncio.set_event_loop(event_loop)
sender = manager.build_sender()
manager.start_loop(__dummy_loop)
# test not bind
req_id = sender.send_async(RequestType.STOP_ENGINE, None)
resp = sender.recv(req_id)
assert resp.type == ResponseType.HANDLER_NOT_EXIST
assert manager.is_loop_alive()
# test bind success
sender.send_async(RequestType.STOP_ENGINE, None)
manager.bind_func(RequestType.STOP_ENGINE, __stop_engine_callback)
req_id = sender.send_async(RequestType.STOP_ENGINE, 'test')
resp = sender.recv(req_id)
assert resp.data == 'test success'
import pytest
import torch
from lmdeploy.pytorch.kernels import apply_rotary_pos_emb
def _rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., :x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2:]
return torch.cat((-x2, x1), dim=-1)
class TestApplyRotary:
@pytest.fixture
def dtype(self, request):
yield request.param
@pytest.fixture
def batch_size(self):
yield 4
@pytest.fixture
def num_heads_q(self, request):
yield request.param
@pytest.fixture
def num_heads_k(self, request):
yield request.param
@pytest.fixture
def feature_dim(self):
yield 16
@pytest.fixture
def seq_length(self, batch_size):
yield torch.randint(8, 16, (batch_size, ), device='cuda')
@pytest.fixture
def max_seqlen(self, seq_length):
yield seq_length.max()
@pytest.fixture
def q_states(self, seq_length, num_heads_q, feature_dim, dtype):
yield torch.rand(seq_length.sum(),
num_heads_q,
feature_dim,
dtype=dtype,
device='cuda')
@pytest.fixture
def k_states(self, seq_length, num_heads_k, feature_dim, dtype):
yield torch.rand(seq_length.sum(),
num_heads_k,
feature_dim,
dtype=dtype,
device='cuda')
@pytest.fixture
def position_ids_1d(self, seq_length, max_seqlen):
yield torch.randint(0,
max_seqlen.item(), (seq_length.sum().item(), ),
device='cuda')
@pytest.fixture
def cached_cos(self, max_seqlen, feature_dim, dtype):
yield torch.rand(max_seqlen, feature_dim, dtype=dtype, device='cuda')
@pytest.fixture
def cached_sin(self, max_seqlen, feature_dim, dtype):
yield torch.rand(max_seqlen, feature_dim, dtype=dtype, device='cuda')
@pytest.fixture
def gt(self, q_states, k_states, cached_cos, cached_sin, position_ids_1d):
cos = cached_cos[position_ids_1d, None, :]
sin = cached_sin[position_ids_1d, None, :]
q_embed = q_states * cos + _rotate_half(q_states) * sin
k_embed = k_states * cos + _rotate_half(k_states) * sin
yield q_embed, k_embed
@pytest.mark.parametrize('dtype',
[torch.bfloat16, torch.float16, torch.float32],
indirect=True)
@pytest.mark.parametrize(('num_heads_q', 'num_heads_k'), [(8, 8), (8, 4)],
indirect=True)
def test_apply_rotary(self, q_states, k_states, cached_cos, cached_sin,
position_ids_1d, gt):
q_embed, k_embed = apply_rotary_pos_emb(q_states, k_states, cached_cos,
cached_sin, None,
position_ids_1d)
q_gt, k_gt = gt
rtol = None
atol = None
if q_states.dtype == torch.float16:
rtol = 1e-5
atol = 1e-3
torch.testing.assert_close(q_embed, q_gt, rtol=rtol, atol=atol)
torch.testing.assert_close(k_embed, k_gt, rtol=rtol, atol=atol)
import pytest
import torch
from lmdeploy.pytorch.kernels.fill_kv_cache import fill_kv_cache
def _div_up(a, b):
return (a + b - 1) // b
class TestFillKVCache:
@pytest.fixture
def num_heads(self):
yield 4
@pytest.fixture
def head_dim(self):
yield 32
@pytest.fixture
def block_size(self):
yield 16
@pytest.fixture
def seq_lens(self, request):
yield request.param
@pytest.fixture
def history_lens(self, request):
yield request.param
@pytest.fixture
def batch_size(self, seq_lens):
yield len(seq_lens)
@pytest.fixture
def kv_lens(self, seq_lens, history_lens):
yield [s + h for s, h in zip(seq_lens, history_lens)]
@pytest.fixture
def max_q_seq_length(self, seq_lens):
yield max(seq_lens)
@pytest.fixture
def num_tokens(self, seq_lens):
yield sum(seq_lens)
@pytest.fixture
def num_blocks_per_input(self, kv_lens, block_size):
yield [_div_up(kv_len, block_size) for kv_len in kv_lens]
@pytest.fixture
def max_num_blocks(self, num_blocks_per_input):
yield max(num_blocks_per_input)
@pytest.fixture
def q_seq_length(self, seq_lens):
yield torch.tensor(seq_lens).cuda()
@pytest.fixture
def q_start_loc(self, q_seq_length):
cum_seq_length = q_seq_length.cumsum(0)
yield cum_seq_length - q_seq_length
@pytest.fixture
def kv_seq_length(self, kv_lens):
yield torch.tensor(kv_lens).cuda()
@pytest.fixture
def k_states(self, num_tokens, num_heads, head_dim):
yield torch.rand(num_tokens, num_heads, head_dim).cuda()
@pytest.fixture
def v_states(self, k_states):
yield torch.rand_like(k_states)
@pytest.fixture
def k_caches(self, batch_size, max_num_blocks, block_size, num_heads,
head_dim):
shape = (batch_size * max_num_blocks, block_size, num_heads, head_dim)
yield torch.full(shape, 0.0).cuda()
@pytest.fixture
def v_caches(self, k_caches):
yield torch.rand_like(k_caches)
@pytest.fixture
def block_offsets(self, num_blocks_per_input):
batch_size = len(num_blocks_per_input)
max_num_blocks = max(num_blocks_per_input)
batch_ids = torch.arange(batch_size)
ret = torch.arange(max_num_blocks)
ret = batch_ids[:, None] + ret[None, :] * batch_size
yield ret.cuda()
@pytest.fixture
def gt(self, k_states, v_states, k_caches, v_caches, seq_lens,
history_lens, block_offsets, block_size):
batch_size = len(seq_lens)
k_caches = k_caches.clone()
v_caches = v_caches.clone()
splited_k_states = k_states.split(seq_lens)
splited_v_states = v_states.split(seq_lens)
for bidx in range(batch_size):
k_state = splited_k_states[bidx]
v_state = splited_v_states[bidx]
h_len = history_lens[bidx]
b_offs = block_offsets[bidx]
block_id = _div_up(h_len + 1, block_size) - 1
fill_start = h_len % block_size
fill_size = min(block_size - fill_start, k_state.size(0))
while True:
boff = b_offs[block_id]
tmp_ks = k_state[:fill_size]
tmp_vs = v_state[:fill_size]
fill_end = fill_start + fill_size
k_caches[boff, fill_start:fill_end] = tmp_ks
v_caches[boff, fill_start:fill_end] = tmp_vs
k_state = k_state[fill_size:]
v_state = v_state[fill_size:]
block_id += 1
fill_start = 0
fill_size = min(block_size, k_state.size(0))
if fill_size == 0:
break
yield k_caches, v_caches
@pytest.mark.parametrize(['seq_lens', 'history_lens'], [
((1, 1, 1, 1), (1, 16, 31, 24)),
((1, 8, 16, 24), (1, 16, 31, 24)),
],
indirect=True)
def test_fill_kv_cache(self, k_states, v_states, k_caches, v_caches,
block_offsets, q_start_loc, q_seq_length,
kv_seq_length, max_q_seq_length, gt):
fill_kv_cache(k_states, v_states, k_caches, v_caches, q_start_loc,
q_seq_length, kv_seq_length, max_q_seq_length,
block_offsets)
torch.testing.assert_close(k_caches, gt[0])
torch.testing.assert_close(v_caches, gt[1])
import pytest
import torch
from torch import nn
from lmdeploy.pytorch.kernels.fused_rotary_emb import fused_rotary_emb
class DummyRotaryEmbedding(nn.Module):
def __init__(self,
dim,
max_position_embeddings=2048,
base=10000,
device=None):
super().__init__()
self.dim = dim
self.max_position_embeddings = max_position_embeddings
self.base = base
inv_freq = 1.0 / (self.base**(torch.arange(
0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer('inv_freq', inv_freq, persistent=False)
def forward(self, x, position_ids, seq_len=None):
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float()
freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
emb = torch.cat((freqs, freqs), dim=-1)
cos = emb.cos().to(dtype=x.dtype)
sin = emb.sin().to(dtype=x.dtype)
# backwards compatibility
return cos, sin
class DummyLinearScalingRotaryEmbedding(DummyRotaryEmbedding):
def __init__(self,
dim,
max_position_embeddings=2048,
base=10000,
device=None,
scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def forward(self, x, position_ids, seq_len=None):
position_ids = position_ids.float() / self.scaling_factor
cos, sin = super().forward(x, position_ids, seq_len)
return cos, sin
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., :x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2:]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=2):
"""Applies Rotary Position Embedding to the query and key tensors."""
cos = cos.unsqueeze(unsqueeze_dim)
sin = sin.unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
class TestFusedRotaryEmb:
@pytest.fixture
def dtype(self):
yield torch.float16
@pytest.fixture
def batch_size(self):
yield 2
@pytest.fixture
def head_dim(self):
yield 64
@pytest.fixture
def q_num_heads(self):
yield 4
@pytest.fixture
def k_num_heads(self):
yield 2
@pytest.fixture
def seq_len(self):
yield 100
@pytest.fixture
def q(self, batch_size, seq_len, q_num_heads, head_dim, dtype):
yield torch.rand(batch_size,
seq_len,
q_num_heads,
head_dim,
dtype=dtype).to('cuda')
@pytest.fixture
def k(self, batch_size, seq_len, k_num_heads, head_dim, dtype):
yield torch.rand(batch_size,
seq_len,
k_num_heads,
head_dim,
dtype=dtype).to('cuda')
@pytest.fixture
def position_ids(self, batch_size, seq_len):
yield torch.randint(0, seq_len + 100, (batch_size, seq_len)).cuda()
@pytest.fixture
def rotary_emb(self, head_dim):
yield DummyLinearScalingRotaryEmbedding(head_dim,
scaling_factor=1.0).to('cuda')
@pytest.fixture
def gt(self, q, k, position_ids, rotary_emb):
with torch.inference_mode():
cos, sin = rotary_emb(q, position_ids)
yield apply_rotary_pos_emb(q,
k,
cos,
sin,
position_ids=position_ids)
def test_fused_rotary_emb(self, q, k, position_ids, rotary_emb, gt):
inv_freq = rotary_emb.inv_freq
scaling_factor = rotary_emb.scaling_factor
with torch.inference_mode():
outq, outk = fused_rotary_emb(q,
k,
position_ids,
inv_freq,
scaling_factor=scaling_factor)
gtq, gtk = gt
torch.testing.assert_close(outq, gtq, atol=1e-3, rtol=1e-5)
torch.testing.assert_close(outk, gtk, atol=1e-3, rtol=1e-5)
import pytest
import torch
from torch.nn.utils.rnn import pad_sequence
from lmdeploy.pytorch.kernels.mbgmm import mbgmm_a, mbgmm_b
class TestMBGMM:
@pytest.fixture
def dtype(self):
yield torch.float16
@pytest.fixture
def head_size(self):
yield 32
@pytest.fixture
def out_head_size(self):
yield 16
@pytest.fixture
def seq_lens(self):
yield torch.tensor([2, 4, 6, 8]).cuda()
@pytest.fixture
def ranks(self):
yield torch.tensor([2, 4]).cuda()
@pytest.fixture
def page_start(self, ranks):
yield torch.zeros_like(ranks)
@pytest.fixture
def start_loc(self, seq_lens):
yield seq_lens.cumsum(0) - seq_lens
@pytest.fixture
def input(self, seq_lens, head_size, dtype):
total_len = seq_lens.sum()
yield torch.rand(total_len, head_size, dtype=dtype).cuda()
@pytest.fixture
def adapter_ids(self, seq_lens, ranks):
num_ranks = len(ranks)
num_seqs = len(seq_lens)
ret = torch.randint(0, num_ranks, (num_seqs, )).cuda()
yield ret
@pytest.fixture
def scaling(self, adapter_ids):
yield torch.ones(adapter_ids.size(0)).cuda()
@pytest.fixture
def lora_a(self, ranks, head_size, dtype):
out = []
for rank in ranks:
w = torch.rand(head_size, rank, dtype=dtype).cuda()
out.append(w)
yield out
@pytest.fixture
def lora_b(self, ranks, out_head_size, dtype):
out = []
for rank in ranks:
w = torch.rand(rank, out_head_size, dtype=dtype).cuda()
out.append(w)
yield out
@pytest.fixture
def page_table(self, ranks):
total_ranks = sum(ranks)
index = torch.randperm(total_ranks)
index = index.split(ranks.tolist())
yield pad_sequence(index, batch_first=True).cuda()
@pytest.fixture
def paged_lora_a(self, lora_a, ranks, page_table, head_size, dtype):
num_pages = sum(ranks)
cache = torch.empty(num_pages, head_size, dtype=dtype).cuda()
for index, r, w in zip(page_table, ranks, lora_a):
cache[index[:r]] = w.t()
yield cache
@pytest.fixture
def paged_lora_b(self, lora_b, ranks, page_table, head_size, out_head_size,
dtype):
num_pages = sum(ranks)
cache = torch.empty(num_pages, head_size, dtype=dtype).cuda()
for index, r, w in zip(page_table, ranks, lora_b):
cache[index[:r], :out_head_size] = w
yield cache
@pytest.fixture
def gt(self, input, start_loc, seq_lens, adapter_ids, lora_a, lora_b):
out = []
for loc, s_len, r_id in zip(start_loc, seq_lens, adapter_ids):
inp = input[loc:loc + s_len]
l_a = lora_a[r_id]
l_b = lora_b[r_id]
out.append(inp @ l_a @ l_b)
yield torch.cat(out)
def test_mbgmm(self, input, paged_lora_a, paged_lora_b, out_head_size,
start_loc, seq_lens, adapter_ids, scaling, page_table,
ranks, page_start, gt):
max_seq_len = max(seq_lens).item()
max_rank = page_table.size(-1)
xa = mbgmm_a(input,
paged_lora_a,
q_start_loc=start_loc,
q_seqlens=seq_lens,
adapter_ids=adapter_ids,
rank_page_table=page_table,
rank_page_start=page_start,
ranks=ranks,
max_seq_len=max_seq_len,
max_rank=max_rank)
output = mbgmm_b(xa,
paged_lora_b[..., :out_head_size],
q_start_loc=start_loc,
q_seqlens=seq_lens,
adapter_ids=adapter_ids,
scaling=scaling,
rank_page_table=page_table,
rank_page_start=page_start,
ranks=ranks,
max_seq_len=max_seq_len,
max_rank=max_rank)
torch.testing.assert_close(gt, output)
import pytest
import torch
from torch.nn.utils.rnn import pad_sequence
from lmdeploy.pytorch.kernels.mbgmv import mbgmv_a, mbgmv_b
class TestMBGMV:
@pytest.fixture
def dtype(self):
yield torch.float16
@pytest.fixture
def head_size(self):
yield 64
@pytest.fixture
def out_head_size(self):
yield 32
@pytest.fixture
def batch_size(self):
yield 8
@pytest.fixture
def ranks(self):
yield torch.tensor([2, 4]).cuda()
@pytest.fixture
def page_start(self, ranks):
yield torch.zeros_like(ranks)
@pytest.fixture
def input(self, batch_size, head_size, dtype):
x = torch.rand(batch_size, head_size, dtype=dtype).cuda()
x -= 0.5
yield x
@pytest.fixture
def adapter_ids(self, batch_size, ranks):
num_ranks = len(ranks)
ret = torch.randint(0, num_ranks, (batch_size, )).cuda()
yield ret
@pytest.fixture
def scaling(self, adapter_ids):
yield torch.ones(adapter_ids.size(0)).cuda()
@pytest.fixture
def lora_a(self, ranks, head_size, dtype):
out = []
for rank in ranks:
w = torch.rand(head_size, rank, dtype=dtype).cuda()
w -= 0.5
out.append(w)
yield out
@pytest.fixture
def lora_b(self, ranks, out_head_size, dtype):
out = []
for rank in ranks:
w = torch.rand(rank, out_head_size, dtype=dtype).cuda()
w -= 0.5
out.append(w)
yield out
@pytest.fixture
def page_table(self, ranks):
total_ranks = sum(ranks)
index = torch.randperm(total_ranks)
index = index.split(ranks.tolist())
yield pad_sequence(index, batch_first=True).cuda()
@pytest.fixture
def paged_lora_a(self, lora_a, ranks, page_table, head_size, dtype):
num_pages = sum(ranks)
cache = torch.empty(num_pages, head_size, dtype=dtype).cuda()
for index, r, w in zip(page_table, ranks, lora_a):
cache[index[:r]] = w.t()
yield cache
@pytest.fixture
def paged_lora_b(self, lora_b, ranks, page_table, head_size, out_head_size,
dtype):
num_pages = sum(ranks)
cache = torch.empty(num_pages, head_size, dtype=dtype).cuda()
for index, r, w in zip(page_table, ranks, lora_b):
cache[index[:r], :out_head_size] = w
yield cache
@pytest.fixture
def gt(self, input, adapter_ids, lora_a, lora_b):
out = []
for inp, r_id in zip(input, adapter_ids):
inp = inp.unsqueeze(0)
l_a = lora_a[r_id]
l_b = lora_b[r_id]
out.append(inp @ l_a @ l_b)
yield torch.cat(out)
def test_mbgmv(self, input, paged_lora_a, paged_lora_b, out_head_size,
adapter_ids, scaling, page_table, ranks, page_start, gt):
max_rank = page_table.size(-1)
xa = mbgmv_a(input,
paged_lora_a,
adapter_ids=adapter_ids,
rank_page_table=page_table,
rank_page_start=page_start,
ranks=ranks,
max_rank=max_rank)
output = mbgmv_b(xa,
paged_lora_b[..., :out_head_size],
adapter_ids=adapter_ids,
scaling=scaling,
rank_page_table=page_table,
rank_page_start=page_start,
ranks=ranks,
max_rank=max_rank)
torch.testing.assert_close(gt, output, atol=2e-3, rtol=1e-5)
import pytest
import torch
from lmdeploy.pytorch.kernels import multinomial_sampling
class TestMultinomialSampling:
@pytest.fixture
def num_tokens(self, request):
yield request.param
@pytest.fixture
def select_ids(self, request):
yield request.param
@pytest.fixture
def batch_size(self, select_ids):
yield len(select_ids)
@pytest.fixture
def dtype(self, request):
yield request.param
@pytest.fixture
def scores(self, num_tokens, batch_size, select_ids, dtype):
ret = torch.zeros(batch_size, num_tokens).cuda()
batch_ids = torch.arange(batch_size).cuda()
ret[batch_ids, select_ids] = 1
ret = ret.to(dtype)
yield ret
@pytest.fixture
def seeds(self, batch_size):
yield torch.randint(1000, 2000, (batch_size, )).cuda()
@pytest.fixture
def offsets(self, batch_size):
yield torch.randint(1000, 2000, (batch_size, )).cuda()
@pytest.fixture
def indices(self, scores):
num_tokens = scores.size(1)
ret = [torch.randperm(num_tokens) for _ in scores]
ret = torch.stack(ret, 0).cuda()
yield ret
@pytest.fixture
def gt(self, batch_size, select_ids, indices):
batch_ids = torch.arange(batch_size).cuda()
yield indices[batch_ids, select_ids]
@pytest.mark.parametrize('dtype',
[torch.float32, torch.half, torch.bfloat16])
@pytest.mark.parametrize(['num_tokens', 'select_ids'], [
(8, (4, 2) * 30),
(200, (50, 150)),
],
indirect=True)
def test_multinomial_sampling(self, scores, seeds, offsets, indices, gt):
output = multinomial_sampling(scores, seeds, offsets, indices)
torch.testing.assert_close(output, gt)
import math
import pytest
import torch
def _conti_input(data, seq_lens):
data = [x[:l] for x, l in zip(data, seq_lens)]
data = torch.cat(data, dim=0)
return data
def _make_bias(seq_lens, history_lens, neg_val):
full_seq_lens = seq_lens + history_lens
max_seq_len = seq_lens.max().item()
max_full_len = full_seq_lens.max().item()
seq_ranges = [torch.arange(max_seq_len) for _ in seq_lens]
for r, l in zip(seq_ranges, seq_lens):
r[l:] = -max_full_len
seq_ranges = torch.stack(seq_ranges, dim=0).cuda()
kv_ranges = [torch.arange(max_full_len) for _ in full_seq_lens]
kv_ranges = torch.stack(kv_ranges, 0).cuda()
mask = kv_ranges[:, None, :] - seq_ranges[:, :, None] > history_lens[:,
None,
None]
return mask.float() * neg_val
def _make_blocked_cache(batched_k, batched_v, seq_lens, history_lens,
block_offsets, block_size, num_heads_k, feat_dim):
max_blocks_nums = block_offsets.max() + 1
full_seq_lens = seq_lens + history_lens
blocked_k = batched_k.new_zeros(max_blocks_nums, block_size, num_heads_k,
feat_dim)
blocked_v = batched_v.new_zeros(max_blocks_nums, block_size, num_heads_k,
feat_dim)
for batch_id, offset in enumerate(block_offsets):
ori_k = batched_k[batch_id]
ori_v = batched_v[batch_id]
seq_len = full_seq_lens[batch_id]
for block_id, block_start in enumerate(range(0, seq_len, block_size)):
block_off = offset[block_id]
tmp_k = ori_k[block_start:block_start + block_size]
tmp_v = ori_v[block_start:block_start + block_size]
size = tmp_k.size(0)
blocked_k[block_off, :size] = tmp_k
blocked_v[block_off, :size] = tmp_v
return blocked_k, blocked_v
def _naive_attention(batched_q, batched_kv, bias):
batched_k, batched_v = batched_kv
num_heads_q = batched_q.shape[2]
num_heads_k = batched_k.shape[2]
head_dim = batched_q.shape[-1]
group = num_heads_q // num_heads_k
q = batched_q.transpose(1, 2)
k = batched_k.permute(0, 2, 3, 1)
v = batched_v.transpose(1, 2)
# expand group
k = k.unsqueeze(2).expand(-1, -1, group, -1, -1).flatten(1, 2)
v = v.unsqueeze(2).expand(-1, -1, group, -1, -1).flatten(1, 2)
qk = torch.matmul(q, k) / math.sqrt(head_dim)
attn_weight = qk + bias[:, None]
attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
attn_weight = attn_weight.to(q.dtype)
attn_output = torch.matmul(attn_weight, v)
attn_output = attn_output.transpose(1, 2).contiguous()
return attn_output
def _naive_window_attention(q, k, v, seqlens_q, seqlens_k, window_size):
from flash_attn import flash_attn_varlen_func
def _make_cu_seqlens(seqlens):
cu_seqlens = seqlens.cumsum(0)
cu_zero = cu_seqlens.new_zeros(1)
cu_seqlens = torch.cat([cu_zero, cu_seqlens])
return cu_seqlens
max_seqlen_q = seqlens_q.max().item()
max_seqlen_k = seqlens_k.max().item()
cu_seqlens_q = _make_cu_seqlens(seqlens_q).int()
cu_seqlens_k = _make_cu_seqlens(seqlens_k).int()
output = flash_attn_varlen_func(q,
k,
v,
cu_seqlens_q,
cu_seqlens_k,
max_seqlen_q=max_seqlen_q,
max_seqlen_k=max_seqlen_k,
causal=True,
window_size=window_size)
return output
class TestPagedAttention:
@pytest.fixture
def dtype(self):
yield torch.float16
@pytest.fixture
def feat_dim(self):
yield 16
@pytest.fixture
def num_heads_q(self, request):
yield request.param
@pytest.fixture
def num_heads_k(self, request):
yield request.param
@pytest.fixture
def block_size(self, request):
yield request.param
@pytest.fixture
def seq_lens(self, request):
yield torch.tensor(request.param, device='cuda')
@pytest.fixture
def start_loc(self, seq_lens):
seq_sum = seq_lens.cumsum(0)
start_loc = torch.cat([seq_sum.new_zeros(1), seq_sum[:-1]], dim=0)
yield start_loc
@pytest.fixture
def history_lens(self, request):
yield torch.tensor(request.param, device='cuda')
@pytest.fixture
def batched_q(self, seq_lens, num_heads_q, feat_dim, dtype):
torch.manual_seed(123)
batch_size = len(seq_lens)
max_seq_len = seq_lens.max().item()
inputs = torch.rand(batch_size,
max_seq_len,
num_heads_q,
feat_dim,
dtype=dtype,
device='cuda')
yield inputs
@pytest.fixture
def batched_kv(self, seq_lens, history_lens, num_heads_k, feat_dim, dtype):
torch.manual_seed(123)
batch_size = len(seq_lens)
full_seq_lens = seq_lens + history_lens
max_seq_len = full_seq_lens.max().item()
k = torch.rand(batch_size,
max_seq_len,
num_heads_k,
feat_dim,
dtype=dtype,
device='cuda')
v = torch.rand(batch_size,
max_seq_len,
num_heads_k,
feat_dim,
dtype=dtype,
device='cuda')
yield k, v
@pytest.fixture
def conti_q(self, seq_lens, batched_q):
yield _conti_input(batched_q, seq_lens)
@pytest.fixture
def block_offsets(self, seq_lens, history_lens, block_size):
full_seq_lens = seq_lens + history_lens
batch_size = full_seq_lens.size(0)
num_blocks = (full_seq_lens + block_size - 1) // block_size
offset = [
torch.arange(size) * batch_size + idx
for idx, size in enumerate(num_blocks)
]
max_len = max(len(o) for o in offset)
new_offset = offset[0].new_zeros(batch_size, max_len)
for o, no in zip(offset, new_offset):
len_o = o.size(0)
no[:len_o] = o
yield new_offset.cuda()
@pytest.fixture
def conti_kv(self, batched_kv, seq_lens, history_lens):
full_seq_lens = seq_lens + history_lens
conti_k = _conti_input(batched_kv[0], full_seq_lens)
conti_v = _conti_input(batched_kv[1], full_seq_lens)
yield (conti_k, conti_v)
@pytest.fixture
def blocked_kv(self, batched_kv, seq_lens, history_lens, block_offsets,
block_size, num_heads_k, feat_dim):
batched_k, batched_v = batched_kv
yield _make_blocked_cache(batched_k, batched_v, seq_lens, history_lens,
block_offsets, block_size, num_heads_k,
feat_dim)
@pytest.fixture
def mask(self, seq_lens, history_lens):
neg_val = -1e30
yield _make_bias(seq_lens, history_lens, neg_val)
@pytest.fixture
def gt(self, batched_q, batched_kv, mask):
yield _naive_attention(batched_q, batched_kv, mask)
@pytest.fixture
def conti_gt(self, gt, seq_lens):
yield _conti_input(gt, seq_lens)
@pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
indirect=True)
@pytest.mark.parametrize(['seq_lens', 'history_lens'],
[([30, 50, 70, 90], [50, 40, 30, 20]),
([1, 1, 1, 1], [50, 40, 30, 20])],
indirect=True)
@pytest.mark.parametrize('block_size', [2, 16], indirect=True)
def test_paged_attention(self, conti_q, blocked_kv, block_offsets,
start_loc, seq_lens, history_lens, conti_gt):
from lmdeploy.pytorch.kernels import paged_attention_fwd
kv_seq_lens = seq_lens + history_lens
max_seq_len = seq_lens.max().item()
blocked_k, blocked_v = blocked_kv
out = torch.empty_like(conti_q)
paged_attention_fwd(conti_q,
blocked_k,
blocked_v,
out,
block_offsets=block_offsets,
q_start_loc=start_loc,
q_seqlens=seq_lens,
kv_seqlens=kv_seq_lens,
max_seqlen=max_seq_len)
torch.testing.assert_close(out, conti_gt, atol=1e-3, rtol=1e-5)
@pytest.fixture
def win_size(self, request):
yield request.param
@pytest.fixture
def window_gt(self, conti_q, conti_kv, seq_lens, history_lens, win_size):
kv_lens = seq_lens + history_lens
yield _naive_window_attention(conti_q,
conti_kv[0],
conti_kv[1],
seq_lens,
kv_lens,
window_size=(win_size, win_size))
@pytest.mark.parametrize(['num_heads_q', 'num_heads_k'], [(4, 2)],
indirect=True)
@pytest.mark.parametrize(['seq_lens', 'history_lens'], [
([30, 50, 70, 90], [50, 40, 30, 20]),
([1, 1, 1, 1], [50, 40, 30, 20]),
],
indirect=True)
@pytest.mark.parametrize('win_size', (32, ), indirect=True)
@pytest.mark.parametrize('block_size', [16], indirect=True)
def test_window_attention(self, conti_q, blocked_kv, block_offsets,
start_loc, seq_lens, history_lens, win_size,
window_gt):
from lmdeploy.pytorch.kernels import paged_attention_fwd
kv_seq_lens = seq_lens + history_lens
max_seq_len = seq_lens.max().item()
blocked_k, blocked_v = blocked_kv
out = torch.empty_like(conti_q)
paged_attention_fwd(conti_q,
blocked_k,
blocked_v,
out,
block_offsets=block_offsets,
q_start_loc=start_loc,
q_seqlens=seq_lens,
kv_seqlens=kv_seq_lens,
max_seqlen=max_seq_len,
window_size=win_size)
torch.testing.assert_close(out, window_gt, atol=1e-3, rtol=1e-5)
import pytest
import torch
from lmdeploy.pytorch.kernels.rearange_all_gather import rearange_all_gather
class TestRearangeAllGather:
@pytest.fixture
def seq_lens(self, request):
yield torch.tensor(request.param, device='cuda')
@pytest.fixture
def start_loc(self, seq_lens):
yield seq_lens.cumsum(0) - seq_lens
@pytest.fixture
def ranks(self):
yield torch.tensor([4, 8]).cuda()
@pytest.fixture
def adapter_ids(self, seq_lens, ranks):
num_ranks = len(ranks)
num_seqs = len(seq_lens)
ret = torch.randint(0, num_ranks, (num_seqs, )).cuda()
yield ret
@pytest.fixture
def world_size(self):
yield 2
@pytest.fixture
def input(self, seq_lens, ranks):
max_rank = max(ranks)
total_len = seq_lens.sum()
yield torch.rand(total_len, max_rank).cuda()
@pytest.fixture
def rank_per_input(self, seq_lens, ranks, adapter_ids):
token_adapter_ids = [
torch.full((slen, ), ada_id)
for slen, ada_id in zip(seq_lens, adapter_ids)
]
token_adapter_ids = torch.cat(token_adapter_ids).cuda()
yield ranks[token_adapter_ids]
@pytest.fixture
def valid_mask(self, rank_per_input, seq_lens, ranks):
max_rank = max(ranks)
total_len = seq_lens.sum()
mask = torch.zeros(total_len, max_rank).to(bool)
for r, m in zip(rank_per_input, mask):
m[:r] = True
yield mask.cuda()
@pytest.fixture
def gt(self, input, rank_per_input, ranks, world_size):
max_rank = max(ranks)
pranks = rank_per_input // world_size
pmax_rank = max_rank // world_size
output = torch.empty_like(input)
for pr, inp, out in zip(pranks, input, output):
pindex = torch.arange(pr).cuda()
index = [pindex + ws * pmax_rank for ws in range(world_size)]
index = torch.cat(index)
out[:index.size(0)] = inp[index]
yield output
@pytest.mark.parametrize('seq_lens', [[30, 50, 70, 90], [1, 1, 1, 1]],
indirect=True)
def test_gather(self, input, start_loc, seq_lens, adapter_ids, ranks,
world_size, gt, valid_mask):
max_seq_len = max(seq_lens)
output = rearange_all_gather(input,
start_loc,
seq_lens,
adapter_ids,
ranks,
world_size,
max_seq_len=max_seq_len)
output = output.where(valid_mask, output.new_tensor(0))
gt = gt.where(valid_mask, gt.new_tensor(0))
torch.testing.assert_close(output, gt)
import pytest
import torch
class TestRMSNorm:
@pytest.fixture(scope='class')
def dtype(self, request):
yield request.param
@pytest.fixture(scope='class')
def input(self, dtype):
yield torch.rand(4, 8, dtype=dtype, device='cuda')
@pytest.fixture(scope='class')
def weight(self, dtype):
yield torch.rand(8, dtype=dtype, device='cuda')
@pytest.fixture(scope='class')
def eps(self):
yield 1e-6
@pytest.fixture(scope='class')
def gt(self, input, weight, eps):
input_dtype = input.dtype
input = input.to(torch.float32)
variance = input.pow(2).mean(-1, keepdim=True)
input = input * torch.rsqrt(variance + eps)
return weight * input.to(input_dtype)
@pytest.mark.parametrize('dtype',
[torch.bfloat16, torch.float16, torch.float32],
indirect=True)
def test_rms_norm(self, input, weight, eps, gt):
from lmdeploy.pytorch.kernels import rms_norm
out = rms_norm(input, weight, eps)
torch.testing.assert_close(out, gt)
import pytest
import torch
from lmdeploy.pytorch.messages import SchedulerSession
from lmdeploy.pytorch.paging.block_manager import (DefaultBlockManager,
WindowBlockManager)
from lmdeploy.pytorch.paging.block_manager.base_block_manager import \
LogicalAllocator # noqa: E501
class TestAllocator:
@pytest.fixture
def num_gpu_blocks(self):
yield 16
@pytest.fixture
def num_cpu_blocks(self):
yield 4
@pytest.fixture
def allocator(self, num_cpu_blocks, num_gpu_blocks):
yield LogicalAllocator(num_cpu_blocks, num_gpu_blocks)
def test_alloc(self, allocator, num_cpu_blocks, num_gpu_blocks):
# initialize
num_blocks = num_cpu_blocks + num_gpu_blocks
gpu_allocator = allocator.get_phy_allocator('gpu')
cpu_allocator = allocator.get_phy_allocator('cpu')
assert allocator.get_num_free_blocks() == num_blocks
assert cpu_allocator.get_num_free_blocks() == num_cpu_blocks
assert gpu_allocator.get_num_free_blocks() == num_gpu_blocks
# test allocate
block_size = 4
blocks = allocator.allocate(block_size, 'gpu')
assert len(blocks) == block_size
assert allocator.get_num_free_blocks() == num_blocks - block_size
assert gpu_allocator.get_num_free_blocks(
) == num_gpu_blocks - block_size
# test free
allocator.add_ref_count(blocks, 1)
allocator.free(blocks)
assert allocator.get_num_free_blocks() == num_blocks - block_size
allocator.free(blocks)
assert allocator.get_num_free_blocks() == num_blocks
assert gpu_allocator.get_num_free_blocks() == num_gpu_blocks
assert cpu_allocator.get_num_free_blocks() == num_cpu_blocks
def test_full(self, allocator, num_cpu_blocks, num_gpu_blocks):
num_blocks = num_cpu_blocks + num_gpu_blocks
gpu_allocator = allocator.get_phy_allocator('gpu')
cpu_allocator = allocator.get_phy_allocator('cpu')
# no free blocks
gpu_block_size = num_gpu_blocks
gpu_blocks = allocator.allocate(gpu_block_size, 'gpu')
cpu_block_size = num_cpu_blocks
cpu_blocks = allocator.allocate(cpu_block_size, 'cpu')
assert cpu_allocator.get_num_free_blocks() == 0
assert gpu_allocator.get_num_free_blocks() == 0
with pytest.raises(MemoryError):
allocator.allocate(1, 'gpu')
allocator.free(gpu_blocks)
allocator.free(cpu_blocks)
assert allocator.get_num_free_blocks() == num_blocks
assert gpu_allocator.get_num_free_blocks() == num_gpu_blocks
assert cpu_allocator.get_num_free_blocks() == num_cpu_blocks
class TestDefaultBlockManager:
@pytest.fixture
def block_size(self):
yield 16
@pytest.fixture
def num_cpu_blocks(self):
yield 4
@pytest.fixture
def num_gpu_blocks(self):
yield 4
@pytest.fixture
def block_mgr(self, num_cpu_blocks, num_gpu_blocks):
yield DefaultBlockManager(num_cpu_blocks, num_gpu_blocks)
def test_alloc(self, block_mgr, block_size, num_gpu_blocks):
sess = SchedulerSession(0, block_size)
# test alloc
token_ids = torch.tensor([1])
msg = sess.add_sequence(token_ids)
assert block_mgr.can_allocate(msg)
block_mgr.allocate(msg)
block_table = block_mgr.get_block_table(msg)
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 1
assert block_table is not None
assert len(block_table) == 1
# test free
block_mgr.free(msg)
block_table = block_mgr.get_block_table(msg)
assert block_table is None or len(block_table) == 0
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks
# alloc over limit
token_ids = torch.zeros((num_gpu_blocks * block_size + 1, ),
dtype=torch.int64)
msg = sess.add_sequence(token_ids)
assert not block_mgr.can_allocate(msg)
def test_append_slot(self, block_mgr, block_size, num_gpu_blocks):
sess = SchedulerSession(0, block_size)
# test append
token_ids = torch.tensor([1])
msg = sess.add_sequence(token_ids)
block_mgr.allocate(msg)
block_table = block_mgr.get_block_table(msg)
assert len(block_table) == 1
# no new logical block
msg.update_token_ids(torch.tensor([1] * (block_size - 1)))
assert block_mgr.can_append_slot(msg)
block_mgr.append_slot(msg)
block_table = block_mgr.get_block_table(msg)
assert len(block_table) == 1
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 1
# with new logical block
msg.update_token_ids(torch.tensor([1]))
block_mgr.append_slot(msg)
block_table = block_mgr.get_block_table(msg)
assert len(block_table) == 2
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 2
def test_fork(self, block_mgr, block_size, num_gpu_blocks):
sess = SchedulerSession(0, block_size)
token_ids = torch.tensor([1] * (block_size * 2 + 1))
from_msg = sess.add_sequence(token_ids)
block_mgr.allocate(from_msg)
from_block_table = block_mgr.get_block_table(from_msg)
assert len(from_block_table) == 3
to_msg = sess.fork_sequence(torch.tensor([1]), from_msg)
# fork
assert block_mgr.can_fork(from_msg)
copy_map = block_mgr.fork(from_msg, to_msg)
block_table = block_mgr.get_block_table(to_msg)
assert len(block_table) == 3
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 4
assert block_table[0] == from_block_table[0]
assert block_table[1] == from_block_table[1]
assert block_table[2] != from_block_table[2]
assert len(copy_map) == 1
assert copy_map[from_block_table[2]] == block_table[2]
# can not fork
assert not block_mgr.can_fork(from_msg)
def test_swap(self, block_mgr, block_size, num_gpu_blocks):
sess = SchedulerSession(0, block_size)
token_ids = torch.tensor([1] * (block_size + 1))
msg = sess.add_sequence(token_ids)
block_mgr.allocate(msg)
old_phy_blocks = block_mgr.get_block_table(msg)
success, swap_map = block_mgr.try_swap_out(msg)
new_phy_blocks = block_mgr.get_block_table(msg)
assert success
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks
assert block_mgr.get_num_free_cpu_blocks() == num_gpu_blocks - 2
assert len(swap_map) == 2
for block_id in old_phy_blocks:
assert block_id in swap_map
for block_id in new_phy_blocks:
assert block_id - num_gpu_blocks in swap_map.values()
old_phy_blocks = block_mgr.get_block_table(msg)
success, swap_map = block_mgr.try_swap_in(msg)
new_phy_blocks = block_mgr.get_block_table(msg)
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 2
assert block_mgr.get_num_free_cpu_blocks() == num_gpu_blocks
assert len(swap_map) == 2
for block_id in old_phy_blocks:
assert block_id - num_gpu_blocks in swap_map
for block_id in new_phy_blocks:
assert block_id in swap_map.values()
success, swap_map = block_mgr.try_swap_out(msg)
assert success
token_ids = torch.tensor([1] * (block_size * 4))
msg_full = sess.add_sequence(token_ids)
block_mgr.allocate(msg_full)
success, swap_map = block_mgr.try_swap_out(msg)
assert not success
class TestWindowBlockManager:
@pytest.fixture
def window_size(self):
yield 32
@pytest.fixture
def block_size(self):
yield 16
@pytest.fixture
def num_cpu_blocks(self):
yield 4
@pytest.fixture
def num_gpu_blocks(self):
yield 4
@pytest.fixture
def block_mgr(self, num_cpu_blocks, num_gpu_blocks, window_size):
yield WindowBlockManager(num_cpu_blocks, num_gpu_blocks, window_size)
def test_alloc(self, block_mgr, block_size, num_gpu_blocks):
sess = SchedulerSession(0, block_size)
# test alloc
token_ids = torch.tensor([1])
msg = sess.add_sequence(token_ids)
assert block_mgr.can_allocate(msg)
block_mgr.allocate(msg)
block_table = block_mgr.get_block_table(msg)
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 1
assert block_table is not None
assert len(block_table) == 1
# test free
block_mgr.free(msg)
block_table = block_mgr.get_block_table(msg)
assert block_table is None or len(block_table) == 0
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks
# alloc over limit
token_ids = torch.zeros((num_gpu_blocks * block_size + 1, ),
dtype=torch.int64)
msg = sess.add_sequence(token_ids)
assert not block_mgr.can_allocate(msg)
def test_win_alloc(self, block_mgr, block_size, num_gpu_blocks,
window_size):
sess = SchedulerSession(0, block_size)
# 2 win block
token_ids = torch.tensor([1] * window_size)
msg = sess.add_sequence(token_ids)
block_mgr.allocate(msg)
msg.update_token_ids(torch.tensor([1]))
block_mgr.allocate(msg)
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 3
block_table = block_mgr.get_block_table(msg)
assert block_table is None or len(block_table) == 3
block_mgr.free(msg)
# 3 win block
token_ids = torch.tensor([1] * (window_size + 2))
msg = sess.add_sequence(token_ids)
block_mgr.allocate(msg)
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 3
msg.update_token_ids(torch.tensor([1]))
block_mgr.allocate(msg)
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 3
block_table = block_mgr.get_block_table(msg)
assert block_table is None or len(block_table) == 3
block_mgr.free(msg)
# not full win
token_ids = torch.tensor([1] * (window_size - 2))
msg = sess.add_sequence(token_ids)
block_mgr.allocate(msg)
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 2
msg.update_token_ids(torch.tensor([1]))
block_mgr.allocate(msg)
assert block_mgr.get_num_free_gpu_blocks() == num_gpu_blocks - 2
block_table = block_mgr.get_block_table(msg)
assert block_table is None or len(block_table) == 2
block_mgr.free(msg)
import pytest
import torch
from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig
from lmdeploy.pytorch.messages import MessageStatus
from lmdeploy.pytorch.paging.scheduler import Scheduler
class TestScheduler:
@pytest.fixture
def block_size(self):
yield 16
@pytest.fixture
def num_cpu_blocks(self):
yield 4
@pytest.fixture
def num_gpu_blocks(self):
yield 4
@pytest.fixture
def cache_config(self, block_size, num_cpu_blocks, num_gpu_blocks):
yield CacheConfig(block_size=block_size,
num_cpu_blocks=num_cpu_blocks,
num_gpu_blocks=num_gpu_blocks)
@pytest.fixture
def scheduler_config(self):
yield SchedulerConfig(max_batches=4,
max_session_len=128,
max_request_output_len=64,
eviction_type='copy')
@pytest.fixture
def scheduler(self, cache_config, scheduler_config):
yield Scheduler(scheduler_config=scheduler_config,
cache_config=cache_config)
def test_schedule_base(self, scheduler, block_size, num_gpu_blocks):
block_manager = scheduler.block_manager
session_id = 0
session = scheduler.add_session(session_id)
assert session_id in scheduler.sessions
assert scheduler.sessions[session_id] == session
num_blocks = 2
token_ids = torch.tensor([0] * block_size * num_blocks)
seq = session.add_sequence(token_ids)
scheduler.add_sequence(seq)
assert seq.status == MessageStatus.WAITING
assert seq in scheduler.waiting
output = scheduler.schedule(is_prefill=True)
block_tables = scheduler.get_block_tables(output.running)
assert seq.status == MessageStatus.RUNNING
assert seq in output.running
assert len(block_tables) == 1
assert len(block_tables[0]) == num_blocks
assert block_manager.get_num_free_gpu_blocks(
) == num_gpu_blocks - num_blocks
assert scheduler.has_unfinished()
def test_update(self, scheduler, block_size, num_gpu_blocks):
block_manager = scheduler.block_manager
session_id1 = 0
session1 = scheduler.add_session(session_id1)
token_ids1 = torch.tensor([0] * block_size * 1)
seq1 = session1.add_sequence(token_ids1)
scheduler.add_sequence(seq1)
session_id2 = 1
session2 = scheduler.add_session(session_id2)
token_ids2 = torch.tensor([0] * block_size * 2)
seq2 = session2.add_sequence(token_ids2)
scheduler.add_sequence(seq2)
token_ids3 = torch.tensor([0] * block_size * 3)
seq3 = session2.add_sequence(token_ids3)
scheduler.add_sequence(seq3)
scheduler.schedule(is_prefill=True)
assert seq1.status == MessageStatus.RUNNING
assert seq2.status == MessageStatus.RUNNING
assert seq3.status == MessageStatus.WAITING
# stop seq
seq1.status = MessageStatus.STOPPED
scheduler.update()
assert len(scheduler.running) == 1
assert seq1 in scheduler.hanging
# end seq
seq1.status = MessageStatus.ENDED
scheduler.update()
assert session_id1 in scheduler.sessions
assert seq1 not in scheduler.running
assert seq1 not in scheduler.hanging
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks - 2
# stop session
scheduler.stop_session(session_id2)
scheduler.update()
assert len(scheduler.running) == 0
assert len(scheduler.waiting) == 0
assert len(scheduler.hanging) == 2
# end session
scheduler.end_session(session_id2)
scheduler.update()
assert seq2.status == MessageStatus.ENDED
assert seq3.status == MessageStatus.ENDED
assert session_id2 not in scheduler.sessions
assert len(scheduler.hanging) == 0
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
def test_swap(self, scheduler, block_size, num_gpu_blocks, num_cpu_blocks):
block_manager = scheduler.block_manager
session_id = 0
session = scheduler.add_session(session_id)
# test: add 3 seq
token_ids1 = torch.tensor([0] * block_size * 1)
seq1 = session.add_sequence(token_ids1)
scheduler.add_sequence(seq1)
token_ids2 = torch.tensor([0] * block_size * 2)
seq2 = session.add_sequence(token_ids2)
scheduler.add_sequence(seq2)
token_ids3 = torch.tensor([0] * block_size * 3)
seq3 = session.add_sequence(token_ids3)
scheduler.add_sequence(seq3)
scheduler.schedule(is_prefill=True)
# seq1: 1 running gpu
# seq2: 2 running gpu
# seq3: 3 waiting empty
assert seq1.status == MessageStatus.RUNNING
assert seq2.status == MessageStatus.RUNNING
assert seq3.status == MessageStatus.WAITING
assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks - 3
# test: waiting alloc
seq2.status = MessageStatus.STOPPED
scheduler.update()
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 1
assert len(scheduler.hanging) == 1
output = scheduler.schedule(is_prefill=True)
# seq1: 1 running gpu
# seq2: 2 hanging cpu
# seq3: 3 waiting gpu
assert seq1.status == MessageStatus.RUNNING
assert seq2.status == MessageStatus.STOPPED
assert seq3.status == MessageStatus.RUNNING
assert block_manager.get_num_free_gpu_blocks() == 0
assert block_manager.get_num_free_cpu_blocks() == num_cpu_blocks - 2
assert len(output.swap_out_map) == 2
# test: waiting append token
seq2.status = MessageStatus.WAITING
seq3.status = MessageStatus.ENDED
seq2.update_token_ids(torch.tensor([1] * block_size))
scheduler.update()
assert len(scheduler.running) == 1
assert len(scheduler.waiting) == 1
assert len(scheduler.hanging) == 0
output = scheduler.schedule(is_prefill=True)
# seq1: 1 running gpu
# seq2: 3 running gpu
# seq3: 3 nan
assert seq1.status == MessageStatus.RUNNING
assert seq2.status == MessageStatus.RUNNING
assert block_manager.get_num_free_gpu_blocks() == 0
assert block_manager.get_num_free_cpu_blocks() == num_cpu_blocks
assert len(output.swap_in_map) == 2
# test running append
seq1.update_token_ids(torch.tensor([1] * block_size))
seq2.update_token_ids(torch.tensor([1] * block_size))
scheduler.update()
assert len(scheduler.running) == 2
output = scheduler.schedule(is_prefill=False)
# seq1: 1 waiting cpu
# seq2: 4 running gpu
# seq3: 3 nan
assert seq1.status == MessageStatus.WAITING
assert seq2.status == MessageStatus.RUNNING
assert block_manager.get_num_free_gpu_blocks() == 0
assert block_manager.get_num_free_cpu_blocks() == num_cpu_blocks - 1
assert len(output.swap_out_map) == 1
import pytest
import torch
from lmdeploy.pytorch.tools.layout_convert import (batch_tensor,
continuous_tensor)
class TestContinuous:
@pytest.fixture
def batched_tensor(self):
yield torch.tensor([[1, 2, 3, 0, 0], [4, 5, 6, 7, 8], [9, 10, 0, 0,
0]])
@pytest.fixture
def seq_len(self):
yield torch.tensor([3, 5, 2])
@pytest.fixture
def conti_tensor(self):
yield torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
def test_conti_tensor(self, batched_tensor, seq_len, conti_tensor):
conti_out = continuous_tensor(batched_tensor, seq_len)
torch.testing.assert_close(conti_out, conti_tensor)
batched_out = batch_tensor(conti_tensor, seq_len)
torch.testing.assert_close(batched_out, batched_tensor)
import pytest
import torch
from lmdeploy.pytorch.tools.make_inputs import (make_model_inputs,
make_step_context)
class TestMakeInputs:
@pytest.fixture
def seq_length(self):
yield torch.tensor([2, 4, 3])
@pytest.fixture
def history_length(self):
yield [10, 12, 6]
@pytest.fixture
def input_ids(self, seq_length):
batch_size = len(seq_length)
max_seq_len = max(seq_length)
yield torch.randint(0, 100, (batch_size, max_seq_len))
@pytest.fixture
def block_size(self):
yield 4
@pytest.fixture
def num_key_value_heads(self):
yield 1
@pytest.fixture
def head_size(self):
yield 4
@pytest.fixture
def kv_cache_dtype(self):
yield torch.float16
@pytest.fixture
def past_key_values(self, history_length, num_key_value_heads, head_size):
max_len = max(history_length)
batch_size = len(history_length)
k_cache = torch.rand(batch_size, num_key_value_heads, max_len,
head_size)
v_cache = k_cache + 1
yield [(k_cache, v_cache)]
def test_make_inputs(self, input_ids, seq_length, history_length):
model_inputs = make_model_inputs(input_ids,
seq_length=seq_length,
block_offsets=None,
history_length=history_length)
position_ids = torch.tensor([
[10, 11, 11, 11],
[12, 13, 14, 15],
[6, 7, 8, 8],
])
q_start_loc = torch.tensor([0, 2, 6])
torch.testing.assert_close(model_inputs.position_ids, position_ids)
torch.testing.assert_close(model_inputs.q_start_loc, q_start_loc)
def test_make_step_context(self, input_ids, seq_length, history_length,
past_key_values, block_size,
num_key_value_heads, head_size, kv_cache_dtype):
step_ctx = make_step_context(input_ids,
seq_length=seq_length,
history_length=history_length,
past_key_values=past_key_values,
world_size=1,
device='cuda',
block_size=block_size,
num_key_value_heads=num_key_value_heads,
head_size=head_size,
kv_cache_dtype=kv_cache_dtype)
block_offsets = step_ctx.block_offsets
assert block_offsets[0][3] == 0
assert block_offsets[1][3] != 0
assert block_offsets[2][3] == 0
kv_caches = step_ctx.kv_caches
assert len(kv_caches) == len(past_key_values)
import pytest
from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
from lmdeploy.model import ChatTemplateConfig
from lmdeploy.serve.async_engine import deduce_a_name
@pytest.mark.parametrize(
'backend_config',
[TurbomindEngineConfig('internlm'),
PytorchEngineConfig(None), None])
@pytest.mark.parametrize(
'chat_template_config',
[ChatTemplateConfig('internlm'),
ChatTemplateConfig(None), None])
@pytest.mark.parametrize('model_name', ['internlm', None])
@pytest.mark.parametrize('model_path', ['/path/to/internlm-chat-7b'])
def test_deduce_a_name(model_path, model_name, chat_template_config,
backend_config):
name = deduce_a_name(model_path, model_name, chat_template_config,
backend_config)
assert name == 'internlm'
import os
import tempfile
import numpy as np
import pytest
class TestAutoBackend:
@pytest.fixture
def turbomind_workspace(self):
workspace = tempfile.TemporaryDirectory(
'internlm-chat-7b-turbomind').name
os.makedirs(os.path.join(workspace, 'triton_models'), exist_ok=True)
return workspace
@pytest.fixture
def models(self):
# example models to test
# format (model_path, is_pytorch_supported, is_turbomind_supported)
models = [
('baichuan-inc/Baichuan-7B', False, True),
('baichuan-inc/Baichuan2-7B-Chat', True, True),
('baichuan-inc/Baichuan-13B-Chat', False, False),
('baichuan-inc/Baichuan2-13B-Chat', True, False),
('internlm/internlm-chat-7b', True, True),
('internlm/internlm2-chat-7b', True, True),
('internlm/internlm-xcomposer2-7b', False, False),
('internlm/internlm-xcomposer-7b', False, True),
('THUDM/chatglm2-6b', True, False),
('THUDM/chatglm3-6b', True, False),
('deepseek-ai/deepseek-moe-16b-chat', True, False),
('tiiuae/falcon-7b-instruct', True, False),
('01-ai/Yi-34B-Chat', True, True),
('codellama/CodeLlama-7b-Instruct-hf', True, True),
('mistralai/Mistral-7B-Instruct-v0.1', True, False),
('mistralai/Mixtral-8x7B-Instruct-v0.1', True, False),
('Qwen/Qwen-7B-Chat', False, True),
('Qwen/Qwen-VL-Chat', False, True),
('Qwen/Qwen1.5-4B-Chat', True, False),
]
return models
def test_pytorch_is_suppored(self, turbomind_workspace, models):
from lmdeploy.pytorch.supported_models import is_supported
assert is_supported(turbomind_workspace) is False
for m, flag, _ in models:
assert is_supported(m) is flag
def test_turbomind_is_suppored(self, turbomind_workspace, models):
from lmdeploy.turbomind.supported_models import is_supported
assert is_supported(turbomind_workspace) is True
for m, _, flag in models:
assert is_supported(m) is flag
def test_autoget_backend(self, turbomind_workspace, models):
from lmdeploy.archs import autoget_backend
assert autoget_backend(turbomind_workspace) == 'turbomind'
n = len(models)
choices = np.random.choice(n, n // 2, replace=False)
for i in choices:
model, is_support_pytorch, is_support_turbomind = models[i]
target = 'turbomind' if is_support_turbomind else 'pytorch'
backend = autoget_backend(model)
assert backend == target
def test_autoget_backend_config(self, turbomind_workspace):
from lmdeploy.archs import autoget_backend_config
from lmdeploy.messages import (PytorchEngineConfig,
TurbomindEngineConfig)
assert type(autoget_backend_config(
turbomind_workspace)) is TurbomindEngineConfig
assert type(autoget_backend_config(
'internlm/internlm-chat-7b')) is TurbomindEngineConfig
assert type(
autoget_backend_config(
'mistralai/Mistral-7B-Instruct-v0.1')) is PytorchEngineConfig
import os
import pytest
from lmdeploy.turbomind.utils import get_model_from_config
@pytest.mark.parametrize('item',
[('baichuan-inc/Baichuan-7B', 'baichuan'),
('baichuan-inc/Baichuan2-7B-Base', 'baichuan2'),
('internlm/internlm2-7b', 'internlm2'),
('internlm/internlm2-chat-7b', 'internlm2'),
('internlm/internlm2-math-20b', 'internlm2'),
('internlm/internlm-20b', 'llama'),
('NousResearch/Llama-2-7b-chat-hf', 'llama'),
('Qwen/Qwen-7B-Chat', 'qwen'),
('Qwen/Qwen-14B', 'qwen'),
('NousResearch/Nous-Hermes-2-SOLAR-10.7B', 'llama'),
('01-ai/Yi-34B-Chat', 'llama')])
def test_get_model_from_config(item):
from transformers.utils import cached_file
model_id, result = item
local_file = cached_file(model_id, 'config.json')
local_dir = os.path.dirname(local_file)
print(get_model_from_config(local_dir))
assert get_model_from_config(local_dir) == result
from typing import List
from lmdeploy import EngineGenerationConfig, GenerationConfig, Tokenizer
def test_engine_generation_config():
tokenizer = Tokenizer('internlm/internlm-chat-7b')
config = GenerationConfig(n=3, stop_words=['<eoa>'])
_config = EngineGenerationConfig.From(config, tokenizer)
assert _config.n == config.n == 3 and \
_config.max_new_tokens == config.max_new_tokens and \
_config.temperature == config.temperature
assert isinstance(_config.stop_words, List) and \
isinstance(_config.stop_words[0], int)
import PIL
from lmdeploy.model import MODELS
from lmdeploy.vl.constants import IMAGE_TOKEN
from lmdeploy.vl.templates import VLChatTemplateWrapper
def test_prompt_to_messages():
model = MODELS.get('vicuna')()
templtae = VLChatTemplateWrapper(model)
out = templtae.prompt_to_messages('hi')
assert isinstance(out, list) and isinstance(out[0], dict)
im = PIL.Image.new(mode='RGB', size=(200, 200))
out = templtae.prompt_to_messages(('hi', [im]))
assert isinstance(out, list) and isinstance(out[0], dict)
def test_messages2prompt():
model = MODELS.get('vicuna')()
templtae = VLChatTemplateWrapper(model)
messages = [{
'role':
'user',
'content': [{
'type': 'text',
'text': 'hi'
}, {
'type': 'image_url',
'image_url': {
'url': 'xxx'
}
}]
}]
prompt = templtae.messages2prompt(messages)
assert isinstance(prompt, str)
assert prompt.count(IMAGE_TOKEN) == 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment