Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
fe851fbc
Commit
fe851fbc
authored
Mar 24, 2024
by
zhouxiang
Browse files
0.2.6版本新增文件补充
parent
e2d98ddc
Changes
220
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2065 additions
and
0 deletions
+2065
-0
tests/pytorch/engine/test_logits_process.py
tests/pytorch/engine/test_logits_process.py
+115
-0
tests/pytorch/engine/test_request.py
tests/pytorch/engine/test_request.py
+59
-0
tests/pytorch/kernel/test_apply_rotary.py
tests/pytorch/kernel/test_apply_rotary.py
+102
-0
tests/pytorch/kernel/test_fill_kv_cache.py
tests/pytorch/kernel/test_fill_kv_cache.py
+143
-0
tests/pytorch/kernel/test_fused_rotary_emb.py
tests/pytorch/kernel/test_fused_rotary_emb.py
+142
-0
tests/pytorch/kernel/test_mbgmm.py
tests/pytorch/kernel/test_mbgmm.py
+134
-0
tests/pytorch/kernel/test_mbgmv.py
tests/pytorch/kernel/test_mbgmv.py
+123
-0
tests/pytorch/kernel/test_multinomial_sampling.py
tests/pytorch/kernel/test_multinomial_sampling.py
+62
-0
tests/pytorch/kernel/test_paged_attention.py
tests/pytorch/kernel/test_paged_attention.py
+293
-0
tests/pytorch/kernel/test_rearange_all_gather.py
tests/pytorch/kernel/test_rearange_all_gather.py
+83
-0
tests/pytorch/kernel/test_rms_norm.py
tests/pytorch/kernel/test_rms_norm.py
+38
-0
tests/pytorch/paging/test_block_manager.py
tests/pytorch/paging/test_block_manager.py
+291
-0
tests/pytorch/paging/test_scheduler.py
tests/pytorch/paging/test_scheduler.py
+194
-0
tests/pytorch/tools/test_layout_convert.py
tests/pytorch/tools/test_layout_convert.py
+28
-0
tests/pytorch/tools/test_make_inputs.py
tests/pytorch/tools/test_make_inputs.py
+82
-0
tests/test_lmdeploy/test_async_engine.py
tests/test_lmdeploy/test_async_engine.py
+22
-0
tests/test_lmdeploy/test_auto_backend.py
tests/test_lmdeploy/test_auto_backend.py
+77
-0
tests/test_lmdeploy/test_get_model.py
tests/test_lmdeploy/test_get_model.py
+26
-0
tests/test_lmdeploy/test_messages.py
tests/test_lmdeploy/test_messages.py
+15
-0
tests/test_lmdeploy/test_vl_template.py
tests/test_lmdeploy/test_vl_template.py
+36
-0
No files found.
tests/pytorch/engine/test_logits_process.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
transformers.generation.logits_process
import
(
RepetitionPenaltyLogitsProcessor
,
TemperatureLogitsWarper
,
TopKLogitsWarper
,
TopPLogitsWarper
)
@
pytest
.
mark
.
parametrize
(
'inplace'
,
[
True
,
False
])
def
test_process_temperature
(
inplace
):
from
lmdeploy.pytorch.engine.logits_process
import
_process_temperature
batch_size
=
4
num_tokens
=
16
scores
=
torch
.
rand
(
batch_size
,
num_tokens
)
temperatures
=
torch
.
rand
(
batch_size
)
gt
=
[]
for
score
,
temperature
in
zip
(
scores
,
temperatures
):
warper
=
TemperatureLogitsWarper
(
temperature
.
item
())
gt
.
append
(
warper
(
None
,
score
[
None
]))
gt
=
torch
.
cat
(
gt
)
out
=
_process_temperature
(
scores
,
temperatures
,
inplace
=
inplace
)
torch
.
testing
.
assert_close
(
out
,
gt
)
@
pytest
.
mark
.
parametrize
(
'inplace'
,
[
True
,
False
])
def
test_process_bad_words
(
inplace
):
from
lmdeploy.pytorch.engine.logits_process
import
_process_bad_words
filter_value
:
float
=
-
float
(
'inf'
)
batch_size
=
4
num_tokens
=
16
scores
=
torch
.
rand
(
batch_size
,
num_tokens
)
bad_words
=
torch
.
tensor
([
[
0
,
1
],
[
3
,
-
1
],
[
4
,
4
],
[
-
1
,
-
1
],
])
out_scores
=
_process_bad_words
(
scores
,
bad_words
,
inplace
=
inplace
)
for
score
,
bw
in
zip
(
out_scores
,
bad_words
):
bw
=
bw
.
tolist
()
for
w
in
bw
:
if
w
>=
0
:
assert
score
[
w
]
==
filter_value
@
pytest
.
mark
.
parametrize
(
'inplace'
,
[
True
,
False
])
def
test_processrepetition_penalty
(
inplace
):
from
lmdeploy.pytorch.engine.logits_process
import
\
_process_repetition_penalty
batch_size
=
4
num_tokens
=
16
scores
=
torch
.
rand
(
batch_size
,
num_tokens
)
input_ids
=
torch
.
tensor
([
[
0
,
1
],
[
3
,
6
],
[
4
,
4
],
[
0
,
0
],
])
penalties
=
1
+
torch
.
rand
(
batch_size
)
gt
=
[]
for
score
,
ids
,
penalty
in
zip
(
scores
,
input_ids
,
penalties
):
warper
=
RepetitionPenaltyLogitsProcessor
(
penalty
.
item
())
gt
.
append
(
warper
(
ids
[
None
],
score
[
None
].
clone
()))
gt
=
torch
.
cat
(
gt
)
out
=
_process_repetition_penalty
(
scores
,
input_ids
,
penalties
,
inplace
=
inplace
)
torch
.
testing
.
assert_close
(
out
,
gt
)
@
pytest
.
mark
.
parametrize
(
'inplace'
,
[
True
,
False
])
def
test_filter_topk_sorted
(
inplace
):
from
lmdeploy.pytorch.engine.logits_process
import
_filter_topk_sorted
batch_size
=
4
num_tokens
=
16
scores
=
torch
.
rand
(
batch_size
,
num_tokens
).
sort
(
1
,
descending
=
True
)[
0
]
top_k
=
torch
.
randint
(
4
,
num_tokens
-
4
,
(
batch_size
,
))
gt
=
[]
for
score
,
k
in
zip
(
scores
,
top_k
):
warper
=
TopKLogitsWarper
(
k
.
item
())
gt
.
append
(
warper
(
None
,
score
[
None
].
clone
()))
gt
=
torch
.
cat
(
gt
)
out
=
_filter_topk_sorted
(
scores
,
top_k
,
inplace
=
inplace
)
torch
.
testing
.
assert_close
(
out
,
gt
)
@
pytest
.
mark
.
parametrize
(
'inplace'
,
[
True
,
False
])
def
test_filter_topp_sorted
(
inplace
):
from
lmdeploy.pytorch.engine.logits_process
import
_filter_topp_sorted
batch_size
=
4
num_tokens
=
16
scores
=
torch
.
rand
(
batch_size
,
num_tokens
).
sort
(
1
,
descending
=
True
)[
0
]
top_p
=
torch
.
rand
(
batch_size
)
gt
=
[]
for
score
,
p
in
zip
(
scores
,
top_p
):
warper
=
TopPLogitsWarper
(
p
.
item
())
gt
.
append
(
warper
(
None
,
score
[
None
].
clone
()))
gt
=
torch
.
cat
(
gt
)
out
=
_filter_topp_sorted
(
scores
,
top_p
,
inplace
=
inplace
)
torch
.
testing
.
assert_close
(
out
,
gt
)
tests/pytorch/engine/test_request.py
0 → 100644
View file @
fe851fbc
import
asyncio
import
pytest
from
lmdeploy.pytorch.engine.request
import
(
RequestManager
,
RequestType
,
Response
,
ResponseType
)
class
TestRequestHander
:
@
pytest
.
fixture
def
event_loop
(
self
):
old_loop
=
asyncio
.
get_event_loop
()
new_loop
=
asyncio
.
new_event_loop
()
yield
new_loop
new_loop
.
stop
()
asyncio
.
set_event_loop
(
old_loop
)
@
pytest
.
fixture
def
thread_safe
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
manager
(
self
,
thread_safe
):
yield
RequestManager
(
thread_safe
=
thread_safe
)
@
pytest
.
mark
.
parametrize
(
'thread_safe'
,
[
True
,
False
])
def
test_bind
(
self
,
manager
,
event_loop
):
def
__stop_engine_callback
(
reqs
,
**
kwargs
):
for
req
in
reqs
:
manager
.
response
(
Response
(
type
=
ResponseType
.
SUCCESS
,
sender_id
=
req
.
sender_id
,
req_id
=
req
.
req_id
,
data
=
f
'
{
req
.
data
}
success'
))
async
def
__dummy_loop
():
while
True
:
manager
.
step
()
await
asyncio
.
sleep
(
0.1
)
asyncio
.
set_event_loop
(
event_loop
)
sender
=
manager
.
build_sender
()
manager
.
start_loop
(
__dummy_loop
)
# test not bind
req_id
=
sender
.
send_async
(
RequestType
.
STOP_ENGINE
,
None
)
resp
=
sender
.
recv
(
req_id
)
assert
resp
.
type
==
ResponseType
.
HANDLER_NOT_EXIST
assert
manager
.
is_loop_alive
()
# test bind success
sender
.
send_async
(
RequestType
.
STOP_ENGINE
,
None
)
manager
.
bind_func
(
RequestType
.
STOP_ENGINE
,
__stop_engine_callback
)
req_id
=
sender
.
send_async
(
RequestType
.
STOP_ENGINE
,
'test'
)
resp
=
sender
.
recv
(
req_id
)
assert
resp
.
data
==
'test success'
tests/pytorch/kernel/test_apply_rotary.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
lmdeploy.pytorch.kernels
import
apply_rotary_pos_emb
def
_rotate_half
(
x
):
"""Rotates half the hidden dims of the input."""
x1
=
x
[...,
:
x
.
shape
[
-
1
]
//
2
]
x2
=
x
[...,
x
.
shape
[
-
1
]
//
2
:]
return
torch
.
cat
((
-
x2
,
x1
),
dim
=-
1
)
class
TestApplyRotary
:
@
pytest
.
fixture
def
dtype
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
batch_size
(
self
):
yield
4
@
pytest
.
fixture
def
num_heads_q
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
num_heads_k
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
feature_dim
(
self
):
yield
16
@
pytest
.
fixture
def
seq_length
(
self
,
batch_size
):
yield
torch
.
randint
(
8
,
16
,
(
batch_size
,
),
device
=
'cuda'
)
@
pytest
.
fixture
def
max_seqlen
(
self
,
seq_length
):
yield
seq_length
.
max
()
@
pytest
.
fixture
def
q_states
(
self
,
seq_length
,
num_heads_q
,
feature_dim
,
dtype
):
yield
torch
.
rand
(
seq_length
.
sum
(),
num_heads_q
,
feature_dim
,
dtype
=
dtype
,
device
=
'cuda'
)
@
pytest
.
fixture
def
k_states
(
self
,
seq_length
,
num_heads_k
,
feature_dim
,
dtype
):
yield
torch
.
rand
(
seq_length
.
sum
(),
num_heads_k
,
feature_dim
,
dtype
=
dtype
,
device
=
'cuda'
)
@
pytest
.
fixture
def
position_ids_1d
(
self
,
seq_length
,
max_seqlen
):
yield
torch
.
randint
(
0
,
max_seqlen
.
item
(),
(
seq_length
.
sum
().
item
(),
),
device
=
'cuda'
)
@
pytest
.
fixture
def
cached_cos
(
self
,
max_seqlen
,
feature_dim
,
dtype
):
yield
torch
.
rand
(
max_seqlen
,
feature_dim
,
dtype
=
dtype
,
device
=
'cuda'
)
@
pytest
.
fixture
def
cached_sin
(
self
,
max_seqlen
,
feature_dim
,
dtype
):
yield
torch
.
rand
(
max_seqlen
,
feature_dim
,
dtype
=
dtype
,
device
=
'cuda'
)
@
pytest
.
fixture
def
gt
(
self
,
q_states
,
k_states
,
cached_cos
,
cached_sin
,
position_ids_1d
):
cos
=
cached_cos
[
position_ids_1d
,
None
,
:]
sin
=
cached_sin
[
position_ids_1d
,
None
,
:]
q_embed
=
q_states
*
cos
+
_rotate_half
(
q_states
)
*
sin
k_embed
=
k_states
*
cos
+
_rotate_half
(
k_states
)
*
sin
yield
q_embed
,
k_embed
@
pytest
.
mark
.
parametrize
(
'dtype'
,
[
torch
.
bfloat16
,
torch
.
float16
,
torch
.
float32
],
indirect
=
True
)
@
pytest
.
mark
.
parametrize
((
'num_heads_q'
,
'num_heads_k'
),
[(
8
,
8
),
(
8
,
4
)],
indirect
=
True
)
def
test_apply_rotary
(
self
,
q_states
,
k_states
,
cached_cos
,
cached_sin
,
position_ids_1d
,
gt
):
q_embed
,
k_embed
=
apply_rotary_pos_emb
(
q_states
,
k_states
,
cached_cos
,
cached_sin
,
None
,
position_ids_1d
)
q_gt
,
k_gt
=
gt
rtol
=
None
atol
=
None
if
q_states
.
dtype
==
torch
.
float16
:
rtol
=
1e-5
atol
=
1e-3
torch
.
testing
.
assert_close
(
q_embed
,
q_gt
,
rtol
=
rtol
,
atol
=
atol
)
torch
.
testing
.
assert_close
(
k_embed
,
k_gt
,
rtol
=
rtol
,
atol
=
atol
)
tests/pytorch/kernel/test_fill_kv_cache.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
lmdeploy.pytorch.kernels.fill_kv_cache
import
fill_kv_cache
def
_div_up
(
a
,
b
):
return
(
a
+
b
-
1
)
//
b
class
TestFillKVCache
:
@
pytest
.
fixture
def
num_heads
(
self
):
yield
4
@
pytest
.
fixture
def
head_dim
(
self
):
yield
32
@
pytest
.
fixture
def
block_size
(
self
):
yield
16
@
pytest
.
fixture
def
seq_lens
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
history_lens
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
batch_size
(
self
,
seq_lens
):
yield
len
(
seq_lens
)
@
pytest
.
fixture
def
kv_lens
(
self
,
seq_lens
,
history_lens
):
yield
[
s
+
h
for
s
,
h
in
zip
(
seq_lens
,
history_lens
)]
@
pytest
.
fixture
def
max_q_seq_length
(
self
,
seq_lens
):
yield
max
(
seq_lens
)
@
pytest
.
fixture
def
num_tokens
(
self
,
seq_lens
):
yield
sum
(
seq_lens
)
@
pytest
.
fixture
def
num_blocks_per_input
(
self
,
kv_lens
,
block_size
):
yield
[
_div_up
(
kv_len
,
block_size
)
for
kv_len
in
kv_lens
]
@
pytest
.
fixture
def
max_num_blocks
(
self
,
num_blocks_per_input
):
yield
max
(
num_blocks_per_input
)
@
pytest
.
fixture
def
q_seq_length
(
self
,
seq_lens
):
yield
torch
.
tensor
(
seq_lens
).
cuda
()
@
pytest
.
fixture
def
q_start_loc
(
self
,
q_seq_length
):
cum_seq_length
=
q_seq_length
.
cumsum
(
0
)
yield
cum_seq_length
-
q_seq_length
@
pytest
.
fixture
def
kv_seq_length
(
self
,
kv_lens
):
yield
torch
.
tensor
(
kv_lens
).
cuda
()
@
pytest
.
fixture
def
k_states
(
self
,
num_tokens
,
num_heads
,
head_dim
):
yield
torch
.
rand
(
num_tokens
,
num_heads
,
head_dim
).
cuda
()
@
pytest
.
fixture
def
v_states
(
self
,
k_states
):
yield
torch
.
rand_like
(
k_states
)
@
pytest
.
fixture
def
k_caches
(
self
,
batch_size
,
max_num_blocks
,
block_size
,
num_heads
,
head_dim
):
shape
=
(
batch_size
*
max_num_blocks
,
block_size
,
num_heads
,
head_dim
)
yield
torch
.
full
(
shape
,
0.0
).
cuda
()
@
pytest
.
fixture
def
v_caches
(
self
,
k_caches
):
yield
torch
.
rand_like
(
k_caches
)
@
pytest
.
fixture
def
block_offsets
(
self
,
num_blocks_per_input
):
batch_size
=
len
(
num_blocks_per_input
)
max_num_blocks
=
max
(
num_blocks_per_input
)
batch_ids
=
torch
.
arange
(
batch_size
)
ret
=
torch
.
arange
(
max_num_blocks
)
ret
=
batch_ids
[:,
None
]
+
ret
[
None
,
:]
*
batch_size
yield
ret
.
cuda
()
@
pytest
.
fixture
def
gt
(
self
,
k_states
,
v_states
,
k_caches
,
v_caches
,
seq_lens
,
history_lens
,
block_offsets
,
block_size
):
batch_size
=
len
(
seq_lens
)
k_caches
=
k_caches
.
clone
()
v_caches
=
v_caches
.
clone
()
splited_k_states
=
k_states
.
split
(
seq_lens
)
splited_v_states
=
v_states
.
split
(
seq_lens
)
for
bidx
in
range
(
batch_size
):
k_state
=
splited_k_states
[
bidx
]
v_state
=
splited_v_states
[
bidx
]
h_len
=
history_lens
[
bidx
]
b_offs
=
block_offsets
[
bidx
]
block_id
=
_div_up
(
h_len
+
1
,
block_size
)
-
1
fill_start
=
h_len
%
block_size
fill_size
=
min
(
block_size
-
fill_start
,
k_state
.
size
(
0
))
while
True
:
boff
=
b_offs
[
block_id
]
tmp_ks
=
k_state
[:
fill_size
]
tmp_vs
=
v_state
[:
fill_size
]
fill_end
=
fill_start
+
fill_size
k_caches
[
boff
,
fill_start
:
fill_end
]
=
tmp_ks
v_caches
[
boff
,
fill_start
:
fill_end
]
=
tmp_vs
k_state
=
k_state
[
fill_size
:]
v_state
=
v_state
[
fill_size
:]
block_id
+=
1
fill_start
=
0
fill_size
=
min
(
block_size
,
k_state
.
size
(
0
))
if
fill_size
==
0
:
break
yield
k_caches
,
v_caches
@
pytest
.
mark
.
parametrize
([
'seq_lens'
,
'history_lens'
],
[
((
1
,
1
,
1
,
1
),
(
1
,
16
,
31
,
24
)),
((
1
,
8
,
16
,
24
),
(
1
,
16
,
31
,
24
)),
],
indirect
=
True
)
def
test_fill_kv_cache
(
self
,
k_states
,
v_states
,
k_caches
,
v_caches
,
block_offsets
,
q_start_loc
,
q_seq_length
,
kv_seq_length
,
max_q_seq_length
,
gt
):
fill_kv_cache
(
k_states
,
v_states
,
k_caches
,
v_caches
,
q_start_loc
,
q_seq_length
,
kv_seq_length
,
max_q_seq_length
,
block_offsets
)
torch
.
testing
.
assert_close
(
k_caches
,
gt
[
0
])
torch
.
testing
.
assert_close
(
v_caches
,
gt
[
1
])
tests/pytorch/kernel/test_fused_rotary_emb.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
torch
import
nn
from
lmdeploy.pytorch.kernels.fused_rotary_emb
import
fused_rotary_emb
class
DummyRotaryEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
dim
,
max_position_embeddings
=
2048
,
base
=
10000
,
device
=
None
):
super
().
__init__
()
self
.
dim
=
dim
self
.
max_position_embeddings
=
max_position_embeddings
self
.
base
=
base
inv_freq
=
1.0
/
(
self
.
base
**
(
torch
.
arange
(
0
,
self
.
dim
,
2
,
dtype
=
torch
.
int64
).
float
().
to
(
device
)
/
self
.
dim
))
self
.
register_buffer
(
'inv_freq'
,
inv_freq
,
persistent
=
False
)
def
forward
(
self
,
x
,
position_ids
,
seq_len
=
None
):
inv_freq_expanded
=
self
.
inv_freq
[
None
,
:,
None
].
float
().
expand
(
position_ids
.
shape
[
0
],
-
1
,
1
)
position_ids_expanded
=
position_ids
[:,
None
,
:].
float
()
freqs
=
(
inv_freq_expanded
@
position_ids_expanded
).
transpose
(
1
,
2
)
emb
=
torch
.
cat
((
freqs
,
freqs
),
dim
=-
1
)
cos
=
emb
.
cos
().
to
(
dtype
=
x
.
dtype
)
sin
=
emb
.
sin
().
to
(
dtype
=
x
.
dtype
)
# backwards compatibility
return
cos
,
sin
class
DummyLinearScalingRotaryEmbedding
(
DummyRotaryEmbedding
):
def
__init__
(
self
,
dim
,
max_position_embeddings
=
2048
,
base
=
10000
,
device
=
None
,
scaling_factor
=
1.0
):
self
.
scaling_factor
=
scaling_factor
super
().
__init__
(
dim
,
max_position_embeddings
,
base
,
device
)
def
forward
(
self
,
x
,
position_ids
,
seq_len
=
None
):
position_ids
=
position_ids
.
float
()
/
self
.
scaling_factor
cos
,
sin
=
super
().
forward
(
x
,
position_ids
,
seq_len
)
return
cos
,
sin
def
rotate_half
(
x
):
"""Rotates half the hidden dims of the input."""
x1
=
x
[...,
:
x
.
shape
[
-
1
]
//
2
]
x2
=
x
[...,
x
.
shape
[
-
1
]
//
2
:]
return
torch
.
cat
((
-
x2
,
x1
),
dim
=-
1
)
def
apply_rotary_pos_emb
(
q
,
k
,
cos
,
sin
,
position_ids
=
None
,
unsqueeze_dim
=
2
):
"""Applies Rotary Position Embedding to the query and key tensors."""
cos
=
cos
.
unsqueeze
(
unsqueeze_dim
)
sin
=
sin
.
unsqueeze
(
unsqueeze_dim
)
q_embed
=
(
q
*
cos
)
+
(
rotate_half
(
q
)
*
sin
)
k_embed
=
(
k
*
cos
)
+
(
rotate_half
(
k
)
*
sin
)
return
q_embed
,
k_embed
class
TestFusedRotaryEmb
:
@
pytest
.
fixture
def
dtype
(
self
):
yield
torch
.
float16
@
pytest
.
fixture
def
batch_size
(
self
):
yield
2
@
pytest
.
fixture
def
head_dim
(
self
):
yield
64
@
pytest
.
fixture
def
q_num_heads
(
self
):
yield
4
@
pytest
.
fixture
def
k_num_heads
(
self
):
yield
2
@
pytest
.
fixture
def
seq_len
(
self
):
yield
100
@
pytest
.
fixture
def
q
(
self
,
batch_size
,
seq_len
,
q_num_heads
,
head_dim
,
dtype
):
yield
torch
.
rand
(
batch_size
,
seq_len
,
q_num_heads
,
head_dim
,
dtype
=
dtype
).
to
(
'cuda'
)
@
pytest
.
fixture
def
k
(
self
,
batch_size
,
seq_len
,
k_num_heads
,
head_dim
,
dtype
):
yield
torch
.
rand
(
batch_size
,
seq_len
,
k_num_heads
,
head_dim
,
dtype
=
dtype
).
to
(
'cuda'
)
@
pytest
.
fixture
def
position_ids
(
self
,
batch_size
,
seq_len
):
yield
torch
.
randint
(
0
,
seq_len
+
100
,
(
batch_size
,
seq_len
)).
cuda
()
@
pytest
.
fixture
def
rotary_emb
(
self
,
head_dim
):
yield
DummyLinearScalingRotaryEmbedding
(
head_dim
,
scaling_factor
=
1.0
).
to
(
'cuda'
)
@
pytest
.
fixture
def
gt
(
self
,
q
,
k
,
position_ids
,
rotary_emb
):
with
torch
.
inference_mode
():
cos
,
sin
=
rotary_emb
(
q
,
position_ids
)
yield
apply_rotary_pos_emb
(
q
,
k
,
cos
,
sin
,
position_ids
=
position_ids
)
def
test_fused_rotary_emb
(
self
,
q
,
k
,
position_ids
,
rotary_emb
,
gt
):
inv_freq
=
rotary_emb
.
inv_freq
scaling_factor
=
rotary_emb
.
scaling_factor
with
torch
.
inference_mode
():
outq
,
outk
=
fused_rotary_emb
(
q
,
k
,
position_ids
,
inv_freq
,
scaling_factor
=
scaling_factor
)
gtq
,
gtk
=
gt
torch
.
testing
.
assert_close
(
outq
,
gtq
,
atol
=
1e-3
,
rtol
=
1e-5
)
torch
.
testing
.
assert_close
(
outk
,
gtk
,
atol
=
1e-3
,
rtol
=
1e-5
)
tests/pytorch/kernel/test_mbgmm.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
torch.nn.utils.rnn
import
pad_sequence
from
lmdeploy.pytorch.kernels.mbgmm
import
mbgmm_a
,
mbgmm_b
class
TestMBGMM
:
@
pytest
.
fixture
def
dtype
(
self
):
yield
torch
.
float16
@
pytest
.
fixture
def
head_size
(
self
):
yield
32
@
pytest
.
fixture
def
out_head_size
(
self
):
yield
16
@
pytest
.
fixture
def
seq_lens
(
self
):
yield
torch
.
tensor
([
2
,
4
,
6
,
8
]).
cuda
()
@
pytest
.
fixture
def
ranks
(
self
):
yield
torch
.
tensor
([
2
,
4
]).
cuda
()
@
pytest
.
fixture
def
page_start
(
self
,
ranks
):
yield
torch
.
zeros_like
(
ranks
)
@
pytest
.
fixture
def
start_loc
(
self
,
seq_lens
):
yield
seq_lens
.
cumsum
(
0
)
-
seq_lens
@
pytest
.
fixture
def
input
(
self
,
seq_lens
,
head_size
,
dtype
):
total_len
=
seq_lens
.
sum
()
yield
torch
.
rand
(
total_len
,
head_size
,
dtype
=
dtype
).
cuda
()
@
pytest
.
fixture
def
adapter_ids
(
self
,
seq_lens
,
ranks
):
num_ranks
=
len
(
ranks
)
num_seqs
=
len
(
seq_lens
)
ret
=
torch
.
randint
(
0
,
num_ranks
,
(
num_seqs
,
)).
cuda
()
yield
ret
@
pytest
.
fixture
def
scaling
(
self
,
adapter_ids
):
yield
torch
.
ones
(
adapter_ids
.
size
(
0
)).
cuda
()
@
pytest
.
fixture
def
lora_a
(
self
,
ranks
,
head_size
,
dtype
):
out
=
[]
for
rank
in
ranks
:
w
=
torch
.
rand
(
head_size
,
rank
,
dtype
=
dtype
).
cuda
()
out
.
append
(
w
)
yield
out
@
pytest
.
fixture
def
lora_b
(
self
,
ranks
,
out_head_size
,
dtype
):
out
=
[]
for
rank
in
ranks
:
w
=
torch
.
rand
(
rank
,
out_head_size
,
dtype
=
dtype
).
cuda
()
out
.
append
(
w
)
yield
out
@
pytest
.
fixture
def
page_table
(
self
,
ranks
):
total_ranks
=
sum
(
ranks
)
index
=
torch
.
randperm
(
total_ranks
)
index
=
index
.
split
(
ranks
.
tolist
())
yield
pad_sequence
(
index
,
batch_first
=
True
).
cuda
()
@
pytest
.
fixture
def
paged_lora_a
(
self
,
lora_a
,
ranks
,
page_table
,
head_size
,
dtype
):
num_pages
=
sum
(
ranks
)
cache
=
torch
.
empty
(
num_pages
,
head_size
,
dtype
=
dtype
).
cuda
()
for
index
,
r
,
w
in
zip
(
page_table
,
ranks
,
lora_a
):
cache
[
index
[:
r
]]
=
w
.
t
()
yield
cache
@
pytest
.
fixture
def
paged_lora_b
(
self
,
lora_b
,
ranks
,
page_table
,
head_size
,
out_head_size
,
dtype
):
num_pages
=
sum
(
ranks
)
cache
=
torch
.
empty
(
num_pages
,
head_size
,
dtype
=
dtype
).
cuda
()
for
index
,
r
,
w
in
zip
(
page_table
,
ranks
,
lora_b
):
cache
[
index
[:
r
],
:
out_head_size
]
=
w
yield
cache
@
pytest
.
fixture
def
gt
(
self
,
input
,
start_loc
,
seq_lens
,
adapter_ids
,
lora_a
,
lora_b
):
out
=
[]
for
loc
,
s_len
,
r_id
in
zip
(
start_loc
,
seq_lens
,
adapter_ids
):
inp
=
input
[
loc
:
loc
+
s_len
]
l_a
=
lora_a
[
r_id
]
l_b
=
lora_b
[
r_id
]
out
.
append
(
inp
@
l_a
@
l_b
)
yield
torch
.
cat
(
out
)
def
test_mbgmm
(
self
,
input
,
paged_lora_a
,
paged_lora_b
,
out_head_size
,
start_loc
,
seq_lens
,
adapter_ids
,
scaling
,
page_table
,
ranks
,
page_start
,
gt
):
max_seq_len
=
max
(
seq_lens
).
item
()
max_rank
=
page_table
.
size
(
-
1
)
xa
=
mbgmm_a
(
input
,
paged_lora_a
,
q_start_loc
=
start_loc
,
q_seqlens
=
seq_lens
,
adapter_ids
=
adapter_ids
,
rank_page_table
=
page_table
,
rank_page_start
=
page_start
,
ranks
=
ranks
,
max_seq_len
=
max_seq_len
,
max_rank
=
max_rank
)
output
=
mbgmm_b
(
xa
,
paged_lora_b
[...,
:
out_head_size
],
q_start_loc
=
start_loc
,
q_seqlens
=
seq_lens
,
adapter_ids
=
adapter_ids
,
scaling
=
scaling
,
rank_page_table
=
page_table
,
rank_page_start
=
page_start
,
ranks
=
ranks
,
max_seq_len
=
max_seq_len
,
max_rank
=
max_rank
)
torch
.
testing
.
assert_close
(
gt
,
output
)
tests/pytorch/kernel/test_mbgmv.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
torch.nn.utils.rnn
import
pad_sequence
from
lmdeploy.pytorch.kernels.mbgmv
import
mbgmv_a
,
mbgmv_b
class
TestMBGMV
:
@
pytest
.
fixture
def
dtype
(
self
):
yield
torch
.
float16
@
pytest
.
fixture
def
head_size
(
self
):
yield
64
@
pytest
.
fixture
def
out_head_size
(
self
):
yield
32
@
pytest
.
fixture
def
batch_size
(
self
):
yield
8
@
pytest
.
fixture
def
ranks
(
self
):
yield
torch
.
tensor
([
2
,
4
]).
cuda
()
@
pytest
.
fixture
def
page_start
(
self
,
ranks
):
yield
torch
.
zeros_like
(
ranks
)
@
pytest
.
fixture
def
input
(
self
,
batch_size
,
head_size
,
dtype
):
x
=
torch
.
rand
(
batch_size
,
head_size
,
dtype
=
dtype
).
cuda
()
x
-=
0.5
yield
x
@
pytest
.
fixture
def
adapter_ids
(
self
,
batch_size
,
ranks
):
num_ranks
=
len
(
ranks
)
ret
=
torch
.
randint
(
0
,
num_ranks
,
(
batch_size
,
)).
cuda
()
yield
ret
@
pytest
.
fixture
def
scaling
(
self
,
adapter_ids
):
yield
torch
.
ones
(
adapter_ids
.
size
(
0
)).
cuda
()
@
pytest
.
fixture
def
lora_a
(
self
,
ranks
,
head_size
,
dtype
):
out
=
[]
for
rank
in
ranks
:
w
=
torch
.
rand
(
head_size
,
rank
,
dtype
=
dtype
).
cuda
()
w
-=
0.5
out
.
append
(
w
)
yield
out
@
pytest
.
fixture
def
lora_b
(
self
,
ranks
,
out_head_size
,
dtype
):
out
=
[]
for
rank
in
ranks
:
w
=
torch
.
rand
(
rank
,
out_head_size
,
dtype
=
dtype
).
cuda
()
w
-=
0.5
out
.
append
(
w
)
yield
out
@
pytest
.
fixture
def
page_table
(
self
,
ranks
):
total_ranks
=
sum
(
ranks
)
index
=
torch
.
randperm
(
total_ranks
)
index
=
index
.
split
(
ranks
.
tolist
())
yield
pad_sequence
(
index
,
batch_first
=
True
).
cuda
()
@
pytest
.
fixture
def
paged_lora_a
(
self
,
lora_a
,
ranks
,
page_table
,
head_size
,
dtype
):
num_pages
=
sum
(
ranks
)
cache
=
torch
.
empty
(
num_pages
,
head_size
,
dtype
=
dtype
).
cuda
()
for
index
,
r
,
w
in
zip
(
page_table
,
ranks
,
lora_a
):
cache
[
index
[:
r
]]
=
w
.
t
()
yield
cache
@
pytest
.
fixture
def
paged_lora_b
(
self
,
lora_b
,
ranks
,
page_table
,
head_size
,
out_head_size
,
dtype
):
num_pages
=
sum
(
ranks
)
cache
=
torch
.
empty
(
num_pages
,
head_size
,
dtype
=
dtype
).
cuda
()
for
index
,
r
,
w
in
zip
(
page_table
,
ranks
,
lora_b
):
cache
[
index
[:
r
],
:
out_head_size
]
=
w
yield
cache
@
pytest
.
fixture
def
gt
(
self
,
input
,
adapter_ids
,
lora_a
,
lora_b
):
out
=
[]
for
inp
,
r_id
in
zip
(
input
,
adapter_ids
):
inp
=
inp
.
unsqueeze
(
0
)
l_a
=
lora_a
[
r_id
]
l_b
=
lora_b
[
r_id
]
out
.
append
(
inp
@
l_a
@
l_b
)
yield
torch
.
cat
(
out
)
def
test_mbgmv
(
self
,
input
,
paged_lora_a
,
paged_lora_b
,
out_head_size
,
adapter_ids
,
scaling
,
page_table
,
ranks
,
page_start
,
gt
):
max_rank
=
page_table
.
size
(
-
1
)
xa
=
mbgmv_a
(
input
,
paged_lora_a
,
adapter_ids
=
adapter_ids
,
rank_page_table
=
page_table
,
rank_page_start
=
page_start
,
ranks
=
ranks
,
max_rank
=
max_rank
)
output
=
mbgmv_b
(
xa
,
paged_lora_b
[...,
:
out_head_size
],
adapter_ids
=
adapter_ids
,
scaling
=
scaling
,
rank_page_table
=
page_table
,
rank_page_start
=
page_start
,
ranks
=
ranks
,
max_rank
=
max_rank
)
torch
.
testing
.
assert_close
(
gt
,
output
,
atol
=
2e-3
,
rtol
=
1e-5
)
tests/pytorch/kernel/test_multinomial_sampling.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
lmdeploy.pytorch.kernels
import
multinomial_sampling
class
TestMultinomialSampling
:
@
pytest
.
fixture
def
num_tokens
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
select_ids
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
batch_size
(
self
,
select_ids
):
yield
len
(
select_ids
)
@
pytest
.
fixture
def
dtype
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
scores
(
self
,
num_tokens
,
batch_size
,
select_ids
,
dtype
):
ret
=
torch
.
zeros
(
batch_size
,
num_tokens
).
cuda
()
batch_ids
=
torch
.
arange
(
batch_size
).
cuda
()
ret
[
batch_ids
,
select_ids
]
=
1
ret
=
ret
.
to
(
dtype
)
yield
ret
@
pytest
.
fixture
def
seeds
(
self
,
batch_size
):
yield
torch
.
randint
(
1000
,
2000
,
(
batch_size
,
)).
cuda
()
@
pytest
.
fixture
def
offsets
(
self
,
batch_size
):
yield
torch
.
randint
(
1000
,
2000
,
(
batch_size
,
)).
cuda
()
@
pytest
.
fixture
def
indices
(
self
,
scores
):
num_tokens
=
scores
.
size
(
1
)
ret
=
[
torch
.
randperm
(
num_tokens
)
for
_
in
scores
]
ret
=
torch
.
stack
(
ret
,
0
).
cuda
()
yield
ret
@
pytest
.
fixture
def
gt
(
self
,
batch_size
,
select_ids
,
indices
):
batch_ids
=
torch
.
arange
(
batch_size
).
cuda
()
yield
indices
[
batch_ids
,
select_ids
]
@
pytest
.
mark
.
parametrize
(
'dtype'
,
[
torch
.
float32
,
torch
.
half
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
([
'num_tokens'
,
'select_ids'
],
[
(
8
,
(
4
,
2
)
*
30
),
(
200
,
(
50
,
150
)),
],
indirect
=
True
)
def
test_multinomial_sampling
(
self
,
scores
,
seeds
,
offsets
,
indices
,
gt
):
output
=
multinomial_sampling
(
scores
,
seeds
,
offsets
,
indices
)
torch
.
testing
.
assert_close
(
output
,
gt
)
tests/pytorch/kernel/test_paged_attention.py
0 → 100644
View file @
fe851fbc
import
math
import
pytest
import
torch
def
_conti_input
(
data
,
seq_lens
):
data
=
[
x
[:
l
]
for
x
,
l
in
zip
(
data
,
seq_lens
)]
data
=
torch
.
cat
(
data
,
dim
=
0
)
return
data
def
_make_bias
(
seq_lens
,
history_lens
,
neg_val
):
full_seq_lens
=
seq_lens
+
history_lens
max_seq_len
=
seq_lens
.
max
().
item
()
max_full_len
=
full_seq_lens
.
max
().
item
()
seq_ranges
=
[
torch
.
arange
(
max_seq_len
)
for
_
in
seq_lens
]
for
r
,
l
in
zip
(
seq_ranges
,
seq_lens
):
r
[
l
:]
=
-
max_full_len
seq_ranges
=
torch
.
stack
(
seq_ranges
,
dim
=
0
).
cuda
()
kv_ranges
=
[
torch
.
arange
(
max_full_len
)
for
_
in
full_seq_lens
]
kv_ranges
=
torch
.
stack
(
kv_ranges
,
0
).
cuda
()
mask
=
kv_ranges
[:,
None
,
:]
-
seq_ranges
[:,
:,
None
]
>
history_lens
[:,
None
,
None
]
return
mask
.
float
()
*
neg_val
def
_make_blocked_cache
(
batched_k
,
batched_v
,
seq_lens
,
history_lens
,
block_offsets
,
block_size
,
num_heads_k
,
feat_dim
):
max_blocks_nums
=
block_offsets
.
max
()
+
1
full_seq_lens
=
seq_lens
+
history_lens
blocked_k
=
batched_k
.
new_zeros
(
max_blocks_nums
,
block_size
,
num_heads_k
,
feat_dim
)
blocked_v
=
batched_v
.
new_zeros
(
max_blocks_nums
,
block_size
,
num_heads_k
,
feat_dim
)
for
batch_id
,
offset
in
enumerate
(
block_offsets
):
ori_k
=
batched_k
[
batch_id
]
ori_v
=
batched_v
[
batch_id
]
seq_len
=
full_seq_lens
[
batch_id
]
for
block_id
,
block_start
in
enumerate
(
range
(
0
,
seq_len
,
block_size
)):
block_off
=
offset
[
block_id
]
tmp_k
=
ori_k
[
block_start
:
block_start
+
block_size
]
tmp_v
=
ori_v
[
block_start
:
block_start
+
block_size
]
size
=
tmp_k
.
size
(
0
)
blocked_k
[
block_off
,
:
size
]
=
tmp_k
blocked_v
[
block_off
,
:
size
]
=
tmp_v
return
blocked_k
,
blocked_v
def
_naive_attention
(
batched_q
,
batched_kv
,
bias
):
batched_k
,
batched_v
=
batched_kv
num_heads_q
=
batched_q
.
shape
[
2
]
num_heads_k
=
batched_k
.
shape
[
2
]
head_dim
=
batched_q
.
shape
[
-
1
]
group
=
num_heads_q
//
num_heads_k
q
=
batched_q
.
transpose
(
1
,
2
)
k
=
batched_k
.
permute
(
0
,
2
,
3
,
1
)
v
=
batched_v
.
transpose
(
1
,
2
)
# expand group
k
=
k
.
unsqueeze
(
2
).
expand
(
-
1
,
-
1
,
group
,
-
1
,
-
1
).
flatten
(
1
,
2
)
v
=
v
.
unsqueeze
(
2
).
expand
(
-
1
,
-
1
,
group
,
-
1
,
-
1
).
flatten
(
1
,
2
)
qk
=
torch
.
matmul
(
q
,
k
)
/
math
.
sqrt
(
head_dim
)
attn_weight
=
qk
+
bias
[:,
None
]
attn_weight
=
torch
.
softmax
(
attn_weight
,
dim
=-
1
,
dtype
=
torch
.
float32
)
attn_weight
=
attn_weight
.
to
(
q
.
dtype
)
attn_output
=
torch
.
matmul
(
attn_weight
,
v
)
attn_output
=
attn_output
.
transpose
(
1
,
2
).
contiguous
()
return
attn_output
def
_naive_window_attention
(
q
,
k
,
v
,
seqlens_q
,
seqlens_k
,
window_size
):
from
flash_attn
import
flash_attn_varlen_func
def
_make_cu_seqlens
(
seqlens
):
cu_seqlens
=
seqlens
.
cumsum
(
0
)
cu_zero
=
cu_seqlens
.
new_zeros
(
1
)
cu_seqlens
=
torch
.
cat
([
cu_zero
,
cu_seqlens
])
return
cu_seqlens
max_seqlen_q
=
seqlens_q
.
max
().
item
()
max_seqlen_k
=
seqlens_k
.
max
().
item
()
cu_seqlens_q
=
_make_cu_seqlens
(
seqlens_q
).
int
()
cu_seqlens_k
=
_make_cu_seqlens
(
seqlens_k
).
int
()
output
=
flash_attn_varlen_func
(
q
,
k
,
v
,
cu_seqlens_q
,
cu_seqlens_k
,
max_seqlen_q
=
max_seqlen_q
,
max_seqlen_k
=
max_seqlen_k
,
causal
=
True
,
window_size
=
window_size
)
return
output
class
TestPagedAttention
:
@
pytest
.
fixture
def
dtype
(
self
):
yield
torch
.
float16
@
pytest
.
fixture
def
feat_dim
(
self
):
yield
16
@
pytest
.
fixture
def
num_heads_q
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
num_heads_k
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
block_size
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
seq_lens
(
self
,
request
):
yield
torch
.
tensor
(
request
.
param
,
device
=
'cuda'
)
@
pytest
.
fixture
def
start_loc
(
self
,
seq_lens
):
seq_sum
=
seq_lens
.
cumsum
(
0
)
start_loc
=
torch
.
cat
([
seq_sum
.
new_zeros
(
1
),
seq_sum
[:
-
1
]],
dim
=
0
)
yield
start_loc
@
pytest
.
fixture
def
history_lens
(
self
,
request
):
yield
torch
.
tensor
(
request
.
param
,
device
=
'cuda'
)
@
pytest
.
fixture
def
batched_q
(
self
,
seq_lens
,
num_heads_q
,
feat_dim
,
dtype
):
torch
.
manual_seed
(
123
)
batch_size
=
len
(
seq_lens
)
max_seq_len
=
seq_lens
.
max
().
item
()
inputs
=
torch
.
rand
(
batch_size
,
max_seq_len
,
num_heads_q
,
feat_dim
,
dtype
=
dtype
,
device
=
'cuda'
)
yield
inputs
@
pytest
.
fixture
def
batched_kv
(
self
,
seq_lens
,
history_lens
,
num_heads_k
,
feat_dim
,
dtype
):
torch
.
manual_seed
(
123
)
batch_size
=
len
(
seq_lens
)
full_seq_lens
=
seq_lens
+
history_lens
max_seq_len
=
full_seq_lens
.
max
().
item
()
k
=
torch
.
rand
(
batch_size
,
max_seq_len
,
num_heads_k
,
feat_dim
,
dtype
=
dtype
,
device
=
'cuda'
)
v
=
torch
.
rand
(
batch_size
,
max_seq_len
,
num_heads_k
,
feat_dim
,
dtype
=
dtype
,
device
=
'cuda'
)
yield
k
,
v
@
pytest
.
fixture
def
conti_q
(
self
,
seq_lens
,
batched_q
):
yield
_conti_input
(
batched_q
,
seq_lens
)
@
pytest
.
fixture
def
block_offsets
(
self
,
seq_lens
,
history_lens
,
block_size
):
full_seq_lens
=
seq_lens
+
history_lens
batch_size
=
full_seq_lens
.
size
(
0
)
num_blocks
=
(
full_seq_lens
+
block_size
-
1
)
//
block_size
offset
=
[
torch
.
arange
(
size
)
*
batch_size
+
idx
for
idx
,
size
in
enumerate
(
num_blocks
)
]
max_len
=
max
(
len
(
o
)
for
o
in
offset
)
new_offset
=
offset
[
0
].
new_zeros
(
batch_size
,
max_len
)
for
o
,
no
in
zip
(
offset
,
new_offset
):
len_o
=
o
.
size
(
0
)
no
[:
len_o
]
=
o
yield
new_offset
.
cuda
()
@
pytest
.
fixture
def
conti_kv
(
self
,
batched_kv
,
seq_lens
,
history_lens
):
full_seq_lens
=
seq_lens
+
history_lens
conti_k
=
_conti_input
(
batched_kv
[
0
],
full_seq_lens
)
conti_v
=
_conti_input
(
batched_kv
[
1
],
full_seq_lens
)
yield
(
conti_k
,
conti_v
)
@
pytest
.
fixture
def
blocked_kv
(
self
,
batched_kv
,
seq_lens
,
history_lens
,
block_offsets
,
block_size
,
num_heads_k
,
feat_dim
):
batched_k
,
batched_v
=
batched_kv
yield
_make_blocked_cache
(
batched_k
,
batched_v
,
seq_lens
,
history_lens
,
block_offsets
,
block_size
,
num_heads_k
,
feat_dim
)
@
pytest
.
fixture
def
mask
(
self
,
seq_lens
,
history_lens
):
neg_val
=
-
1e30
yield
_make_bias
(
seq_lens
,
history_lens
,
neg_val
)
@
pytest
.
fixture
def
gt
(
self
,
batched_q
,
batched_kv
,
mask
):
yield
_naive_attention
(
batched_q
,
batched_kv
,
mask
)
@
pytest
.
fixture
def
conti_gt
(
self
,
gt
,
seq_lens
):
yield
_conti_input
(
gt
,
seq_lens
)
@
pytest
.
mark
.
parametrize
([
'num_heads_q'
,
'num_heads_k'
],
[(
4
,
2
)],
indirect
=
True
)
@
pytest
.
mark
.
parametrize
([
'seq_lens'
,
'history_lens'
],
[([
30
,
50
,
70
,
90
],
[
50
,
40
,
30
,
20
]),
([
1
,
1
,
1
,
1
],
[
50
,
40
,
30
,
20
])],
indirect
=
True
)
@
pytest
.
mark
.
parametrize
(
'block_size'
,
[
2
,
16
],
indirect
=
True
)
def
test_paged_attention
(
self
,
conti_q
,
blocked_kv
,
block_offsets
,
start_loc
,
seq_lens
,
history_lens
,
conti_gt
):
from
lmdeploy.pytorch.kernels
import
paged_attention_fwd
kv_seq_lens
=
seq_lens
+
history_lens
max_seq_len
=
seq_lens
.
max
().
item
()
blocked_k
,
blocked_v
=
blocked_kv
out
=
torch
.
empty_like
(
conti_q
)
paged_attention_fwd
(
conti_q
,
blocked_k
,
blocked_v
,
out
,
block_offsets
=
block_offsets
,
q_start_loc
=
start_loc
,
q_seqlens
=
seq_lens
,
kv_seqlens
=
kv_seq_lens
,
max_seqlen
=
max_seq_len
)
torch
.
testing
.
assert_close
(
out
,
conti_gt
,
atol
=
1e-3
,
rtol
=
1e-5
)
@
pytest
.
fixture
def
win_size
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
def
window_gt
(
self
,
conti_q
,
conti_kv
,
seq_lens
,
history_lens
,
win_size
):
kv_lens
=
seq_lens
+
history_lens
yield
_naive_window_attention
(
conti_q
,
conti_kv
[
0
],
conti_kv
[
1
],
seq_lens
,
kv_lens
,
window_size
=
(
win_size
,
win_size
))
@
pytest
.
mark
.
parametrize
([
'num_heads_q'
,
'num_heads_k'
],
[(
4
,
2
)],
indirect
=
True
)
@
pytest
.
mark
.
parametrize
([
'seq_lens'
,
'history_lens'
],
[
([
30
,
50
,
70
,
90
],
[
50
,
40
,
30
,
20
]),
([
1
,
1
,
1
,
1
],
[
50
,
40
,
30
,
20
]),
],
indirect
=
True
)
@
pytest
.
mark
.
parametrize
(
'win_size'
,
(
32
,
),
indirect
=
True
)
@
pytest
.
mark
.
parametrize
(
'block_size'
,
[
16
],
indirect
=
True
)
def
test_window_attention
(
self
,
conti_q
,
blocked_kv
,
block_offsets
,
start_loc
,
seq_lens
,
history_lens
,
win_size
,
window_gt
):
from
lmdeploy.pytorch.kernels
import
paged_attention_fwd
kv_seq_lens
=
seq_lens
+
history_lens
max_seq_len
=
seq_lens
.
max
().
item
()
blocked_k
,
blocked_v
=
blocked_kv
out
=
torch
.
empty_like
(
conti_q
)
paged_attention_fwd
(
conti_q
,
blocked_k
,
blocked_v
,
out
,
block_offsets
=
block_offsets
,
q_start_loc
=
start_loc
,
q_seqlens
=
seq_lens
,
kv_seqlens
=
kv_seq_lens
,
max_seqlen
=
max_seq_len
,
window_size
=
win_size
)
torch
.
testing
.
assert_close
(
out
,
window_gt
,
atol
=
1e-3
,
rtol
=
1e-5
)
tests/pytorch/kernel/test_rearange_all_gather.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
lmdeploy.pytorch.kernels.rearange_all_gather
import
rearange_all_gather
class
TestRearangeAllGather
:
@
pytest
.
fixture
def
seq_lens
(
self
,
request
):
yield
torch
.
tensor
(
request
.
param
,
device
=
'cuda'
)
@
pytest
.
fixture
def
start_loc
(
self
,
seq_lens
):
yield
seq_lens
.
cumsum
(
0
)
-
seq_lens
@
pytest
.
fixture
def
ranks
(
self
):
yield
torch
.
tensor
([
4
,
8
]).
cuda
()
@
pytest
.
fixture
def
adapter_ids
(
self
,
seq_lens
,
ranks
):
num_ranks
=
len
(
ranks
)
num_seqs
=
len
(
seq_lens
)
ret
=
torch
.
randint
(
0
,
num_ranks
,
(
num_seqs
,
)).
cuda
()
yield
ret
@
pytest
.
fixture
def
world_size
(
self
):
yield
2
@
pytest
.
fixture
def
input
(
self
,
seq_lens
,
ranks
):
max_rank
=
max
(
ranks
)
total_len
=
seq_lens
.
sum
()
yield
torch
.
rand
(
total_len
,
max_rank
).
cuda
()
@
pytest
.
fixture
def
rank_per_input
(
self
,
seq_lens
,
ranks
,
adapter_ids
):
token_adapter_ids
=
[
torch
.
full
((
slen
,
),
ada_id
)
for
slen
,
ada_id
in
zip
(
seq_lens
,
adapter_ids
)
]
token_adapter_ids
=
torch
.
cat
(
token_adapter_ids
).
cuda
()
yield
ranks
[
token_adapter_ids
]
@
pytest
.
fixture
def
valid_mask
(
self
,
rank_per_input
,
seq_lens
,
ranks
):
max_rank
=
max
(
ranks
)
total_len
=
seq_lens
.
sum
()
mask
=
torch
.
zeros
(
total_len
,
max_rank
).
to
(
bool
)
for
r
,
m
in
zip
(
rank_per_input
,
mask
):
m
[:
r
]
=
True
yield
mask
.
cuda
()
@
pytest
.
fixture
def
gt
(
self
,
input
,
rank_per_input
,
ranks
,
world_size
):
max_rank
=
max
(
ranks
)
pranks
=
rank_per_input
//
world_size
pmax_rank
=
max_rank
//
world_size
output
=
torch
.
empty_like
(
input
)
for
pr
,
inp
,
out
in
zip
(
pranks
,
input
,
output
):
pindex
=
torch
.
arange
(
pr
).
cuda
()
index
=
[
pindex
+
ws
*
pmax_rank
for
ws
in
range
(
world_size
)]
index
=
torch
.
cat
(
index
)
out
[:
index
.
size
(
0
)]
=
inp
[
index
]
yield
output
@
pytest
.
mark
.
parametrize
(
'seq_lens'
,
[[
30
,
50
,
70
,
90
],
[
1
,
1
,
1
,
1
]],
indirect
=
True
)
def
test_gather
(
self
,
input
,
start_loc
,
seq_lens
,
adapter_ids
,
ranks
,
world_size
,
gt
,
valid_mask
):
max_seq_len
=
max
(
seq_lens
)
output
=
rearange_all_gather
(
input
,
start_loc
,
seq_lens
,
adapter_ids
,
ranks
,
world_size
,
max_seq_len
=
max_seq_len
)
output
=
output
.
where
(
valid_mask
,
output
.
new_tensor
(
0
))
gt
=
gt
.
where
(
valid_mask
,
gt
.
new_tensor
(
0
))
torch
.
testing
.
assert_close
(
output
,
gt
)
tests/pytorch/kernel/test_rms_norm.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
class
TestRMSNorm
:
@
pytest
.
fixture
(
scope
=
'class'
)
def
dtype
(
self
,
request
):
yield
request
.
param
@
pytest
.
fixture
(
scope
=
'class'
)
def
input
(
self
,
dtype
):
yield
torch
.
rand
(
4
,
8
,
dtype
=
dtype
,
device
=
'cuda'
)
@
pytest
.
fixture
(
scope
=
'class'
)
def
weight
(
self
,
dtype
):
yield
torch
.
rand
(
8
,
dtype
=
dtype
,
device
=
'cuda'
)
@
pytest
.
fixture
(
scope
=
'class'
)
def
eps
(
self
):
yield
1e-6
@
pytest
.
fixture
(
scope
=
'class'
)
def
gt
(
self
,
input
,
weight
,
eps
):
input_dtype
=
input
.
dtype
input
=
input
.
to
(
torch
.
float32
)
variance
=
input
.
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
input
=
input
*
torch
.
rsqrt
(
variance
+
eps
)
return
weight
*
input
.
to
(
input_dtype
)
@
pytest
.
mark
.
parametrize
(
'dtype'
,
[
torch
.
bfloat16
,
torch
.
float16
,
torch
.
float32
],
indirect
=
True
)
def
test_rms_norm
(
self
,
input
,
weight
,
eps
,
gt
):
from
lmdeploy.pytorch.kernels
import
rms_norm
out
=
rms_norm
(
input
,
weight
,
eps
)
torch
.
testing
.
assert_close
(
out
,
gt
)
tests/pytorch/paging/test_block_manager.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
lmdeploy.pytorch.messages
import
SchedulerSession
from
lmdeploy.pytorch.paging.block_manager
import
(
DefaultBlockManager
,
WindowBlockManager
)
from
lmdeploy.pytorch.paging.block_manager.base_block_manager
import
\
LogicalAllocator
# noqa: E501
class
TestAllocator
:
@
pytest
.
fixture
def
num_gpu_blocks
(
self
):
yield
16
@
pytest
.
fixture
def
num_cpu_blocks
(
self
):
yield
4
@
pytest
.
fixture
def
allocator
(
self
,
num_cpu_blocks
,
num_gpu_blocks
):
yield
LogicalAllocator
(
num_cpu_blocks
,
num_gpu_blocks
)
def
test_alloc
(
self
,
allocator
,
num_cpu_blocks
,
num_gpu_blocks
):
# initialize
num_blocks
=
num_cpu_blocks
+
num_gpu_blocks
gpu_allocator
=
allocator
.
get_phy_allocator
(
'gpu'
)
cpu_allocator
=
allocator
.
get_phy_allocator
(
'cpu'
)
assert
allocator
.
get_num_free_blocks
()
==
num_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_cpu_blocks
assert
gpu_allocator
.
get_num_free_blocks
()
==
num_gpu_blocks
# test allocate
block_size
=
4
blocks
=
allocator
.
allocate
(
block_size
,
'gpu'
)
assert
len
(
blocks
)
==
block_size
assert
allocator
.
get_num_free_blocks
()
==
num_blocks
-
block_size
assert
gpu_allocator
.
get_num_free_blocks
(
)
==
num_gpu_blocks
-
block_size
# test free
allocator
.
add_ref_count
(
blocks
,
1
)
allocator
.
free
(
blocks
)
assert
allocator
.
get_num_free_blocks
()
==
num_blocks
-
block_size
allocator
.
free
(
blocks
)
assert
allocator
.
get_num_free_blocks
()
==
num_blocks
assert
gpu_allocator
.
get_num_free_blocks
()
==
num_gpu_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_cpu_blocks
def
test_full
(
self
,
allocator
,
num_cpu_blocks
,
num_gpu_blocks
):
num_blocks
=
num_cpu_blocks
+
num_gpu_blocks
gpu_allocator
=
allocator
.
get_phy_allocator
(
'gpu'
)
cpu_allocator
=
allocator
.
get_phy_allocator
(
'cpu'
)
# no free blocks
gpu_block_size
=
num_gpu_blocks
gpu_blocks
=
allocator
.
allocate
(
gpu_block_size
,
'gpu'
)
cpu_block_size
=
num_cpu_blocks
cpu_blocks
=
allocator
.
allocate
(
cpu_block_size
,
'cpu'
)
assert
cpu_allocator
.
get_num_free_blocks
()
==
0
assert
gpu_allocator
.
get_num_free_blocks
()
==
0
with
pytest
.
raises
(
MemoryError
):
allocator
.
allocate
(
1
,
'gpu'
)
allocator
.
free
(
gpu_blocks
)
allocator
.
free
(
cpu_blocks
)
assert
allocator
.
get_num_free_blocks
()
==
num_blocks
assert
gpu_allocator
.
get_num_free_blocks
()
==
num_gpu_blocks
assert
cpu_allocator
.
get_num_free_blocks
()
==
num_cpu_blocks
class
TestDefaultBlockManager
:
@
pytest
.
fixture
def
block_size
(
self
):
yield
16
@
pytest
.
fixture
def
num_cpu_blocks
(
self
):
yield
4
@
pytest
.
fixture
def
num_gpu_blocks
(
self
):
yield
4
@
pytest
.
fixture
def
block_mgr
(
self
,
num_cpu_blocks
,
num_gpu_blocks
):
yield
DefaultBlockManager
(
num_cpu_blocks
,
num_gpu_blocks
)
def
test_alloc
(
self
,
block_mgr
,
block_size
,
num_gpu_blocks
):
sess
=
SchedulerSession
(
0
,
block_size
)
# test alloc
token_ids
=
torch
.
tensor
([
1
])
msg
=
sess
.
add_sequence
(
token_ids
)
assert
block_mgr
.
can_allocate
(
msg
)
block_mgr
.
allocate
(
msg
)
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
1
assert
block_table
is
not
None
assert
len
(
block_table
)
==
1
# test free
block_mgr
.
free
(
msg
)
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
block_table
is
None
or
len
(
block_table
)
==
0
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
# alloc over limit
token_ids
=
torch
.
zeros
((
num_gpu_blocks
*
block_size
+
1
,
),
dtype
=
torch
.
int64
)
msg
=
sess
.
add_sequence
(
token_ids
)
assert
not
block_mgr
.
can_allocate
(
msg
)
def
test_append_slot
(
self
,
block_mgr
,
block_size
,
num_gpu_blocks
):
sess
=
SchedulerSession
(
0
,
block_size
)
# test append
token_ids
=
torch
.
tensor
([
1
])
msg
=
sess
.
add_sequence
(
token_ids
)
block_mgr
.
allocate
(
msg
)
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
len
(
block_table
)
==
1
# no new logical block
msg
.
update_token_ids
(
torch
.
tensor
([
1
]
*
(
block_size
-
1
)))
assert
block_mgr
.
can_append_slot
(
msg
)
block_mgr
.
append_slot
(
msg
)
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
len
(
block_table
)
==
1
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
1
# with new logical block
msg
.
update_token_ids
(
torch
.
tensor
([
1
]))
block_mgr
.
append_slot
(
msg
)
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
len
(
block_table
)
==
2
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
2
def
test_fork
(
self
,
block_mgr
,
block_size
,
num_gpu_blocks
):
sess
=
SchedulerSession
(
0
,
block_size
)
token_ids
=
torch
.
tensor
([
1
]
*
(
block_size
*
2
+
1
))
from_msg
=
sess
.
add_sequence
(
token_ids
)
block_mgr
.
allocate
(
from_msg
)
from_block_table
=
block_mgr
.
get_block_table
(
from_msg
)
assert
len
(
from_block_table
)
==
3
to_msg
=
sess
.
fork_sequence
(
torch
.
tensor
([
1
]),
from_msg
)
# fork
assert
block_mgr
.
can_fork
(
from_msg
)
copy_map
=
block_mgr
.
fork
(
from_msg
,
to_msg
)
block_table
=
block_mgr
.
get_block_table
(
to_msg
)
assert
len
(
block_table
)
==
3
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
4
assert
block_table
[
0
]
==
from_block_table
[
0
]
assert
block_table
[
1
]
==
from_block_table
[
1
]
assert
block_table
[
2
]
!=
from_block_table
[
2
]
assert
len
(
copy_map
)
==
1
assert
copy_map
[
from_block_table
[
2
]]
==
block_table
[
2
]
# can not fork
assert
not
block_mgr
.
can_fork
(
from_msg
)
def
test_swap
(
self
,
block_mgr
,
block_size
,
num_gpu_blocks
):
sess
=
SchedulerSession
(
0
,
block_size
)
token_ids
=
torch
.
tensor
([
1
]
*
(
block_size
+
1
))
msg
=
sess
.
add_sequence
(
token_ids
)
block_mgr
.
allocate
(
msg
)
old_phy_blocks
=
block_mgr
.
get_block_table
(
msg
)
success
,
swap_map
=
block_mgr
.
try_swap_out
(
msg
)
new_phy_blocks
=
block_mgr
.
get_block_table
(
msg
)
assert
success
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
assert
block_mgr
.
get_num_free_cpu_blocks
()
==
num_gpu_blocks
-
2
assert
len
(
swap_map
)
==
2
for
block_id
in
old_phy_blocks
:
assert
block_id
in
swap_map
for
block_id
in
new_phy_blocks
:
assert
block_id
-
num_gpu_blocks
in
swap_map
.
values
()
old_phy_blocks
=
block_mgr
.
get_block_table
(
msg
)
success
,
swap_map
=
block_mgr
.
try_swap_in
(
msg
)
new_phy_blocks
=
block_mgr
.
get_block_table
(
msg
)
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
2
assert
block_mgr
.
get_num_free_cpu_blocks
()
==
num_gpu_blocks
assert
len
(
swap_map
)
==
2
for
block_id
in
old_phy_blocks
:
assert
block_id
-
num_gpu_blocks
in
swap_map
for
block_id
in
new_phy_blocks
:
assert
block_id
in
swap_map
.
values
()
success
,
swap_map
=
block_mgr
.
try_swap_out
(
msg
)
assert
success
token_ids
=
torch
.
tensor
([
1
]
*
(
block_size
*
4
))
msg_full
=
sess
.
add_sequence
(
token_ids
)
block_mgr
.
allocate
(
msg_full
)
success
,
swap_map
=
block_mgr
.
try_swap_out
(
msg
)
assert
not
success
class
TestWindowBlockManager
:
@
pytest
.
fixture
def
window_size
(
self
):
yield
32
@
pytest
.
fixture
def
block_size
(
self
):
yield
16
@
pytest
.
fixture
def
num_cpu_blocks
(
self
):
yield
4
@
pytest
.
fixture
def
num_gpu_blocks
(
self
):
yield
4
@
pytest
.
fixture
def
block_mgr
(
self
,
num_cpu_blocks
,
num_gpu_blocks
,
window_size
):
yield
WindowBlockManager
(
num_cpu_blocks
,
num_gpu_blocks
,
window_size
)
def
test_alloc
(
self
,
block_mgr
,
block_size
,
num_gpu_blocks
):
sess
=
SchedulerSession
(
0
,
block_size
)
# test alloc
token_ids
=
torch
.
tensor
([
1
])
msg
=
sess
.
add_sequence
(
token_ids
)
assert
block_mgr
.
can_allocate
(
msg
)
block_mgr
.
allocate
(
msg
)
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
1
assert
block_table
is
not
None
assert
len
(
block_table
)
==
1
# test free
block_mgr
.
free
(
msg
)
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
block_table
is
None
or
len
(
block_table
)
==
0
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
# alloc over limit
token_ids
=
torch
.
zeros
((
num_gpu_blocks
*
block_size
+
1
,
),
dtype
=
torch
.
int64
)
msg
=
sess
.
add_sequence
(
token_ids
)
assert
not
block_mgr
.
can_allocate
(
msg
)
def
test_win_alloc
(
self
,
block_mgr
,
block_size
,
num_gpu_blocks
,
window_size
):
sess
=
SchedulerSession
(
0
,
block_size
)
# 2 win block
token_ids
=
torch
.
tensor
([
1
]
*
window_size
)
msg
=
sess
.
add_sequence
(
token_ids
)
block_mgr
.
allocate
(
msg
)
msg
.
update_token_ids
(
torch
.
tensor
([
1
]))
block_mgr
.
allocate
(
msg
)
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
3
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
block_table
is
None
or
len
(
block_table
)
==
3
block_mgr
.
free
(
msg
)
# 3 win block
token_ids
=
torch
.
tensor
([
1
]
*
(
window_size
+
2
))
msg
=
sess
.
add_sequence
(
token_ids
)
block_mgr
.
allocate
(
msg
)
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
3
msg
.
update_token_ids
(
torch
.
tensor
([
1
]))
block_mgr
.
allocate
(
msg
)
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
3
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
block_table
is
None
or
len
(
block_table
)
==
3
block_mgr
.
free
(
msg
)
# not full win
token_ids
=
torch
.
tensor
([
1
]
*
(
window_size
-
2
))
msg
=
sess
.
add_sequence
(
token_ids
)
block_mgr
.
allocate
(
msg
)
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
2
msg
.
update_token_ids
(
torch
.
tensor
([
1
]))
block_mgr
.
allocate
(
msg
)
assert
block_mgr
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
2
block_table
=
block_mgr
.
get_block_table
(
msg
)
assert
block_table
is
None
or
len
(
block_table
)
==
2
block_mgr
.
free
(
msg
)
tests/pytorch/paging/test_scheduler.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
lmdeploy.pytorch.config
import
CacheConfig
,
SchedulerConfig
from
lmdeploy.pytorch.messages
import
MessageStatus
from
lmdeploy.pytorch.paging.scheduler
import
Scheduler
class
TestScheduler
:
@
pytest
.
fixture
def
block_size
(
self
):
yield
16
@
pytest
.
fixture
def
num_cpu_blocks
(
self
):
yield
4
@
pytest
.
fixture
def
num_gpu_blocks
(
self
):
yield
4
@
pytest
.
fixture
def
cache_config
(
self
,
block_size
,
num_cpu_blocks
,
num_gpu_blocks
):
yield
CacheConfig
(
block_size
=
block_size
,
num_cpu_blocks
=
num_cpu_blocks
,
num_gpu_blocks
=
num_gpu_blocks
)
@
pytest
.
fixture
def
scheduler_config
(
self
):
yield
SchedulerConfig
(
max_batches
=
4
,
max_session_len
=
128
,
max_request_output_len
=
64
,
eviction_type
=
'copy'
)
@
pytest
.
fixture
def
scheduler
(
self
,
cache_config
,
scheduler_config
):
yield
Scheduler
(
scheduler_config
=
scheduler_config
,
cache_config
=
cache_config
)
def
test_schedule_base
(
self
,
scheduler
,
block_size
,
num_gpu_blocks
):
block_manager
=
scheduler
.
block_manager
session_id
=
0
session
=
scheduler
.
add_session
(
session_id
)
assert
session_id
in
scheduler
.
sessions
assert
scheduler
.
sessions
[
session_id
]
==
session
num_blocks
=
2
token_ids
=
torch
.
tensor
([
0
]
*
block_size
*
num_blocks
)
seq
=
session
.
add_sequence
(
token_ids
)
scheduler
.
add_sequence
(
seq
)
assert
seq
.
status
==
MessageStatus
.
WAITING
assert
seq
in
scheduler
.
waiting
output
=
scheduler
.
schedule
(
is_prefill
=
True
)
block_tables
=
scheduler
.
get_block_tables
(
output
.
running
)
assert
seq
.
status
==
MessageStatus
.
RUNNING
assert
seq
in
output
.
running
assert
len
(
block_tables
)
==
1
assert
len
(
block_tables
[
0
])
==
num_blocks
assert
block_manager
.
get_num_free_gpu_blocks
(
)
==
num_gpu_blocks
-
num_blocks
assert
scheduler
.
has_unfinished
()
def
test_update
(
self
,
scheduler
,
block_size
,
num_gpu_blocks
):
block_manager
=
scheduler
.
block_manager
session_id1
=
0
session1
=
scheduler
.
add_session
(
session_id1
)
token_ids1
=
torch
.
tensor
([
0
]
*
block_size
*
1
)
seq1
=
session1
.
add_sequence
(
token_ids1
)
scheduler
.
add_sequence
(
seq1
)
session_id2
=
1
session2
=
scheduler
.
add_session
(
session_id2
)
token_ids2
=
torch
.
tensor
([
0
]
*
block_size
*
2
)
seq2
=
session2
.
add_sequence
(
token_ids2
)
scheduler
.
add_sequence
(
seq2
)
token_ids3
=
torch
.
tensor
([
0
]
*
block_size
*
3
)
seq3
=
session2
.
add_sequence
(
token_ids3
)
scheduler
.
add_sequence
(
seq3
)
scheduler
.
schedule
(
is_prefill
=
True
)
assert
seq1
.
status
==
MessageStatus
.
RUNNING
assert
seq2
.
status
==
MessageStatus
.
RUNNING
assert
seq3
.
status
==
MessageStatus
.
WAITING
# stop seq
seq1
.
status
=
MessageStatus
.
STOPPED
scheduler
.
update
()
assert
len
(
scheduler
.
running
)
==
1
assert
seq1
in
scheduler
.
hanging
# end seq
seq1
.
status
=
MessageStatus
.
ENDED
scheduler
.
update
()
assert
session_id1
in
scheduler
.
sessions
assert
seq1
not
in
scheduler
.
running
assert
seq1
not
in
scheduler
.
hanging
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
2
# stop session
scheduler
.
stop_session
(
session_id2
)
scheduler
.
update
()
assert
len
(
scheduler
.
running
)
==
0
assert
len
(
scheduler
.
waiting
)
==
0
assert
len
(
scheduler
.
hanging
)
==
2
# end session
scheduler
.
end_session
(
session_id2
)
scheduler
.
update
()
assert
seq2
.
status
==
MessageStatus
.
ENDED
assert
seq3
.
status
==
MessageStatus
.
ENDED
assert
session_id2
not
in
scheduler
.
sessions
assert
len
(
scheduler
.
hanging
)
==
0
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
def
test_swap
(
self
,
scheduler
,
block_size
,
num_gpu_blocks
,
num_cpu_blocks
):
block_manager
=
scheduler
.
block_manager
session_id
=
0
session
=
scheduler
.
add_session
(
session_id
)
# test: add 3 seq
token_ids1
=
torch
.
tensor
([
0
]
*
block_size
*
1
)
seq1
=
session
.
add_sequence
(
token_ids1
)
scheduler
.
add_sequence
(
seq1
)
token_ids2
=
torch
.
tensor
([
0
]
*
block_size
*
2
)
seq2
=
session
.
add_sequence
(
token_ids2
)
scheduler
.
add_sequence
(
seq2
)
token_ids3
=
torch
.
tensor
([
0
]
*
block_size
*
3
)
seq3
=
session
.
add_sequence
(
token_ids3
)
scheduler
.
add_sequence
(
seq3
)
scheduler
.
schedule
(
is_prefill
=
True
)
# seq1: 1 running gpu
# seq2: 2 running gpu
# seq3: 3 waiting empty
assert
seq1
.
status
==
MessageStatus
.
RUNNING
assert
seq2
.
status
==
MessageStatus
.
RUNNING
assert
seq3
.
status
==
MessageStatus
.
WAITING
assert
block_manager
.
get_num_free_gpu_blocks
()
==
num_gpu_blocks
-
3
# test: waiting alloc
seq2
.
status
=
MessageStatus
.
STOPPED
scheduler
.
update
()
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
1
assert
len
(
scheduler
.
hanging
)
==
1
output
=
scheduler
.
schedule
(
is_prefill
=
True
)
# seq1: 1 running gpu
# seq2: 2 hanging cpu
# seq3: 3 waiting gpu
assert
seq1
.
status
==
MessageStatus
.
RUNNING
assert
seq2
.
status
==
MessageStatus
.
STOPPED
assert
seq3
.
status
==
MessageStatus
.
RUNNING
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
assert
block_manager
.
get_num_free_cpu_blocks
()
==
num_cpu_blocks
-
2
assert
len
(
output
.
swap_out_map
)
==
2
# test: waiting append token
seq2
.
status
=
MessageStatus
.
WAITING
seq3
.
status
=
MessageStatus
.
ENDED
seq2
.
update_token_ids
(
torch
.
tensor
([
1
]
*
block_size
))
scheduler
.
update
()
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
1
assert
len
(
scheduler
.
hanging
)
==
0
output
=
scheduler
.
schedule
(
is_prefill
=
True
)
# seq1: 1 running gpu
# seq2: 3 running gpu
# seq3: 3 nan
assert
seq1
.
status
==
MessageStatus
.
RUNNING
assert
seq2
.
status
==
MessageStatus
.
RUNNING
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
assert
block_manager
.
get_num_free_cpu_blocks
()
==
num_cpu_blocks
assert
len
(
output
.
swap_in_map
)
==
2
# test running append
seq1
.
update_token_ids
(
torch
.
tensor
([
1
]
*
block_size
))
seq2
.
update_token_ids
(
torch
.
tensor
([
1
]
*
block_size
))
scheduler
.
update
()
assert
len
(
scheduler
.
running
)
==
2
output
=
scheduler
.
schedule
(
is_prefill
=
False
)
# seq1: 1 waiting cpu
# seq2: 4 running gpu
# seq3: 3 nan
assert
seq1
.
status
==
MessageStatus
.
WAITING
assert
seq2
.
status
==
MessageStatus
.
RUNNING
assert
block_manager
.
get_num_free_gpu_blocks
()
==
0
assert
block_manager
.
get_num_free_cpu_blocks
()
==
num_cpu_blocks
-
1
assert
len
(
output
.
swap_out_map
)
==
1
tests/pytorch/tools/test_layout_convert.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
lmdeploy.pytorch.tools.layout_convert
import
(
batch_tensor
,
continuous_tensor
)
class
TestContinuous
:
@
pytest
.
fixture
def
batched_tensor
(
self
):
yield
torch
.
tensor
([[
1
,
2
,
3
,
0
,
0
],
[
4
,
5
,
6
,
7
,
8
],
[
9
,
10
,
0
,
0
,
0
]])
@
pytest
.
fixture
def
seq_len
(
self
):
yield
torch
.
tensor
([
3
,
5
,
2
])
@
pytest
.
fixture
def
conti_tensor
(
self
):
yield
torch
.
tensor
([[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
]])
def
test_conti_tensor
(
self
,
batched_tensor
,
seq_len
,
conti_tensor
):
conti_out
=
continuous_tensor
(
batched_tensor
,
seq_len
)
torch
.
testing
.
assert_close
(
conti_out
,
conti_tensor
)
batched_out
=
batch_tensor
(
conti_tensor
,
seq_len
)
torch
.
testing
.
assert_close
(
batched_out
,
batched_tensor
)
tests/pytorch/tools/test_make_inputs.py
0 → 100644
View file @
fe851fbc
import
pytest
import
torch
from
lmdeploy.pytorch.tools.make_inputs
import
(
make_model_inputs
,
make_step_context
)
class
TestMakeInputs
:
@
pytest
.
fixture
def
seq_length
(
self
):
yield
torch
.
tensor
([
2
,
4
,
3
])
@
pytest
.
fixture
def
history_length
(
self
):
yield
[
10
,
12
,
6
]
@
pytest
.
fixture
def
input_ids
(
self
,
seq_length
):
batch_size
=
len
(
seq_length
)
max_seq_len
=
max
(
seq_length
)
yield
torch
.
randint
(
0
,
100
,
(
batch_size
,
max_seq_len
))
@
pytest
.
fixture
def
block_size
(
self
):
yield
4
@
pytest
.
fixture
def
num_key_value_heads
(
self
):
yield
1
@
pytest
.
fixture
def
head_size
(
self
):
yield
4
@
pytest
.
fixture
def
kv_cache_dtype
(
self
):
yield
torch
.
float16
@
pytest
.
fixture
def
past_key_values
(
self
,
history_length
,
num_key_value_heads
,
head_size
):
max_len
=
max
(
history_length
)
batch_size
=
len
(
history_length
)
k_cache
=
torch
.
rand
(
batch_size
,
num_key_value_heads
,
max_len
,
head_size
)
v_cache
=
k_cache
+
1
yield
[(
k_cache
,
v_cache
)]
def
test_make_inputs
(
self
,
input_ids
,
seq_length
,
history_length
):
model_inputs
=
make_model_inputs
(
input_ids
,
seq_length
=
seq_length
,
block_offsets
=
None
,
history_length
=
history_length
)
position_ids
=
torch
.
tensor
([
[
10
,
11
,
11
,
11
],
[
12
,
13
,
14
,
15
],
[
6
,
7
,
8
,
8
],
])
q_start_loc
=
torch
.
tensor
([
0
,
2
,
6
])
torch
.
testing
.
assert_close
(
model_inputs
.
position_ids
,
position_ids
)
torch
.
testing
.
assert_close
(
model_inputs
.
q_start_loc
,
q_start_loc
)
def
test_make_step_context
(
self
,
input_ids
,
seq_length
,
history_length
,
past_key_values
,
block_size
,
num_key_value_heads
,
head_size
,
kv_cache_dtype
):
step_ctx
=
make_step_context
(
input_ids
,
seq_length
=
seq_length
,
history_length
=
history_length
,
past_key_values
=
past_key_values
,
world_size
=
1
,
device
=
'cuda'
,
block_size
=
block_size
,
num_key_value_heads
=
num_key_value_heads
,
head_size
=
head_size
,
kv_cache_dtype
=
kv_cache_dtype
)
block_offsets
=
step_ctx
.
block_offsets
assert
block_offsets
[
0
][
3
]
==
0
assert
block_offsets
[
1
][
3
]
!=
0
assert
block_offsets
[
2
][
3
]
==
0
kv_caches
=
step_ctx
.
kv_caches
assert
len
(
kv_caches
)
==
len
(
past_key_values
)
tests/test_lmdeploy/test_async_engine.py
0 → 100644
View file @
fe851fbc
import
pytest
from
lmdeploy.messages
import
PytorchEngineConfig
,
TurbomindEngineConfig
from
lmdeploy.model
import
ChatTemplateConfig
from
lmdeploy.serve.async_engine
import
deduce_a_name
@
pytest
.
mark
.
parametrize
(
'backend_config'
,
[
TurbomindEngineConfig
(
'internlm'
),
PytorchEngineConfig
(
None
),
None
])
@
pytest
.
mark
.
parametrize
(
'chat_template_config'
,
[
ChatTemplateConfig
(
'internlm'
),
ChatTemplateConfig
(
None
),
None
])
@
pytest
.
mark
.
parametrize
(
'model_name'
,
[
'internlm'
,
None
])
@
pytest
.
mark
.
parametrize
(
'model_path'
,
[
'/path/to/internlm-chat-7b'
])
def
test_deduce_a_name
(
model_path
,
model_name
,
chat_template_config
,
backend_config
):
name
=
deduce_a_name
(
model_path
,
model_name
,
chat_template_config
,
backend_config
)
assert
name
==
'internlm'
tests/test_lmdeploy/test_auto_backend.py
0 → 100644
View file @
fe851fbc
import
os
import
tempfile
import
numpy
as
np
import
pytest
class
TestAutoBackend
:
@
pytest
.
fixture
def
turbomind_workspace
(
self
):
workspace
=
tempfile
.
TemporaryDirectory
(
'internlm-chat-7b-turbomind'
).
name
os
.
makedirs
(
os
.
path
.
join
(
workspace
,
'triton_models'
),
exist_ok
=
True
)
return
workspace
@
pytest
.
fixture
def
models
(
self
):
# example models to test
# format (model_path, is_pytorch_supported, is_turbomind_supported)
models
=
[
(
'baichuan-inc/Baichuan-7B'
,
False
,
True
),
(
'baichuan-inc/Baichuan2-7B-Chat'
,
True
,
True
),
(
'baichuan-inc/Baichuan-13B-Chat'
,
False
,
False
),
(
'baichuan-inc/Baichuan2-13B-Chat'
,
True
,
False
),
(
'internlm/internlm-chat-7b'
,
True
,
True
),
(
'internlm/internlm2-chat-7b'
,
True
,
True
),
(
'internlm/internlm-xcomposer2-7b'
,
False
,
False
),
(
'internlm/internlm-xcomposer-7b'
,
False
,
True
),
(
'THUDM/chatglm2-6b'
,
True
,
False
),
(
'THUDM/chatglm3-6b'
,
True
,
False
),
(
'deepseek-ai/deepseek-moe-16b-chat'
,
True
,
False
),
(
'tiiuae/falcon-7b-instruct'
,
True
,
False
),
(
'01-ai/Yi-34B-Chat'
,
True
,
True
),
(
'codellama/CodeLlama-7b-Instruct-hf'
,
True
,
True
),
(
'mistralai/Mistral-7B-Instruct-v0.1'
,
True
,
False
),
(
'mistralai/Mixtral-8x7B-Instruct-v0.1'
,
True
,
False
),
(
'Qwen/Qwen-7B-Chat'
,
False
,
True
),
(
'Qwen/Qwen-VL-Chat'
,
False
,
True
),
(
'Qwen/Qwen1.5-4B-Chat'
,
True
,
False
),
]
return
models
def
test_pytorch_is_suppored
(
self
,
turbomind_workspace
,
models
):
from
lmdeploy.pytorch.supported_models
import
is_supported
assert
is_supported
(
turbomind_workspace
)
is
False
for
m
,
flag
,
_
in
models
:
assert
is_supported
(
m
)
is
flag
def
test_turbomind_is_suppored
(
self
,
turbomind_workspace
,
models
):
from
lmdeploy.turbomind.supported_models
import
is_supported
assert
is_supported
(
turbomind_workspace
)
is
True
for
m
,
_
,
flag
in
models
:
assert
is_supported
(
m
)
is
flag
def
test_autoget_backend
(
self
,
turbomind_workspace
,
models
):
from
lmdeploy.archs
import
autoget_backend
assert
autoget_backend
(
turbomind_workspace
)
==
'turbomind'
n
=
len
(
models
)
choices
=
np
.
random
.
choice
(
n
,
n
//
2
,
replace
=
False
)
for
i
in
choices
:
model
,
is_support_pytorch
,
is_support_turbomind
=
models
[
i
]
target
=
'turbomind'
if
is_support_turbomind
else
'pytorch'
backend
=
autoget_backend
(
model
)
assert
backend
==
target
def
test_autoget_backend_config
(
self
,
turbomind_workspace
):
from
lmdeploy.archs
import
autoget_backend_config
from
lmdeploy.messages
import
(
PytorchEngineConfig
,
TurbomindEngineConfig
)
assert
type
(
autoget_backend_config
(
turbomind_workspace
))
is
TurbomindEngineConfig
assert
type
(
autoget_backend_config
(
'internlm/internlm-chat-7b'
))
is
TurbomindEngineConfig
assert
type
(
autoget_backend_config
(
'mistralai/Mistral-7B-Instruct-v0.1'
))
is
PytorchEngineConfig
tests/test_lmdeploy/test_get_model.py
0 → 100644
View file @
fe851fbc
import
os
import
pytest
from
lmdeploy.turbomind.utils
import
get_model_from_config
@
pytest
.
mark
.
parametrize
(
'item'
,
[(
'baichuan-inc/Baichuan-7B'
,
'baichuan'
),
(
'baichuan-inc/Baichuan2-7B-Base'
,
'baichuan2'
),
(
'internlm/internlm2-7b'
,
'internlm2'
),
(
'internlm/internlm2-chat-7b'
,
'internlm2'
),
(
'internlm/internlm2-math-20b'
,
'internlm2'
),
(
'internlm/internlm-20b'
,
'llama'
),
(
'NousResearch/Llama-2-7b-chat-hf'
,
'llama'
),
(
'Qwen/Qwen-7B-Chat'
,
'qwen'
),
(
'Qwen/Qwen-14B'
,
'qwen'
),
(
'NousResearch/Nous-Hermes-2-SOLAR-10.7B'
,
'llama'
),
(
'01-ai/Yi-34B-Chat'
,
'llama'
)])
def
test_get_model_from_config
(
item
):
from
transformers.utils
import
cached_file
model_id
,
result
=
item
local_file
=
cached_file
(
model_id
,
'config.json'
)
local_dir
=
os
.
path
.
dirname
(
local_file
)
print
(
get_model_from_config
(
local_dir
))
assert
get_model_from_config
(
local_dir
)
==
result
tests/test_lmdeploy/test_messages.py
0 → 100644
View file @
fe851fbc
from
typing
import
List
from
lmdeploy
import
EngineGenerationConfig
,
GenerationConfig
,
Tokenizer
def
test_engine_generation_config
():
tokenizer
=
Tokenizer
(
'internlm/internlm-chat-7b'
)
config
=
GenerationConfig
(
n
=
3
,
stop_words
=
[
'<eoa>'
])
_config
=
EngineGenerationConfig
.
From
(
config
,
tokenizer
)
assert
_config
.
n
==
config
.
n
==
3
and
\
_config
.
max_new_tokens
==
config
.
max_new_tokens
and
\
_config
.
temperature
==
config
.
temperature
assert
isinstance
(
_config
.
stop_words
,
List
)
and
\
isinstance
(
_config
.
stop_words
[
0
],
int
)
tests/test_lmdeploy/test_vl_template.py
0 → 100644
View file @
fe851fbc
import
PIL
from
lmdeploy.model
import
MODELS
from
lmdeploy.vl.constants
import
IMAGE_TOKEN
from
lmdeploy.vl.templates
import
VLChatTemplateWrapper
def
test_prompt_to_messages
():
model
=
MODELS
.
get
(
'vicuna'
)()
templtae
=
VLChatTemplateWrapper
(
model
)
out
=
templtae
.
prompt_to_messages
(
'hi'
)
assert
isinstance
(
out
,
list
)
and
isinstance
(
out
[
0
],
dict
)
im
=
PIL
.
Image
.
new
(
mode
=
'RGB'
,
size
=
(
200
,
200
))
out
=
templtae
.
prompt_to_messages
((
'hi'
,
[
im
]))
assert
isinstance
(
out
,
list
)
and
isinstance
(
out
[
0
],
dict
)
def
test_messages2prompt
():
model
=
MODELS
.
get
(
'vicuna'
)()
templtae
=
VLChatTemplateWrapper
(
model
)
messages
=
[{
'role'
:
'user'
,
'content'
:
[{
'type'
:
'text'
,
'text'
:
'hi'
},
{
'type'
:
'image_url'
,
'image_url'
:
{
'url'
:
'xxx'
}
}]
}]
prompt
=
templtae
.
messages2prompt
(
messages
)
assert
isinstance
(
prompt
,
str
)
assert
prompt
.
count
(
IMAGE_TOKEN
)
==
1
Prev
1
…
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment