Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
694e4192
Unverified
Commit
694e4192
authored
Jan 08, 2025
by
JJJJOHNSON
Committed by
GitHub
Jan 07, 2025
Browse files
[eagle2] fix end check when target model verify (#2723)
parent
b22f3f64
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
60 additions
and
19 deletions
+60
-19
python/sglang/srt/speculative/eagle_utils.py
python/sglang/srt/speculative/eagle_utils.py
+31
-19
test/srt/test_eagle_infer.py
test/srt/test_eagle_infer.py
+29
-0
No files found.
python/sglang/srt/speculative/eagle_utils.py
View file @
694e4192
...
@@ -550,8 +550,37 @@ class EagleVerifyInput(SpecInfo):
...
@@ -550,8 +550,37 @@ class EagleVerifyInput(SpecInfo):
triton
.
next_power_of_2
(
max_draft_len
),
triton
.
next_power_of_2
(
max_draft_len
),
)
)
accept_index
=
accept_index
[
accept_index
!=
-
1
]
draft_input
=
EAGLEDraftInput
()
new_accept_index
=
[]
unfinished_index
=
[]
finished_extend_len
=
{}
# {rid:accept_length + 1}
accept_index_cpu
=
accept_index
.
tolist
()
predict_cpu
=
predict
.
tolist
()
# iterate every accepted token and check if req has finished after append the token
# should be checked BEFORE free kv cache slots
for
i
,
(
req
,
accept_index_row
)
in
enumerate
(
zip
(
batch
.
reqs
,
accept_index_cpu
)):
new_accept_index_
=
[]
for
j
,
idx
in
enumerate
(
accept_index_row
):
if
idx
==
-
1
:
break
id
=
predict_cpu
[
idx
]
# if not found_finished:
req
.
output_ids
.
append
(
id
)
finished_extend_len
[
req
.
rid
]
=
j
+
1
req
.
check_finished
()
if
req
.
finished
():
draft_input
.
has_finished
=
True
# set all tokens after finished token to -1 and break
accept_index
[
i
,
j
+
1
:]
=
-
1
break
else
:
new_accept_index_
.
append
(
idx
)
if
not
req
.
finished
():
new_accept_index
.
extend
(
new_accept_index_
)
unfinished_index
.
append
(
i
)
accept_length
=
(
accept_index
!=
-
1
).
sum
(
dim
=
1
)
-
1
accept_index
=
accept_index
[
accept_index
!=
-
1
]
accept_length_cpu
=
accept_length
.
tolist
()
accept_length_cpu
=
accept_length
.
tolist
()
verified_id
=
predict
[
accept_index
]
verified_id
=
predict
[
accept_index
]
verified_id_cpu
=
verified_id
.
tolist
()
verified_id_cpu
=
verified_id
.
tolist
()
...
@@ -570,26 +599,9 @@ class EagleVerifyInput(SpecInfo):
...
@@ -570,26 +599,9 @@ class EagleVerifyInput(SpecInfo):
triton
.
next_power_of_2
(
bs
),
triton
.
next_power_of_2
(
bs
),
)
)
batch
.
seq_lens
.
add_
(
accept_length
+
1
)
batch
.
seq_lens
.
add_
(
accept_length
+
1
)
new_accept_index
=
[]
unfinished_index
=
[]
finished_extend_len
=
{}
# {rid:accept_length + 1}
# retracted_reqs, new_token_ratio = batch.retract_decode()
low
=
0
draft_input
=
EAGLEDraftInput
()
for
i
,
(
req
,
verified_len
)
in
enumerate
(
zip
(
batch
.
reqs
,
accept_length_cpu
)):
req
.
output_ids
.
extend
(
verified_id_cpu
[
low
:
low
+
verified_len
+
1
])
req
.
check_finished
()
if
req
.
finished
():
draft_input
.
has_finished
=
True
else
:
new_accept_index
.
append
(
accept_index
[
low
:
low
+
verified_len
+
1
])
unfinished_index
.
append
(
i
)
low
+=
verified_len
+
1
finished_extend_len
[
req
.
rid
]
=
verified_len
+
1
if
len
(
new_accept_index
)
>
0
:
if
len
(
new_accept_index
)
>
0
:
new_accept_index
=
torch
.
cat
(
new_accept_index
,
d
im
=
0
)
new_accept_index
=
torch
.
tensor
(
new_accept_index
,
d
evice
=
"cuda"
)
draft_input
.
verified_id
=
predict
[
new_accept_index
]
draft_input
.
verified_id
=
predict
[
new_accept_index
]
draft_input
.
hidden_states
=
batch
.
spec_info
.
hidden_states
[
new_accept_index
]
draft_input
.
hidden_states
=
batch
.
spec_info
.
hidden_states
[
new_accept_index
]
draft_input
.
accept_length
=
accept_length
[
unfinished_index
]
draft_input
.
accept_length
=
accept_length
[
unfinished_index
]
...
...
test/srt/test_eagle_infer.py
View file @
694e4192
import
unittest
import
unittest
from
transformers
import
AutoConfig
,
AutoTokenizer
import
sglang
as
sgl
import
sglang
as
sgl
...
@@ -34,6 +36,33 @@ class TestEAGLEEngine(unittest.TestCase):
...
@@ -34,6 +36,33 @@ class TestEAGLEEngine(unittest.TestCase):
print
(
out2
)
print
(
out2
)
self
.
assertEqual
(
out1
,
out2
)
self
.
assertEqual
(
out1
,
out2
)
def
test_eagle_end_check
(
self
):
prompt
=
"[INST] <<SYS>>
\\
nYou are a helpful assistant.
\\
n<</SYS>>
\\
nToday is a sunny day and I like [/INST]"
target_model_path
=
"meta-llama/Llama-2-7b-chat-hf"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
target_model_path
)
speculative_draft_model_path
=
"lmzheng/sglang-EAGLE-llama2-chat-7B"
sampling_params
=
{
"temperature"
:
0
,
"max_new_tokens"
:
1024
,
"skip_special_tokens"
:
False
,
}
engine
=
sgl
.
Engine
(
model_path
=
target_model_path
,
speculative_draft_model_path
=
speculative_draft_model_path
,
speculative_algorithm
=
"EAGLE"
,
speculative_num_steps
=
3
,
speculative_eagle_topk
=
4
,
speculative_num_draft_tokens
=
16
,
)
out1
=
engine
.
generate
(
prompt
,
sampling_params
)[
"text"
]
engine
.
shutdown
()
print
(
"==== Answer 1 ===="
)
print
(
repr
(
out1
))
tokens
=
tokenizer
.
encode
(
out1
,
truncation
=
False
)
assert
tokenizer
.
eos_token_id
not
in
tokens
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment