Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
79cb018e
Unverified
Commit
79cb018e
authored
Feb 01, 2024
by
Liangsheng Yin
Committed by
GitHub
Feb 01, 2024
Browse files
Add city doc benchmark mode (#129)
parent
c7af9f73
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
267 additions
and
21 deletions
+267
-21
benchmark/json_fast_forward/README.md
benchmark/json_fast_forward/README.md
+39
-11
benchmark/json_fast_forward/bench_other.py
benchmark/json_fast_forward/bench_other.py
+114
-5
benchmark/json_fast_forward/bench_sglang.py
benchmark/json_fast_forward/bench_sglang.py
+56
-5
benchmark/json_fast_forward/build_dataset.py
benchmark/json_fast_forward/build_dataset.py
+58
-0
No files found.
benchmark/json_fast_forward/README.md
View file @
79cb018e
...
@@ -3,44 +3,72 @@
...
@@ -3,44 +3,72 @@
### Dependencies
### Dependencies
```
```
llama_cpp_python 0.2.3
2
llama_cpp_python 0.2.3
8
guidance 0.1.10
guidance 0.1.10
vllm 0.2.7
vllm 0.2.7
outlines 0.0.24
outlines 0.0.25
```
### Build dataset
When benchmarking long document information retrieval, run the following command to build the dataset:
```
bash
pip
install
wikipedia
python3 build_dataset.py
```
```
### Benchmark sglang
### Benchmark sglang
Run Llama-7B
Run Llama-7B
```
```
bash
python3
-m
sglang.launch_server
--model-path
meta-llama/Llama-2-7b-chat-hf
--port
30000
python3
-m
sglang.launch_server
--model-path
meta-llama/Llama-2-7b-chat-hf
--port
30000
```
```
Benchmark
Benchmark
Character Generation
```
bash
python3 bench_sglang.py
--mode
character
```
```
python3 bench_sglang.py
Benchmark City Information Retrieval
```
bash
python3 bench_sglang.py
--mode
city
```
```
### Benchmark vllm
### Benchmark vllm
Run Llama-7B
Run Llama-7B
```
```
bash
python3
-m
outlines.serve.serve
--tokenizer-mode
auto
--model
meta-llama/Llama-2-7b-chat-hf
--disable-log-requests
--port
21000
python3
-m
outlines.serve.serve
--tokenizer-mode
auto
--model
meta-llama/Llama-2-7b-chat-hf
--disable-log-requests
--port
21000
```
```
Benchmark
Benchmark
Character Generation
```
bash
python3 bench_other.py
--mode
character
--backend
vllm
```
```
python3 bench_other.py --backend vllm
Benchmark City Information Retrieval
```
bash
python3 bench_other.py
--mode
city
--backend
vllm
```
```
### Benchmark guidance
(seems not supported)
### Benchmark guidance
Run Llama-7B and benchmark
Run Llama-7B and benchmark
character generation
```
bash
python3 bench_other.py
--mode
character
--backend
guidance
--parallel
1
```
```
python3 bench_other.py --backend guidance --parallel 1
Run Llama-7B and benchmark city information retrieval
```
bash
python3 bench_other.py
--mode
city
--backend
guidance
--parallel
1
```
```
benchmark/json_fast_forward/bench_other.py
View file @
79cb018e
...
@@ -9,7 +9,7 @@ from sglang.test.test_utils import (
...
@@ -9,7 +9,7 @@ from sglang.test.test_utils import (
add_common_other_args_and_parse
,
add_common_other_args_and_parse
,
call_generate_outlines
,
call_generate_outlines
,
)
)
from
sglang.utils
import
dump_state_text
from
sglang.utils
import
dump_state_text
,
read_jsonl
from
tqdm
import
tqdm
from
tqdm
import
tqdm
# there are some FSM bugs with json regex converted from pydantic model
# there are some FSM bugs with json regex converted from pydantic model
...
@@ -32,6 +32,16 @@ character_regex = (
...
@@ -32,6 +32,16 @@ character_regex = (
+
r
"""\}"""
+
r
"""\}"""
)
)
city_regex
=
(
r
"""\{\n"""
+
r
""" "name": "[\w\d\s]{1,16}",\n"""
+
r
""" "country": "[\w\d\s]{1,16}",\n"""
+
r
""" "latitude": [-+]?[0-9]*\.?[0-9]{0,2},\n"""
+
r
""" "population": [-+]?[0-9]{1,9},\n"""
+
r
""" "top 3 landmarks": \["[\w\d\s]{1,16}", "[\w\d\s]{1,16}", "[\w\d\s]{1,16}"\]\n"""
+
r
"""\}"""
)
# fmt: off
# fmt: off
def
character_gen
(
name
,
generate
):
def
character_gen
(
name
,
generate
):
s
=
name
+
" is a character in Harry Potter. Please fill in the following information about this character.
\n
"
s
=
name
+
" is a character in Harry Potter. Please fill in the following information about this character.
\n
"
...
@@ -39,6 +49,15 @@ def character_gen(name, generate):
...
@@ -39,6 +49,15 @@ def character_gen(name, generate):
return
s
return
s
# fmt: on
# fmt: on
# fmt: off
def
city_gen
(
document
,
generate
):
s
=
"Please extract the information of a city from the following wikipedia page.
\n
"
s
+=
"Page begin.
\n
"
+
document
+
"Page end.
\n
"
s
+=
"Here is the name, country, and symbol of the city in JSON format.
\n
"
s
+=
generate
(
s
,
max_tokens
=
256
,
regex
=
city_regex
)
return
s
# fmt: on
@
guidance
@
guidance
def
character_maker
(
lm
,
name
):
def
character_maker
(
lm
,
name
):
...
@@ -65,7 +84,31 @@ def character_maker(lm, name):
...
@@ -65,7 +84,31 @@ def character_maker(lm, name):
return
lm
return
lm
def
main
(
args
):
@
guidance
def
city_maker
(
lm
,
document
):
regex_str_no_quote
=
r
"[\w\d\s]+"
regex_float
=
r
"[0-9]+\.[0-9]+"
lm
+=
f
"""
\
Please extract the information of a city from the following wikipedia page.
Page begin.
{
document
}
Page end.
Here is the name, country, and symbol of the city in JSON format.
{{
"name": "
{
guidance
.
gen
(
"name"
,
max_tokens
=
16
,
regex
=
regex_str_no_quote
)
}
",
"country": "
{
guidance
.
gen
(
"country"
,
max_tokens
=
16
,
regex
=
regex_str_no_quote
)
}
",
"latitude":
{
guidance
.
gen
(
"latitude"
,
max_tokens
=
10
,
regex
=
regex_float
)
}
,
"population":
{
guidance
.
gen
(
"population"
,
max_tokens
=
10
,
regex
=
r
"[0-9]+"
)
}
,
"top 3 landmarks": [
"
{
guidance
.
gen
(
"landmark1"
,
max_tokens
=
16
,
regex
=
regex_str_no_quote
)
}
", "
{
guidance
.
gen
(
"landmark2"
,
max_tokens
=
16
,
regex
=
regex_str_no_quote
)
}
", "
{
guidance
.
gen
(
"landmark3"
,
max_tokens
=
16
,
regex
=
regex_str_no_quote
)
}
"
]
}}
"""
return
lm
def
bench_character
(
args
):
arguments
=
[]
arguments
=
[]
with
open
(
args
.
data_path
,
"r"
)
as
f
:
with
open
(
args
.
data_path
,
"r"
)
as
f
:
for
line
in
f
:
for
line
in
f
:
...
@@ -85,7 +128,7 @@ def main(args):
...
@@ -85,7 +128,7 @@ def main(args):
get_one_answer
=
func
get_one_answer
=
func
elif
args
.
backend
==
"guidance"
:
elif
args
.
backend
==
"guidance"
:
model
=
guidance
.
models
.
LlamaCpp
(
model
=
guidance
.
models
.
LlamaCpp
(
"/home/ubuntu/model_weights/Llama-2-7b-chat-hf/ggml-model-f16.gguf"
,
args
.
llama_cpp_model_path
,
n_gpu_layers
=-
1
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
n_ctx
=
4096
,
)
)
...
@@ -110,11 +153,69 @@ def main(args):
...
@@ -110,11 +153,69 @@ def main(args):
latency
=
time
.
time
()
-
tic
latency
=
time
.
time
()
-
tic
return
states
,
latency
def
bench_city_doc
(
args
):
arguments
=
[]
for
line
in
read_jsonl
(
args
.
data_path
):
arguments
.
append
({
"document"
:
line
[
"document"
]})
arguments
=
arguments
[:
args
.
num_jsons
]
states
=
[
None
]
*
len
(
arguments
)
# Select backend
if
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_outlines
,
url
=
url
,
temperature
=
0
)
def
func
(
i
):
states
[
i
]
=
city_gen
(
**
arguments
[
i
],
generate
=
generate
)
get_one_answer
=
func
elif
args
.
backend
==
"guidance"
:
model
=
guidance
.
models
.
LlamaCpp
(
args
.
llama_cpp_model_path
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
)
def
func
(
i
):
lm
=
model
+
city_maker
(
**
arguments
[
i
])
states
[
i
]
=
lm
get_one_answer
=
func
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
tic
=
time
.
time
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
arguments
))):
get_one_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
rets
=
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
arguments
))))
for
_
in
rets
:
pass
latency
=
time
.
time
()
-
tic
return
states
,
latency
def
main
(
args
):
if
args
.
mode
==
"character"
:
args
.
data_path
=
"dataset.txt"
states
,
latency
=
bench_character
(
args
)
elif
args
.
mode
==
"city"
:
args
.
data_path
=
"questions.jsonl"
states
,
latency
=
bench_city_doc
(
args
)
# Compute accuracy
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
# Write results
# Write results
dump_state_text
(
f
"tmp_output_
{
args
.
backend
}
.txt"
,
states
)
dump_state_text
(
f
"tmp_output_
{
args
.
backend
}
_
{
args
.
mode
}
.txt"
,
states
)
with
open
(
args
.
result_file
,
"a"
)
as
fout
:
with
open
(
args
.
result_file
,
"a"
)
as
fout
:
value
=
{
value
=
{
...
@@ -129,7 +230,15 @@ def main(args):
...
@@ -129,7 +230,15 @@ def main(args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--data-path"
,
type
=
str
,
default
=
"dataset.txt"
)
parser
.
add_argument
(
"--data-path"
,
type
=
str
)
parser
.
add_argument
(
"--num-jsons"
,
type
=
int
,
default
=
50
)
parser
.
add_argument
(
"--num-jsons"
,
type
=
int
,
default
=
50
)
parser
.
add_argument
(
"--mode"
,
type
=
str
,
default
=
"character"
,
choices
=
[
"character"
,
"city"
]
)
parser
.
add_argument
(
"--llama-cpp-model-path"
,
type
=
str
,
default
=
"/home/ubuntu/model_weights/Llama-2-7b-chat-hf/ggml-model-f16.gguf"
,
)
args
=
add_common_other_args_and_parse
(
parser
)
args
=
add_common_other_args_and_parse
(
parser
)
main
(
args
)
main
(
args
)
benchmark/json_fast_forward/bench_sglang.py
View file @
79cb018e
...
@@ -7,7 +7,7 @@ from sglang.test.test_utils import (
...
@@ -7,7 +7,7 @@ from sglang.test.test_utils import (
add_common_sglang_args_and_parse
,
add_common_sglang_args_and_parse
,
select_sglang_backend
,
select_sglang_backend
,
)
)
from
sglang.utils
import
dump_state_text
from
sglang.utils
import
dump_state_text
,
read_jsonl
# there are some FSM bugs with json regex converted from pydantic model
# there are some FSM bugs with json regex converted from pydantic model
# here use a string regex instead
# here use a string regex instead
...
@@ -29,6 +29,16 @@ character_regex = (
...
@@ -29,6 +29,16 @@ character_regex = (
+
r
"""\}"""
+
r
"""\}"""
)
)
city_regex
=
(
r
"""\{\n"""
+
r
""" "name": "[\w\d\s]{1,16}",\n"""
+
r
""" "country": "[\w\d\s]{1,16}",\n"""
+
r
""" "latitude": [-+]?[0-9]*\.?[0-9]{0,2},\n"""
+
r
""" "population": [-+]?[0-9]{1,9},\n"""
+
r
""" "top 3 landmarks": \["[\w\d\s]{1,16}", "[\w\d\s]{1,16}", "[\w\d\s]{1,16}"\]\n"""
+
r
"""\}"""
)
# fmt: off
# fmt: off
@
sgl
.
function
@
sgl
.
function
def
character_gen
(
s
,
name
):
def
character_gen
(
s
,
name
):
...
@@ -36,6 +46,38 @@ def character_gen(s, name):
...
@@ -36,6 +46,38 @@ def character_gen(s, name):
s
+=
sgl
.
gen
(
"json_output"
,
max_tokens
=
256
,
regex
=
character_regex
)
s
+=
sgl
.
gen
(
"json_output"
,
max_tokens
=
256
,
regex
=
character_regex
)
# fmt: on
# fmt: on
# fmt: off
@
sgl
.
function
def
city_gen
(
s
,
document
):
s
+=
"Please extract the information of a city from the following wikipedia page.
\n
"
s
+=
"Page begin.
\n
"
+
document
+
"Page end.
\n
"
s
+=
"Here is the name, country, and symbol of the city in JSON format.
\n
"
s
+=
sgl
.
gen
(
"json_output"
,
max_tokens
=
256
,
regex
=
city_regex
)
# fmt: on
def
bench_city_doc
(
args
):
arguments
=
[]
for
line
in
read_jsonl
(
args
.
data_path
):
arguments
.
append
({
"document"
:
line
[
"document"
]})
arguments
=
arguments
[:
args
.
num_jsons
]
# Select backend
backend
=
select_sglang_backend
(
args
)
sgl
.
set_default_backend
(
backend
)
# Run requests
tic
=
time
.
time
()
states
=
city_gen
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
,
progress_bar
=
(
args
.
parallel
==
1
),
)
latency
=
time
.
time
()
-
tic
return
states
,
latency
def
bench_character
(
args
):
def
bench_character
(
args
):
arguments
=
[]
arguments
=
[]
...
@@ -62,14 +104,19 @@ def bench_character(args):
...
@@ -62,14 +104,19 @@ def bench_character(args):
def
main
(
args
):
def
main
(
args
):
states
,
latency
=
bench_character
(
args
)
if
args
.
mode
==
"character"
:
args
.
data_path
=
"dataset.txt"
states
,
latency
=
bench_character
(
args
)
elif
args
.
mode
==
"city"
:
args
.
data_path
=
"questions.jsonl"
states
,
latency
=
bench_city_doc
(
args
)
# Compute accuracy
# Compute accuracy
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
# Write results
# Write results
dump_state_text
(
f
"tmp_output_
{
args
.
backend
}
.txt"
,
states
)
dump_state_text
(
f
"tmp_output_
{
args
.
backend
}
_
{
args
.
mode
}
.txt"
,
states
)
with
open
(
f
"
{
args
.
backend
}
.json"
,
"w"
)
as
fout
:
with
open
(
f
"
{
args
.
backend
}
_
{
args
.
mode
}
.json"
,
"w"
)
as
fout
:
for
state
in
states
:
for
state
in
states
:
fout
.
write
(
state
[
"json_output"
]
+
"
\n
"
)
fout
.
write
(
state
[
"json_output"
]
+
"
\n
"
)
...
@@ -79,6 +126,7 @@ def main(args):
...
@@ -79,6 +126,7 @@ def main(args):
"backend"
:
args
.
backend
,
"backend"
:
args
.
backend
,
"latency"
:
round
(
latency
,
3
),
"latency"
:
round
(
latency
,
3
),
"num_jsons"
:
args
.
num_jsons
,
"num_jsons"
:
args
.
num_jsons
,
"mode"
:
args
.
mode
,
"parallel"
:
args
.
parallel
,
"parallel"
:
args
.
parallel
,
}
}
fout
.
write
(
json
.
dumps
(
value
)
+
"
\n
"
)
fout
.
write
(
json
.
dumps
(
value
)
+
"
\n
"
)
...
@@ -86,7 +134,10 @@ def main(args):
...
@@ -86,7 +134,10 @@ def main(args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--data-path"
,
type
=
str
,
default
=
"dataset.txt"
)
parser
.
add_argument
(
"--data-path"
,
type
=
str
)
parser
.
add_argument
(
"--num-jsons"
,
type
=
int
,
default
=
50
)
parser
.
add_argument
(
"--num-jsons"
,
type
=
int
,
default
=
50
)
parser
.
add_argument
(
"--mode"
,
type
=
str
,
default
=
"character"
,
choices
=
[
"character"
,
"city"
]
)
args
=
add_common_sglang_args_and_parse
(
parser
)
args
=
add_common_sglang_args_and_parse
(
parser
)
main
(
args
)
main
(
args
)
benchmark/json_fast_forward/build_dataset.py
0 → 100644
View file @
79cb018e
import
json
import
transformers
import
wikipedia
model_path
=
"meta-llama/Llama-2-7b-chat-hf"
t
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_path
)
city_names
=
[
"los angles"
,
"london"
,
"tokyo"
,
"beijing"
,
"singapore"
,
"paris"
,
"dubai"
,
"sydney"
,
"moscow"
,
"rome"
,
"toronto"
,
"rio de janeiro"
,
"istanbul"
,
"berlin"
,
"auckland"
,
"buenos aires"
,
"mexico city"
,
"mumbai"
,
"seoul"
,
"bangkok"
,
"cairo"
,
"athens"
,
"jerusalem"
,
]
def
get_content
(
city_name
):
content
=
str
(
wikipedia
.
page
(
city_name
).
content
)
content
=
content
.
replace
(
"
\n\n
"
,
"
\n
"
)
tokens
=
t
.
encode
(
content
)
expected_tokens
=
3000
truncate_len
=
int
((
expected_tokens
/
len
(
tokens
))
*
len
(
content
))
truncate_content
=
content
[:
truncate_len
]
truncate_tokens
=
t
.
encode
(
truncate_content
)
# Count token
print
(
f
"city_name:
{
city_name
}
, #tokens:
{
len
(
tokens
)
}
, #truncate tokens:
{
len
(
truncate_tokens
)
}
"
)
return
truncate_content
if
__name__
==
"__main__"
:
with
open
(
"questions.jsonl"
,
"w"
)
as
fout
:
for
city_name
in
city_names
:
truncate_content
=
get_content
(
city_name
)
fout
.
write
(
json
.
dumps
({
"document"
:
truncate_content
})
+
"
\n
"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment