Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
14522e6a
Unverified
Commit
14522e6a
authored
May 05, 2024
by
Liangsheng Yin
Committed by
GitHub
May 05, 2024
Browse files
Organize Benchmark (#381)
parent
183df472
Changes
36
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
270 additions
and
405 deletions
+270
-405
benchmark/dspy/README.md
benchmark/dspy/README.md
+6
-0
benchmark/generative_agents/README.md
benchmark/generative_agents/README.md
+7
-1
benchmark/generative_agents/bench_other.py
benchmark/generative_agents/bench_other.py
+15
-43
benchmark/gsm8k/README.md
benchmark/gsm8k/README.md
+1
-1
benchmark/gsm8k/bench_other.py
benchmark/gsm8k/bench_other.py
+9
-56
benchmark/hellaswag/README.md
benchmark/hellaswag/README.md
+1
-1
benchmark/hellaswag/bench_other.py
benchmark/hellaswag/bench_other.py
+11
-49
benchmark/json_decode_regex/README.md
benchmark/json_decode_regex/README.md
+3
-3
benchmark/json_decode_regex/bench_other.py
benchmark/json_decode_regex/bench_other.py
+9
-37
benchmark/json_jump_forward/README.md
benchmark/json_jump_forward/README.md
+19
-5
benchmark/json_jump_forward/bench_other.py
benchmark/json_jump_forward/bench_other.py
+78
-36
benchmark/llm_judge/README.md
benchmark/llm_judge/README.md
+7
-1
benchmark/llm_judge/bench_other.py
benchmark/llm_judge/bench_other.py
+62
-43
benchmark/long_json_decode/README.md
benchmark/long_json_decode/README.md
+1
-1
benchmark/long_json_decode/bench_other.py
benchmark/long_json_decode/bench_other.py
+10
-38
benchmark/mmlu/README.md
benchmark/mmlu/README.md
+1
-1
benchmark/mmlu/bench_other.py
benchmark/mmlu/bench_other.py
+6
-68
benchmark/mtbench/README.md
benchmark/mtbench/README.md
+6
-0
benchmark/mtbench/bench_other.py
benchmark/mtbench/bench_other.py
+11
-20
benchmark/multi_chain_reasoning/README.md
benchmark/multi_chain_reasoning/README.md
+7
-1
No files found.
benchmark/dspy/README.md
View file @
14522e6a
...
...
@@ -9,6 +9,12 @@ Turn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa2
cache_turn_on = False
```
or set the environment variable
```
export DSP_CACHEBOOL=false
```
## Benchmark SGLang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
...
...
benchmark/generative_agents/README.md
View file @
14522e6a
...
...
@@ -28,5 +28,11 @@ python3 bench_other.py --num-events 1000 --backend vllm --parallel 1
### Benchmark guidance
```
python3 bench_other.py --num-events 1000 --backend guidance --parallel 1
python3 bench_other.py --num-events 1000 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --num-events 1000 --backend lmql --parallel 1
```
benchmark/generative_agents/bench_other.py
View file @
14522e6a
import
argparse
import
json
import
time
from
functools
import
partial
from
pathlib
import
Path
from
agent_functions
import
(
action_location_object_prompt
,
...
...
@@ -13,12 +11,7 @@ from agent_functions import (
)
from
tqdm
import
tqdm
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_generate_lightllm
,
call_generate_srt_raw
,
call_generate_vllm
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_generate
from
sglang.utils
import
dump_state_text
,
read_jsonl
...
...
@@ -36,48 +29,27 @@ def main(args):
states
=
[]
# Select backend
if
args
.
backend
==
"lightllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_lightllm
,
url
=
url
)
elif
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_vllm
,
url
=
url
)
elif
args
.
backend
==
"srt-raw"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_srt_raw
,
url
=
url
)
elif
args
.
backend
==
"guidance"
:
from
guidance
import
gen
,
models
model
=
models
.
LlamaCpp
(
str
(
Path
.
home
())
+
"/model_weights/Llama-2-7b-chat.gguf"
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
)
def
call_generate
(
prompt
,
temperature
,
max_tokens
,
stop
):
out
=
(
model
+
prompt
+
gen
(
name
=
"result"
,
max_tokens
=
max_tokens
,
temperature
=
temperature
,
stop
=
stop
,
)
)
return
out
[
"result"
]
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
call_generate
=
get_call_generate
(
args
)
def
get_one_answer
(
arg
):
answer
=
call_generate
(
**
arg
,
temperature
=
0
)
states
.
append
(
answer
)
async
def
get_one_answer_async
(
arg
):
answer
=
await
call_generate
(
**
arg
,
temperature
=
0
)
states
.
append
(
answer
)
tic
=
time
.
time
()
# we always sequentially execute agent calls to maintain its dependency
for
arg
in
tqdm
(
arguments
):
get_one_answer
(
arg
)
if
args
.
backend
!=
"lmql"
:
for
arg
in
tqdm
(
arguments
):
get_one_answer
(
arg
)
else
:
import
asyncio
loop
=
asyncio
.
get_event_loop
()
for
arg
in
tqdm
(
arguments
):
loop
.
run_until_complete
(
get_one_answer_async
(
arg
))
latency
=
time
.
time
()
-
tic
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
benchmark/gsm8k/README.md
View file @
14522e6a
...
...
@@ -38,7 +38,7 @@ python3 bench_other.py --num-questions 200 --backend lightllm
### Benchmark guidance
```
python3 bench_other.py --num-questions 200 --backend guidance --parallel 1
python3 bench_other.py --num-questions 200 --backend guidance --parallel 1
--n-ctx 4096 --model-path path/to/gguf
```
...
...
benchmark/gsm8k/bench_other.py
View file @
14522e6a
...
...
@@ -5,17 +5,11 @@ import json
import
re
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
import
numpy
as
np
from
tqdm
import
tqdm
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_generate_lightllm
,
call_generate_srt_raw
,
call_generate_vllm
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_generate
from
sglang.utils
import
dump_state_text
,
read_jsonl
INVALID
=
-
9999999
...
...
@@ -63,54 +57,7 @@ def main(args):
states
=
[
None
]
*
len
(
labels
)
# Select backend
if
args
.
backend
==
"lightllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_lightllm
,
url
=
url
)
elif
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_vllm
,
url
=
url
)
elif
args
.
backend
==
"srt-raw"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_srt_raw
,
url
=
url
)
elif
args
.
backend
==
"guidance"
:
from
guidance
import
gen
,
models
model
=
models
.
LlamaCpp
(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf"
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
)
def
call_generate
(
prompt
,
temperature
,
max_tokens
,
stop
):
out
=
(
model
+
prompt
+
gen
(
name
=
"answer"
,
max_tokens
=
max_tokens
,
temperature
=
temperature
,
stop
=
stop
,
)
)
return
out
[
"answer"
]
elif
args
.
backend
==
"lmql"
:
import
lmql
model
=
lmql
.
model
(
args
.
model_path
,
endpoint
=
f
"
{
args
.
host
}
:
{
args
.
port
}
"
)
@
lmql
.
query
(
model
=
model
)
async
def
program
(
question
):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 257 and STOPS_AT(ANSWER, "Question")
return ANSWER
'''
async
def
call_generate
(
prompt
,
temperature
,
max_tokens
,
stop
):
return
await
program
(
question
=
prompt
,
temperature
=
0
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
call_generate
=
get_call_generate
(
args
)
# Run requests
if
args
.
backend
!=
"lmql"
:
...
...
@@ -130,7 +77,13 @@ def main(args):
get_one_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
questions
))))
list
(
tqdm
(
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
questions
)))),
total
=
len
(
questions
),
)
)
else
:
# Use asyncio
async
def
batched_call
(
batch_size
):
...
...
benchmark/hellaswag/README.md
View file @
14522e6a
...
...
@@ -38,7 +38,7 @@ python3 bench_other.py --num-questions 200 --backend lightllm
### Benchmark guidance
```
CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1
CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1
--n-ctx 4096 --model-path path/to/gguf
```
...
...
benchmark/hellaswag/bench_other.py
View file @
14522e6a
...
...
@@ -3,15 +3,11 @@ import asyncio
import
json
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
import
numpy
as
np
from
tqdm
import
tqdm
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_select_lightllm
,
call_select_vllm
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_select
from
sglang.utils
import
read_jsonl
...
...
@@ -47,47 +43,7 @@ def main(args):
preds
=
[
None
]
*
len
(
labels
)
# Select backend
if
args
.
backend
==
"lightllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_select
=
partial
(
call_select_lightllm
,
url
=
url
)
elif
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_select
=
partial
(
call_select_vllm
,
url
=
url
)
elif
args
.
backend
==
"guidance"
:
from
guidance
import
models
,
select
model
=
models
.
LlamaCpp
(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf"
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
)
def
call_select
(
context
,
choices
):
out
=
model
+
context
+
select
(
choices
,
name
=
"answer"
)
return
choices
.
index
(
out
[
"answer"
])
call_select
(
"Hello,"
,
[
"world"
,
"earth"
])
elif
args
.
backend
==
"lmql"
:
import
lmql
model
=
lmql
.
model
(
"meta-llama/Llama-2-7b-chat-hf"
,
endpoint
=
f
"
{
args
.
host
}
:
{
args
.
port
}
"
)
@
lmql
.
query
(
model
=
model
)
async
def
program
(
ctx
,
choices
):
'''lmql
"""{ctx}[ANSWER]""" where ANSWER in set(choices)
return ANSWER
'''
async
def
call_select
(
context
,
choices
):
answer
=
await
program
(
ctx
=
context
,
choices
=
choices
,
temperature
=
0
)
return
choices
.
index
(
answer
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
call_select
=
get_call_select
(
args
)
# Run requests
if
args
.
backend
!=
"lmql"
:
...
...
@@ -99,11 +55,17 @@ def main(args):
tic
=
time
.
time
()
if
args
.
parallel
==
1
:
for
i
in
range
(
len
(
questions
)):
for
i
in
tqdm
(
range
(
len
(
questions
))
)
:
get_one_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
questions
))))
list
(
tqdm
(
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
questions
)))),
total
=
len
(
questions
),
)
)
else
:
# Use asyncio
async
def
batched_call
(
batch_size
):
...
...
benchmark/json_decode_regex/README.md
View file @
14522e6a
...
...
@@ -36,7 +36,7 @@ python3 bench_sglang.py --num-questions 10
```
### Benchmark
vllm
### Benchmark
Outlines + vLLM
Run Llama-7B
...
...
@@ -47,7 +47,7 @@ python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2
Benchmark
```
python3 bench_other.py --backend
vllm
--num-questions 10
python3 bench_other.py --backend
outlines
--num-questions 10
```
...
...
@@ -56,5 +56,5 @@ python3 bench_other.py --backend vllm --num-questions 10
Run Llama-7B and benchmark
```
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
--n-ctx 4096 --model-path path/to/gguf
```
benchmark/json_decode_regex/bench_other.py
View file @
14522e6a
...
...
@@ -7,10 +7,7 @@ from functools import partial
from
tqdm
import
tqdm
from
sglang.lang.ir
import
REGEX_FLOAT
,
REGEX_INT
,
REGEX_STRING
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_generate_outlines
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_generate
from
sglang.utils
import
dump_state_text
,
read_jsonl
REGEX_LIST
=
r
"\[("
+
REGEX_STRING
+
", )*"
+
REGEX_STRING
+
r
"\]"
...
...
@@ -50,41 +47,11 @@ def main(args):
states
=
[
None
]
*
len
(
arguments
)
# Select backend
if
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_outlines
,
url
=
url
,
temperature
=
0
)
elif
args
.
backend
==
"guidance"
:
from
guidance
import
gen
,
models
model
=
models
.
LlamaCpp
(
"/home/ubuntu/model_weights/Llama-2-7b-chat-hf/ggml-model-f16.gguf"
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
)
def
generate
(
prompt
,
max_tokens
,
stop
=
None
,
regex
=
None
):
out
=
(
model
+
prompt
+
gen
(
name
=
"answer"
,
max_tokens
=
max_tokens
,
temperature
=
0
,
stop
=
stop
,
regex
=
regex
,
)
)
return
out
[
"answer"
]
# warmup
for
_
in
range
(
3
):
generate
(
"Hello!"
*
10
,
max_tokens
=
64
,
stop
=
None
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
call_generate
=
partial
(
get_call_generate
(
args
),
temperature
=
0
)
# Run requests
def
get_one_answer
(
i
):
states
[
i
]
=
json_decode
(
generate
=
generate
,
**
arguments
[
i
])
states
[
i
]
=
json_decode
(
generate
=
call_
generate
,
**
arguments
[
i
])
tic
=
time
.
time
()
if
args
.
parallel
==
1
:
...
...
@@ -92,7 +59,12 @@ def main(args):
get_one_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
rets
=
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
arguments
))))
rets
=
list
(
tqdm
(
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
arguments
)))),
total
=
len
(
arguments
),
)
)
for
_
in
rets
:
pass
...
...
benchmark/json_jump_forward/README.md
View file @
14522e6a
...
...
@@ -39,7 +39,7 @@ python3 bench_sglang.py --mode city
```
### Benchmark
vllm
### Benchmark
Outlines + vLLM
Run Llama-7B
...
...
@@ -50,13 +50,13 @@ python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2
Benchmark Character Generation
```
bash
python3 bench_other.py
--mode
character
--backend
vllm
python3 bench_other.py
--mode
character
--backend
outlines
```
Benchmark City Information Retrieval
```
bash
python3 bench_other.py
--mode
city
--backend
vllm
python3 bench_other.py
--mode
city
--backend
outlines
```
### Benchmark guidance
...
...
@@ -64,11 +64,25 @@ python3 bench_other.py --mode city --backend vllm
Run Llama-7B and benchmark character generation
```
bash
python3 bench_other.py
--mode
character
--backend
guidance
--parallel
1
python3 bench_other.py
--mode
character
--backend
guidance
--parallel
1
--n-ctx
4096
--model-path
path/to/gguf
```
Run Llama-7B and benchmark city information retrieval
```
bash
python3 bench_other.py
--mode
city
--backend
guidance
--parallel
1
python3 bench_other.py
--mode
city
--backend
guidance
--parallel
1
--n-ctx
4096
--model-path
path/to/gguf
```
### Benchmark lmql
Run Llama-7B and benchmark character generation
```
python3 bench_other.py --mode character --backend lmql --parallel 1
```
Run Llama-7B and benchmark city information retrieval
```
python3 bench_other.py --mode city --backend lmql --parallel 1
```
benchmark/json_jump_forward/bench_other.py
View file @
14522e6a
...
...
@@ -7,10 +7,7 @@ from functools import partial
import
guidance
from
tqdm
import
tqdm
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_generate_outlines
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_generate
from
sglang.utils
import
dump_state_text
,
read_jsonl
# there are some FSM bugs with json regex converted from pydantic model
...
...
@@ -85,6 +82,29 @@ def character_maker(lm, name):
return
lm
async
def
call_generate_lmql
(
prompt
,
temperature
,
max_tokens
,
regex
,
max_len
=
4096
,
model
=
None
,
**
kwargs
):
assert
model
is
not
None
import
lmql
@
lmql
.
query
(
model
=
model
)
async
def
program
(
question
,
max_tokens
,
regex
):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens and REGEX(ANSWER, regex)
return ANSWER
'''
return
await
program
(
question
=
prompt
,
temperature
=
temperature
,
max_tokens
=
max_tokens
,
max_len
=
max_len
,
regex
=
regex
,
**
kwargs
,
)
@
guidance
def
city_maker
(
lm
,
document
):
regex_str_no_quote
=
r
"[\w\d\s]+"
...
...
@@ -119,38 +139,68 @@ def bench_character(args):
states
=
[
None
]
*
len
(
arguments
)
# Select backend
if
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_outlines
,
url
=
url
,
temperature
=
0
)
if
args
.
backend
==
"outlines"
:
call_generate
=
partial
(
get_call_generate
(
args
),
temperature
=
0
)
def
func
(
i
):
states
[
i
]
=
character_gen
(
**
arguments
[
i
],
generate
=
generate
)
def
get_one_answer
(
i
):
states
[
i
]
=
character_gen
(
**
arguments
[
i
],
generate
=
call_
generate
)
get_one_answer
=
func
elif
args
.
backend
==
"guidance"
:
model
=
guidance
.
models
.
LlamaCpp
(
args
.
llama_cpp_
model_path
,
args
.
model_path
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
n_ctx
=
args
.
n_ctx
,
)
def
func
(
i
):
def
get_one_answer
(
i
):
lm
=
model
+
character_maker
(
**
arguments
[
i
])
states
[
i
]
=
lm
get_one_answer
=
func
elif
args
.
backend
==
"lmql"
:
import
asyncio
import
lmql
model
=
lmql
.
model
(
args
.
model_path
,
endpoint
=
f
"
{
args
.
host
}
:
{
args
.
port
}
"
)
call_generate
=
partial
(
call_generate_lmql
,
model
=
model
,
max_tokens
=
256
,
regex
=
character_regex
,
)
async
def
get_one_answer_async
(
i
):
states
[
i
]
=
await
call_generate
(
prompt
=
arguments
[
i
][
"name"
],
temperature
=
0
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
tic
=
time
.
time
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
arguments
))):
get_one_answer
(
i
)
if
args
.
backend
!=
"lmql"
:
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
arguments
))):
get_one_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
rets
=
list
(
tqdm
(
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
arguments
)))),
total
=
len
(
arguments
),
)
)
for
_
in
rets
:
pass
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
rets
=
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
arguments
))))
for
_
in
rets
:
pass
batches
=
[]
for
i
in
range
(
0
,
len
(
arguments
),
args
.
parallel
):
batches
.
append
(
list
(
range
(
i
,
min
(
i
+
args
.
parallel
,
len
(
arguments
)))))
loop
=
asyncio
.
get_event_loop
()
for
bt
in
tqdm
(
batches
):
loop
.
run_until_complete
(
asyncio
.
gather
(
*
[
get_one_answer_async
(
i
)
for
i
in
bt
])
)
latency
=
time
.
time
()
-
tic
...
...
@@ -166,26 +216,23 @@ def bench_city_doc(args):
states
=
[
None
]
*
len
(
arguments
)
# Select backend
if
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_outlines
,
url
=
url
,
temperature
=
0
)
if
args
.
backend
==
"outlines"
:
call_generate
=
partial
(
get_call_generate
(
args
),
temperature
=
0
)
def
func
(
i
):
states
[
i
]
=
city_gen
(
**
arguments
[
i
],
generate
=
generate
)
def
get_one_answer
(
i
):
states
[
i
]
=
city_gen
(
**
arguments
[
i
],
generate
=
call_
generate
)
get_one_answer
=
func
elif
args
.
backend
==
"guidance"
:
model
=
guidance
.
models
.
LlamaCpp
(
args
.
llama_cpp_
model_path
,
args
.
model_path
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
n_ctx
=
args
.
n_ctx
,
)
def
func
(
i
):
def
get_one_answer
(
i
):
lm
=
model
+
city_maker
(
**
arguments
[
i
])
states
[
i
]
=
lm
get_one_answer
=
func
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
...
...
@@ -237,10 +284,5 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--mode"
,
type
=
str
,
default
=
"character"
,
choices
=
[
"character"
,
"city"
]
)
parser
.
add_argument
(
"--llama-cpp-model-path"
,
type
=
str
,
default
=
"/home/ubuntu/model_weights/Llama-2-7b-chat-hf/ggml-model-f16.gguf"
,
)
args
=
add_common_other_args_and_parse
(
parser
)
main
(
args
)
benchmark/llm_judge/README.md
View file @
14522e6a
...
...
@@ -23,5 +23,11 @@ python3 bench_other.py --backend vllm --num-questions 25
### Benchmark guidance
```
python3 bench_other.py --backend guidance --num-questions 25 --parallel 1
python3 bench_other.py --backend guidance --num-questions 25 --parallel 1
--n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --backend lmql --num-questions 25 --parallel 1
```
\ No newline at end of file
benchmark/llm_judge/bench_other.py
View file @
14522e6a
...
...
@@ -6,12 +6,7 @@ from functools import partial
from
tqdm
import
tqdm
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_generate_lightllm
,
call_generate_srt_raw
,
call_generate_vllm
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_generate
from
sglang.utils
import
dump_state_text
,
read_jsonl
system_prompt
=
"Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
...
...
@@ -54,53 +49,77 @@ def multi_dimension_judge(article, generate):
return
s
async
def
multi_dimension_judge_async
(
article
,
generate
):
s
=
system_prompt
s
+=
"
\n
```
\n
"
+
article
+
"
\n
```
\n\n
"
judges
=
[]
for
i
in
range
(
len
(
dimension_prompts
)):
comp
=
await
generate
(
s
+
"USER: Please judge the quality based on the following metric. "
+
dimension_prompts
[
i
]
+
" Please provide a single-paragraph judgement. "
+
"Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"
\n
JUDGE:'
,
max_tokens
=
256
,
stop
=
"END"
,
)
judges
.
append
(
comp
)
s
+=
"I will judge the quality based on the following metrics.
\n
"
for
i
in
range
(
len
(
dimension_prompts
)):
s
+=
dimension_prompts
[
i
].
split
(
":"
)[
0
]
+
": "
+
judges
[
i
].
strip
()
+
"
\n
"
s
+=
"In summary, on a scale of 1 to 10, I would give the article a score of"
s
+=
await
generate
(
s
,
max_tokens
=
2
,
stop
=
None
)
return
s
def
main
(
args
):
lines
=
read_jsonl
(
args
.
data_path
)[:
args
.
num_questions
]
states
=
[
None
]
*
len
(
lines
)
# Select backend
if
args
.
backend
==
"lightllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_lightllm
,
url
=
url
,
temperature
=
0
)
elif
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_vllm
,
url
=
url
,
temperature
=
0
)
elif
args
.
backend
==
"srt-raw"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_srt_raw
,
url
=
url
,
temperature
=
0
)
elif
args
.
backend
==
"guidance"
:
from
guidance
import
gen
,
models
model
=
models
.
LlamaCpp
(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf"
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
)
call_generate
=
partial
(
get_call_generate
(
args
),
temperature
=
0
)
def
generate
(
prompt
,
max_tokens
,
stop
):
out
=
(
model
+
prompt
+
gen
(
name
=
"answer"
,
max_tokens
=
max_tokens
,
temperature
=
0
,
stop
=
stop
)
)
return
out
[
"answer"
]
# Run requests
tic
=
time
.
time
()
# warmup
generate
(
"Hello!"
,
max_tokens
=
8
,
stop
=
None
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
if
args
.
backend
!=
"lmql"
:
# Run requests
def
get_one_answer
(
i
):
states
[
i
]
=
multi_dimension_judge
(
lines
[
i
],
generate
)
def
get_one_answer
(
i
):
states
[
i
]
=
multi_dimension_judge
(
lines
[
i
],
call_generate
)
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
lines
))):
get_one_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
list
(
tqdm
(
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
lines
)))),
total
=
len
(
lines
),
)
)
tic
=
time
.
time
()
if
args
.
parallel
==
1
:
for
i
in
tqdm
(
range
(
len
(
lines
))):
get_one_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
lines
))))
import
asyncio
async
def
get_one_answer_async
(
i
):
states
[
i
]
=
await
multi_dimension_judge_async
(
lines
[
i
],
call_generate
)
batches
=
[]
for
i
in
range
(
0
,
len
(
lines
),
args
.
parallel
):
batches
.
append
(
list
(
range
(
i
,
min
(
i
+
args
.
parallel
,
len
(
lines
)))))
loop
=
asyncio
.
get_event_loop
()
for
bt
in
tqdm
(
batches
):
loop
.
run_until_complete
(
asyncio
.
gather
(
*
[
get_one_answer_async
(
i
)
for
i
in
bt
])
)
latency
=
time
.
time
()
-
tic
# Compute accuracy
...
...
benchmark/long_json_decode/README.md
View file @
14522e6a
...
...
@@ -22,7 +22,7 @@ python3 bench_other.py --backend vllm --num-questions 5
### Benchmark guidance
```
python3 bench_other.py --backend guidance --num-questions 5 --parallel 1
python3 bench_other.py --backend guidance --num-questions 5 --parallel 1
--n-ctx 11000 --model-path path/to/code-llama/gguf
```
...
...
benchmark/long_json_decode/bench_other.py
View file @
14522e6a
...
...
@@ -6,12 +6,7 @@ from functools import partial
from
tqdm
import
tqdm
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_generate_lightllm
,
call_generate_srt_raw
,
call_generate_vllm
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_generate
from
sglang.utils
import
dump_state_text
,
read_jsonl
...
...
@@ -44,40 +39,11 @@ def main(args):
states
=
[
None
]
*
len
(
arguments
)
# Select backend
if
args
.
backend
==
"lightllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_lightllm
,
url
=
url
,
temperature
=
0
)
elif
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_vllm
,
url
=
url
,
temperature
=
0
)
elif
args
.
backend
==
"srt-raw"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
generate
=
partial
(
call_generate_srt_raw
,
url
=
url
,
temperature
=
0
)
elif
args
.
backend
==
"guidance"
:
from
guidance
import
gen
,
models
model
=
models
.
LlamaCpp
(
"/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf"
,
n_gpu_layers
=-
1
,
n_ctx
=
11000
,
)
def
generate
(
prompt
,
max_tokens
,
stop
):
out
=
(
model
+
prompt
+
gen
(
name
=
"answer"
,
max_tokens
=
max_tokens
,
temperature
=
0
,
stop
=
stop
)
)
return
out
[
"answer"
]
# warmup
generate
(
"Hello!"
,
max_tokens
=
8
,
stop
=
None
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
call_generate
=
partial
(
get_call_generate
(
args
),
temperature
=
0
)
# Run requests
def
get_one_answer
(
i
):
states
[
i
]
=
json_decode
(
generate
=
generate
,
**
arguments
[
i
])
states
[
i
]
=
json_decode
(
generate
=
call_
generate
,
**
arguments
[
i
])
tic
=
time
.
time
()
if
args
.
parallel
==
1
:
...
...
@@ -85,7 +51,13 @@ def main(args):
get_one_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
arguments
))))
list
(
tqdm
(
executor
.
map
(
get_one_answer
,
list
(
range
(
len
(
arguments
)))),
total
=
len
(
arguments
),
)
)
latency
=
time
.
time
()
-
tic
# Compute accuracy
...
...
benchmark/mmlu/README.md
View file @
14522e6a
...
...
@@ -46,7 +46,7 @@ python3 bench_other.py --nsub 10 --backend lightllm
### Benchmark guidance
```
python3 bench_other.py --nsub 10 --backend guidance --parallel 1
python3 bench_other.py --nsub 10 --backend guidance --parallel 1
--n-ctx 4096 --model-path path/to/gguf
```
...
...
benchmark/mmlu/bench_other.py
View file @
14522e6a
...
...
@@ -4,19 +4,13 @@ import json
import
os
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
import
numpy
as
np
import
pandas
as
pd
import
tiktoken
from
tqdm
import
tqdm
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_generate_lightllm
,
call_generate_srt_raw
,
call_generate_vllm
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_generate
choices
=
[
"A"
,
"B"
,
"C"
,
"D"
]
...
...
@@ -53,10 +47,7 @@ def gen_prompt(train_df, subject, k=-1):
return
prompt
model_initialized
=
None
def
evaluate
(
args
,
subject
,
dev_df
,
test_df
):
def
evaluate
(
args
,
subject
,
dev_df
,
test_df
,
call_generate
):
prompts
=
[]
labels
=
[]
...
...
@@ -78,62 +69,6 @@ def evaluate(args, subject, dev_df, test_df):
preds
=
[
None
]
*
len
(
prompts
)
max_tokens
=
1
# Select backend
global
model_initialized
if
args
.
backend
==
"lightllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_lightllm
,
url
=
url
,
stop
=
None
)
elif
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_vllm
,
url
=
url
,
stop
=
None
)
elif
args
.
backend
==
"srt-raw"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_srt_raw
,
url
=
url
,
stop
=
None
)
elif
args
.
backend
==
"guidance"
:
from
guidance
import
gen
,
models
if
model_initialized
is
None
:
model
=
models
.
LlamaCpp
(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf"
,
n_gpu_layers
=-
1
,
n_ctx
=
4096
,
)
model_initialized
=
model
else
:
model
=
model_initialized
def
call_generate
(
prompt
,
temperature
,
max_tokens
):
out
=
(
model
+
prompt
+
gen
(
name
=
"answer"
,
max_tokens
=
max_tokens
,
temperature
=
0
)
)
return
out
[
"answer"
]
# warmup
call_generate
(
"Hello,"
,
temperature
=
1.0
,
max_tokens
=
8
)
elif
args
.
backend
==
"lmql"
:
import
lmql
model
=
lmql
.
model
(
"meta-llama/Llama-2-7b-chat-hf"
,
endpoint
=
f
"
{
args
.
host
}
:
{
args
.
port
}
"
)
@
lmql
.
query
(
model
=
model
)
async
def
program
(
question
):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 2
return ANSWER
'''
async
def
call_generate
(
prompt
,
temperature
,
max_tokens
):
return
await
program
(
question
=
prompt
,
temperature
=
temperature
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
# Run requests
if
args
.
backend
!=
"lmql"
:
# Use thread pool
...
...
@@ -190,6 +125,9 @@ def main(args):
all_latencies
=
[]
num_requests
=
0
# Select backend
call_generate
=
get_call_generate
(
args
)
for
subject
in
tqdm
(
subjects
[:
args
.
nsub
]):
dev_df
=
pd
.
read_csv
(
os
.
path
.
join
(
args
.
data_dir
,
"dev"
,
subject
+
"_dev.csv"
),
header
=
None
...
...
@@ -198,7 +136,7 @@ def main(args):
os
.
path
.
join
(
args
.
data_dir
,
"test"
,
subject
+
"_test.csv"
),
header
=
None
)
cors
,
acc
,
latency
=
evaluate
(
args
,
subject
,
dev_df
,
test_df
)
cors
,
acc
,
latency
=
evaluate
(
args
,
subject
,
dev_df
,
test_df
,
call_generate
)
all_cors
.
append
(
cors
)
all_latencies
.
append
(
latency
)
num_requests
+=
len
(
test_df
)
...
...
benchmark/mtbench/README.md
View file @
14522e6a
## Download Dataset
```
sh
wget
-O
question.jsonl https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl
```
## Run benchmark
### Benchmark sglang
...
...
benchmark/mtbench/bench_other.py
View file @
14522e6a
...
...
@@ -4,16 +4,11 @@ import os
import
time
import
uuid
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
from
fastchat.model
import
get_conversation_template
from
tqdm
import
tqdm
from
sglang.test.test_utils
import
(
add_common_other_args_and_parse
,
call_generate_lightllm
,
call_generate_srt
,
call_generate_vllm
,
)
from
sglang.test.test_utils
import
add_common_other_args_and_parse
,
get_call_generate
def
load_questions
(
filename
):
...
...
@@ -50,17 +45,7 @@ def main(args):
conv_main
=
get_conversation_template
(
model_id
)
# Select backend
if
args
.
backend
==
"lightllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_lightllm
,
url
=
url
,
stop
=
None
)
elif
args
.
backend
==
"vllm"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_vllm
,
url
=
url
,
stop
=
None
)
elif
args
.
backend
==
"srt"
:
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
call_generate
=
partial
(
call_generate_srt
,
url
=
url
,
stop
=
None
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
call_generate
=
get_call_generate
(
args
)
answers
=
[
None
]
*
len
(
questions
)
...
...
@@ -83,11 +68,17 @@ def main(args):
# Run requests
tic
=
time
.
time
()
if
args
.
parallel
==
1
:
for
i
in
range
(
len
(
questions
)):
for
i
in
tqdm
(
range
(
len
(
questions
))
)
:
get_answer
(
i
)
else
:
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
executor
.
map
(
get_answer
,
list
(
range
(
len
(
questions
))))
list
(
tqdm
(
executor
.
map
(
get_answer
,
list
(
range
(
len
(
questions
)))),
total
=
len
(
questions
),
)
)
latency
=
time
.
time
()
-
tic
print
(
f
"#questions:
{
len
(
questions
)
}
, Latency:
{
latency
:.
2
f
}
"
)
...
...
benchmark/multi_chain_reasoning/README.md
View file @
14522e6a
...
...
@@ -39,5 +39,11 @@ python3 bench_other.py --num-questions 64 --backend lightllm
### Benchmark guidance
```
python3 bench_other.py --num-questions 8 --backend guidance --parallel 1
python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --num-questions 64 --backend lmql --parallel 1
```
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment