Unverified Commit 14522e6a authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Organize Benchmark (#381)

parent 183df472
...@@ -9,6 +9,12 @@ Turn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa2 ...@@ -9,6 +9,12 @@ Turn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa2
cache_turn_on = False cache_turn_on = False
``` ```
or set the environment variable
```
export DSP_CACHEBOOL=false
```
## Benchmark SGLang ## Benchmark SGLang
``` ```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
......
...@@ -28,5 +28,11 @@ python3 bench_other.py --num-events 1000 --backend vllm --parallel 1 ...@@ -28,5 +28,11 @@ python3 bench_other.py --num-events 1000 --backend vllm --parallel 1
### Benchmark guidance ### Benchmark guidance
``` ```
python3 bench_other.py --num-events 1000 --backend guidance --parallel 1 python3 bench_other.py --num-events 1000 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --num-events 1000 --backend lmql --parallel 1
``` ```
import argparse import argparse
import json import json
import time import time
from functools import partial
from pathlib import Path
from agent_functions import ( from agent_functions import (
action_location_object_prompt, action_location_object_prompt,
...@@ -13,12 +11,7 @@ from agent_functions import ( ...@@ -13,12 +11,7 @@ from agent_functions import (
) )
from tqdm import tqdm from tqdm import tqdm
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl from sglang.utils import dump_state_text, read_jsonl
...@@ -36,48 +29,27 @@ def main(args): ...@@ -36,48 +29,27 @@ def main(args):
states = [] states = []
# Select backend # Select backend
if args.backend == "lightllm": call_generate = get_call_generate(args)
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
str(Path.home()) + "/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_generate(prompt, temperature, max_tokens, stop):
out = (
model
+ prompt
+ gen(
name="result",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["result"]
else:
raise ValueError(f"Invalid backend: {args.backend}")
def get_one_answer(arg): def get_one_answer(arg):
answer = call_generate(**arg, temperature=0) answer = call_generate(**arg, temperature=0)
states.append(answer) states.append(answer)
async def get_one_answer_async(arg):
answer = await call_generate(**arg, temperature=0)
states.append(answer)
tic = time.time() tic = time.time()
# we always sequentially execute agent calls to maintain its dependency # we always sequentially execute agent calls to maintain its dependency
if args.backend != "lmql":
for arg in tqdm(arguments): for arg in tqdm(arguments):
get_one_answer(arg) get_one_answer(arg)
else:
import asyncio
loop = asyncio.get_event_loop()
for arg in tqdm(arguments):
loop.run_until_complete(get_one_answer_async(arg))
latency = time.time() - tic latency = time.time() - tic
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
......
...@@ -38,7 +38,7 @@ python3 bench_other.py --num-questions 200 --backend lightllm ...@@ -38,7 +38,7 @@ python3 bench_other.py --num-questions 200 --backend lightllm
### Benchmark guidance ### Benchmark guidance
``` ```
python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
``` ```
......
...@@ -5,17 +5,11 @@ import json ...@@ -5,17 +5,11 @@ import json
import re import re
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999 INVALID = -9999999
...@@ -63,54 +57,7 @@ def main(args): ...@@ -63,54 +57,7 @@ def main(args):
states = [None] * len(labels) states = [None] * len(labels)
# Select backend # Select backend
if args.backend == "lightllm": call_generate = get_call_generate(args)
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_generate(prompt, temperature, max_tokens, stop):
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["answer"]
elif args.backend == "lmql":
import lmql
model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
@lmql.query(model=model)
async def program(question):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 257 and STOPS_AT(ANSWER, "Question")
return ANSWER
'''
async def call_generate(prompt, temperature, max_tokens, stop):
return await program(question=prompt, temperature=0)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests # Run requests
if args.backend != "lmql": if args.backend != "lmql":
...@@ -130,7 +77,13 @@ def main(args): ...@@ -130,7 +77,13 @@ def main(args):
get_one_answer(i) get_one_answer(i)
else: else:
with ThreadPoolExecutor(args.parallel) as executor: with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(questions)))) list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
else: else:
# Use asyncio # Use asyncio
async def batched_call(batch_size): async def batched_call(batch_size):
......
...@@ -38,7 +38,7 @@ python3 bench_other.py --num-questions 200 --backend lightllm ...@@ -38,7 +38,7 @@ python3 bench_other.py --num-questions 200 --backend lightllm
### Benchmark guidance ### Benchmark guidance
``` ```
CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
``` ```
......
...@@ -3,15 +3,11 @@ import asyncio ...@@ -3,15 +3,11 @@ import asyncio
import json import json
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_select
add_common_other_args_and_parse,
call_select_lightllm,
call_select_vllm,
)
from sglang.utils import read_jsonl from sglang.utils import read_jsonl
...@@ -47,47 +43,7 @@ def main(args): ...@@ -47,47 +43,7 @@ def main(args):
preds = [None] * len(labels) preds = [None] * len(labels)
# Select backend # Select backend
if args.backend == "lightllm": call_select = get_call_select(args)
url = f"{args.host}:{args.port}/generate"
call_select = partial(call_select_lightllm, url=url)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_select = partial(call_select_vllm, url=url)
elif args.backend == "guidance":
from guidance import models, select
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_select(context, choices):
out = model + context + select(choices, name="answer")
return choices.index(out["answer"])
call_select("Hello,", ["world", "earth"])
elif args.backend == "lmql":
import lmql
model = lmql.model(
"meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
)
@lmql.query(model=model)
async def program(ctx, choices):
'''lmql
"""{ctx}[ANSWER]""" where ANSWER in set(choices)
return ANSWER
'''
async def call_select(context, choices):
answer = await program(ctx=context, choices=choices, temperature=0)
return choices.index(answer)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests # Run requests
if args.backend != "lmql": if args.backend != "lmql":
...@@ -99,11 +55,17 @@ def main(args): ...@@ -99,11 +55,17 @@ def main(args):
tic = time.time() tic = time.time()
if args.parallel == 1: if args.parallel == 1:
for i in range(len(questions)): for i in tqdm(range(len(questions))):
get_one_answer(i) get_one_answer(i)
else: else:
with ThreadPoolExecutor(args.parallel) as executor: with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(questions)))) list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
else: else:
# Use asyncio # Use asyncio
async def batched_call(batch_size): async def batched_call(batch_size):
......
...@@ -36,7 +36,7 @@ python3 bench_sglang.py --num-questions 10 ...@@ -36,7 +36,7 @@ python3 bench_sglang.py --num-questions 10
``` ```
### Benchmark vllm ### Benchmark Outlines + vLLM
Run Llama-7B Run Llama-7B
...@@ -47,7 +47,7 @@ python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2 ...@@ -47,7 +47,7 @@ python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2
Benchmark Benchmark
``` ```
python3 bench_other.py --backend vllm --num-questions 10 python3 bench_other.py --backend outlines --num-questions 10
``` ```
...@@ -56,5 +56,5 @@ python3 bench_other.py --backend vllm --num-questions 10 ...@@ -56,5 +56,5 @@ python3 bench_other.py --backend vllm --num-questions 10
Run Llama-7B and benchmark Run Llama-7B and benchmark
``` ```
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1 python3 bench_other.py --backend guidance --num-questions 10 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
``` ```
...@@ -7,10 +7,7 @@ from functools import partial ...@@ -7,10 +7,7 @@ from functools import partial
from tqdm import tqdm from tqdm import tqdm
from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
add_common_other_args_and_parse,
call_generate_outlines,
)
from sglang.utils import dump_state_text, read_jsonl from sglang.utils import dump_state_text, read_jsonl
REGEX_LIST = r"\[(" + REGEX_STRING + ", )*" + REGEX_STRING + r"\]" REGEX_LIST = r"\[(" + REGEX_STRING + ", )*" + REGEX_STRING + r"\]"
...@@ -50,41 +47,11 @@ def main(args): ...@@ -50,41 +47,11 @@ def main(args):
states = [None] * len(arguments) states = [None] * len(arguments)
# Select backend # Select backend
if args.backend == "vllm": call_generate = partial(get_call_generate(args), temperature=0)
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_outlines, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat-hf/ggml-model-f16.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def generate(prompt, max_tokens, stop=None, regex=None):
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=0,
stop=stop,
regex=regex,
)
)
return out["answer"]
# warmup
for _ in range(3):
generate("Hello!" * 10, max_tokens=64, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests # Run requests
def get_one_answer(i): def get_one_answer(i):
states[i] = json_decode(generate=generate, **arguments[i]) states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time() tic = time.time()
if args.parallel == 1: if args.parallel == 1:
...@@ -92,7 +59,12 @@ def main(args): ...@@ -92,7 +59,12 @@ def main(args):
get_one_answer(i) get_one_answer(i)
else: else:
with ThreadPoolExecutor(args.parallel) as executor: with ThreadPoolExecutor(args.parallel) as executor:
rets = executor.map(get_one_answer, list(range(len(arguments)))) rets = list(
tqdm(
executor.map(get_one_answer, list(range(len(arguments)))),
total=len(arguments),
)
)
for _ in rets: for _ in rets:
pass pass
......
...@@ -39,7 +39,7 @@ python3 bench_sglang.py --mode city ...@@ -39,7 +39,7 @@ python3 bench_sglang.py --mode city
``` ```
### Benchmark vllm ### Benchmark Outlines + vLLM
Run Llama-7B Run Llama-7B
...@@ -50,13 +50,13 @@ python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2 ...@@ -50,13 +50,13 @@ python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2
Benchmark Character Generation Benchmark Character Generation
```bash ```bash
python3 bench_other.py --mode character --backend vllm python3 bench_other.py --mode character --backend outlines
``` ```
Benchmark City Information Retrieval Benchmark City Information Retrieval
```bash ```bash
python3 bench_other.py --mode city --backend vllm python3 bench_other.py --mode city --backend outlines
``` ```
### Benchmark guidance ### Benchmark guidance
...@@ -64,11 +64,25 @@ python3 bench_other.py --mode city --backend vllm ...@@ -64,11 +64,25 @@ python3 bench_other.py --mode city --backend vllm
Run Llama-7B and benchmark character generation Run Llama-7B and benchmark character generation
```bash ```bash
python3 bench_other.py --mode character --backend guidance --parallel 1 python3 bench_other.py --mode character --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
``` ```
Run Llama-7B and benchmark city information retrieval Run Llama-7B and benchmark city information retrieval
```bash ```bash
python3 bench_other.py --mode city --backend guidance --parallel 1 python3 bench_other.py --mode city --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
Run Llama-7B and benchmark character generation
```
python3 bench_other.py --mode character --backend lmql --parallel 1
```
Run Llama-7B and benchmark city information retrieval
```
python3 bench_other.py --mode city --backend lmql --parallel 1
``` ```
...@@ -7,10 +7,7 @@ from functools import partial ...@@ -7,10 +7,7 @@ from functools import partial
import guidance import guidance
from tqdm import tqdm from tqdm import tqdm
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
add_common_other_args_and_parse,
call_generate_outlines,
)
from sglang.utils import dump_state_text, read_jsonl from sglang.utils import dump_state_text, read_jsonl
# there are some FSM bugs with json regex converted from pydantic model # there are some FSM bugs with json regex converted from pydantic model
...@@ -85,6 +82,29 @@ def character_maker(lm, name): ...@@ -85,6 +82,29 @@ def character_maker(lm, name):
return lm return lm
async def call_generate_lmql(
prompt, temperature, max_tokens, regex, max_len=4096, model=None, **kwargs
):
assert model is not None
import lmql
@lmql.query(model=model)
async def program(question, max_tokens, regex):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens and REGEX(ANSWER, regex)
return ANSWER
'''
return await program(
question=prompt,
temperature=temperature,
max_tokens=max_tokens,
max_len=max_len,
regex=regex,
**kwargs,
)
@guidance @guidance
def city_maker(lm, document): def city_maker(lm, document):
regex_str_no_quote = r"[\w\d\s]+" regex_str_no_quote = r"[\w\d\s]+"
...@@ -119,38 +139,68 @@ def bench_character(args): ...@@ -119,38 +139,68 @@ def bench_character(args):
states = [None] * len(arguments) states = [None] * len(arguments)
# Select backend # Select backend
if args.backend == "vllm": if args.backend == "outlines":
url = f"{args.host}:{args.port}/generate" call_generate = partial(get_call_generate(args), temperature=0)
generate = partial(call_generate_outlines, url=url, temperature=0)
def func(i): def get_one_answer(i):
states[i] = character_gen(**arguments[i], generate=generate) states[i] = character_gen(**arguments[i], generate=call_generate)
get_one_answer = func
elif args.backend == "guidance": elif args.backend == "guidance":
model = guidance.models.LlamaCpp( model = guidance.models.LlamaCpp(
args.llama_cpp_model_path, args.model_path,
n_gpu_layers=-1, n_gpu_layers=-1,
n_ctx=4096, n_ctx=args.n_ctx,
) )
def func(i): def get_one_answer(i):
lm = model + character_maker(**arguments[i]) lm = model + character_maker(**arguments[i])
states[i] = lm states[i] = lm
get_one_answer = func elif args.backend == "lmql":
import asyncio
import lmql
model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
call_generate = partial(
call_generate_lmql,
model=model,
max_tokens=256,
regex=character_regex,
)
async def get_one_answer_async(i):
states[i] = await call_generate(prompt=arguments[i]["name"], temperature=0)
else: else:
raise ValueError(f"Invalid backend: {args.backend}") raise ValueError(f"Invalid backend: {args.backend}")
tic = time.time() tic = time.time()
if args.backend != "lmql":
if args.parallel == 1: if args.parallel == 1:
for i in tqdm(range(len(arguments))): for i in tqdm(range(len(arguments))):
get_one_answer(i) get_one_answer(i)
else: else:
with ThreadPoolExecutor(args.parallel) as executor: with ThreadPoolExecutor(args.parallel) as executor:
rets = executor.map(get_one_answer, list(range(len(arguments)))) rets = list(
tqdm(
executor.map(get_one_answer, list(range(len(arguments)))),
total=len(arguments),
)
)
for _ in rets: for _ in rets:
pass pass
else:
batches = []
for i in range(0, len(arguments), args.parallel):
batches.append(list(range(i, min(i + args.parallel, len(arguments)))))
loop = asyncio.get_event_loop()
for bt in tqdm(batches):
loop.run_until_complete(
asyncio.gather(*[get_one_answer_async(i) for i in bt])
)
latency = time.time() - tic latency = time.time() - tic
...@@ -166,26 +216,23 @@ def bench_city_doc(args): ...@@ -166,26 +216,23 @@ def bench_city_doc(args):
states = [None] * len(arguments) states = [None] * len(arguments)
# Select backend # Select backend
if args.backend == "vllm": if args.backend == "outlines":
url = f"{args.host}:{args.port}/generate" call_generate = partial(get_call_generate(args), temperature=0)
generate = partial(call_generate_outlines, url=url, temperature=0)
def func(i): def get_one_answer(i):
states[i] = city_gen(**arguments[i], generate=generate) states[i] = city_gen(**arguments[i], generate=call_generate)
get_one_answer = func
elif args.backend == "guidance": elif args.backend == "guidance":
model = guidance.models.LlamaCpp( model = guidance.models.LlamaCpp(
args.llama_cpp_model_path, args.model_path,
n_gpu_layers=-1, n_gpu_layers=-1,
n_ctx=4096, n_ctx=args.n_ctx,
) )
def func(i): def get_one_answer(i):
lm = model + city_maker(**arguments[i]) lm = model + city_maker(**arguments[i])
states[i] = lm states[i] = lm
get_one_answer = func
else: else:
raise ValueError(f"Invalid backend: {args.backend}") raise ValueError(f"Invalid backend: {args.backend}")
...@@ -237,10 +284,5 @@ if __name__ == "__main__": ...@@ -237,10 +284,5 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--mode", type=str, default="character", choices=["character", "city"] "--mode", type=str, default="character", choices=["character", "city"]
) )
parser.add_argument(
"--llama-cpp-model-path",
type=str,
default="/home/ubuntu/model_weights/Llama-2-7b-chat-hf/ggml-model-f16.gguf",
)
args = add_common_other_args_and_parse(parser) args = add_common_other_args_and_parse(parser)
main(args) main(args)
...@@ -23,5 +23,11 @@ python3 bench_other.py --backend vllm --num-questions 25 ...@@ -23,5 +23,11 @@ python3 bench_other.py --backend vllm --num-questions 25
### Benchmark guidance ### Benchmark guidance
``` ```
python3 bench_other.py --backend guidance --num-questions 25 --parallel 1 python3 bench_other.py --backend guidance --num-questions 25 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --backend lmql --num-questions 25 --parallel 1
``` ```
\ No newline at end of file
...@@ -6,12 +6,7 @@ from functools import partial ...@@ -6,12 +6,7 @@ from functools import partial
from tqdm import tqdm from tqdm import tqdm
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl from sglang.utils import dump_state_text, read_jsonl
system_prompt = "Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency." system_prompt = "Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
...@@ -54,53 +49,77 @@ def multi_dimension_judge(article, generate): ...@@ -54,53 +49,77 @@ def multi_dimension_judge(article, generate):
return s return s
async def multi_dimension_judge_async(article, generate):
s = system_prompt
s += "\n```\n" + article + "\n```\n\n"
judges = []
for i in range(len(dimension_prompts)):
comp = await generate(
s
+ "USER: Please judge the quality based on the following metric. "
+ dimension_prompts[i]
+ " Please provide a single-paragraph judgement. "
+ "Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"\nJUDGE:',
max_tokens=256,
stop="END",
)
judges.append(comp)
s += "I will judge the quality based on the following metrics.\n"
for i in range(len(dimension_prompts)):
s += dimension_prompts[i].split(":")[0] + ": " + judges[i].strip() + "\n"
s += "In summary, on a scale of 1 to 10, I would give the article a score of"
s += await generate(s, max_tokens=2, stop=None)
return s
def main(args): def main(args):
lines = read_jsonl(args.data_path)[: args.num_questions] lines = read_jsonl(args.data_path)[: args.num_questions]
states = [None] * len(lines) states = [None] * len(lines)
# Select backend # Select backend
if args.backend == "lightllm": call_generate = partial(get_call_generate(args), temperature=0)
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_lightllm, url=url, temperature=0)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_vllm, url=url, temperature=0)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def generate(prompt, max_tokens, stop): # Run requests
out = ( tic = time.time()
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
)
return out["answer"]
# warmup if args.backend != "lmql":
generate("Hello!", max_tokens=8, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests
def get_one_answer(i): def get_one_answer(i):
states[i] = multi_dimension_judge(lines[i], generate) states[i] = multi_dimension_judge(lines[i], call_generate)
tic = time.time()
if args.parallel == 1: if args.parallel == 1:
for i in tqdm(range(len(lines))): for i in tqdm(range(len(lines))):
get_one_answer(i) get_one_answer(i)
else: else:
with ThreadPoolExecutor(args.parallel) as executor: with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(lines)))) list(
tqdm(
executor.map(get_one_answer, list(range(len(lines)))),
total=len(lines),
)
)
else:
import asyncio
async def get_one_answer_async(i):
states[i] = await multi_dimension_judge_async(lines[i], call_generate)
batches = []
for i in range(0, len(lines), args.parallel):
batches.append(list(range(i, min(i + args.parallel, len(lines)))))
loop = asyncio.get_event_loop()
for bt in tqdm(batches):
loop.run_until_complete(
asyncio.gather(*[get_one_answer_async(i) for i in bt])
)
latency = time.time() - tic latency = time.time() - tic
# Compute accuracy # Compute accuracy
......
...@@ -22,7 +22,7 @@ python3 bench_other.py --backend vllm --num-questions 5 ...@@ -22,7 +22,7 @@ python3 bench_other.py --backend vllm --num-questions 5
### Benchmark guidance ### Benchmark guidance
``` ```
python3 bench_other.py --backend guidance --num-questions 5 --parallel 1 python3 bench_other.py --backend guidance --num-questions 5 --parallel 1 --n-ctx 11000 --model-path path/to/code-llama/gguf
``` ```
......
...@@ -6,12 +6,7 @@ from functools import partial ...@@ -6,12 +6,7 @@ from functools import partial
from tqdm import tqdm from tqdm import tqdm
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.utils import dump_state_text, read_jsonl from sglang.utils import dump_state_text, read_jsonl
...@@ -44,40 +39,11 @@ def main(args): ...@@ -44,40 +39,11 @@ def main(args):
states = [None] * len(arguments) states = [None] * len(arguments)
# Select backend # Select backend
if args.backend == "lightllm": call_generate = partial(get_call_generate(args), temperature=0)
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_lightllm, url=url, temperature=0)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_vllm, url=url, temperature=0)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
"/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf",
n_gpu_layers=-1,
n_ctx=11000,
)
def generate(prompt, max_tokens, stop):
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
)
return out["answer"]
# warmup
generate("Hello!", max_tokens=8, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests # Run requests
def get_one_answer(i): def get_one_answer(i):
states[i] = json_decode(generate=generate, **arguments[i]) states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time() tic = time.time()
if args.parallel == 1: if args.parallel == 1:
...@@ -85,7 +51,13 @@ def main(args): ...@@ -85,7 +51,13 @@ def main(args):
get_one_answer(i) get_one_answer(i)
else: else:
with ThreadPoolExecutor(args.parallel) as executor: with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(arguments)))) list(
tqdm(
executor.map(get_one_answer, list(range(len(arguments)))),
total=len(arguments),
)
)
latency = time.time() - tic latency = time.time() - tic
# Compute accuracy # Compute accuracy
......
...@@ -46,7 +46,7 @@ python3 bench_other.py --nsub 10 --backend lightllm ...@@ -46,7 +46,7 @@ python3 bench_other.py --nsub 10 --backend lightllm
### Benchmark guidance ### Benchmark guidance
``` ```
python3 bench_other.py --nsub 10 --backend guidance --parallel 1 python3 bench_other.py --nsub 10 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
``` ```
......
...@@ -4,19 +4,13 @@ import json ...@@ -4,19 +4,13 @@ import json
import os import os
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import tiktoken import tiktoken
from tqdm import tqdm from tqdm import tqdm
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
choices = ["A", "B", "C", "D"] choices = ["A", "B", "C", "D"]
...@@ -53,10 +47,7 @@ def gen_prompt(train_df, subject, k=-1): ...@@ -53,10 +47,7 @@ def gen_prompt(train_df, subject, k=-1):
return prompt return prompt
model_initialized = None def evaluate(args, subject, dev_df, test_df, call_generate):
def evaluate(args, subject, dev_df, test_df):
prompts = [] prompts = []
labels = [] labels = []
...@@ -78,62 +69,6 @@ def evaluate(args, subject, dev_df, test_df): ...@@ -78,62 +69,6 @@ def evaluate(args, subject, dev_df, test_df):
preds = [None] * len(prompts) preds = [None] * len(prompts)
max_tokens = 1 max_tokens = 1
# Select backend
global model_initialized
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url, stop=None)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url, stop=None)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url, stop=None)
elif args.backend == "guidance":
from guidance import gen, models
if model_initialized is None:
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
model_initialized = model
else:
model = model_initialized
def call_generate(prompt, temperature, max_tokens):
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0)
)
return out["answer"]
# warmup
call_generate("Hello,", temperature=1.0, max_tokens=8)
elif args.backend == "lmql":
import lmql
model = lmql.model(
"meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
)
@lmql.query(model=model)
async def program(question):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 2
return ANSWER
'''
async def call_generate(prompt, temperature, max_tokens):
return await program(question=prompt, temperature=temperature)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests # Run requests
if args.backend != "lmql": if args.backend != "lmql":
# Use thread pool # Use thread pool
...@@ -190,6 +125,9 @@ def main(args): ...@@ -190,6 +125,9 @@ def main(args):
all_latencies = [] all_latencies = []
num_requests = 0 num_requests = 0
# Select backend
call_generate = get_call_generate(args)
for subject in tqdm(subjects[: args.nsub]): for subject in tqdm(subjects[: args.nsub]):
dev_df = pd.read_csv( dev_df = pd.read_csv(
os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
...@@ -198,7 +136,7 @@ def main(args): ...@@ -198,7 +136,7 @@ def main(args):
os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
) )
cors, acc, latency = evaluate(args, subject, dev_df, test_df) cors, acc, latency = evaluate(args, subject, dev_df, test_df, call_generate)
all_cors.append(cors) all_cors.append(cors)
all_latencies.append(latency) all_latencies.append(latency)
num_requests += len(test_df) num_requests += len(test_df)
......
## Download Dataset
```sh
wget -O question.jsonl https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl
```
## Run benchmark ## Run benchmark
### Benchmark sglang ### Benchmark sglang
......
...@@ -4,16 +4,11 @@ import os ...@@ -4,16 +4,11 @@ import os
import time import time
import uuid import uuid
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from functools import partial
from fastchat.model import get_conversation_template from fastchat.model import get_conversation_template
from tqdm import tqdm
from sglang.test.test_utils import ( from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt,
call_generate_vllm,
)
def load_questions(filename): def load_questions(filename):
...@@ -50,17 +45,7 @@ def main(args): ...@@ -50,17 +45,7 @@ def main(args):
conv_main = get_conversation_template(model_id) conv_main = get_conversation_template(model_id)
# Select backend # Select backend
if args.backend == "lightllm": call_generate = get_call_generate(args)
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url, stop=None)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url, stop=None)
elif args.backend == "srt":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt, url=url, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
answers = [None] * len(questions) answers = [None] * len(questions)
...@@ -83,11 +68,17 @@ def main(args): ...@@ -83,11 +68,17 @@ def main(args):
# Run requests # Run requests
tic = time.time() tic = time.time()
if args.parallel == 1: if args.parallel == 1:
for i in range(len(questions)): for i in tqdm(range(len(questions))):
get_answer(i) get_answer(i)
else: else:
with ThreadPoolExecutor(args.parallel) as executor: with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_answer, list(range(len(questions)))) list(
tqdm(
executor.map(get_answer, list(range(len(questions)))),
total=len(questions),
)
)
latency = time.time() - tic latency = time.time() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}") print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
......
...@@ -39,5 +39,11 @@ python3 bench_other.py --num-questions 64 --backend lightllm ...@@ -39,5 +39,11 @@ python3 bench_other.py --num-questions 64 --backend lightllm
### Benchmark guidance ### Benchmark guidance
``` ```
python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --num-questions 64 --backend lmql --parallel 1
``` ```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment