Unverified Commit 14522e6a authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Organize Benchmark (#381)

parent 183df472
......@@ -9,6 +9,12 @@ Turn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa2
cache_turn_on = False
```
or set the environment variable
```
export DSP_CACHEBOOL=false
```
## Benchmark SGLang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
......
......@@ -28,5 +28,11 @@ python3 bench_other.py --num-events 1000 --backend vllm --parallel 1
### Benchmark guidance
```
python3 bench_other.py --num-events 1000 --backend guidance --parallel 1
python3 bench_other.py --num-events 1000 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --num-events 1000 --backend lmql --parallel 1
```
import argparse
import json
import time
from functools import partial
from pathlib import Path
from agent_functions import (
action_location_object_prompt,
......@@ -13,12 +11,7 @@ from agent_functions import (
)
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import dump_state_text, read_jsonl
......@@ -36,48 +29,27 @@ def main(args):
states = []
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
str(Path.home()) + "/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_generate(prompt, temperature, max_tokens, stop):
out = (
model
+ prompt
+ gen(
name="result",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["result"]
else:
raise ValueError(f"Invalid backend: {args.backend}")
call_generate = get_call_generate(args)
def get_one_answer(arg):
answer = call_generate(**arg, temperature=0)
states.append(answer)
async def get_one_answer_async(arg):
answer = await call_generate(**arg, temperature=0)
states.append(answer)
tic = time.time()
# we always sequentially execute agent calls to maintain its dependency
for arg in tqdm(arguments):
get_one_answer(arg)
if args.backend != "lmql":
for arg in tqdm(arguments):
get_one_answer(arg)
else:
import asyncio
loop = asyncio.get_event_loop()
for arg in tqdm(arguments):
loop.run_until_complete(get_one_answer_async(arg))
latency = time.time() - tic
print(f"Latency: {latency:.3f}")
......
......@@ -38,7 +38,7 @@ python3 bench_other.py --num-questions 200 --backend lightllm
### Benchmark guidance
```
python3 bench_other.py --num-questions 200 --backend guidance --parallel 1
python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
......
......@@ -5,17 +5,11 @@ import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import dump_state_text, read_jsonl
INVALID = -9999999
......@@ -63,54 +57,7 @@ def main(args):
states = [None] * len(labels)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_generate(prompt, temperature, max_tokens, stop):
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=temperature,
stop=stop,
)
)
return out["answer"]
elif args.backend == "lmql":
import lmql
model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
@lmql.query(model=model)
async def program(question):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 257 and STOPS_AT(ANSWER, "Question")
return ANSWER
'''
async def call_generate(prompt, temperature, max_tokens, stop):
return await program(question=prompt, temperature=0)
else:
raise ValueError(f"Invalid backend: {args.backend}")
call_generate = get_call_generate(args)
# Run requests
if args.backend != "lmql":
......@@ -130,7 +77,13 @@ def main(args):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(questions))))
list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
else:
# Use asyncio
async def batched_call(batch_size):
......
......@@ -38,7 +38,7 @@ python3 bench_other.py --num-questions 200 --backend lightllm
### Benchmark guidance
```
CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1
CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
......
......@@ -3,15 +3,11 @@ import asyncio
import json
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_select_lightllm,
call_select_vllm,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_select
from sglang.utils import read_jsonl
......@@ -47,47 +43,7 @@ def main(args):
preds = [None] * len(labels)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
call_select = partial(call_select_lightllm, url=url)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_select = partial(call_select_vllm, url=url)
elif args.backend == "guidance":
from guidance import models, select
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def call_select(context, choices):
out = model + context + select(choices, name="answer")
return choices.index(out["answer"])
call_select("Hello,", ["world", "earth"])
elif args.backend == "lmql":
import lmql
model = lmql.model(
"meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
)
@lmql.query(model=model)
async def program(ctx, choices):
'''lmql
"""{ctx}[ANSWER]""" where ANSWER in set(choices)
return ANSWER
'''
async def call_select(context, choices):
answer = await program(ctx=context, choices=choices, temperature=0)
return choices.index(answer)
else:
raise ValueError(f"Invalid backend: {args.backend}")
call_select = get_call_select(args)
# Run requests
if args.backend != "lmql":
......@@ -99,11 +55,17 @@ def main(args):
tic = time.time()
if args.parallel == 1:
for i in range(len(questions)):
for i in tqdm(range(len(questions))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(questions))))
list(
tqdm(
executor.map(get_one_answer, list(range(len(questions)))),
total=len(questions),
)
)
else:
# Use asyncio
async def batched_call(batch_size):
......
......@@ -36,7 +36,7 @@ python3 bench_sglang.py --num-questions 10
```
### Benchmark vllm
### Benchmark Outlines + vLLM
Run Llama-7B
......@@ -47,7 +47,7 @@ python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2
Benchmark
```
python3 bench_other.py --backend vllm --num-questions 10
python3 bench_other.py --backend outlines --num-questions 10
```
......@@ -56,5 +56,5 @@ python3 bench_other.py --backend vllm --num-questions 10
Run Llama-7B and benchmark
```
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
......@@ -7,10 +7,7 @@ from functools import partial
from tqdm import tqdm
from sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_STRING
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_outlines,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import dump_state_text, read_jsonl
REGEX_LIST = r"\[(" + REGEX_STRING + ", )*" + REGEX_STRING + r"\]"
......@@ -50,41 +47,11 @@ def main(args):
states = [None] * len(arguments)
# Select backend
if args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_outlines, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat-hf/ggml-model-f16.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
def generate(prompt, max_tokens, stop=None, regex=None):
out = (
model
+ prompt
+ gen(
name="answer",
max_tokens=max_tokens,
temperature=0,
stop=stop,
regex=regex,
)
)
return out["answer"]
# warmup
for _ in range(3):
generate("Hello!" * 10, max_tokens=64, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
call_generate = partial(get_call_generate(args), temperature=0)
# Run requests
def get_one_answer(i):
states[i] = json_decode(generate=generate, **arguments[i])
states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time()
if args.parallel == 1:
......@@ -92,7 +59,12 @@ def main(args):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
rets = executor.map(get_one_answer, list(range(len(arguments))))
rets = list(
tqdm(
executor.map(get_one_answer, list(range(len(arguments)))),
total=len(arguments),
)
)
for _ in rets:
pass
......
......@@ -39,7 +39,7 @@ python3 bench_sglang.py --mode city
```
### Benchmark vllm
### Benchmark Outlines + vLLM
Run Llama-7B
......@@ -50,13 +50,13 @@ python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2
Benchmark Character Generation
```bash
python3 bench_other.py --mode character --backend vllm
python3 bench_other.py --mode character --backend outlines
```
Benchmark City Information Retrieval
```bash
python3 bench_other.py --mode city --backend vllm
python3 bench_other.py --mode city --backend outlines
```
### Benchmark guidance
......@@ -64,11 +64,25 @@ python3 bench_other.py --mode city --backend vllm
Run Llama-7B and benchmark character generation
```bash
python3 bench_other.py --mode character --backend guidance --parallel 1
python3 bench_other.py --mode character --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
Run Llama-7B and benchmark city information retrieval
```bash
python3 bench_other.py --mode city --backend guidance --parallel 1
python3 bench_other.py --mode city --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
Run Llama-7B and benchmark character generation
```
python3 bench_other.py --mode character --backend lmql --parallel 1
```
Run Llama-7B and benchmark city information retrieval
```
python3 bench_other.py --mode city --backend lmql --parallel 1
```
......@@ -7,10 +7,7 @@ from functools import partial
import guidance
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_outlines,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import dump_state_text, read_jsonl
# there are some FSM bugs with json regex converted from pydantic model
......@@ -85,6 +82,29 @@ def character_maker(lm, name):
return lm
async def call_generate_lmql(
prompt, temperature, max_tokens, regex, max_len=4096, model=None, **kwargs
):
assert model is not None
import lmql
@lmql.query(model=model)
async def program(question, max_tokens, regex):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < max_tokens and REGEX(ANSWER, regex)
return ANSWER
'''
return await program(
question=prompt,
temperature=temperature,
max_tokens=max_tokens,
max_len=max_len,
regex=regex,
**kwargs,
)
@guidance
def city_maker(lm, document):
regex_str_no_quote = r"[\w\d\s]+"
......@@ -119,38 +139,68 @@ def bench_character(args):
states = [None] * len(arguments)
# Select backend
if args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_outlines, url=url, temperature=0)
if args.backend == "outlines":
call_generate = partial(get_call_generate(args), temperature=0)
def func(i):
states[i] = character_gen(**arguments[i], generate=generate)
def get_one_answer(i):
states[i] = character_gen(**arguments[i], generate=call_generate)
get_one_answer = func
elif args.backend == "guidance":
model = guidance.models.LlamaCpp(
args.llama_cpp_model_path,
args.model_path,
n_gpu_layers=-1,
n_ctx=4096,
n_ctx=args.n_ctx,
)
def func(i):
def get_one_answer(i):
lm = model + character_maker(**arguments[i])
states[i] = lm
get_one_answer = func
elif args.backend == "lmql":
import asyncio
import lmql
model = lmql.model(args.model_path, endpoint=f"{args.host}:{args.port}")
call_generate = partial(
call_generate_lmql,
model=model,
max_tokens=256,
regex=character_regex,
)
async def get_one_answer_async(i):
states[i] = await call_generate(prompt=arguments[i]["name"], temperature=0)
else:
raise ValueError(f"Invalid backend: {args.backend}")
tic = time.time()
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
if args.backend != "lmql":
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
rets = list(
tqdm(
executor.map(get_one_answer, list(range(len(arguments)))),
total=len(arguments),
)
)
for _ in rets:
pass
else:
with ThreadPoolExecutor(args.parallel) as executor:
rets = executor.map(get_one_answer, list(range(len(arguments))))
for _ in rets:
pass
batches = []
for i in range(0, len(arguments), args.parallel):
batches.append(list(range(i, min(i + args.parallel, len(arguments)))))
loop = asyncio.get_event_loop()
for bt in tqdm(batches):
loop.run_until_complete(
asyncio.gather(*[get_one_answer_async(i) for i in bt])
)
latency = time.time() - tic
......@@ -166,26 +216,23 @@ def bench_city_doc(args):
states = [None] * len(arguments)
# Select backend
if args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_outlines, url=url, temperature=0)
if args.backend == "outlines":
call_generate = partial(get_call_generate(args), temperature=0)
def func(i):
states[i] = city_gen(**arguments[i], generate=generate)
def get_one_answer(i):
states[i] = city_gen(**arguments[i], generate=call_generate)
get_one_answer = func
elif args.backend == "guidance":
model = guidance.models.LlamaCpp(
args.llama_cpp_model_path,
args.model_path,
n_gpu_layers=-1,
n_ctx=4096,
n_ctx=args.n_ctx,
)
def func(i):
def get_one_answer(i):
lm = model + city_maker(**arguments[i])
states[i] = lm
get_one_answer = func
else:
raise ValueError(f"Invalid backend: {args.backend}")
......@@ -237,10 +284,5 @@ if __name__ == "__main__":
parser.add_argument(
"--mode", type=str, default="character", choices=["character", "city"]
)
parser.add_argument(
"--llama-cpp-model-path",
type=str,
default="/home/ubuntu/model_weights/Llama-2-7b-chat-hf/ggml-model-f16.gguf",
)
args = add_common_other_args_and_parse(parser)
main(args)
......@@ -23,5 +23,11 @@ python3 bench_other.py --backend vllm --num-questions 25
### Benchmark guidance
```
python3 bench_other.py --backend guidance --num-questions 25 --parallel 1
python3 bench_other.py --backend guidance --num-questions 25 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --backend lmql --num-questions 25 --parallel 1
```
\ No newline at end of file
......@@ -6,12 +6,7 @@ from functools import partial
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import dump_state_text, read_jsonl
system_prompt = "Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
......@@ -54,53 +49,77 @@ def multi_dimension_judge(article, generate):
return s
async def multi_dimension_judge_async(article, generate):
s = system_prompt
s += "\n```\n" + article + "\n```\n\n"
judges = []
for i in range(len(dimension_prompts)):
comp = await generate(
s
+ "USER: Please judge the quality based on the following metric. "
+ dimension_prompts[i]
+ " Please provide a single-paragraph judgement. "
+ "Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"\nJUDGE:',
max_tokens=256,
stop="END",
)
judges.append(comp)
s += "I will judge the quality based on the following metrics.\n"
for i in range(len(dimension_prompts)):
s += dimension_prompts[i].split(":")[0] + ": " + judges[i].strip() + "\n"
s += "In summary, on a scale of 1 to 10, I would give the article a score of"
s += await generate(s, max_tokens=2, stop=None)
return s
def main(args):
lines = read_jsonl(args.data_path)[: args.num_questions]
states = [None] * len(lines)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_lightllm, url=url, temperature=0)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_vllm, url=url, temperature=0)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
call_generate = partial(get_call_generate(args), temperature=0)
def generate(prompt, max_tokens, stop):
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
)
return out["answer"]
# Run requests
tic = time.time()
# warmup
generate("Hello!", max_tokens=8, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
if args.backend != "lmql":
# Run requests
def get_one_answer(i):
states[i] = multi_dimension_judge(lines[i], generate)
def get_one_answer(i):
states[i] = multi_dimension_judge(lines[i], call_generate)
if args.parallel == 1:
for i in tqdm(range(len(lines))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
list(
tqdm(
executor.map(get_one_answer, list(range(len(lines)))),
total=len(lines),
)
)
tic = time.time()
if args.parallel == 1:
for i in tqdm(range(len(lines))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(lines))))
import asyncio
async def get_one_answer_async(i):
states[i] = await multi_dimension_judge_async(lines[i], call_generate)
batches = []
for i in range(0, len(lines), args.parallel):
batches.append(list(range(i, min(i + args.parallel, len(lines)))))
loop = asyncio.get_event_loop()
for bt in tqdm(batches):
loop.run_until_complete(
asyncio.gather(*[get_one_answer_async(i) for i in bt])
)
latency = time.time() - tic
# Compute accuracy
......
......@@ -22,7 +22,7 @@ python3 bench_other.py --backend vllm --num-questions 5
### Benchmark guidance
```
python3 bench_other.py --backend guidance --num-questions 5 --parallel 1
python3 bench_other.py --backend guidance --num-questions 5 --parallel 1 --n-ctx 11000 --model-path path/to/code-llama/gguf
```
......
......@@ -6,12 +6,7 @@ from functools import partial
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
from sglang.utils import dump_state_text, read_jsonl
......@@ -44,40 +39,11 @@ def main(args):
states = [None] * len(arguments)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_lightllm, url=url, temperature=0)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_vllm, url=url, temperature=0)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import gen, models
model = models.LlamaCpp(
"/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf",
n_gpu_layers=-1,
n_ctx=11000,
)
def generate(prompt, max_tokens, stop):
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
)
return out["answer"]
# warmup
generate("Hello!", max_tokens=8, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
call_generate = partial(get_call_generate(args), temperature=0)
# Run requests
def get_one_answer(i):
states[i] = json_decode(generate=generate, **arguments[i])
states[i] = json_decode(generate=call_generate, **arguments[i])
tic = time.time()
if args.parallel == 1:
......@@ -85,7 +51,13 @@ def main(args):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(arguments))))
list(
tqdm(
executor.map(get_one_answer, list(range(len(arguments)))),
total=len(arguments),
)
)
latency = time.time() - tic
# Compute accuracy
......
......@@ -46,7 +46,7 @@ python3 bench_other.py --nsub 10 --backend lightllm
### Benchmark guidance
```
python3 bench_other.py --nsub 10 --backend guidance --parallel 1
python3 bench_other.py --nsub 10 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
......
......@@ -4,19 +4,13 @@ import json
import os
import time
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
import pandas as pd
import tiktoken
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt_raw,
call_generate_vllm,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
choices = ["A", "B", "C", "D"]
......@@ -53,10 +47,7 @@ def gen_prompt(train_df, subject, k=-1):
return prompt
model_initialized = None
def evaluate(args, subject, dev_df, test_df):
def evaluate(args, subject, dev_df, test_df, call_generate):
prompts = []
labels = []
......@@ -78,62 +69,6 @@ def evaluate(args, subject, dev_df, test_df):
preds = [None] * len(prompts)
max_tokens = 1
# Select backend
global model_initialized
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url, stop=None)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url, stop=None)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url, stop=None)
elif args.backend == "guidance":
from guidance import gen, models
if model_initialized is None:
model = models.LlamaCpp(
"/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
n_gpu_layers=-1,
n_ctx=4096,
)
model_initialized = model
else:
model = model_initialized
def call_generate(prompt, temperature, max_tokens):
out = (
model
+ prompt
+ gen(name="answer", max_tokens=max_tokens, temperature=0)
)
return out["answer"]
# warmup
call_generate("Hello,", temperature=1.0, max_tokens=8)
elif args.backend == "lmql":
import lmql
model = lmql.model(
"meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
)
@lmql.query(model=model)
async def program(question):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 2
return ANSWER
'''
async def call_generate(prompt, temperature, max_tokens):
return await program(question=prompt, temperature=temperature)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests
if args.backend != "lmql":
# Use thread pool
......@@ -190,6 +125,9 @@ def main(args):
all_latencies = []
num_requests = 0
# Select backend
call_generate = get_call_generate(args)
for subject in tqdm(subjects[: args.nsub]):
dev_df = pd.read_csv(
os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
......@@ -198,7 +136,7 @@ def main(args):
os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
)
cors, acc, latency = evaluate(args, subject, dev_df, test_df)
cors, acc, latency = evaluate(args, subject, dev_df, test_df, call_generate)
all_cors.append(cors)
all_latencies.append(latency)
num_requests += len(test_df)
......
## Download Dataset
```sh
wget -O question.jsonl https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl
```
## Run benchmark
### Benchmark sglang
......
......@@ -4,16 +4,11 @@ import os
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from fastchat.model import get_conversation_template
from tqdm import tqdm
from sglang.test.test_utils import (
add_common_other_args_and_parse,
call_generate_lightllm,
call_generate_srt,
call_generate_vllm,
)
from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
def load_questions(filename):
......@@ -50,17 +45,7 @@ def main(args):
conv_main = get_conversation_template(model_id)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url, stop=None)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url, stop=None)
elif args.backend == "srt":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt, url=url, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
call_generate = get_call_generate(args)
answers = [None] * len(questions)
......@@ -83,11 +68,17 @@ def main(args):
# Run requests
tic = time.time()
if args.parallel == 1:
for i in range(len(questions)):
for i in tqdm(range(len(questions))):
get_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_answer, list(range(len(questions))))
list(
tqdm(
executor.map(get_answer, list(range(len(questions)))),
total=len(questions),
)
)
latency = time.time() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
......
......@@ -39,5 +39,11 @@ python3 bench_other.py --num-questions 64 --backend lightllm
### Benchmark guidance
```
python3 bench_other.py --num-questions 8 --backend guidance --parallel 1
python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
```
### Benchmark lmql
```
python3 bench_other.py --num-questions 64 --backend lmql --parallel 1
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment