"src/include/blockwise_2d_tensor_op.hip.hpp" did not exist on "3bd51021ab676dcc17fe0674d6a39411acbfce5a"
Commit 22085081 authored by Lianmin Zheng's avatar Lianmin Zheng
Browse files
parent f6d40df0
"""
Generate line data for line retrieval task.
Usage:
python3 gen_data.py --number 1000
"""
import argparse
from collections import defaultdict
import json
from tqdm import tqdm
import numpy as np
def generate_lines(random_words, num_lines, redirect_ratio):
prefix = "Here is a list of lines, each with its corresponding REGISTER_CONTENT value. Please memorize them. Be prepared to provide the REGISTER_CONTENT value for a specific line index when I ask."
suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resovling the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
# Raw lines
visited_indices = set([None])
visited_values = set([None])
lines = []
redirects = []
indices = []
values = []
for i in tqdm(range(num_lines)):
line_index = None
while line_index in visited_indices:
line_index = "-".join(np.random.choice(random_words, size=(2,)))
visited_indices.add(line_index)
line_value = np.random.randint(low=0, high=999999)
line_value = f"{line_value:06}"
line = f"Line {line_index}: The REGISTER_CONTENT is {line_value}."
lines.append(line)
redirects.append(None)
indices.append(line_index)
values.append(line_value)
# Add redirect
if redirect_ratio > 0:
num_redirect_lines = int(len(lines) * redirect_ratio)
redirect_indices = np.random.choice(np.arange(len(lines)),
size=(num_redirect_lines,), replace=False)
for i in redirect_indices:
target_idx = np.random.choice(min(i * 2 + 100, num_lines))
lines[i] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
redirects[i] = target_idx
# Build links and find sources
links = [[] for _ in range(num_lines)]
contains_ring = set()
for i in range(num_lines):
if redirects[i] is None:
continue
tmp_link = []
cur = i
visited = set()
while redirects[cur] is not None:
visited.add(cur)
tmp_link.append(redirects[cur])
cur = redirects[cur]
if cur in visited:
contains_ring.add(i)
tmp_link = None
break
values[i] = values[cur]
links[i] = tmp_link
# Group by num_links
group_by_num_hoops = defaultdict(list)
for i in range(num_lines):
if i in contains_ring:
continue
group_by_num_hoops[len(links[i]) + 1].append(i)
keys = sorted(list(group_by_num_hoops.keys()))
for num_links in keys:
print(f"#links: {num_links}, #lines: {len(group_by_num_hoops[num_links])}")
# Append few-shot examples
hoop1_candidates = list(group_by_num_hoops[1])
hoop1_candidate_keys = {c: max([c] + links[c]) for c in hoop1_candidates}
hoop1_candidates.sort(key=lambda c: hoop1_candidate_keys[c])
hoop2_candidates = list(group_by_num_hoops[2])
hoop2_candidate_keys = {c: max([c] + links[c]) for c in hoop2_candidates}
hoop2_candidates.sort(key=lambda c: hoop2_candidate_keys[c])
i = hoop1_candidates[5]
suffix = suffix.replace("__idx0__", indices[i]).replace("__val0__", values[i])
if len(hoop2_candidates):
i = hoop2_candidates[0]
suffix = suffix.replace("__idx1__", indices[i]).replace("__val1__", values[i])
i = hoop2_candidates[1]
suffix = suffix.replace("__idx2__", indices[i]).replace("__val2__", values[i])
else:
i = hoop1_candidates[1]
suffix = suffix.replace("__idx1__", indices[i]).replace("__val1__", values[i])
i = hoop1_candidates[10]
suffix = suffix.replace("__idx2__", indices[i]).replace("__val2__", values[i])
obj = {
"prefix": prefix,
"suffix": suffix,
"lines": lines,
"indices": indices,
"values": values,
"links": links,
"group_by_num_hoops": group_by_num_hoops,
"contains_ring": sorted(list(contains_ring)),
}
return obj
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--number", type=int)
parser.add_argument("--redirect-ratio", type=float, default=0.0)
args = parser.parse_args()
num_lines = args.number
random_words_filename = "random_words.json"
random_words = json.load(open(random_words_filename, "r"))
np.random.seed(42)
obj = generate_lines(random_words, num_lines, args.redirect_ratio)
fout = f"lines_{num_lines}_{args.redirect_ratio:.1f}.json"
with open(fout, "w") as fout:
json.dump(obj, fout, indent=2)
## Download benchmark images
```
python3 download_images.py
```
image benchmark source: https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
### Other Dependency
```
pip3 install "torch>=2.1.2" "transformers>=4.36" pillow
```
## Run benchmark
### Benchmark sglang
Launch a server
```
python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
```
Run benchmark
```
# Run with local models
python3 bench_sglang.py --num-questions 60
# Run with OpenAI models
python3 bench_sglang.py --num-questions 60 --backend gpt-4-vision-preview
```
### Bench LLaVA original code
```
git clone git@github.com:haotian-liu/LLaVA.git
cd LLaVA
git reset --hard 9a26bd1435b4ac42c282757f2c16d34226575e96
pip3 install -e .
cd ~/sglang/benchmark/llava_bench
CUDA_VISIBLE_DEVICES=0 bash bench_hf_llava_bench.sh
```
### Benchmark llama.cpp
```
# Install
CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
pip install sse_starlette starlette_context pydantic_settings
# Download weights
mkdir -p ~/model_weights/llava-v1.5-7b/
wget https://huggingface.co/mys/ggml_llava-v1.5-7b/resolve/main/ggml-model-f16.gguf -O ~/model_weights/llava-v1.5-7b/ggml-model-f16.gguf
wget https://huggingface.co/mys/ggml_llava-v1.5-7b/resolve/main/mmproj-model-f16.gguf -O ~/model_weights/llava-v1.5-7b/mmproj-model-f16.gguf
```
```
python3 -m llama_cpp.server --model ~/model_weights/llava-v1.5-7b/ggml-model-f16.gguf --clip_model_path ~/model_weights/llava-v1.5-7b/mmproj-model-f16.gguf --chat_format llava-1-5 --port 23000
OPENAI_BASE_URL=http://localhost:23000/v1 python3 bench_sglang.py --backend gpt-4-vision-preview --num-q 1
```
#!/bin/bash
python -m llava.eval.model_vqa \
--model-path liuhaotian/llava-v1.5-7b \
--question-file ./questions.jsonl \
--image-folder ./images \
--answers-file ./answers_hf.jsonl \
--temperature 0 \
--conv-mode vicuna_v1
#!/bin/bash
python -m llava.eval.model_vqa_loader \
--model-path liuhaotian/llava-v1.5-7b \
--question-file ./mme_pack/llava_mme_bench_replace.jsonl \
--image-folder ./mme_pack/MME_Benchmark_release_version \
--answers-file ./answers_hf_mme.jsonl \
--temperature 0 \
--conv-mode vicuna_v1
import argparse
import json
import time
import os
import sglang as sgl
import tqdm
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
from PIL import Image
@sgl.function
def image_qa(s, image_file, question):
s += sgl.user(sgl.image(image_file) + question)
s += sgl.assistant(sgl.gen("answer", max_tokens=args.max_tokens))
def main(args):
lines = read_jsonl(args.question_file)[:args.num_questions]
arguments = [
{"image_file":
os.path.abspath(args.image_folder + "/" + l["image"]),
"question": l["text"]} for l in lines
]
#arguments = [
# {"image_file":
# Image.open(os.path.abspath(args.image_folder + "/" + l["image"])),
# "question": l["text"]} for l in lines
#]
states = [None] * len(lines)
# Select backend
backend = select_sglang_backend(args)
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
if args.parallel == 1:
for i in tqdm.tqdm(range(len(lines))):
image_file = arguments[i]["image_file"]
question = arguments[i]["question"]
ret = image_qa.run(
image_file=image_file,
question=question,
temperature=0)
states[i] = ret
else:
states = image_qa.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True)
latency = time.time() - tic
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
print(f"Write output to {args.answer_file}")
with open(args.answer_file, "w") as fout:
for i in range(len(lines)):
value = {
"question_id": lines[i]["question_id"],
"prompt": lines[i]["text"],
"text": states[i]["answer"].strip(),
"model_id": backend.model_info["model_path"],
"answer_id": i,
"metadata": {},
}
fout.write(json.dumps(value) + "\n")
with open(args.result_file, "a") as fout:
value = {
"task": "llava_bench",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": len(lines),
"parallel": args.parallel,
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--question-file", type=str, default="questions.jsonl")
parser.add_argument("--answer-file", type=str, default="answers.jsonl")
parser.add_argument("--image-folder", type=str, default="./images")
parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--num-questions", type=int, default=None)
parser.add_argument("--max-tokens", type=int, default=768)
args = add_common_sglang_args_and_parse(parser)
main(args)
MME_FOLDER=./mme_pack
python3 bench_sglang.py --num-questions 5000 --question-file $MME_FOLDER/llava_mme_bench_replace.jsonl --answer-file answer_mme.jsonl --image-folder $MME_FOLDER/MME_Benchmark_release_version --max-tokens 4
import os
# Create the 'images' directory if it doesn't exist
if not os.path.exists('images'):
os.makedirs('images')
# Base URL
base_url = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/"
# Loop through image numbers
for i in range(1, 25):
# Format the image number with leading zeros
image_number = str(i).zfill(3)
image_url = base_url + image_number + ".jpg"
image_path = "images/" + image_number + ".jpg"
# Download the image using wget
os.system(f"wget -O {image_path} {image_url}")
print("Download complete.")
## Run benchmark
### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
python3 bench_sglang.py --num-questions 25 --parallel 8
python3 bench_sglang.py --num-questions 16 --parallel 1
```
### Benchmark vllm
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```
```
python3 bench_other.py --backend vllm --num-questions 25
```
### Benchmark guidance
```
python3 bench_other.py --backend guidance --num-questions 25 --parallel 1
```
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import time
import numpy as np
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
system_prompt = (
"Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
)
dimension_prompts = [
"Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
"Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
"Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
"Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
"Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
"Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
]
def multi_dimension_judge(article, generate):
s = system_prompt
s += "\n```\n" + article + "\n```\n\n"
judges = []
for i in range(len(dimension_prompts)):
comp = generate(s +
"USER: Please judge the quality based on the following metric. " +
dimension_prompts[i] + " Please provide a single-paragraph judgement. " +
"Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"\nJUDGE:',
max_tokens=256, stop="END")
judges.append(comp)
s += "I will judge the quality based on the following metrics.\n"
for i in range(len(dimension_prompts)):
s += dimension_prompts[i].split(":")[0] + ": " + judges[i].strip() + "\n"
s += "In summary, on a scale of 1 to 10, I would give the article a score of"
s += generate(s, max_tokens=2, stop=None)
return s
def main(args):
lines = read_jsonl(args.data_path)[:args.num_questions]
states = [None] * len(lines)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_lightllm, url=url, temperature=0)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_vllm, url=url, temperature=0)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import models, gen
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
def generate(prompt, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0, stop=stop)
return out["answer"]
# warmup
generate("Hello!", max_tokens=8, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests
def get_one_answer(i):
states[i] = multi_dimension_judge(lines[i], generate)
tic = time.time()
if args.parallel == 1:
for i in tqdm(range(len(lines))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(lines))))
latency = time.time() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "llm_judge",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="articles.jsonl")
parser.add_argument("--num-questions", type=int, default=20)
args = add_common_other_args_and_parse(parser)
main(args)
import argparse
import json
import time
import numpy as np
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
system_prompt = (
"Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
)
dimension_prompts = [
"Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
"Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
"Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
"Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
"Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
"Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
]
@sgl.function
def multi_dimension_judge(s, article):
s += system_prompt
s += "\n```\n" + article + "\n```\n\n"
forks = s.fork(len(dimension_prompts))
for i in range(len(dimension_prompts)):
forks[i] += ("USER: Please judge the quality based on the following metric. " +
dimension_prompts[i] + " Please provide a single-paragraph judgement. " +
"Focus on the provided metric and do not say other things. "
'End your judgement paragraph with the word "END"\nJUDGE:')
forks[i] += sgl.gen("judgement", max_tokens=256, stop="END")
forks.join()
s += "I will judge the quality based on the following metrics.\n"
for i in range(len(dimension_prompts)):
s += dimension_prompts[i].split(":")[0] + ": " + forks[i]["judgement"].strip() + "\n"
s += "In summary, on a scale of 1 to 10, I would give the article a score of"
s += sgl.gen("score", max_tokens=2)
def main(args):
lines = read_jsonl(args.data_path)[:args.num_questions]
arguments = [{"article": l} for l in lines]
# Select backend
backend = select_sglang_backend(args)
# Run requests
tic = time.time()
states = multi_dimension_judge.run_batch(
arguments, temperature=0, backend=backend, num_threads=args.parallel)
latency = time.time() - tic
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "llm_judge",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="articles.jsonl")
parser.add_argument("--num-questions", type=int, default=20)
args = add_common_sglang_args_and_parse(parser)
main(args)
## Run benchmark
### Benchmark sglang
```
python3 -m sglang.launch_server --model-path codellama/CodeLlama-7b-instruct-hf --port 30000
```
```
python3 bench_sglang.py --num-questions 5 --parallel 1
```
### Benchmark vllm
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model codellama/CodeLlama-7b-instruct-hf --disable-log-requests --port 21000 --gpu 0.97
```
```
python3 bench_other.py --backend vllm --num-questions 5
```
### Benchmark guidance
```
python3 bench_other.py --backend guidance --num-questions 5 --parallel 1
```
### Build dataset
```
pip install wikipedia
python3 build_dataset.py
```
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import time
from tqdm import tqdm
import numpy as np
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
from sglang.utils import read_jsonl, dump_state_text
def json_decode(document, generate):
s = "Please extract the information of a city from the following wikipedia page.\n"
s += "Page begin.\n" + document + "Page end.\n"
s += "Here is the name, country, and symbol of the city in JSON format.\n"
s += '{\n'
s += ' "name": "'
s += generate(s, max_tokens=8, stop='"') + '",\n'
s += ' "country": "'
s += generate(s, max_tokens=8, stop='"') + '",\n'
s += ' "air port code": "'
s += generate(s, max_tokens=8, stop='"') + '",\n'
s += ' "top 3 landmarks": "'
s += generate(s, max_tokens=24, stop='"') + '",\n'
s += '}\n'
return s
def main(args):
lines = read_jsonl(args.data_path)
arguments = []
for i in range(len(lines[:args.num_questions])):
arguments.append({
"document": lines[i]["document"],
})
states = [None] * len(arguments)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_lightllm, url=url, temperature=0)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_vllm, url=url, temperature=0)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
generate = partial(call_generate_srt_raw, url=url, temperature=0)
elif args.backend == "guidance":
from guidance import models, gen
model = models.LlamaCpp("/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf", n_gpu_layers=-1, n_ctx=11000)
def generate(prompt, max_tokens, stop):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0, stop=stop)
return out["answer"]
# warmup
generate("Hello!", max_tokens=8, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests
def get_one_answer(i):
states[i] = json_decode(generate=generate, **arguments[i])
tic = time.time()
if args.parallel == 1:
for i in tqdm(range(len(arguments))):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(arguments))))
latency = time.time() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "long_json_decode",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="questions.jsonl")
parser.add_argument("--num-questions", type=int, default=100)
args = add_common_other_args_and_parse(parser)
main(args)
import argparse
import json
import time
import numpy as np
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
from sglang.utils import read_jsonl, dump_state_text
@sgl.function
def json_decode(s, document):
s += "Please extract the information of a city from the following wikipedia page.\n"
s += "Page begin.\n" + document + "Page end.\n"
s += "Here is the name, country, and symbol of the city in JSON format.\n"
s += '{\n'
s += ' "name": "' + sgl.gen("name", max_tokens=8, stop='"') + '",\n'
s += ' "country": "' + sgl.gen("country", max_tokens=8, stop='"') + '",\n'
s += ' "air port code": "' + sgl.gen("air port code", max_tokens=8, stop='"') + '",\n'
s += ' "top 3 landmarks": "' + sgl.gen("landmarks", max_tokens=24, stop='"') + '",\n'
s += '}\n'
def main(args):
lines = read_jsonl(args.data_path)
arguments = []
for i in range(len(lines[:args.num_questions])):
arguments.append({
"document": lines[i]["document"],
})
# Select backend
backend = select_sglang_backend(args)
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
states = json_decode.run_batch(
arguments, temperature=0, num_threads=args.parallel)
latency = time.time() - tic
# Compute accuracy
print(f"Latency: {latency:.3f}")
# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(args.result_file, "a") as fout:
value = {
"task": "long_json_decode",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="questions.jsonl")
parser.add_argument("--num-questions", type=int, default=10)
args = add_common_sglang_args_and_parse(parser)
main(args)
import json
import transformers
import wikipedia
name = "meta-llama/Llama-2-7b-chat-hf"
t = transformers.AutoTokenizer.from_pretrained(name)
city_names = ["los angles", "london", "tokyo", "beijing", "singapore"]
for city_name in city_names:
content = str(wikipedia.page(city_name).content)
content = content.replace("\n\n", "\n")
tokens = t.encode(content)
truncate_len = int((10000 / len(tokens)) * len(content))
truncate_content = content[:truncate_len]
truncate_tokens = t.encode(truncate_content)
# Count token
print(f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}")
with open("questions.jsonl", "a") as fout:
fout.write(json.dumps({"document": truncate_content}) + "\n")
## Download data
```
wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
tar xf data.tar
```
## Run benchmark
### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
python3 bench_sglang.py --nsub 10
```
### Benchmark vllm
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```
```
python3 bench_other.py --nsub 10 --backend vllm
```
### Benchmark lightllm
```
# A10G
python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
# V100
python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 4500 --port 22000
```
```
python3 bench_other.py --nsub 10 --backend lightllm
```
### Benchmark guidance
```
python3 bench_other.py --nsub 10 --backend guidance --parallel 1
```
### Benchmark lmql
```
CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
```
```
python3 bench_other.py --nsub 10 --backend lmql --parallel 2
```
import argparse
import asyncio
from concurrent.futures import ThreadPoolExecutor
import json
from functools import partial
import os
import time
import numpy as np
import pandas as pd
import tiktoken
from tqdm import tqdm
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
choices = ["A", "B", "C", "D"]
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
def format_subject(subject):
l = subject.split("_")
s = ""
for entry in l:
s += " " + entry
return s
def format_example(df, idx, include_answer=True):
prompt = df.iloc[idx, 0]
k = df.shape[1] - 2
for j in range(k):
prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
prompt += "\nAnswer:"
if include_answer:
prompt += " {}\n\n".format(df.iloc[idx, k + 1])
return prompt
def gen_prompt(train_df, subject, k=-1):
prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(format_subject(subject))
if k == -1:
k = train_df.shape[0]
for i in range(k):
prompt += format_example(train_df, i)
return prompt
model_initialized = None
def evaluate(args, subject, dev_df, test_df):
prompts = []
labels = []
# Construct prompts
k = args.ntrain
train_prompt = gen_prompt(dev_df, subject, k)
while len(tokenizer.encode(train_prompt)) > 1536:
k -= 1
train_prompt = gen_prompt(dev_df, subject, k)
for i in range(test_df.shape[0]):
prompt_end = format_example(test_df, i, include_answer=False)
prompt = train_prompt + prompt_end
prompts.append(prompt)
label = test_df.iloc[i, test_df.shape[1]-1]
labels.append(label)
preds = [None] * len(prompts)
max_tokens = 1
# Select backend
global model_initialized
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url, stop=None)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url, stop=None)
elif args.backend == "srt-raw":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt_raw, url=url, stop=None)
elif args.backend == "guidance":
from guidance import models, gen
if model_initialized is None:
model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
model_initialized = model
else:
model = model_initialized
def call_generate(prompt, temperature, max_tokens):
out = model + prompt + gen(name="answer",
max_tokens=max_tokens, temperature=0)
return out["answer"]
elif args.backend == "lmql":
import lmql
model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
endpoint=f"{args.host}:{args.port}")
@lmql.query(model=model)
async def program(question):
'''lmql
"""{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 2
return ANSWER
'''
async def call_generate(prompt, temperature, max_tokens):
return await program(question=prompt, temperature=temperature)
else:
raise ValueError(f"Invalid backend: {args.backend}")
# Run requests
if args.backend != "lmql":
# Use thread pool
def get_one_answer(i):
pred = call_generate(prompts[i], temperature=0,
max_tokens=max_tokens)
preds[i] = pred.strip()[0]
tic = time.time()
if args.parallel == 1:
for i in range(len(prompts)):
get_one_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_one_answer, list(range(len(prompts))))
else:
# Use asyncio
async def batched_call(batch_size):
for i in range(0, len(prompts), batch_size):
tasks = []
for p in prompts[i:i+batch_size]:
tasks.append(call_generate(p,
temperature=0, max_tokens=max_tokens))
rets = await asyncio.gather(*tasks)
for j in range(len(rets)):
preds[i+j] = rets[j].strip()[0]
tic = time.time()
asyncio.run(batched_call(batch_size=args.parallel))
latency = time.time() - tic
# Compute accuracy
cors = [pred == label for pred, label in zip(preds, labels)]
acc = np.mean(cors)
cors = np.array(cors)
print("Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
acc, latency, len(prompts), subject))
return cors, acc, latency
def main(args):
subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
all_cors = []
all_latencies = []
num_requests = 0
for subject in tqdm(subjects[:args.nsub]):
dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
cors, acc, latency = evaluate(args, subject, dev_df, test_df)
all_cors.append(cors)
all_latencies.append(latency)
num_requests += len(test_df)
total_latency = np.sum(all_latencies)
print("Total latency: {:.3f}".format(total_latency))
weighted_acc = np.mean(np.concatenate(all_cors))
print("Average accuracy: {:.3f}".format(weighted_acc))
# Write results
with open(args.result_file, "a") as fout:
value = {
"task": "mmlu",
"backend": args.backend,
"num_gpus": 1,
"latency": round(total_latency, 3),
"accuracy": round(weighted_acc, 3),
"num_requests": num_requests,
"other": {
"nsub": args.nsub,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--ntrain", type=int, default=5)
parser.add_argument("--data_dir", type=str, default="data")
parser.add_argument("--nsub", type=int, default=60)
args = add_common_other_args_and_parse(parser)
main(args)
import argparse
import json
import os
import time
import numpy as np
import pandas as pd
import tiktoken
from tqdm import tqdm
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
choices = ["A", "B", "C", "D"]
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
def format_subject(subject):
l = subject.split("_")
s = ""
for entry in l:
s += " " + entry
return s
def format_example(df, idx, include_answer=True):
prompt = df.iloc[idx, 0]
k = df.shape[1] - 2
for j in range(k):
prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
prompt += "\nAnswer:"
if include_answer:
prompt += " {}\n\n".format(df.iloc[idx, k + 1])
return prompt
def gen_prompt(train_df, subject, k=-1):
prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(format_subject(subject))
if k == -1:
k = train_df.shape[0]
for i in range(k):
prompt += format_example(train_df, i)
return prompt
def evaluate(args, subject, dev_df, test_df):
prompts = []
labels = []
k = args.ntrain
few_shot_examples = gen_prompt(dev_df, subject, k)
while len(tokenizer.encode(few_shot_examples)) > 1536:
k -= 1
few_shot_examples = gen_prompt(dev_df, subject, k)
for i in range(test_df.shape[0]):
prompt_end = format_example(test_df, i, include_answer=False)
prompts.append(prompt_end)
label = test_df.iloc[i, test_df.shape[1]-1]
labels.append(label)
arguments = [{"question": p} for p in prompts]
#####################################
######### SGL Program Begin #########
#####################################
import sglang as sgl
@sgl.function
def few_shot_mmlu(s, examples, question):
s += examples + question + sgl.gen("answer")
#####################################
########## SGL Program End ##########
#####################################
# Select backend
backend = select_sglang_backend(args)
tic = time.time()
states = few_shot_mmlu.bind(examples=few_shot_examples).run_batch(
arguments, temperature=0, max_new_tokens=1,
backend=backend, num_threads=args.parallel)
preds = [s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else ""
for s in states]
latency = time.time() - tic
cors = [pred == label for pred, label in zip(preds, labels)]
acc = np.mean(cors)
cors = np.array(cors)
print("Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
acc, latency, len(prompts), subject))
return cors, acc, latency
def main(args):
subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
all_cors = []
all_latencies = []
num_requests = 0
for subject in tqdm(subjects[:args.nsub]):
dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
cors, acc, latency = evaluate(args, subject, dev_df, test_df)
all_cors.append(cors)
all_latencies.append(latency)
num_requests += len(test_df)
total_latency = np.sum(all_latencies)
print("Total latency: {:.3f}".format(total_latency))
weighted_acc = np.mean(np.concatenate(all_cors))
print("Average accuracy: {:.3f}".format(weighted_acc))
# Write results
with open(args.result_file, "a") as fout:
value = {
"task": "mmlu",
"backend": args.backend,
"num_gpus": 1,
"latency": round(total_latency, 3),
"accuracy": round(weighted_acc, 3),
"num_requests": num_requests,
"other": {
"nsub": args.nsub,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--ntrain", "-k", type=int, default=5)
parser.add_argument("--data_dir", "-d", type=str, default="data")
parser.add_argument("--save_dir", "-s", type=str, default="results")
parser.add_argument("--nsub", type=int, default=60)
args = add_common_sglang_args_and_parse(parser)
main(args)
## Run benchmark
### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
python3 bench_sglang.py --num-questions 80
```
### Benchmark vllm
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```
```
python3 bench_other.py --num-questions 80 --backend vllm
```
### Benchmark lightllm
```
# A10G
python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
```
```
python3 bench_other.py --num-questions 80 --backend lightllm
```
import argparse
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import json
import os
import time
import uuid
from fastchat.model import get_conversation_template
import requests
from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt
def load_questions(filename):
questions = []
with open(filename, "r") as fin:
for line in fin:
obj = json.loads(line)
questions.append(obj)
return questions
def write_answers(filename, model_id, questions, answers):
with open(os.path.expanduser(filename), "w") as fout:
for i in range(len(answers)):
ans_json = {
"question_id": questions[i]["question_id"],
"answer_id": uuid.uuid4().hex,
"model_id": model_id,
"choices": {
"index": 0,
"turns": [answers[i][0], answers[i][1]],
},
"tstamp": time.time(),
}
fout.write(json.dumps(ans_json) + "\n")
def main(args):
questions = load_questions(args.question_file)
questions = (questions * 10)[:args.num_questions]
max_tokens = 256
model_id = "llama-2-chat"
conv_main = get_conversation_template(model_id)
# Select backend
if args.backend == "lightllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_lightllm, url=url, stop=None)
elif args.backend == "vllm":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_vllm, url=url, stop=None)
elif args.backend == "srt":
url = f"{args.host}:{args.port}/generate"
call_generate = partial(call_generate_srt, url=url, stop=None)
else:
raise ValueError(f"Invalid backend: {args.backend}")
answers = [None] * len(questions)
def get_answer(i):
conv = conv_main.copy()
cur_answers = []
for j in range(2):
q = questions[i]["turns"][j]
conv.append_message(conv.roles[0], q)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
output = call_generate(prompt,
temperature=0, max_tokens=max_tokens).strip()
cur_answers.append(output)
conv.update_last_message(output)
answers[i] = cur_answers
# Run requests
tic = time.time()
if args.parallel == 1:
for i in range(len(questions)):
get_answer(i)
else:
with ThreadPoolExecutor(args.parallel) as executor:
executor.map(get_answer, list(range(len(questions))))
latency = time.time() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
# Write results
answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
write_answers(answer_file, model_id, questions, answers)
with open(args.result_file, "a") as fout:
value = {
"task": "mtbench",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--question-file", type=str, default="question.jsonl")
parser.add_argument("--answer-file", type=str, default=None)
parser.add_argument("--num-questions", type=int, default=80)
args = add_common_other_args_and_parse(parser)
main(args)
import argparse
import json
import os
import time
import uuid
import sglang as sgl
from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
def load_questions(filename):
questions = []
with open(filename, "r") as fin:
for line in fin:
obj = json.loads(line)
questions.append(obj)
return questions
def write_answers(filename, model_id, questions, answers):
with open(os.path.expanduser(filename), "w") as fout:
for i in range(len(answers)):
ans_json = {
"question_id": questions[i]["question_id"],
"answer_id": uuid.uuid4().hex,
"model_id": model_id,
"choices": {
"index": 0,
"turns": [answers[i][0], answers[i][1]],
},
"tstamp": time.time(),
}
fout.write(json.dumps(ans_json) + "\n")
@sgl.function
def answer_mt_bench(s, question_1, question_2):
s += sgl.system()
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1"))
s += sgl.user(question_2)
s += sgl.assistant(sgl.gen("answer_2"))
def main(args):
# Construct prompts
questions = load_questions(args.question_file)[:args.num_questions]
arguments = [
{"question_1": q["turns"][0], "question_2": q["turns"][1]}
for q in questions
]
# Select backend
backend = select_sglang_backend(args)
sgl.set_default_backend(backend)
# Run requests
tic = time.time()
rets = answer_mt_bench.run_batch(
arguments,
temperature=0,
max_new_tokens=256,
num_threads=args.parallel)
answers = [[s["answer_1"], s["answer_2"]] for s in rets]
latency = time.time() - tic
print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
# Write results
model_id = backend.model_info["model_path"]
answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
write_answers(answer_file, model_id, questions, answers)
with open(args.result_file, "a") as fout:
value = {
"task": "mtbench",
"backend": args.backend,
"num_gpus": 1,
"latency": round(latency, 3),
"num_requests": args.num_questions,
"other": {
"num_questions": args.num_questions,
"parallel": args.parallel,
}
}
fout.write(json.dumps(value) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--question-file", type=str, default="question.jsonl")
parser.add_argument("--answer-file", type=str, default=None)
parser.add_argument("--num-questions", type=int, default=80)
args = add_common_sglang_args_and_parse(parser)
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment