release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com> Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu> Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com> Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

release initial code
Co-authored-by: Ying Sheng <sqy1415@gmail.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com> Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu> Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com> Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
22085081 · Lianmin Zheng · f6d40df0 · 22085081 · 22085081 · 22085081
Commit 22085081 authored Jan 08, 2024 by Lianmin Zheng
20 changed files
--- a/benchmark/line_retrieval/gen_data.py
+++ b/benchmark/line_retrieval/gen_data.py
+"""
+Generate line data for line retrieval task.
+Usage:
+python3 gen_data.py --number 1000
+"""
+import argparse
+from collections import defaultdict
+import json
+from tqdm import tqdm
+import numpy as np
+def generate_lines(random_words, num_lines, redirect_ratio):
+    prefix = "Here is a list of lines, each with its corresponding REGISTER_CONTENT value. Please memorize them. Be prepared to provide the REGISTER_CONTENT value for a specific line index when I ask."
+    suffix = "The list has ended. Please give the final REGISTER_CONTENT value for a specific line after resovling the redirections and references. For example, the REGISTER_CONTENT of Line __idx0__ is __val0__. The REGISTER_CONTENT of Line __idx1__ is __val1__. The REGISTER_CONTENT of Line __idx2__ is __val2__. The REGISTER_CONTENT of Line ??? is"
+    # Raw lines
+    visited_indices = set([None])
+    visited_values = set([None])
+    lines = []
+    redirects = []
+    indices = []
+    values = []
+    for i in tqdm(range(num_lines)):
+        line_index = None
+        while line_index in visited_indices:
+            line_index = "-".join(np.random.choice(random_words, size=(2,)))
+        visited_indices.add(line_index)
+        line_value = np.random.randint(low=0, high=999999)
+        line_value = f"{line_value:06}"
+        line = f"Line {line_index}: The REGISTER_CONTENT is {line_value}."
+        lines.append(line)
+        redirects.append(None)
+        indices.append(line_index)
+        values.append(line_value)
+    # Add redirect
+    if redirect_ratio > 0:
+        num_redirect_lines = int(len(lines) * redirect_ratio)
+        redirect_indices = np.random.choice(np.arange(len(lines)),
+            size=(num_redirect_lines,), replace=False)
+        for i in redirect_indices:
+            target_idx = np.random.choice(min(i * 2 + 100, num_lines))
+            lines[i] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
+            redirects[i] = target_idx
+    # Build links and find sources
+    links = [[] for _ in range(num_lines)]
+    contains_ring = set()
+    for i in range(num_lines):
+        if redirects[i] is None:
+            continue
+        tmp_link = []
+        cur = i
+        visited = set()
+        while redirects[cur] is not None:
+            visited.add(cur)
+            tmp_link.append(redirects[cur])
+            cur = redirects[cur]
+            if cur in visited:
+                contains_ring.add(i)
+                tmp_link = None
+                break
+        values[i] = values[cur]
+        links[i] = tmp_link
+    # Group by num_links
+    group_by_num_hoops = defaultdict(list)
+    for i in range(num_lines):
+        if i in contains_ring:
+            continue
+        group_by_num_hoops[len(links[i]) + 1].append(i)
+    keys = sorted(list(group_by_num_hoops.keys()))
+    for num_links in keys:
+        print(f"#links: {num_links}, #lines: {len(group_by_num_hoops[num_links])}")
+    # Append few-shot examples
+    hoop1_candidates = list(group_by_num_hoops[1])
+    hoop1_candidate_keys = {c: max([c] + links[c]) for c in hoop1_candidates}
+    hoop1_candidates.sort(key=lambda c: hoop1_candidate_keys[c])
+    hoop2_candidates = list(group_by_num_hoops[2])
+    hoop2_candidate_keys = {c: max([c] + links[c]) for c in hoop2_candidates}
+    hoop2_candidates.sort(key=lambda c: hoop2_candidate_keys[c])
+    i = hoop1_candidates[5]
+    suffix = suffix.replace("__idx0__", indices[i]).replace("__val0__", values[i])
+    if len(hoop2_candidates):
+        i = hoop2_candidates[0]
+        suffix = suffix.replace("__idx1__", indices[i]).replace("__val1__", values[i])
+        i = hoop2_candidates[1]
+        suffix = suffix.replace("__idx2__", indices[i]).replace("__val2__", values[i])
+    else:
+        i = hoop1_candidates[1]
+        suffix = suffix.replace("__idx1__", indices[i]).replace("__val1__", values[i])
+        i = hoop1_candidates[10]
+        suffix = suffix.replace("__idx2__", indices[i]).replace("__val2__", values[i])
+    obj = {
+        "prefix": prefix,
+        "suffix": suffix,
+        "lines": lines,
+        "indices": indices,
+        "values": values,
+        "links": links,
+        "group_by_num_hoops": group_by_num_hoops,
+        "contains_ring": sorted(list(contains_ring)),
+    }
+    return obj
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--number", type=int)
+    parser.add_argument("--redirect-ratio", type=float, default=0.0)
+    args = parser.parse_args()
+    num_lines = args.number
+    random_words_filename = "random_words.json"
+    random_words = json.load(open(random_words_filename, "r"))
+    np.random.seed(42)
+    obj = generate_lines(random_words, num_lines, args.redirect_ratio)
+    fout = f"lines_{num_lines}_{args.redirect_ratio:.1f}.json"
+    with open(fout, "w") as fout:
+        json.dump(obj, fout, indent=2)
--- a/benchmark/llava_bench/README.md
+++ b/benchmark/llava_bench/README.md
+## Download benchmark images
+```
+python3 download_images.py
+```
+image benchmark source: https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild 
+### Other Dependency
+```
+pip3 install "torch>=2.1.2" "transformers>=4.36" pillow
+```
+## Run benchmark
+### Benchmark sglang
+Launch a server
+```
+python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
+```
+Run benchmark
+```
+# Run with local models
+python3 bench_sglang.py --num-questions 60
+# Run with OpenAI models
+python3 bench_sglang.py --num-questions 60 --backend gpt-4-vision-preview
+```
+### Bench LLaVA original code
+```
+git clone git@github.com:haotian-liu/LLaVA.git
+cd LLaVA
+git reset --hard 9a26bd1435b4ac42c282757f2c16d34226575e96
+pip3 install -e .
+cd ~/sglang/benchmark/llava_bench
+CUDA_VISIBLE_DEVICES=0 bash bench_hf_llava_bench.sh
+```
+### Benchmark llama.cpp
+```
+# Install
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+pip install sse_starlette starlette_context pydantic_settings
+# Download weights
+mkdir -p ~/model_weights/llava-v1.5-7b/
+wget https://huggingface.co/mys/ggml_llava-v1.5-7b/resolve/main/ggml-model-f16.gguf -O ~/model_weights/llava-v1.5-7b/ggml-model-f16.gguf
+wget https://huggingface.co/mys/ggml_llava-v1.5-7b/resolve/main/mmproj-model-f16.gguf -O ~/model_weights/llava-v1.5-7b/mmproj-model-f16.gguf
+```
+```
+python3 -m llama_cpp.server --model ~/model_weights/llava-v1.5-7b/ggml-model-f16.gguf --clip_model_path ~/model_weights/llava-v1.5-7b/mmproj-model-f16.gguf --chat_format llava-1-5 --port 23000
+OPENAI_BASE_URL=http://localhost:23000/v1 python3 bench_sglang.py --backend gpt-4-vision-preview --num-q 1
+```
--- a/benchmark/llava_bench/bench_hf_llava_bench.sh
+++ b/benchmark/llava_bench/bench_hf_llava_bench.sh
+#!/bin/bash
+python -m llava.eval.model_vqa \
+    --model-path liuhaotian/llava-v1.5-7b \
+    --question-file ./questions.jsonl \
+    --image-folder ./images \
+    --answers-file ./answers_hf.jsonl \
+    --temperature 0 \
+    --conv-mode vicuna_v1
--- a/benchmark/llava_bench/bench_hf_mme.sh
+++ b/benchmark/llava_bench/bench_hf_mme.sh
+#!/bin/bash
+python -m llava.eval.model_vqa_loader \
+    --model-path liuhaotian/llava-v1.5-7b \
+    --question-file ./mme_pack/llava_mme_bench_replace.jsonl \
+    --image-folder ./mme_pack/MME_Benchmark_release_version \
+    --answers-file ./answers_hf_mme.jsonl \
+    --temperature 0 \
+    --conv-mode vicuna_v1
--- a/benchmark/llava_bench/bench_sglang.py
+++ b/benchmark/llava_bench/bench_sglang.py
+import argparse
+import json
+import time
+import os
+import sglang as sgl
+import tqdm
+from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
+from sglang.utils import read_jsonl, dump_state_text
+from PIL import Image
+@sgl.function
+def image_qa(s, image_file, question):
+    s += sgl.user(sgl.image(image_file) + question)
+    s += sgl.assistant(sgl.gen("answer", max_tokens=args.max_tokens))
+def main(args):
+    lines = read_jsonl(args.question_file)[:args.num_questions]
+    arguments = [
+        {"image_file":
+            os.path.abspath(args.image_folder + "/" + l["image"]),
+          "question": l["text"]} for l in lines
+    ]
+    #arguments = [
+    #    {"image_file":
+    #        Image.open(os.path.abspath(args.image_folder + "/" + l["image"])),
+    #      "question": l["text"]} for l in lines
+    #]
+    states = [None] * len(lines)
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+    # Run requests
+    tic = time.time()
+    if args.parallel == 1:
+        for i in tqdm.tqdm(range(len(lines))):
+            image_file = arguments[i]["image_file"]
+            question = arguments[i]["question"]
+            ret = image_qa.run(
+                image_file=image_file,
+                question=question,
+                temperature=0)
+            states[i] = ret
+    else:
+        states = image_qa.run_batch(
+            arguments,
+            temperature=0,
+            num_threads=args.parallel,
+            progress_bar=True)
+    latency = time.time() - tic
+    print(f"Latency: {latency:.3f}")
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    print(f"Write output to {args.answer_file}")
+    with open(args.answer_file, "w") as fout:
+        for i in range(len(lines)):
+            value = {
+                "question_id": lines[i]["question_id"],
+                "prompt": lines[i]["text"],
+                "text": states[i]["answer"].strip(),
+                "model_id": backend.model_info["model_path"],
+                "answer_id": i,
+                "metadata": {},
+            }
+            fout.write(json.dumps(value) + "\n")
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "llava_bench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": len(lines),
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="questions.jsonl")
+    parser.add_argument("--answer-file", type=str, default="answers.jsonl")
+    parser.add_argument("--image-folder", type=str, default="./images")
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--num-questions", type=int, default=None)
+    parser.add_argument("--max-tokens", type=int, default=768)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/llava_bench/bench_sglang_mme.sh
+++ b/benchmark/llava_bench/bench_sglang_mme.sh
+MME_FOLDER=./mme_pack
+python3 bench_sglang.py --num-questions 5000 --question-file $MME_FOLDER/llava_mme_bench_replace.jsonl --answer-file answer_mme.jsonl --image-folder $MME_FOLDER/MME_Benchmark_release_version --max-tokens 4
--- a/benchmark/llava_bench/download_images.py
+++ b/benchmark/llava_bench/download_images.py
+import os
+# Create the 'images' directory if it doesn't exist
+if not os.path.exists('images'):
+    os.makedirs('images')
+# Base URL
+base_url = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/"
+# Loop through image numbers
+for i in range(1, 25):
+    # Format the image number with leading zeros
+    image_number = str(i).zfill(3)
+    image_url = base_url + image_number + ".jpg"
+    image_path = "images/" + image_number + ".jpg"
+    # Download the image using wget
+    os.system(f"wget -O {image_path} {image_url}")
+print("Download complete.")
--- a/benchmark/llm_judge/README.md
+++ b/benchmark/llm_judge/README.md
+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+```
+python3 bench_sglang.py --num-questions 25 --parallel 8
+python3 bench_sglang.py --num-questions 16 --parallel 1
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+```
+python3 bench_other.py --backend vllm --num-questions 25
+```
+### Benchmark guidance
+```
+python3 bench_other.py --backend guidance --num-questions 25 --parallel 1
+```
--- a/benchmark/llm_judge/bench_other.py
+++ b/benchmark/llm_judge/bench_other.py
+import argparse
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+import json
+import time
+import numpy as np
+from tqdm import tqdm
+from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
+from sglang.utils import read_jsonl, dump_state_text
+system_prompt = (
+"Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
+)
+dimension_prompts = [
+"Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
+"Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
+"Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
+"Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
+"Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
+"Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
+]
+def multi_dimension_judge(article, generate):
+    s = system_prompt
+    s += "\n```\n" + article + "\n```\n\n"
+    judges = []
+    for i in range(len(dimension_prompts)):
+        comp = generate(s +
+                "USER: Please judge the quality based on the following metric. " +
+                dimension_prompts[i] + " Please provide a single-paragraph judgement. " +
+                "Focus on the provided metric and do not say other things. " 
+                'End your judgement paragraph with the word "END"\nJUDGE:',
+            max_tokens=256, stop="END")
+        judges.append(comp)
+    s += "I will judge the quality based on the following metrics.\n"
+    for i in range(len(dimension_prompts)):
+        s += dimension_prompts[i].split(":")[0] + ": " + judges[i].strip() + "\n"
+    s += "In summary, on a scale of 1 to 10, I would give the article a score of"
+    s += generate(s, max_tokens=2, stop=None)
+    return s
+def main(args):
+    lines = read_jsonl(args.data_path)[:args.num_questions]
+    states = [None] * len(lines)
+    # Select backend
+    if args.backend == "lightllm":
+        url = f"{args.host}:{args.port}/generate"
+        generate = partial(call_generate_lightllm, url=url, temperature=0)
+    elif args.backend == "vllm":
+        url = f"{args.host}:{args.port}/generate"
+        generate = partial(call_generate_vllm, url=url, temperature=0)
+    elif args.backend == "srt-raw":
+        url = f"{args.host}:{args.port}/generate"
+        generate = partial(call_generate_srt_raw, url=url, temperature=0)
+    elif args.backend == "guidance":
+        from guidance import models, gen
+        model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
+        def generate(prompt, max_tokens, stop):
+            out = model + prompt + gen(name="answer",
+                max_tokens=max_tokens, temperature=0, stop=stop)
+            return out["answer"]
+        # warmup
+        generate("Hello!", max_tokens=8, stop=None)
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+    # Run requests
+    def get_one_answer(i):
+        states[i] = multi_dimension_judge(lines[i], generate)
+    tic = time.time()
+    if args.parallel == 1:
+        for i in tqdm(range(len(lines))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            executor.map(get_one_answer, list(range(len(lines))))
+    latency = time.time() - tic
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "llm_judge",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            }
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="articles.jsonl")
+    parser.add_argument("--num-questions", type=int, default=20)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/llm_judge/bench_sglang.py
+++ b/benchmark/llm_judge/bench_sglang.py
+import argparse
+import json
+import time
+import numpy as np
+import sglang as sgl
+from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
+from sglang.utils import read_jsonl, dump_state_text
+system_prompt = (
+"Please serve as an impartial judge and rigorously evaluate the quality of the following article. Apply the most stringent standards possible, showing no leniency."
+)
+dimension_prompts = [
+"Content: This refers to the essences of the essay. The substance should be well researched, accurate, relevant to the topic and should show a thorough understanding of the subject. The essay should also reflect a clear goal or purpose.",
+"Organization and Structure: An essay needs to be properly structured with a clear introduction, body, and conclusion. The essay should flow naturally, with one paragraph leading seamlessly into the next.",
+"Argument and Analysis: The argument made in the essay should be logical, coherent and clearly articulated. Each point made should be backed up by solid evidence and thorough analysis.",
+"Clarity and Precision: The essay should be written in a clear and concise manner. The points made should be easily understood by the reader. The language used should also be precise and unambiguous.",
+"Grammar and Punctuation: Proper use of grammar and punctuation is vital in an academic essay. Errors in grammar and punctuation not only distract the reader but can also negatively impact the meaning and interpretation of the content.",
+"Referencing and Citation: An essay should contain proper citations and references for all sources used. This not only prevents accusations of plagiarism but also gives credit to the authors of the works that have contributed to the essay. The citation should adhere to a specific format as required by the academic institution or specified by the professor.",
+]
+@sgl.function
+def multi_dimension_judge(s, article):
+    s += system_prompt
+    s += "\n```\n" + article + "\n```\n\n"
+    forks = s.fork(len(dimension_prompts))
+    for i in range(len(dimension_prompts)):
+        forks[i] += ("USER: Please judge the quality based on the following metric. " +
+                     dimension_prompts[i] + " Please provide a single-paragraph judgement. " +
+                     "Focus on the provided metric and do not say other things. " 
+                     'End your judgement paragraph with the word "END"\nJUDGE:')
+        forks[i] += sgl.gen("judgement", max_tokens=256, stop="END")
+    forks.join()
+    s += "I will judge the quality based on the following metrics.\n"
+    for i in range(len(dimension_prompts)):
+        s += dimension_prompts[i].split(":")[0] + ": " + forks[i]["judgement"].strip() + "\n"
+    s += "In summary, on a scale of 1 to 10, I would give the article a score of"
+    s += sgl.gen("score", max_tokens=2)
+def main(args):
+    lines = read_jsonl(args.data_path)[:args.num_questions]
+    arguments = [{"article": l} for l in lines]
+    # Select backend
+    backend = select_sglang_backend(args)
+    # Run requests
+    tic = time.time()
+    states = multi_dimension_judge.run_batch(
+        arguments, temperature=0, backend=backend, num_threads=args.parallel)
+    latency = time.time() - tic
+    print(f"Latency: {latency:.3f}")
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "llm_judge",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            }
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="articles.jsonl")
+    parser.add_argument("--num-questions", type=int, default=20)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/long_json_decode/README.md
+++ b/benchmark/long_json_decode/README.md
+## Run benchmark
+### Benchmark sglang
+```
+python3 -m sglang.launch_server --model-path codellama/CodeLlama-7b-instruct-hf --port 30000
+```
+```
+python3 bench_sglang.py --num-questions 5 --parallel 1
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model codellama/CodeLlama-7b-instruct-hf  --disable-log-requests --port 21000 --gpu 0.97
+```
+```
+python3 bench_other.py --backend vllm --num-questions 5
+```
+### Benchmark guidance
+```
+python3 bench_other.py --backend guidance --num-questions 5 --parallel 1
+```
+### Build dataset
+```
+pip install wikipedia
+python3 build_dataset.py
+```
--- a/benchmark/long_json_decode/bench_other.py
+++ b/benchmark/long_json_decode/bench_other.py
+import argparse
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+import json
+import time
+from tqdm import tqdm
+import numpy as np
+from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
+from sglang.utils import read_jsonl, dump_state_text
+def json_decode(document, generate):
+    s = "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += '{\n'
+    s += '  "name": "'
+    s += generate(s, max_tokens=8, stop='"') + '",\n'
+    s += '  "country": "'
+    s += generate(s, max_tokens=8, stop='"') + '",\n'
+    s += '  "air port code": "'
+    s += generate(s, max_tokens=8, stop='"') + '",\n'
+    s += '  "top 3 landmarks": "'
+    s += generate(s, max_tokens=24, stop='"') + '",\n'
+    s += '}\n'
+    return s
+def main(args):
+    lines = read_jsonl(args.data_path)
+    arguments = []
+    for i in range(len(lines[:args.num_questions])):
+        arguments.append({
+            "document": lines[i]["document"],
+        })
+    states = [None] * len(arguments)
+    # Select backend
+    if args.backend == "lightllm":
+        url = f"{args.host}:{args.port}/generate"
+        generate = partial(call_generate_lightllm, url=url, temperature=0)
+    elif args.backend == "vllm":
+        url = f"{args.host}:{args.port}/generate"
+        generate = partial(call_generate_vllm, url=url, temperature=0)
+    elif args.backend == "srt-raw":
+        url = f"{args.host}:{args.port}/generate"
+        generate = partial(call_generate_srt_raw, url=url, temperature=0)
+    elif args.backend == "guidance":
+        from guidance import models, gen
+        model = models.LlamaCpp("/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf", n_gpu_layers=-1, n_ctx=11000)
+        def generate(prompt, max_tokens, stop):
+            out = model + prompt + gen(name="answer",
+                max_tokens=max_tokens, temperature=0, stop=stop)
+            return out["answer"]
+        # warmup
+        generate("Hello!", max_tokens=8, stop=None)
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+    # Run requests
+    def get_one_answer(i):
+        states[i] = json_decode(generate=generate, **arguments[i])
+    tic = time.time()
+    if args.parallel == 1:
+        for i in tqdm(range(len(arguments))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            executor.map(get_one_answer, list(range(len(arguments))))
+    latency = time.time() - tic
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "long_json_decode",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            }
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=100)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/long_json_decode/bench_sglang.py
+++ b/benchmark/long_json_decode/bench_sglang.py
+import argparse
+import json
+import time
+import numpy as np
+import sglang as sgl
+from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
+from sglang.utils import read_jsonl, dump_state_text
+@sgl.function
+def json_decode(s, document):
+    s += "Please extract the information of a city from the following wikipedia page.\n"
+    s += "Page begin.\n" + document + "Page end.\n"
+    s += "Here is the name, country, and symbol of the city in JSON format.\n"
+    s += '{\n'
+    s += '  "name": "' + sgl.gen("name", max_tokens=8, stop='"') + '",\n'
+    s += '  "country": "' + sgl.gen("country", max_tokens=8, stop='"') + '",\n'
+    s += '  "air port code": "' + sgl.gen("air port code", max_tokens=8, stop='"') + '",\n'
+    s += '  "top 3 landmarks": "' + sgl.gen("landmarks", max_tokens=24, stop='"') + '",\n'
+    s += '}\n'
+def main(args):
+    lines = read_jsonl(args.data_path)
+    arguments = []
+    for i in range(len(lines[:args.num_questions])):
+        arguments.append({
+            "document": lines[i]["document"],
+        })
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+    # Run requests
+    tic = time.time()
+    states = json_decode.run_batch(
+        arguments, temperature=0, num_threads=args.parallel)
+    latency = time.time() - tic
+    # Compute accuracy 
+    print(f"Latency: {latency:.3f}")
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "long_json_decode",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            }
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="questions.jsonl")
+    parser.add_argument("--num-questions", type=int, default=10)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/long_json_decode/build_dataset.py
+++ b/benchmark/long_json_decode/build_dataset.py
+import json
+import transformers
+import wikipedia
+name = "meta-llama/Llama-2-7b-chat-hf"
+t = transformers.AutoTokenizer.from_pretrained(name)
+city_names = ["los angles", "london", "tokyo", "beijing", "singapore"]
+for city_name in city_names:
+    content = str(wikipedia.page(city_name).content)
+    content = content.replace("\n\n", "\n")
+    tokens = t.encode(content)
+    truncate_len = int((10000 / len(tokens)) * len(content))
+    truncate_content = content[:truncate_len]
+    truncate_tokens = t.encode(truncate_content)
+    # Count token
+    print(f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}")
+    with open("questions.jsonl", "a") as fout:
+        fout.write(json.dumps({"document": truncate_content}) + "\n")
--- a/benchmark/mmlu/README.md
+++ b/benchmark/mmlu/README.md
+## Download data
+```
+wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
+tar xf data.tar
+```
+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+```
+python3 bench_sglang.py --nsub 10
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+```
+python3 bench_other.py --nsub 10 --backend vllm
+```
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+# V100
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 4500 --port 22000
+```
+```
+python3 bench_other.py --nsub 10 --backend lightllm
+```
+### Benchmark guidance
+```
+python3 bench_other.py --nsub 10 --backend guidance --parallel 1
+```
+### Benchmark lmql
+```
+CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+```
+python3 bench_other.py --nsub 10 --backend lmql --parallel 2
+```
--- a/benchmark/mmlu/bench_other.py
+++ b/benchmark/mmlu/bench_other.py
+import argparse
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import json
+from functools import partial
+import os
+import time
+import numpy as np
+import pandas as pd
+import tiktoken
+from tqdm import tqdm
+from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
+choices = ["A", "B", "C", "D"]
+tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
+def format_subject(subject):
+    l = subject.split("_")
+    s = ""
+    for entry in l:
+        s += " " + entry
+    return s
+def format_example(df, idx, include_answer=True):
+    prompt = df.iloc[idx, 0]
+    k = df.shape[1] - 2
+    for j in range(k):
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
+    prompt += "\nAnswer:"
+    if include_answer:
+        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+    return prompt
+def gen_prompt(train_df, subject, k=-1):
+    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(format_subject(subject))
+    if k == -1:
+        k = train_df.shape[0]
+    for i in range(k):
+        prompt += format_example(train_df, i)
+    return prompt
+model_initialized = None
+def evaluate(args, subject, dev_df, test_df):
+    prompts = []
+    labels = []
+    # Construct prompts
+    k = args.ntrain
+    train_prompt = gen_prompt(dev_df, subject, k)
+    while len(tokenizer.encode(train_prompt)) > 1536:
+        k -= 1
+        train_prompt = gen_prompt(dev_df, subject, k)
+    for i in range(test_df.shape[0]):
+        prompt_end = format_example(test_df, i, include_answer=False)
+        prompt = train_prompt + prompt_end
+        prompts.append(prompt)
+        label = test_df.iloc[i, test_df.shape[1]-1]
+        labels.append(label)
+    preds = [None] * len(prompts)
+    max_tokens = 1
+    # Select backend
+    global model_initialized
+    if args.backend == "lightllm":
+        url = f"{args.host}:{args.port}/generate"
+        call_generate = partial(call_generate_lightllm, url=url, stop=None)
+    elif args.backend == "vllm":
+        url = f"{args.host}:{args.port}/generate"
+        call_generate = partial(call_generate_vllm, url=url, stop=None)
+    elif args.backend == "srt-raw":
+        url = f"{args.host}:{args.port}/generate"
+        call_generate = partial(call_generate_srt_raw, url=url, stop=None)
+    elif args.backend == "guidance":
+        from guidance import models, gen
+        if model_initialized is None:
+            model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
+            model_initialized = model
+        else:
+            model = model_initialized
+        def call_generate(prompt, temperature, max_tokens):
+            out = model + prompt + gen(name="answer",
+                max_tokens=max_tokens, temperature=0)
+            return out["answer"]
+    elif args.backend == "lmql":
+        import lmql
+        model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
+           endpoint=f"{args.host}:{args.port}")
+        @lmql.query(model=model)
+        async def program(question):
+            '''lmql
+            """{question}[ANSWER]""" where len(TOKENS(ANSWER)) < 2
+            return ANSWER
+            '''
+        async def call_generate(prompt, temperature, max_tokens):
+            return await program(question=prompt, temperature=temperature)
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            pred = call_generate(prompts[i], temperature=0,
+                                 max_tokens=max_tokens)
+            preds[i] = pred.strip()[0]
+        tic = time.time()
+        if args.parallel == 1:
+            for i in range(len(prompts)):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                executor.map(get_one_answer, list(range(len(prompts))))
+    else:
+        # Use asyncio
+        async def batched_call(batch_size):
+            for i in range(0, len(prompts), batch_size):
+                tasks = []
+                for p in prompts[i:i+batch_size]:
+                    tasks.append(call_generate(p,
+                        temperature=0, max_tokens=max_tokens))
+                rets = await asyncio.gather(*tasks)
+                for j in range(len(rets)):
+                    preds[i+j] = rets[j].strip()[0]
+        tic = time.time()
+        asyncio.run(batched_call(batch_size=args.parallel))
+    latency = time.time() - tic
+    # Compute accuracy
+    cors = [pred == label for pred, label in zip(preds, labels)]
+    acc = np.mean(cors)
+    cors = np.array(cors)
+    print("Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
+        acc, latency, len(prompts), subject))
+    return cors, acc, latency
+def main(args):
+    subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
+    all_cors = []
+    all_latencies = []
+    num_requests = 0
+    for subject in tqdm(subjects[:args.nsub]):
+        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
+        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
+        cors, acc, latency = evaluate(args, subject, dev_df, test_df)
+        all_cors.append(cors)
+        all_latencies.append(latency)
+        num_requests += len(test_df)
+    total_latency = np.sum(all_latencies)
+    print("Total latency: {:.3f}".format(total_latency))
+    weighted_acc = np.mean(np.concatenate(all_cors))
+    print("Average accuracy: {:.3f}".format(weighted_acc))
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mmlu",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(total_latency, 3),
+            "accuracy": round(weighted_acc, 3),
+            "num_requests": num_requests,
+            "other": {
+                "nsub": args.nsub,
+                "parallel": args.parallel,
+            }
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ntrain", type=int, default=5)
+    parser.add_argument("--data_dir", type=str, default="data")
+    parser.add_argument("--nsub", type=int, default=60)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/mmlu/bench_sglang.py
+++ b/benchmark/mmlu/bench_sglang.py
+import argparse
+import json
+import os
+import time
+import numpy as np
+import pandas as pd
+import tiktoken
+from tqdm import tqdm
+from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
+choices = ["A", "B", "C", "D"]
+tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
+def format_subject(subject):
+    l = subject.split("_")
+    s = ""
+    for entry in l:
+        s += " " + entry
+    return s
+def format_example(df, idx, include_answer=True):
+    prompt = df.iloc[idx, 0]
+    k = df.shape[1] - 2
+    for j in range(k):
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
+    prompt += "\nAnswer:"
+    if include_answer:
+        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+    return prompt
+def gen_prompt(train_df, subject, k=-1):
+    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(format_subject(subject))
+    if k == -1:
+        k = train_df.shape[0]
+    for i in range(k):
+        prompt += format_example(train_df, i)
+    return prompt
+def evaluate(args, subject, dev_df, test_df):
+    prompts = []
+    labels = []
+    k = args.ntrain
+    few_shot_examples = gen_prompt(dev_df, subject, k)
+    while len(tokenizer.encode(few_shot_examples)) > 1536:
+        k -= 1
+        few_shot_examples = gen_prompt(dev_df, subject, k)
+    for i in range(test_df.shape[0]):
+        prompt_end = format_example(test_df, i, include_answer=False)
+        prompts.append(prompt_end)
+        label = test_df.iloc[i, test_df.shape[1]-1]
+        labels.append(label)
+    arguments = [{"question": p} for p in prompts]
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+    import sglang as sgl
+    @sgl.function
+    def few_shot_mmlu(s, examples, question):
+        s += examples + question + sgl.gen("answer")
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+    # Select backend
+    backend = select_sglang_backend(args)
+    tic = time.time()
+    states = few_shot_mmlu.bind(examples=few_shot_examples).run_batch(
+        arguments, temperature=0, max_new_tokens=1,
+        backend=backend, num_threads=args.parallel)
+    preds = [s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else ""
+             for s in states]
+    latency = time.time() - tic
+    cors = [pred == label for pred, label in zip(preds, labels)]
+    acc = np.mean(cors)
+    cors = np.array(cors)
+    print("Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
+        acc, latency, len(prompts), subject))
+    return cors, acc, latency
+def main(args):
+    subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
+    all_cors = []
+    all_latencies = []
+    num_requests = 0
+    for subject in tqdm(subjects[:args.nsub]):
+        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
+        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
+        cors, acc, latency = evaluate(args, subject, dev_df, test_df)
+        all_cors.append(cors)
+        all_latencies.append(latency)
+        num_requests += len(test_df)
+    total_latency = np.sum(all_latencies)
+    print("Total latency: {:.3f}".format(total_latency))
+    weighted_acc = np.mean(np.concatenate(all_cors))
+    print("Average accuracy: {:.3f}".format(weighted_acc))
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mmlu",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(total_latency, 3),
+            "accuracy": round(weighted_acc, 3),
+            "num_requests": num_requests,
+            "other": {
+                "nsub": args.nsub,
+                "parallel": args.parallel,
+            }
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ntrain", "-k", type=int, default=5)
+    parser.add_argument("--data_dir", "-d", type=str, default="data")
+    parser.add_argument("--save_dir", "-s", type=str, default="results")
+    parser.add_argument("--nsub", type=int, default=60)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/mtbench/README.md
+++ b/benchmark/mtbench/README.md
+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+```
+python3 bench_sglang.py --num-questions 80
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+```
+python3 bench_other.py --num-questions 80 --backend vllm
+```
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+```
+python3 bench_other.py --num-questions 80 --backend lightllm
+```
--- a/benchmark/mtbench/bench_other.py
+++ b/benchmark/mtbench/bench_other.py
+import argparse
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+import json
+import os
+import time
+import uuid
+from fastchat.model import get_conversation_template
+import requests
+from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt
+def load_questions(filename):
+    questions = []
+    with open(filename, "r") as fin:
+        for line in fin:
+            obj = json.loads(line)
+            questions.append(obj)
+    return questions
+def write_answers(filename, model_id, questions, answers):
+    with open(os.path.expanduser(filename), "w") as fout:
+        for i in range(len(answers)):
+            ans_json = {
+                "question_id": questions[i]["question_id"],
+                "answer_id": uuid.uuid4().hex,
+                "model_id": model_id,
+                "choices": {
+                    "index": 0,
+                    "turns": [answers[i][0], answers[i][1]],
+                },
+                "tstamp": time.time(),
+            }
+            fout.write(json.dumps(ans_json) + "\n")
+def main(args):
+    questions = load_questions(args.question_file)
+    questions = (questions * 10)[:args.num_questions]
+    max_tokens = 256
+    model_id = "llama-2-chat"
+    conv_main = get_conversation_template(model_id)
+    # Select backend
+    if args.backend == "lightllm":
+        url = f"{args.host}:{args.port}/generate"
+        call_generate = partial(call_generate_lightllm, url=url, stop=None)
+    elif args.backend == "vllm":
+        url = f"{args.host}:{args.port}/generate"
+        call_generate = partial(call_generate_vllm, url=url, stop=None)
+    elif args.backend == "srt":
+        url = f"{args.host}:{args.port}/generate"
+        call_generate = partial(call_generate_srt, url=url, stop=None)
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+    answers = [None] * len(questions)
+    def get_answer(i):
+        conv = conv_main.copy()
+        cur_answers = []
+        for j in range(2):
+            q = questions[i]["turns"][j]
+            conv.append_message(conv.roles[0], q)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt() 
+            output = call_generate(prompt,
+                temperature=0, max_tokens=max_tokens).strip()
+            cur_answers.append(output)
+            conv.update_last_message(output)
+        answers[i] = cur_answers
+    # Run requests
+    tic = time.time()
+    if args.parallel == 1:
+        for i in range(len(questions)):
+            get_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            executor.map(get_answer, list(range(len(questions))))
+    latency = time.time() - tic
+    print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
+    # Write results
+    answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
+    write_answers(answer_file, model_id, questions, answers)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mtbench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            }
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument("--answer-file", type=str, default=None)
+    parser.add_argument("--num-questions", type=int, default=80)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/mtbench/bench_sglang.py
+++ b/benchmark/mtbench/bench_sglang.py
+import argparse
+import json
+import os
+import time
+import uuid
+import sglang as sgl
+from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
+def load_questions(filename):
+    questions = []
+    with open(filename, "r") as fin:
+        for line in fin:
+            obj = json.loads(line)
+            questions.append(obj)
+    return questions
+def write_answers(filename, model_id, questions, answers):
+    with open(os.path.expanduser(filename), "w") as fout:
+        for i in range(len(answers)):
+            ans_json = {
+                "question_id": questions[i]["question_id"],
+                "answer_id": uuid.uuid4().hex,
+                "model_id": model_id,
+                "choices": {
+                    "index": 0,
+                    "turns": [answers[i][0], answers[i][1]],
+                },
+                "tstamp": time.time(),
+            }
+            fout.write(json.dumps(ans_json) + "\n")
+@sgl.function
+def answer_mt_bench(s, question_1, question_2):
+    s += sgl.system()
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1"))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2"))
+def main(args):
+    # Construct prompts
+    questions = load_questions(args.question_file)[:args.num_questions]
+    arguments = [
+        {"question_1": q["turns"][0], "question_2": q["turns"][1]}
+        for q in questions
+    ]
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+    # Run requests
+    tic = time.time()
+    rets = answer_mt_bench.run_batch(
+        arguments,
+        temperature=0,
+        max_new_tokens=256,
+        num_threads=args.parallel)
+    answers = [[s["answer_1"], s["answer_2"]] for s in rets]
+    latency = time.time() - tic
+    print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
+    # Write results
+    model_id = backend.model_info["model_path"]
+    answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
+    write_answers(answer_file, model_id, questions, answers)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mtbench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            }
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument("--answer-file", type=str, default=None)
+    parser.add_argument("--num-questions", type=int, default=80)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)