Format Benchmark Code (#399)

95c4e0df · Liangsheng Yin · GitHub · 19818b9c · 95c4e0df · 95c4e0df
Unverified Commit 95c4e0df authored Apr 28, 2024 by Liangsheng Yin Committed by GitHub Apr 28, 2024
20 changed files
--- a/benchmark/long_json_decode/build_dataset.py
+++ b/benchmark/long_json_decode/build_dataset.py
@@ -3,7 +3,6 @@ import json
 import transformers
 import wikipedia

-
 name = "meta-llama/Llama-2-7b-chat-hf"
 t = transformers.AutoTokenizer.from_pretrained(name)
 city_names = ["los angles", "london", "tokyo", "beijing", "singapore"]
@@ -20,7 +19,9 @@ for city_name in city_names:
    truncate_tokens = t.encode(truncate_content)

    # Count token
-    print(f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}")
+    print(
+        f"city_name: {city_name}, #tokens: {len(tokens)}, #truncate tokens: {len(truncate_tokens)}"
+    )

    with open("questions.jsonl", "a") as fout:
        fout.write(json.dumps({"document": truncate_content}) + "\n")
--- a/benchmark/mmlu/bench_other.py
+++ b/benchmark/mmlu/bench_other.py
 import argparse
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
 import json
-from functools import partial
 import os
 import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial

 import numpy as np
 import pandas as pd
 import tiktoken
 from tqdm import tqdm
-from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw

+from sglang.test.test_utils import (
+    add_common_other_args_and_parse,
+    call_generate_lightllm,
+    call_generate_srt_raw,
+    call_generate_vllm,
+)

 choices = ["A", "B", "C", "D"]

@@ -25,18 +30,22 @@ def format_subject(subject):
        s += " " + entry
    return s

+
 def format_example(df, idx, include_answer=True):
    prompt = df.iloc[idx, 0]
    k = df.shape[1] - 2
    for j in range(k):
-        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
    prompt += "\nAnswer:"
    if include_answer:
        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
    return prompt

+
 def gen_prompt(train_df, subject, k=-1):
-    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(format_subject(subject))
+    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
+        format_subject(subject)
+    )
    if k == -1:
        k = train_df.shape[0]
    for i in range(k):
@@ -63,7 +72,7 @@ def evaluate(args, subject, dev_df, test_df):
        prompt = train_prompt + prompt_end
        prompts.append(prompt)

-        label = test_df.iloc[i, test_df.shape[1]-1]
+        label = test_df.iloc[i, test_df.shape[1] - 1]
        labels.append(label)

    preds = [None] * len(prompts)
@@ -82,17 +91,24 @@ def evaluate(args, subject, dev_df, test_df):
        url = f"{args.host}:{args.port}/generate"
        call_generate = partial(call_generate_srt_raw, url=url, stop=None)
    elif args.backend == "guidance":
-        from guidance import models, gen
+        from guidance import gen, models

        if model_initialized is None:
-            model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
+            model = models.LlamaCpp(
+                "/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
+                n_gpu_layers=-1,
+                n_ctx=4096,
+            )
            model_initialized = model
        else:
            model = model_initialized

        def call_generate(prompt, temperature, max_tokens):
-            out = model + prompt + gen(name="answer",
-                max_tokens=max_tokens, temperature=0)
+            out = (
+                model
+                + prompt
+                + gen(name="answer", max_tokens=max_tokens, temperature=0)
+            )
            return out["answer"]

        # warmup
@@ -100,8 +116,10 @@ def evaluate(args, subject, dev_df, test_df):

    elif args.backend == "lmql":
        import lmql
-        model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
-           endpoint=f"{args.host}:{args.port}")
+
+        model = lmql.model(
+            "meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
+        )

        @lmql.query(model=model)
        async def program(question):
@@ -112,6 +130,7 @@ def evaluate(args, subject, dev_df, test_df):

        async def call_generate(prompt, temperature, max_tokens):
            return await program(question=prompt, temperature=temperature)
+
    else:
        raise ValueError(f"Invalid backend: {args.backend}")

@@ -119,8 +138,7 @@ def evaluate(args, subject, dev_df, test_df):
    if args.backend != "lmql":
        # Use thread pool
        def get_one_answer(i):
-            pred = call_generate(prompts[i], temperature=0,
-                                 max_tokens=max_tokens)
+            pred = call_generate(prompts[i], temperature=0, max_tokens=max_tokens)
            preds[i] = pred.strip()[0]

        tic = time.time()
@@ -135,12 +153,11 @@ def evaluate(args, subject, dev_df, test_df):
        async def batched_call(batch_size):
            for i in range(0, len(prompts), batch_size):
                tasks = []
-                for p in prompts[i:i+batch_size]:
-                    tasks.append(call_generate(p,
-                        temperature=0, max_tokens=max_tokens))
+                for p in prompts[i : i + batch_size]:
+                    tasks.append(call_generate(p, temperature=0, max_tokens=max_tokens))
                rets = await asyncio.gather(*tasks)
                for j in range(len(rets)):
-                    preds[i+j] = rets[j].strip()[0]
+                    preds[i + j] = rets[j].strip()[0]

        tic = time.time()
        asyncio.run(batched_call(batch_size=args.parallel))
@@ -151,22 +168,35 @@ def evaluate(args, subject, dev_df, test_df):
    acc = np.mean(cors)
    cors = np.array(cors)

-    print("Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
-        acc, latency, len(prompts), subject))
+    print(
+        "Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
+            acc, latency, len(prompts), subject
+        )
+    )

    return cors, acc, latency


 def main(args):
-    subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
+    subjects = sorted(
+        [
+            f.split("_test.csv")[0]
+            for f in os.listdir(os.path.join(args.data_dir, "test"))
+            if "_test.csv" in f
+        ]
+    )

    all_cors = []
    all_latencies = []
    num_requests = 0

-    for subject in tqdm(subjects[:args.nsub]):
-        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
-        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
+    for subject in tqdm(subjects[: args.nsub]):
+        dev_df = pd.read_csv(
+            os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
+        )[: args.ntrain]
+        test_df = pd.read_csv(
+            os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
+        )

        cors, acc, latency = evaluate(args, subject, dev_df, test_df)
        all_cors.append(cors)
@@ -191,7 +221,7 @@ def main(args):
            "other": {
                "nsub": args.nsub,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/mmlu/bench_sglang.py
+++ b/benchmark/mmlu/bench_sglang.py
@@ -7,8 +7,11 @@ import numpy as np
 import pandas as pd
 import tiktoken
 from tqdm import tqdm
-from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend

+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)

 choices = ["A", "B", "C", "D"]

@@ -22,24 +25,29 @@ def format_subject(subject):
        s += " " + entry
    return s

+
 def format_example(df, idx, include_answer=True):
    prompt = df.iloc[idx, 0]
    k = df.shape[1] - 2
    for j in range(k):
-        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
    prompt += "\nAnswer:"
    if include_answer:
        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
    return prompt

+
 def gen_prompt(train_df, subject, k=-1):
-    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(format_subject(subject))
+    prompt = "The following are multiple choice questions (with answers) about{}.\n\n".format(
+        format_subject(subject)
+    )
    if k == -1:
        k = train_df.shape[0]
    for i in range(k):
        prompt += format_example(train_df, i)
    return prompt

+
 def evaluate(args, subject, dev_df, test_df):
    prompts = []
    labels = []
@@ -54,7 +62,7 @@ def evaluate(args, subject, dev_df, test_df):
        prompt_end = format_example(test_df, i, include_answer=False)
        prompts.append(prompt_end)

-        label = test_df.iloc[i, test_df.shape[1]-1]
+        label = test_df.iloc[i, test_df.shape[1] - 1]
        labels.append(label)

    arguments = [{"question": p} for p in prompts]
@@ -66,11 +74,14 @@ def evaluate(args, subject, dev_df, test_df):
    import sglang as sgl

    if args.backend.startswith("gpt-"):
+
        @sgl.function
        def few_shot_mmlu(s, examples, question):
            s += sgl.user(examples + question)
            s += sgl.assistant(sgl.gen("answer"))
+
    else:
+
        @sgl.function
        def few_shot_mmlu(s, examples, question):
            s += examples + question + sgl.gen("answer")
@@ -84,32 +95,50 @@ def evaluate(args, subject, dev_df, test_df):

    tic = time.time()
    states = few_shot_mmlu.bind(examples=few_shot_examples).run_batch(
-        arguments, temperature=0, max_new_tokens=1,
-        backend=backend, num_threads=args.parallel)
-    preds = [s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else ""
-             for s in states]
+        arguments,
+        temperature=0,
+        max_new_tokens=1,
+        backend=backend,
+        num_threads=args.parallel,
+    )
+    preds = [
+        s["answer"].strip()[0] if len(s["answer"].strip()) > 0 else "" for s in states
+    ]
    latency = time.time() - tic

    cors = [pred == label for pred, label in zip(preds, labels)]
    acc = np.mean(cors)
    cors = np.array(cors)

-    print("Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
-        acc, latency, len(prompts), subject))
+    print(
+        "Average accuracy {:.3f}, latency {:.2f}, #q: {} - {}".format(
+            acc, latency, len(prompts), subject
+        )
+    )

    return cors, acc, latency


 def main(args):
-    subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test")) if "_test.csv" in f])
+    subjects = sorted(
+        [
+            f.split("_test.csv")[0]
+            for f in os.listdir(os.path.join(args.data_dir, "test"))
+            if "_test.csv" in f
+        ]
+    )

    all_cors = []
    all_latencies = []
    num_requests = 0

-    for subject in tqdm(subjects[:args.nsub]):
-        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
-        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
+    for subject in tqdm(subjects[: args.nsub]):
+        dev_df = pd.read_csv(
+            os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
+        )[: args.ntrain]
+        test_df = pd.read_csv(
+            os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
+        )

        cors, acc, latency = evaluate(args, subject, dev_df, test_df)
        all_cors.append(cors)
@@ -134,7 +163,7 @@ def main(args):
            "other": {
                "nsub": args.nsub,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/mtbench/bench_other.py
+++ b/benchmark/mtbench/bench_other.py
 import argparse
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 import json
 import os
 import time
 import uuid
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial

 from fastchat.model import get_conversation_template
-import requests
-from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt
+
+from sglang.test.test_utils import (
+    add_common_other_args_and_parse,
+    call_generate_lightllm,
+    call_generate_srt,
+    call_generate_vllm,
+)


 def load_questions(filename):
@@ -38,7 +43,7 @@ def write_answers(filename, model_id, questions, answers):

 def main(args):
    questions = load_questions(args.question_file)
-    questions = (questions * 10)[:args.num_questions]
+    questions = (questions * 10)[: args.num_questions]
    max_tokens = 256
    model_id = "llama-2-chat"

@@ -67,9 +72,8 @@ def main(args):
            conv.append_message(conv.roles[0], q)
            conv.append_message(conv.roles[1], None)

-            prompt = conv.get_prompt() 
-            output = call_generate(prompt,
-                temperature=0, max_tokens=max_tokens).strip()
+            prompt = conv.get_prompt()
+            output = call_generate(prompt, temperature=0, max_tokens=max_tokens).strip()

            cur_answers.append(output)
            conv.update_last_message(output)
@@ -102,7 +106,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/mtbench/bench_sglang.py
+++ b/benchmark/mtbench/bench_sglang.py
@@ -5,7 +5,10 @@ import time
 import uuid

 import sglang as sgl
-from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)


 def load_questions(filename):
@@ -44,10 +47,9 @@ def answer_mt_bench(s, question_1, question_2):

 def main(args):
    # Construct prompts
-    questions = load_questions(args.question_file)[:args.num_questions]
+    questions = load_questions(args.question_file)[: args.num_questions]
    arguments = [
-        {"question_1": q["turns"][0], "question_2": q["turns"][1]}
-        for q in questions
+        {"question_1": q["turns"][0], "question_2": q["turns"][1]} for q in questions
    ]

    # Select backend
@@ -83,7 +85,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/multi_chain_reasoning/bench_other.py
+++ b/benchmark/multi_chain_reasoning/bench_other.py
 import argparse
 import ast
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 import json
 import re
 import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial

 import numpy as np
-from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
-from sglang.utils import read_jsonl, dump_state_text

+from sglang.test.test_utils import (
+    add_common_other_args_and_parse,
+    call_generate_lightllm,
+    call_generate_srt_raw,
+    call_generate_vllm,
+)
+from sglang.utils import dump_state_text, read_jsonl

 INVALID = -9999999


 def get_answer_value(answer_str):
    answer_str = answer_str.replace(",", "")
-    numbers = re.findall(r'\d+', answer_str)
+    numbers = re.findall(r"\d+", answer_str)
    if len(numbers) < 1:
        return INVALID
    try:
@@ -44,14 +49,20 @@ def multi_chain_gsm8k(question, num_chains, call_generate):

    comps = []
    for i in range(num_chains):
-        comps.append(call_generate(s + "Answer: " + prompt_lib[i % num_chains],
-                     max_tokens=256, temperature=0.3, stop="Question"))
+        comps.append(
+            call_generate(
+                s + "Answer: " + prompt_lib[i % num_chains],
+                max_tokens=256,
+                temperature=0.3,
+                stop="Question",
+            )
+        )

    s += "Answer: To answer this question, here are some possible solutions. "
    s += "After considering all of them, I will do a majority vote.\n\n"
    for i in range(num_chains):
        s += f"Solution {i+1}: " + comps[i].strip() + "\n\n"
-    s += f"\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
+    s += "\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
    s += call_generate(s, max_tokens=16, temperature=0, stop=None)
    return s

@@ -64,7 +75,7 @@ def main(args):

    questions = []
    labels = []
-    for i in range(len(lines[:args.num_questions])):
+    for i in range(len(lines[: args.num_questions])):
        questions.append(lines[i]["question"])
        labels.append(get_answer_value(lines[i]["answer"]))
    assert all(l != INVALID for l in labels)
@@ -82,16 +93,28 @@ def main(args):
        url = f"{args.host}:{args.port}/generate"
        call_generate = partial(call_generate_srt_raw, url=url)
    elif args.backend == "guidance":
-        from guidance import models, gen
+        from guidance import gen, models

-        model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
+        model = models.LlamaCpp(
+            "/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
+            n_gpu_layers=-1,
+            n_ctx=4096,
+        )

        def call_generate(prompt, temperature, max_tokens, stop):
-            out = model + prompt + gen(name="answer",
-                max_tokens=max_tokens, temperature=temperature, stop=stop)
+            out = (
+                model
+                + prompt
+                + gen(
+                    name="answer",
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    stop=stop,
+                )
+            )
            return out["answer"]

-        #def multi_chain_gsm8k(question, num_chains, call_generate):
+        # def multi_chain_gsm8k(question, num_chains, call_generate):
        #    s = model + "Question: " + question + "\n"

        #    comps = []
@@ -108,8 +131,10 @@ def main(args):

    elif args.backend == "lmql":
        import lmql
-        model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
-           endpoint=f"{args.host}:{args.port}")
+
+        model = lmql.model(
+            "meta-llama/Llama-2-7b-chat-hf", endpoint=f"{args.host}:{args.port}"
+        )

        @lmql.query(model=model)
        async def program(question):
@@ -128,8 +153,7 @@ def main(args):
    if args.backend != "lmql":
        # Use thread pool
        def get_one_answer(i):
-            answer = multi_chain_gsm8k(questions[i], args.num_chains,
-                call_generate)
+            answer = multi_chain_gsm8k(questions[i], args.num_chains, call_generate)
            states[i] = answer

        tic = time.time()
@@ -144,12 +168,18 @@ def main(args):
        async def batched_call(batch_size):
            for i in range(0, len(questions), batch_size):
                tasks = []
-                for q in questions[i:i+batch_size]:
-                    tasks.append(call_generate(few_shot_examples + q,
-                        temperature=0, max_tokens=256, stop="Question"))
+                for q in questions[i : i + batch_size]:
+                    tasks.append(
+                        call_generate(
+                            few_shot_examples + q,
+                            temperature=0,
+                            max_tokens=256,
+                            stop="Question",
+                        )
+                    )
                rets = await asyncio.gather(*tasks)
                for j in range(len(rets)):
-                    states[i+j] = get_answer_value(rets[j])
+                    states[i + j] = get_answer_value(rets[j])

        tic = time.time()
        asyncio.run(batched_call(batch_size=args.parallel))
@@ -180,7 +210,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/multi_chain_reasoning/bench_sglang.py
+++ b/benchmark/multi_chain_reasoning/bench_sglang.py
@@ -5,16 +5,19 @@ import re
 import time

 import numpy as np
-from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
-from sglang.utils import read_jsonl, dump_state_text

+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl

 INVALID = -9999999


 def get_answer_value(answer_str):
    answer_str = answer_str.replace(",", "")
-    numbers = re.findall(r'\d+', answer_str)
+    numbers = re.findall(r"\d+", answer_str)
    if len(numbers) < 1:
        return INVALID
    try:
@@ -37,12 +40,12 @@ def main(args):
    lines = read_jsonl(args.data_path)

    # Construct prompts
-    #k = args.num_shot
-    #few_shot_examples = get_few_shot_examples(lines, k)
+    # k = args.num_shot
+    # few_shot_examples = get_few_shot_examples(lines, k)

    questions = []
    labels = []
-    for i in range(len(lines[:args.num_questions])):
+    for i in range(len(lines[: args.num_questions])):
        questions.append(lines[i]["question"])
        labels.append(get_answer_value(lines[i]["answer"]))
    assert all(l != INVALID for l in labels)
@@ -59,21 +62,24 @@ def main(args):
    @sgl.function
    def multi_chain_gsm8k(s, question):
        s += "Question: " + question + "\n"
-        #s += "Answer: " + prompt_lib[0] + sgl.gen("answer", max_tokens=256, stop="Question",
+        # s += "Answer: " + prompt_lib[0] + sgl.gen("answer", max_tokens=256, stop="Question",
        #    temperature=0)
-        #return
+        # return

        forks = s.fork(num_chains)
        for i in range(num_chains):
-            forks[i] += ("Answer: " + prompt_lib[i % num_chains] +
-                sgl.gen(f"chain", max_tokens=256, temperature=0.3, stop="Question"))
+            forks[i] += (
+                "Answer: "
+                + prompt_lib[i % num_chains]
+                + sgl.gen("chain", max_tokens=256, temperature=0.3, stop="Question")
+            )
        forks.join()

        s += "Answer: To answer this question, here are some possible solutions. "
        s += "After considering all of them, I will do a majority vote.\n\n"
        for i in range(num_chains):
            s += f"Solution {i+1}: " + forks[i]["chain"].strip() + "\n\n"
-        s += f"\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
+        s += "\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
        s += sgl.gen("answer", max_tokens=16)

    #####################################
@@ -86,7 +92,12 @@ def main(args):
    # Run requests
    tic = time.time()
    states = multi_chain_gsm8k.run_batch(
-        arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
    latency = time.time() - tic

    preds = []
@@ -114,7 +125,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/multi_document_qa/bench_other.py
+++ b/benchmark/multi_document_qa/bench_other.py
 import argparse
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 import json
 import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial

 from tqdm import tqdm
-import numpy as np
-from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
-from sglang.utils import read_jsonl, dump_state_text

+from sglang.test.test_utils import (
+    add_common_other_args_and_parse,
+    call_generate_lightllm,
+    call_generate_srt_raw,
+    call_generate_vllm,
+)
+from sglang.utils import dump_state_text, read_jsonl

 USER_PREFIX = "[INST] "
 USER_SUFFIX = " [/INST]"
@@ -25,7 +28,11 @@ def multi_document_qa(docs, question, generate):
    s += "".join(docs)

    s += "\nDocuments end."
-    s += ("\n\nBased on the above documents, please answer this question:\n" + question + "\nAnswer in three words or fewer.")
+    s += (
+        "\n\nBased on the above documents, please answer this question:\n"
+        + question
+        + "\nAnswer in three words or fewer."
+    )
    s += USER_SUFFIX
    s += ASSISTANT_PREFIX
    answer = generate(s, max_tokens=16, stop=None)
@@ -42,11 +49,13 @@ def main(args):
    if args.backend == "guidance":
        num_docs = 7  # due to OOM

-    for i in range(len(l["questions"][:args.num_questions])):
-        arguments.append({
-            "docs": l["documents"][:num_docs],
-            "question": l["questions"][i],
-        })
+    for i in range(len(l["questions"][: args.num_questions])):
+        arguments.append(
+            {
+                "docs": l["documents"][:num_docs],
+                "question": l["questions"][i],
+            }
+        )
        labels.append(l["answers"][i])
    states = [None] * len(arguments)

@@ -61,13 +70,20 @@ def main(args):
        url = f"{args.host}:{args.port}/generate"
        generate = partial(call_generate_srt_raw, url=url, temperature=0)
    elif args.backend == "guidance":
-        from guidance import models, gen
+        from guidance import gen, models

-        model = models.LlamaCpp("/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf", n_gpu_layers=-1, n_ctx=11000)
+        model = models.LlamaCpp(
+            "/home/ubuntu/model_weights/CodeLlama-7b-instruct-hf.gguf",
+            n_gpu_layers=-1,
+            n_ctx=11000,
+        )

        def generate(prompt, max_tokens, stop):
-            out = model + prompt + gen(name="answer",
-                max_tokens=max_tokens, temperature=0, stop=stop)
+            out = (
+                model
+                + prompt
+                + gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
+            )
            return out["answer"]

        # warmup
@@ -113,7 +129,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/multi_document_qa/bench_sglang.py
+++ b/benchmark/multi_document_qa/bench_sglang.py
@@ -2,10 +2,12 @@ import argparse
 import json
 import time

-import numpy as np
 import sglang as sgl
-from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
-from sglang.utils import read_jsonl, dump_state_text
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl


 @sgl.function
@@ -19,7 +21,11 @@ def multi_document_qa(s, docs, question):
    forks.join("concate_and_append")

    s += "\nDocuments end."
-    s += ("\n\nBased on the above documents, please answer this question:\n" + question + "\nAnswer in three words or fewer.")
+    s += (
+        "\n\nBased on the above documents, please answer this question:\n"
+        + question
+        + "\nAnswer in three words or fewer."
+    )
    s += sgl.user_end()
    s += sgl.assistant(sgl.gen("answer", max_tokens=16))

@@ -29,11 +35,13 @@ def main(args):
    l = lines[0]
    arguments = []
    labels = []
-    for i in range(len(l["questions"][:args.num_questions])):
-        arguments.append({
-            "docs": l["documents"][:10],
-            "question": l["questions"][i],
-        })
+    for i in range(len(l["questions"][: args.num_questions])):
+        arguments.append(
+            {
+                "docs": l["documents"][:10],
+                "question": l["questions"][i],
+            }
+        )
        labels.append(l["answers"][i])

    # Select backend
@@ -43,10 +51,11 @@ def main(args):
    # Run requests
    tic = time.time()
    states = multi_document_qa.run_batch(
-        arguments, temperature=0, num_threads=args.parallel, progress_bar=True)
+        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+    )
    latency = time.time() - tic

-    # Compute accuracy 
+    # Compute accuracy
    print([s["answer"] for s in states])
    correct = 0
    for s, label in zip(states, labels):
@@ -71,7 +80,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/multi_document_qa/build_dataset.py
+++ b/benchmark/multi_document_qa/build_dataset.py
@@ -3,7 +3,8 @@ import json
 import transformers

 content = "\n".join(
-    open("llama2.txt", 'r', encoding='utf-8', errors='ignore').readlines())
+    open("llama2.txt", "r", encoding="utf-8", errors="ignore").readlines()
+)
 content = content.replace("\n\n", "\n")

 # Count token
@@ -35,30 +36,35 @@ for i, s in enumerate(segments):

 # Dump
 with open("questions.jsonl", "w") as fout:
-    fout.write(json.dumps({
-        "documents": segments[:30],
-        "questions": [
-            "What is the name of the fine-tuned LLMs?",
-            "Which figure shows the helpfulness human evaluation results for Llama 2-Chat?",
-            "What is the number of parameters in the largest Llama 2 model?",
-            "What is the batch size of fine-tuning?",
-            "Where can we find the details of potential data contamination?",
-            "What is the full name of MPT?",
-            "What is the power consumption of RSC in Watt?",
-            "How many tokens of data do they train on?",
-            "Which model's release is delayed due to a lack of time to sufficiently red team?",
-            "Which activation function is used in Llama?"
-        ],
-        "answers": [
-            "Llama 2 Chat",
-            "1",
-            "70 B",
-            "64",
-            "A 6",
-            "MosaicML",
-            "400",
-            "2 trillion",
-            "34 B",
-            "SwiGLU",
-        ],
-    }) + "\n")
+    fout.write(
+        json.dumps(
+            {
+                "documents": segments[:30],
+                "questions": [
+                    "What is the name of the fine-tuned LLMs?",
+                    "Which figure shows the helpfulness human evaluation results for Llama 2-Chat?",
+                    "What is the number of parameters in the largest Llama 2 model?",
+                    "What is the batch size of fine-tuning?",
+                    "Where can we find the details of potential data contamination?",
+                    "What is the full name of MPT?",
+                    "What is the power consumption of RSC in Watt?",
+                    "How many tokens of data do they train on?",
+                    "Which model's release is delayed due to a lack of time to sufficiently red team?",
+                    "Which activation function is used in Llama?",
+                ],
+                "answers": [
+                    "Llama 2 Chat",
+                    "1",
+                    "70 B",
+                    "64",
+                    "A 6",
+                    "MosaicML",
+                    "400",
+                    "2 trillion",
+                    "34 B",
+                    "SwiGLU",
+                ],
+            }
+        )
+        + "\n"
+    )
--- a/benchmark/multi_turn_chat/bench_other.py
+++ b/benchmark/multi_turn_chat/bench_other.py
@@ -4,12 +4,12 @@ from argparse import ArgumentParser
 from concurrent.futures import ThreadPoolExecutor

 import requests
-from sglang.test.test_utils import add_common_other_args_and_parse
-from sglang.utils import dump_state_text
+from data_gen import gen_arguments
 from tqdm import tqdm
 from vllm.transformers_utils.tokenizer import get_tokenizer

-from data_gen import gen_arguments
+from sglang.test.test_utils import add_common_other_args_and_parse
+from sglang.utils import dump_state_text


 def get_generate(args):
@@ -61,7 +61,7 @@ def multi_turns(generate, qas):
    s = ""
    for qa in qas:
        s += qa["prompt"]
-        s += generate(s, max_tokens=qa["new_tokens"]) 
+        s += generate(s, max_tokens=qa["new_tokens"])

    return s


--- a/benchmark/multi_turn_chat/bench_sglang.py
+++ b/benchmark/multi_turn_chat/bench_sglang.py
@@ -2,22 +2,22 @@ import json
 import time
 from argparse import ArgumentParser

+from data_gen import gen_arguments
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
 import sglang as sgl
 from sglang.test.test_utils import (
    add_common_sglang_args_and_parse,
    select_sglang_backend,
 )
 from sglang.utils import dump_state_text
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-from data_gen import gen_arguments


 @sgl.function
 def multi_turns(s, qas):
    for qa in qas:
        s += qa["prompt"]
-        s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True) 
+        s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)


 def main(args):
@@ -29,7 +29,11 @@ def main(args):

    tic = time.time()
    states = multi_turns.run_batch(
-        multi_qas, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True
+        multi_qas,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
    )
    latency = time.time() - tic


--- a/benchmark/react/bench_other.py
+++ b/benchmark/react/bench_other.py
 import argparse
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 import json
 import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
 from pathlib import Path

 from tqdm import tqdm
+
 from sglang.test.test_utils import (
    add_common_other_args_and_parse,
    call_generate_lightllm,
-    call_generate_vllm,
    call_generate_srt_raw,
+    call_generate_vllm,
 )
-from sglang.utils import read_jsonl, dump_state_text
+from sglang.utils import dump_state_text, read_jsonl


 def get_prompt(question):
@@ -83,16 +84,15 @@ Action 2: Search[Leonid Levin]
 Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. 
 Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. 
 Action 3: Finish[yes]
-""" + question)
+"""
+        + question
+    )
    return prompt


 def main(args):
-    lines = read_jsonl(args.data_path)[:args.num_questions]
-    arguments = [{
-        "question": k,
-        "triplets": v
-    } for l in lines for k, v in l.items()]
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"question": k, "triplets": v} for l in lines for k, v in l.items()]

    states = []

@@ -107,7 +107,7 @@ def main(args):
        url = f"{args.host}:{args.port}/generate"
        call_generate = partial(call_generate_srt_raw, url=url)
    elif args.backend == "guidance":
-        from guidance import models, gen
+        from guidance import gen, models

        model = models.LlamaCpp(
            str(Path.home()) + "/model_weights/Llama-2-7b-chat.gguf",
@@ -116,12 +116,16 @@ def main(args):
        )

        def call_generate(prompt, temperature, max_tokens, stop):
-            out = (model + prompt + gen(
-                name="result",
-                max_tokens=max_tokens,
-                temperature=temperature,
-                stop=stop,
-            ))
+            out = (
+                model
+                + prompt
+                + gen(
+                    name="result",
+                    max_tokens=max_tokens,
+                    temperature=temperature,
+                    stop=stop,
+                )
+            )
            return out["result"]

        # warmup
@@ -137,15 +141,23 @@ def main(args):
        for i in range(1, len(triplets) + 2):
            prompt += "Thought " + str(i) + ":"
            states.append(prompt)
-            answer = call_generate(prompt,
-                                   max_tokens=200,
-                                   temperature=0,
-                                   stop="Observation")
+            answer = call_generate(
+                prompt, max_tokens=200, temperature=0, stop="Observation"
+            )
            if i > len(triplets):
                break
-            prompt += (triplets[i - 1]["thought"] + "\nAction " + str(i) +
-                       ":" + triplets[i - 1]["action"] + "\nObservation " +
-                       str(i) + ":" + triplets[i - 1]["observation"] + "\n")
+            prompt += (
+                triplets[i - 1]["thought"]
+                + "\nAction "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["action"]
+                + "\nObservation "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["observation"]
+                + "\n"
+            )

            states.append(answer)


--- a/benchmark/react/bench_sglang.py
+++ b/benchmark/react/bench_sglang.py
@@ -7,7 +7,7 @@ from sglang.test.test_utils import (
    add_common_sglang_args_and_parse,
    select_sglang_backend,
 )
-from sglang.utils import read_jsonl, dump_state_text
+from sglang.utils import dump_state_text, read_jsonl


 @sgl.function
@@ -79,7 +79,9 @@ Action 2: Search[Leonid Levin]
 Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. 
 Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. 
 Action 3: Finish[yes]
-""" + question)
+"""
+        + question
+    )
    for i in range(1, len(triplets) + 2):
        s += "Thought " + str(i) + ":"
        # NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
@@ -90,17 +92,23 @@ Action 3: Finish[yes]
        # print(ss[0]["thought_action"])
        if i > len(triplets):
            break
-        s += (triplets[i - 1]["thought"] + "\nAction " + str(i) + ":" +
-              triplets[i - 1]["action"] + "\nObservation " + str(i) + ":" +
-              triplets[i - 1]["observation"] + "\n")
+        s += (
+            triplets[i - 1]["thought"]
+            + "\nAction "
+            + str(i)
+            + ":"
+            + triplets[i - 1]["action"]
+            + "\nObservation "
+            + str(i)
+            + ":"
+            + triplets[i - 1]["observation"]
+            + "\n"
+        )


 def main(args):
-    lines = read_jsonl(args.data_path)[:args.num_questions]
-    arguments = [{
-        "question": k,
-        "triplets": v
-    } for l in lines for k, v in l.items()]
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"question": k, "triplets": v} for l in lines for k, v in l.items()]

    # Select backend
    backend = select_sglang_backend(args)
@@ -108,11 +116,12 @@ def main(args):

    states = []
    tic = time.time()
-    states = webthink.run_batch(arguments,
-                                temperature=0,
-                                num_threads=args.parallel,
-                                progress_bar=True,
-                                )
+    states = webthink.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
    latency = time.time() - tic

    # Compute accuracy

--- a/benchmark/tip_suggestion/bench_other.py
+++ b/benchmark/tip_suggestion/bench_other.py
 import argparse
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 import json
 import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial

 from tqdm import tqdm
-import numpy as np
-from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
-from sglang.utils import read_jsonl, dump_state_text

+from sglang.test.test_utils import (
+    add_common_other_args_and_parse,
+    call_generate_lightllm,
+    call_generate_srt_raw,
+    call_generate_vllm,
+)
+from sglang.utils import dump_state_text, read_jsonl

 number = 5


 def expand_tip(topic, tip, generate):
    s = (
-"""Please expand a tip for a topic into a detailed paragraph.
+        """Please expand a tip for a topic into a detailed paragraph.

 Topic: staying healthy
 Tip: Regular Exercise
@@ -30,14 +33,23 @@ Topic: writing a blog post
 Tip: structure your content effectively
 Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.

-Topic: """ + topic + "\nTip: " + tip + "\nParagraph:")
+Topic: """
+        + topic
+        + "\nTip: "
+        + tip
+        + "\nParagraph:"
+    )
    return generate(s, max_tokens=128, stop=["\n\n"])


 def suggest_tips(topic, generate):
    s = "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
    s += "USER: Give some tips for " + topic + ".\n"
-    s += ("ASSISTANT: Okay. Here are " + str(number) + " concise tips, each under 8 words:\n")
+    s += (
+        "ASSISTANT: Okay. Here are "
+        + str(number)
+        + " concise tips, each under 8 words:\n"
+    )

    tips = []
    for i in range(1, 1 + number):
@@ -49,12 +61,12 @@ def suggest_tips(topic, generate):
    paragraphs = [expand_tip(topic, tip, generate=generate) for tip in tips]

    for i in range(1, 1 + number):
-        s += f"Tip {i}:" + paragraphs[i-1] + "\n"
+        s += f"Tip {i}:" + paragraphs[i - 1] + "\n"
    return s


 def main(args):
-    lines = read_jsonl(args.data_path)[:args.num_questions]
+    lines = read_jsonl(args.data_path)[: args.num_questions]
    states = [None] * len(lines)

    # Select backend
@@ -68,13 +80,20 @@ def main(args):
        url = f"{args.host}:{args.port}/generate"
        generate = partial(call_generate_srt_raw, url=url, temperature=0)
    elif args.backend == "guidance":
-        from guidance import models, gen
+        from guidance import gen, models

-        model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
+        model = models.LlamaCpp(
+            "/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
+            n_gpu_layers=-1,
+            n_ctx=4096,
+        )

        def generate(prompt, max_tokens, stop):
-            out = model + prompt + gen(name="answer",
-                max_tokens=max_tokens, temperature=0, stop=stop)
+            out = (
+                model
+                + prompt
+                + gen(name="answer", max_tokens=max_tokens, temperature=0, stop=stop)
+            )
            return out["answer"]

        # warmup
@@ -111,7 +130,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/tip_suggestion/bench_sglang.py
+++ b/benchmark/tip_suggestion/bench_sglang.py
@@ -2,11 +2,12 @@ import argparse
 import json
 import time

-import numpy as np
 import sglang as sgl
-from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
-from sglang.utils import read_jsonl, dump_state_text
-
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl

 number = 5

@@ -14,7 +15,7 @@ number = 5
 @sgl.function
 def expand_tip(s, topic, tip):
    s += (
-"""Please expand a tip for a topic into a detailed paragraph.
+        """Please expand a tip for a topic into a detailed paragraph.

 Topic: staying healthy
 Tip: Regular Exercise
@@ -28,7 +29,12 @@ Topic: writing a blog post
 Tip: structure your content effectively
 Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.

-Topic: """ + topic + "\nTip: " + tip + "\nParagraph:")
+Topic: """
+        + topic
+        + "\nTip: "
+        + tip
+        + "\nParagraph:"
+    )
    s += sgl.gen("paragraph", max_tokens=128, stop=["\n\n"], temperature=0)


@@ -36,7 +42,11 @@ Topic: """ + topic + "\nTip: " + tip + "\nParagraph:")
 def suggest_tips(s, topic):
    s += "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
    s += "USER: Give some tips for " + topic + ".\n"
-    s += ("ASSISTANT: Okay. Here are " + str(number) + " concise tips, each under 8 words:\n")
+    s += (
+        "ASSISTANT: Okay. Here are "
+        + str(number)
+        + " concise tips, each under 8 words:\n"
+    )

    paragraphs = []
    for i in range(1, 1 + number):
@@ -44,14 +54,12 @@ def suggest_tips(s, topic):
        paragraphs.append(expand_tip(topic=topic, tip=s[f"tip_{i}"]))

    for i in range(1, 1 + number):
-        s += f"Tip {i}:" + paragraphs[i-1]["paragraph"] + "\n"
+        s += f"Tip {i}:" + paragraphs[i - 1]["paragraph"] + "\n"


 def main(args):
-    lines = read_jsonl(args.data_path)[:args.num_questions]
-    arguments = [
-        {"topic": l["topic"]} for l in lines
-    ]
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"topic": l["topic"]} for l in lines]

    # Select backend
    sgl.set_default_backend(select_sglang_backend(args))
@@ -59,7 +67,8 @@ def main(args):
    # Run requests
    tic = time.time()
    states = suggest_tips.run_batch(
-        arguments, temperature=0, num_threads=args.parallel, progress_bar=True)
+        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+    )
    latency = time.time() - tic

    # Compute accuracy
@@ -78,7 +87,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/tree_of_thought_deep/bench_other.py
+++ b/benchmark/tree_of_thought_deep/bench_other.py
 import argparse
 import ast
-import asyncio
-from collections import Counter
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 import json
 import re
 import time
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial

 import numpy as np
 from tqdm import tqdm
-from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
-from sglang.utils import read_jsonl, dump_state_text

+from sglang.test.test_utils import (
+    add_common_other_args_and_parse,
+    call_generate_lightllm,
+    call_generate_srt_raw,
+    call_generate_vllm,
+)
+from sglang.utils import dump_state_text, read_jsonl

 INVALID = -9999999


 def get_answer_value(answer_str):
    answer_str = answer_str.replace(",", "")
-    numbers = re.findall(r'\d+', answer_str)
+    numbers = re.findall(r"\d+", answer_str)
    if len(numbers) < 1:
        return INVALID
    try:
@@ -47,35 +51,56 @@ temp = 0.001


 def propose_plan(s, question, num_branches, call_generate):
-    s += (USER_PREFIX +
-"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """ + question + USER_SUFFIX)
+    s += (
+        USER_PREFIX
+        + """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+        + USER_SUFFIX
+    )

    s += ASSISTANT_PREFIX
-    comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
    return [s + comp + ASSISTANT_SUFFIX for comp in comps]


 def execute_plan(s, num_branches, call_generate):
-    s += (USER_PREFIX +
-"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short.""" + USER_SUFFIX)
+    s += (
+        USER_PREFIX
+        + """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+        + USER_SUFFIX
+    )
    s += ASSISTANT_PREFIX
-    comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
    return [s + comp + ASSISTANT_SUFFIX for comp in comps]


 def reflect_solution(s, num_branches, call_generate):
-    s += (USER_PREFIX +
-"""Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness.""" + USER_SUFFIX)
+    s += (
+        USER_PREFIX
+        + """Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+        + USER_SUFFIX
+    )
    s += ASSISTANT_PREFIX
-    comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
    return [s + comp + ASSISTANT_SUFFIX for comp in comps]


 def get_final_answer(s, num_branches, call_generate):
-    s += (USER_PREFIX +
-"""Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration.""" + USER_SUFFIX)
+    s += (
+        USER_PREFIX
+        + """Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration."""
+        + USER_SUFFIX
+    )
    s += ASSISTANT_PREFIX
-    comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
    return [s + comp + ASSISTANT_SUFFIX for comp in comps]


@@ -107,7 +132,7 @@ def main(args):
    num_branches = 2
    questions = []
    labels = []
-    for i in range(len(lines[:args.num_questions])):
+    for i in range(len(lines[: args.num_questions])):
        questions.append(lines[i]["question"])
        labels.append(get_answer_value(lines[i]["answer"]))
    assert all(l != INVALID for l in labels)
@@ -124,20 +149,40 @@ def main(args):
        url = f"{args.host}:{args.port}/generate"
        call_generate = partial(call_generate_srt_raw, url=url)
    elif args.backend == "guidance":
-        from guidance import models, gen
+        from guidance import gen, models

-        model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
+        model = models.LlamaCpp(
+            "/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
+            n_gpu_layers=-1,
+            n_ctx=4096,
+        )

        def call_generate(prompt, temperature, max_tokens, stop, n):
            if n == 1:
-                out = model + prompt + gen(name="answer",
-                    max_tokens=max_tokens, temperature=temperature, stop=stop)
+                out = (
+                    model
+                    + prompt
+                    + gen(
+                        name="answer",
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        stop=stop,
+                    )
+                )
                return out["answer"]
            else:
                rets = []
                for i in range(n):
-                    out = model + prompt + gen(name="answer",
-                        max_tokens=max_tokens, temperature=temperature, stop=stop)
+                    out = (
+                        model
+                        + prompt
+                        + gen(
+                            name="answer",
+                            max_tokens=max_tokens,
+                            temperature=temperature,
+                            stop=stop,
+                        )
+                    )
                    rets.append(out["answer"])
                return rets

@@ -146,6 +191,7 @@ def main(args):

    # Run requests
    states = [None] * len(questions)
+
    def get_one_answer(i):
        states[i] = tree_search(**arguments[i], call_generate=call_generate)

@@ -188,7 +234,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/tree_of_thought_deep/bench_sglang.py
+++ b/benchmark/tree_of_thought_deep/bench_sglang.py
 import argparse
 import ast
-from collections import Counter
 import json
 import re
 import time
+from collections import Counter

 import numpy as np
-from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
-from sglang.utils import read_jsonl, dump_state_text
-import sglang as sgl

+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl

 INVALID = -9999999


 def get_answer_value(answer_str):
    answer_str = answer_str.replace(",", "")
-    numbers = re.findall(r'\d+', answer_str)
+    numbers = re.findall(r"\d+", answer_str)
    if len(numbers) < 1:
        return INVALID
    try:
@@ -40,7 +43,9 @@ temp = 0.001

 def propose_plan(s, question, num_branches):
    s += sgl.user(
-"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """ + question)
+        """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+    )
    forks = s.fork(num_branches)
    forks += sgl.assistant(sgl.gen("plan", max_tokens=256, temperature=temp))
    return forks
@@ -48,7 +53,8 @@ def propose_plan(s, question, num_branches):

 def execute_plan(s, num_branches):
    s += sgl.user(
-"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short.""")
+        """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+    )
    forks = s.fork(num_branches)
    forks += sgl.assistant(sgl.gen("answer", max_tokens=256, temperature=temp))
    return forks
@@ -56,7 +62,8 @@ def execute_plan(s, num_branches):

 def reflect_solution(s, num_branches):
    s += sgl.user(
-"""Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness.""")
+        """Okay. Now, evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+    )
    forks = s.fork(num_branches)
    forks += sgl.assistant(sgl.gen("score", max_tokens=256, temperature=temp))
    return forks
@@ -64,13 +71,13 @@ def reflect_solution(s, num_branches):

 def get_final_answer(s, num_branches):
    s += sgl.user(
-"""Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration.""")
+        """Based on your reflection, do you change your mind? Now, give me the final answer after careful consideration."""
+    )
    forks = s.fork(num_branches)
    forks += sgl.assistant(sgl.gen("final_answer", max_tokens=256, temperature=temp))
    return forks


-
 @sgl.function
 def tree_search(s, question, num_branches):
    plan_forks = propose_plan(s, question, num_branches)
@@ -93,6 +100,7 @@ def tree_search(s, question, num_branches):

    return solutions

+
 def main(args):
    lines = read_jsonl(args.data_path)

@@ -100,7 +108,7 @@ def main(args):
    num_branches = 2
    questions = []
    labels = []
-    for i in range(len(lines[:args.num_questions])):
+    for i in range(len(lines[: args.num_questions])):
        questions.append(lines[i]["question"])
        labels.append(get_answer_value(lines[i]["answer"]))
    assert all(l != INVALID for l in labels)
@@ -112,7 +120,12 @@ def main(args):
    # Run requests
    tic = time.time()
    states = tree_search.run_batch(
-        arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
    latency = time.time() - tic
    answers_text = []
    for s in states:
@@ -144,7 +157,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/tree_of_thought_v0/bench_other.py
+++ b/benchmark/tree_of_thought_v0/bench_other.py
 import argparse
 import ast
-import asyncio
-from collections import Counter
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 import json
 import re
 import time
+from collections import Counter
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial

 import numpy as np
 from tqdm import tqdm
-from sglang.test.test_utils import add_common_other_args_and_parse, call_generate_lightllm, call_generate_vllm, call_generate_srt_raw
-from sglang.utils import read_jsonl, dump_state_text

+from sglang.test.test_utils import (
+    add_common_other_args_and_parse,
+    call_generate_lightllm,
+    call_generate_srt_raw,
+    call_generate_vllm,
+)
+from sglang.utils import dump_state_text, read_jsonl

 INVALID = -9999999


 def get_answer_value(answer_str):
    answer_str = answer_str.replace(",", "")
-    numbers = re.findall(r'\d+', answer_str)
+    numbers = re.findall(r"\d+", answer_str)
    if len(numbers) < 1:
        return INVALID
    try:
@@ -47,27 +51,43 @@ temp = 0.3


 def propose_plan(s, question, num_branches, call_generate):
-    s += (USER_PREFIX +
-"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """ + question + USER_SUFFIX)
+    s += (
+        USER_PREFIX
+        + """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+        + USER_SUFFIX
+    )

    s += ASSISTANT_PREFIX
-    comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
    return [s + comp + ASSISTANT_SUFFIX for comp in comps]


 def execute_plan(s, num_branches, call_generate):
-    s += (USER_PREFIX +
-"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short.""" + USER_SUFFIX)
+    s += (
+        USER_PREFIX
+        + """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+        + USER_SUFFIX
+    )
    s += ASSISTANT_PREFIX
-    comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
    return [s + comp + ASSISTANT_SUFFIX for comp in comps]


 def reflect_solution(s, num_branches, call_generate):
-    s += (USER_PREFIX +
-"""Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness.""" + USER_SUFFIX)
+    s += (
+        USER_PREFIX
+        + """Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+        + USER_SUFFIX
+    )
    s += ASSISTANT_PREFIX
-    comps = call_generate(s, max_tokens=256, temperature=temp, stop=None, n=num_branches)
+    comps = call_generate(
+        s, max_tokens=256, temperature=temp, stop=None, n=num_branches
+    )
    return [s + comp + ASSISTANT_SUFFIX for comp in comps]


@@ -92,7 +112,7 @@ def main(args):
    num_branches = 3
    questions = []
    labels = []
-    for i in range(len(lines[:args.num_questions])):
+    for i in range(len(lines[: args.num_questions])):
        questions.append(lines[i]["question"])
        labels.append(get_answer_value(lines[i]["answer"]))
    assert all(l != INVALID for l in labels)
@@ -109,25 +129,46 @@ def main(args):
        url = f"{args.host}:{args.port}/generate"
        call_generate = partial(call_generate_srt_raw, url=url)
    elif args.backend == "guidance":
-        from guidance import models, gen
+        from guidance import gen, models

-        model = models.LlamaCpp("/home/ubuntu/model_weights/Llama-2-7b-chat.gguf", n_gpu_layers=-1, n_ctx=4096)
+        model = models.LlamaCpp(
+            "/home/ubuntu/model_weights/Llama-2-7b-chat.gguf",
+            n_gpu_layers=-1,
+            n_ctx=4096,
+        )

        def call_generate(prompt, temperature, max_tokens, stop, n):
            if n == 1:
-                out = model + prompt + gen(name="answer",
-                    max_tokens=max_tokens, temperature=temperature, stop=stop)
+                out = (
+                    model
+                    + prompt
+                    + gen(
+                        name="answer",
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        stop=stop,
+                    )
+                )
                return out["answer"]
            else:
                rets = []
                for i in range(n):
-                    out = model + prompt + gen(name="answer",
-                        max_tokens=max_tokens, temperature=temperature, stop=stop)
+                    out = (
+                        model
+                        + prompt
+                        + gen(
+                            name="answer",
+                            max_tokens=max_tokens,
+                            temperature=temperature,
+                            stop=stop,
+                        )
+                    )
                    rets.append(out["answer"])
                return rets

    # Run requests
    states = [None] * len(questions)
+
    def get_one_answer(i):
        states[i] = tree_search(**arguments[i], call_generate=call_generate)

@@ -170,7 +211,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")


--- a/benchmark/tree_of_thought_v0/bench_sglang.py
+++ b/benchmark/tree_of_thought_v0/bench_sglang.py
 import argparse
 import ast
-from collections import Counter
 import json
 import re
 import time
+from collections import Counter

 import numpy as np
-from sglang.test.test_utils import add_common_sglang_args_and_parse, select_sglang_backend
-from sglang.utils import read_jsonl, dump_state_text
-import sglang as sgl

+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl

 INVALID = -9999999


 def get_answer_value(answer_str):
    answer_str = answer_str.replace(",", "")
-    numbers = re.findall(r'\d+', answer_str)
+    numbers = re.findall(r"\d+", answer_str)
    if len(numbers) < 1:
        return INVALID
    try:
@@ -40,7 +43,9 @@ temp = 0.3

 def propose_plan(s, question, num_branches):
    s += sgl.user(
-"""Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """ + question)
+        """Please generate a high-level plan for solving the following question. As the first step, just say what method and idea you will use to solve the question. You can reorganize the information in the question. Do not do the actual calculation. Keep your response concise and within 80 words. Question: """
+        + question
+    )
    forks = s.fork(num_branches)
    forks += sgl.assistant(sgl.gen("plan", max_tokens=256, temperature=temp))
    return forks
@@ -48,7 +53,8 @@ def propose_plan(s, question, num_branches):

 def execute_plan(s, num_branches):
    s += sgl.user(
-"""The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short.""")
+        """The plan looks good! Now, use real numbers and do the calculation. Please solve the question step-by-step according to the high-level plan. Give me the final answer. Make your response short."""
+    )
    forks = s.fork(num_branches)
    forks += sgl.assistant(sgl.gen("answer", max_tokens=256, temperature=temp))
    return forks
@@ -56,7 +62,8 @@ def execute_plan(s, num_branches):

 def reflect_solution(s, num_branches):
    s += sgl.user(
-"""Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness.""")
+        """Okay. Now you evaluate your own solution and give it a score on a scale of 1 to 5. Please do rigorous check of the correctness."""
+    )
    forks = s.fork(num_branches)
    forks += sgl.assistant(sgl.gen("score", max_tokens=256, temperature=temp))
    return forks
@@ -90,7 +97,7 @@ def main(args):
    num_branches = 3
    questions = []
    labels = []
-    for i in range(len(lines[:args.num_questions])):
+    for i in range(len(lines[: args.num_questions])):
        questions.append(lines[i]["question"])
        labels.append(get_answer_value(lines[i]["answer"]))
    assert all(l != INVALID for l in labels)
@@ -102,7 +109,12 @@ def main(args):
    # Run requests
    tic = time.time()
    states = tree_search.run_batch(
-        arguments, temperature=0, backend=backend, num_threads=args.parallel, progress_bar=True)
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
    latency = time.time() - tic
    answers_text = []
    for s in states:
@@ -134,7 +146,7 @@ def main(args):
            "other": {
                "num_questions": args.num_questions,
                "parallel": args.parallel,
-            }
+            },
        }
        fout.write(json.dumps(value) + "\n")