adapt to sglang v0.5.2rc1 on dcu

909abb58 · maxiao · 909abb58 · 909abb58 · 909abb58 · 909abb58
Commit 909abb58 authored Sep 04, 2025 by maxiao
20 changed files
--- a/benchmark/multi_turn_chat/README.md
+++ b/benchmark/multi_turn_chat/README.md
+### Benchmark sglang
+
+Run Llama-7B
+
+```
+python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+Run Mixtral-8x7B
+(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
+
+```
+python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
+```
+
+Benchmark(short output)
+
+```
+python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf
+```
+
+Benchmark(long output)
+
+```
+python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long
+```
+
+### Benchmark vLLM
+
+Run Llama-7B
+
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
+```
+
+Run Mixtral-8x7B
+
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
+```
+
+Benchmark(short output)
+
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm
+```
+
+Benchmark(long output)
+
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm --long
+```
+
+### Benchmark guidance
+
+Benchmark Llama-7B (short output)
+
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+Benchmark Llama-7B (long output)
+
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf --long
+```
--- a/benchmark/multi_turn_chat/bench_other.py
+++ b/benchmark/multi_turn_chat/bench_other.py
+import json
+import time
+from argparse import ArgumentParser
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from data_gen import gen_arguments
+from tqdm import tqdm
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text
+
+
+def multi_turns(generate, qas):
+    s = ""
+    for qa in qas:
+        s += qa["prompt"]
+        s += generate(s, max_tokens=qa["new_tokens"])
+
+    return s
+
+
+def main(args):
+    print(args)
+
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+
+    multi_qas = gen_arguments(args, tokenizer)
+
+    states = [None] * args.num_qa
+
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    def get_one_answer(i):
+        states[i] = multi_turns(generate=call_generate, **multi_qas[i])
+
+    tic = time.perf_counter()
+    if args.parallel == 1:
+        for i in tqdm(range(len(multi_qas))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            rets = list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(multi_qas)))),
+                    total=len(multi_qas),
+                )
+            )
+            for _ in rets:
+                pass
+
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_turn_chat",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_qa,
+            "num_turns": args.turns,
+            "other": {
+                "parallel": args.parallel,
+                "output_mode": "long" if args.long else "short",
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--turns", type=int, default=4)
+    parser.add_argument("--num-qa", type=int, default=20)
+    parser.add_argument("--min-len-q", type=int, default=256)
+    parser.add_argument("--max-len-q", type=int, default=512)
+    parser.add_argument("--min-len-a", type=int, default=4)
+    parser.add_argument("--max-len-a", type=int, default=8)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--long", action="store_true")
+    args = add_common_other_args_and_parse(parser)
+
+    if args.long:
+        args.min_len_a = 256
+        args.max_len_a = 512
+        args.num_qa = 20
+    main(args)
--- a/benchmark/multi_turn_chat/bench_sglang.py
+++ b/benchmark/multi_turn_chat/bench_sglang.py
+import json
+import time
+from argparse import ArgumentParser
+
+from data_gen import gen_arguments
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+@sgl.function
+def multi_turns(s, qas):
+    for qa in qas:
+        s += qa["prompt"]
+        s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)
+
+
+def main(args):
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+
+    multi_qas = gen_arguments(args, tokenizer)
+
+    backend = select_sglang_backend(args)
+
+    tic = time.perf_counter()
+    states = multi_turns.run_batch(
+        multi_qas,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_turn_chat",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_qa,
+            "num_turns": args.turns,
+            "other": {
+                "parallel": args.parallel,
+                "output_mode": "long" if args.long else "short",
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--turns", type=int, default=4)
+    parser.add_argument("--num-qa", type=int, default=20)
+    parser.add_argument("--min-len-q", type=int, default=256)
+    parser.add_argument("--max-len-q", type=int, default=512)
+    parser.add_argument("--min-len-a", type=int, default=4)
+    parser.add_argument("--max-len-a", type=int, default=8)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--long", action="store_true")
+    args = add_common_sglang_args_and_parse(parser)
+
+    if args.long:
+        args.min_len_a = 256
+        args.max_len_a = 512
+        args.num_qa = 20
+
+    print(args)
+    main(args)
--- a/benchmark/multi_turn_chat/data_gen.py
+++ b/benchmark/multi_turn_chat/data_gen.py
+import random
+import string
+
+random.seed(42)
+
+
+def gen_prompt(tokenizer, token_num):
+    cha_set = string.ascii_letters + string.digits
+    ret = "".join(random.choices(cha_set, k=token_num))
+    while len(tokenizer(ret).input_ids) < token_num:
+        ret += random.choice(cha_set)
+    return ret
+
+
+def gen_arguments(args, tokenizer):
+    multi_qas = [{"qas": []} for _ in range(args.num_qa)]
+    for i in range(args.num_qa):
+        qas = multi_qas[i]["qas"]
+        for _ in range(args.turns):
+            prompt_len = random.randint(args.min_len_q, args.max_len_q)
+            new_tokens = random.randint(args.min_len_a, args.max_len_a)
+            qas.append(
+                {
+                    "prompt": gen_prompt(tokenizer, prompt_len),
+                    "new_tokens": new_tokens,
+                }
+            )
+
+    return multi_qas
--- a/benchmark/multi_turn_chat/long_prompt_multi_turn.py
+++ b/benchmark/multi_turn_chat/long_prompt_multi_turn.py
+import json
+import random
+import time
+from argparse import ArgumentParser
+from pathlib import Path
+
+from tqdm import tqdm
+
+import sglang as sgl
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+def gen_prompt(tokenizer, token_num):
+    all_available_tokens = list(tokenizer.get_vocab().values())
+    selected_tokens = random.choices(all_available_tokens, k=token_num)
+    ret = tokenizer.decode(selected_tokens)
+    return ret
+
+
+def get_cache_path(args):
+    # Create cache directory under ~/.cache/sglang
+    cache_dir = Path.home() / ".cache" / "sglang"
+
+    # Create a unique cache filename based on the arguments that affect generation
+    cache_key = f"qa_{args.num_qa}_{args.turns}_{args.system_prompt_len}_{args.len_q}_{args.len_a}_{args.tokenizer.replace('/', '_')}.json"
+    return cache_dir / cache_key
+
+
+def gen_arguments(args, tokenizer):
+    cache_path = get_cache_path(args)
+
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"Loading cached arguments from {cache_path}")
+        with open(cache_path, "r") as f:
+            return json.load(f)
+
+    print("Generating new arguments...")
+    # First progress bar for system prompts
+    multi_qas = []
+    for _ in tqdm(range(args.num_qa), desc="Generating system prompts"):
+        multi_qas.append(
+            {"system_prompt": gen_prompt(tokenizer, args.system_prompt_len), "qas": []}
+        )
+
+    # Nested progress bars for QA pairs
+    for i in tqdm(range(args.num_qa), desc="Generating QA pairs"):
+        qas = multi_qas[i]["qas"]
+        for j in range(args.turns):
+            qas.append(
+                {
+                    "prompt": gen_prompt(tokenizer, args.len_q),
+                    "new_tokens": args.len_a,
+                }
+            )
+
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(cache_path, "w") as f:
+        json.dump(multi_qas, f)
+    print(f"Cached arguments saved to {cache_path}")
+
+    return multi_qas
+
+
+@sgl.function
+def multi_turns(s, system_prompt, qas):
+    s += system_prompt
+
+    for i, qa in enumerate(qas):
+        s += qa["prompt"]
+        s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)
+
+
+def main(args):
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+
+    multi_qas = gen_arguments(args, tokenizer)
+
+    backend = select_sglang_backend(args)
+
+    tic = time.perf_counter()
+    states = multi_turns.run_batch(
+        multi_qas,
+        temperature=0,
+        backend=backend,
+        num_threads="auto",
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_turn_system_prompt_chat",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_requests": args.num_qa,
+            "num_turns": args.turns,
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--turns", type=int, default=8)
+    parser.add_argument("--num-qa", type=int, default=128)
+    parser.add_argument("--system-prompt-len", type=int, default=2048)
+    parser.add_argument("--len-q", type=int, default=32)
+    parser.add_argument("--len-a", type=int, default=128)
+    parser.add_argument(
+        "--tokenizer", type=str, default="meta-llama/Meta-Llama-3-8B-Instruct"
+    )
+    parser.add_argument("--trust-remote-code", action="store_true")
+    args = add_common_sglang_args_and_parse(parser)
+
+    print(args)
+    main(args)
--- a/benchmark/react/README.md
+++ b/benchmark/react/README.md
+## Run benchmark
+
+NOTE: This is an implementation for replaying a given trace for throughput/latency benchmark purposes. It is not an actual ReAct agent implementation.
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 100
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --num-questions 100 --backend vllm
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 100 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --num-questions 100 --backend lmql --parallel 1
+```
--- a/benchmark/react/bench_other.py
+++ b/benchmark/react/bench_other.py
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+
+def get_prompt(question):
+    prompt = (
+        """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+Here are some examples.
+Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.
+Action 1: Search[Colorado orogeny]
+Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
+Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.
+Action 2: Lookup[eastern sector]
+Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.
+Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.
+Action 3: Search[High Plains]
+Observation 3: High Plains refers to one of two distinct land regions:
+Thought 4: I need to instead search High Plains (United States).
+Action 4: Search[High Plains (United States)]
+Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]
+Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.
+Action 5: Finish[1,800 to 7,000 ft]
+Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
+Thought 1: The question simplifies to "The Simpsons" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.
+Action 1: Search[Milhouse]
+Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
+Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
+Action 2: Lookup[named after]
+Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
+Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
+Action 3: Finish[Richard Nixon]
+Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
+Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.
+Action 1: Search[Adam Clayton Powell]
+Observation 1: Could not find [Adam Clayton Powell]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].
+Thought 2: To find the documentary, I can search Adam Clayton Powell (film).
+Action 2: Search[Adam Clayton Powell (film)]
+Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.
+The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.
+Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.
+Action 3: Finish[The Saimaa Gesture]
+Question: What profession does Nicholas Ray and Elia Kazan have in common?
+Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.
+Action 1: Search[Nicholas Ray]
+Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.
+Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.
+Action 2: Search[Elia Kazan]
+Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.
+Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.
+Action 3: Finish[director, screenwriter, actor]
+Question: Which magazine was started first Arthur's Magazine or First for Women?
+Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur's Magazine]
+Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
+Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
+Action 2: Search[First for Women]
+Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
+Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
+Action 3: Finish[Arthur's Magazine]
+Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
+Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.
+Action 1: Search[Pavel Urysohn]
+Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
+Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
+Action 2: Search[Leonid Levin]
+Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
+Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
+Action 3: Finish[yes]
+"""
+        + question
+    )
+    return prompt
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"question": k, "triplets": v} for l in lines for k, v in l.items()]
+
+    states = []
+
+    # Select backend
+    call_generate = get_call_generate(args)
+
+    def run_single_agent(argument):
+        question = argument["question"]
+        triplets = argument["triplets"]
+        prompt = get_prompt(question)
+        for i in range(1, len(triplets) + 2):
+            prompt += "Thought " + str(i) + ":"
+            states.append(prompt)
+            answer = call_generate(
+                prompt, max_tokens=200, temperature=0, stop="Observation"
+            )
+            if i > len(triplets):
+                break
+            prompt += (
+                triplets[i - 1]["thought"]
+                + "\nAction "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["action"]
+                + "\nObservation "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["observation"]
+                + "\n"
+            )
+
+            states.append(answer)
+
+    async def run_single_agent_async(argument):
+        question = argument["question"]
+        triplets = argument["triplets"]
+        prompt = get_prompt(question)
+        for i in range(1, len(triplets) + 2):
+            prompt += "Thought " + str(i) + ":"
+            states.append(prompt)
+            answer = await call_generate(
+                prompt, max_tokens=200, temperature=0, stop="Observation", max_len=4096
+            )
+            if i > len(triplets):
+                break
+            prompt += (
+                triplets[i - 1]["thought"]
+                + "\nAction "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["action"]
+                + "\nObservation "
+                + str(i)
+                + ":"
+                + triplets[i - 1]["observation"]
+                + "\n"
+            )
+
+            states.append(answer)
+
+    tic = time.perf_counter()
+
+    if args.backend != "lmql":
+        if args.parallel == 1:
+            for arg in tqdm(arguments):
+                run_single_agent(arg)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(run_single_agent, arguments), total=len(arguments)
+                    )
+                )
+
+    else:
+        import asyncio
+
+        loop = asyncio.get_event_loop()
+        batches = [
+            [] for _ in range((len(arguments) + args.parallel - 1) // args.parallel)
+        ]
+        for i, arg in enumerate(arguments):
+            batches[i // args.parallel].append(arg)
+        for bt in tqdm(batches):
+            tasks = [run_single_agent_async(arg) for arg in bt]
+            loop.run_until_complete(asyncio.gather(*tasks))
+
+    latency = time.perf_counter() - tic
+
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "ReAct Agents",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": len(arguments),
+            "other": {
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="hotpotqa_100.jsonl")
+    parser.add_argument("--num-questions", type=int, default=10)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/react/bench_sglang.py
+++ b/benchmark/react/bench_sglang.py
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+
+@sgl.function
+def webthink(s, question, triplets):
+    s += (
+        """Solve a question answering task with interleaving Thought, Action, Observation steps. Thought can reason about the current situation, and Action can be three types:
+(1) Search[entity], which searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.
+(2) Lookup[keyword], which returns the next sentence containing keyword in the current passage.
+(3) Finish[answer], which returns the answer and finishes the task.
+Here are some examples.
+Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
+Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.
+Action 1: Search[Colorado orogeny]
+Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
+Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.
+Action 2: Lookup[eastern sector]
+Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.
+Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.
+Action 3: Search[High Plains]
+Observation 3: High Plains refers to one of two distinct land regions:
+Thought 4: I need to instead search High Plains (United States).
+Action 4: Search[High Plains (United States)]
+Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]
+Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.
+Action 5: Finish[1,800 to 7,000 ft]
+Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
+Thought 1: The question simplifies to "The Simpsons" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.
+Action 1: Search[Milhouse]
+Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
+Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
+Action 2: Lookup[named after]
+Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
+Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
+Action 3: Finish[Richard Nixon]
+Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
+Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.
+Action 1: Search[Adam Clayton Powell]
+Observation 1: Could not find [Adam Clayton Powell]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].
+Thought 2: To find the documentary, I can search Adam Clayton Powell (film).
+Action 2: Search[Adam Clayton Powell (film)]
+Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.
+The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.
+Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.
+Action 3: Finish[The Saimaa Gesture]
+Question: What profession does Nicholas Ray and Elia Kazan have in common?
+Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.
+Action 1: Search[Nicholas Ray]
+Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.
+Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.
+Action 2: Search[Elia Kazan]
+Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.
+Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.
+Action 3: Finish[director, screenwriter, actor]
+Question: Which magazine was started first Arthur's Magazine or First for Women?
+Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
+Action 1: Search[Arthur's Magazine]
+Observation 1: Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
+Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
+Action 2: Search[First for Women]
+Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
+Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
+Action 3: Finish[Arthur's Magazine]
+Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
+Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.
+Action 1: Search[Pavel Urysohn]
+Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 â August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
+Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
+Action 2: Search[Leonid Levin]
+Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
+Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
+Action 3: Finish[yes]
+"""
+        + question
+    )
+    for i in range(1, len(triplets) + 2):
+        s += "Thought " + str(i) + ":"
+        # NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
+        ss = s.fork(1)
+        ss[0] += sgl.gen(name="thought_action", max_tokens=200, stop="Observation")
+        ss.join()
+        # to verify the correctness of output, this should be collected
+        # print(ss[0]["thought_action"])
+        if i > len(triplets):
+            break
+        s += (
+            triplets[i - 1]["thought"]
+            + "\nAction "
+            + str(i)
+            + ":"
+            + triplets[i - 1]["action"]
+            + "\nObservation "
+            + str(i)
+            + ":"
+            + triplets[i - 1]["observation"]
+            + "\n"
+        )
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"question": k, "triplets": v} for l in lines for k, v in l.items()]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    states = []
+    tic = time.perf_counter()
+    states = webthink.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "ReAct Agents",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": len(arguments),
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="hotpotqa_100.jsonl")
+    parser.add_argument("--num-questions", type=int, default=10)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
--- a/benchmark/react/hotpotqa_100.jsonl
+++ b/benchmark/react/hotpotqa_100.jsonl
--- a/benchmark/reasoning_benchmark/README.md
+++ b/benchmark/reasoning_benchmark/README.md
+# Run benchmark
+
+This benchmark is primarily intended to be used with reasoning models like `DeepSeek-R1` and its distilled models like `DeepSeek-R1-Distill-Qwen-1.5B`. Please use
+
+```bash
+pip install antlr4-python3-runtime
+```
+
+for `parse_latex` which we use for symbolic equality check.
+
+## Benchmark sglang
+
+1. Launch the Server
+```bash
+python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --port 30000
+```
+
+Note that depending on the GPU this benchmark will take quiet some time. To employ data parallelism please use:
+
+```bash
+python3 -m sglang_router.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --port 30000 --dp-size 4
+```
+
+
+2. Benchmarking
+
+We use [suggested](https://github.com/deepseek-ai/DeepSeek-R1) parameters of `temperature=0.6`, `top_p=.95`, `max_new_tokens=32768`. The command line argument `num-tries` can be used to evaluate the model multiple times on the same question. We use the suggested `64` from the repo for AIME 2024. For LIMO, we use `8` as the number of tries due to the size of the dataset.
+
+By default evaluate on LIMO dataset.
+
+```bash
+python3 bench_sglang.py --parallel 256 --num-tries 64 --port 30000
+```
+
+Evaluate on AIME 2024 dataset.
+
+```bash
+python3 bench_sglang.py --parallel 256 --port 30000 --data-path Maxwell-Jia/AIME_2024 --question-key Problem --answer-key Answer --num-tries 64
+```
+
+Evaluate on [AIME 2025 I dataset](https://huggingface.co/datasets/opencompass/AIME2025). For benchmark result see [here](https://matharena.ai/).
+
+```bash
+python3 bench_sglang.py --parallel 256 --port 30000 --data-path opencompass/AIME2025 --question-key question --answer-key answer --num-tries 64
+```
+## Results
+
+### Evaluation Results
+| Dataset    | Num Tries | Accuracy | Reference | Standard Error |
+|------------|-----------|----------|-----------|-----------|
+| LIMO       | 8         | 47.7%    | ?         | ?         |
+| AIME 2024  | 64        | 33.2%    | 28.9%     | 3.4%       |
+| AIME 2025 I| 64        | 29.9%    | 25.0%     |  ?        |
+
+### Statistic Analysis Results
+Set up SGLang engine for statistic analysis, for high efficiency we use `--dp-size 8` for data parallelism:
+```bash
+python3 -m sglang_router.launch_server --model-path deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --port 30000 --dp-size 8
+```
+**Experiment 1**:
+We fixed the number of attempts (num_tries) and conducted multiple runs to assess the consistency of the model's performance. The results show that all recorded accuracies lie within ± one standard error deviation from the mean. This suggests that **our metric serves as an effective upper bound for the deviation of reported accuracy**.
+
+To collect the accuracy, run the following command 30 times:
+```bash
+python3 bench_sglang.py --parallel 64 --port 30000 --data-path Maxwell-Jia/AIME_2024 --question-key Problem --answer-key Answer --num-tries 64
+```
+
+![acc_hist](figure/Acc_histplot.png)
+
+
+**Experiment 2**: We explored the relationship between the number of attempts (num_tries) and the standard error (SE) by varying num_tries across a range (e.g., 8, 16, 32, ..., 256) and performing a single run for each value. The results demonstrate that as the number of attempts increases, the standard error decreases, leading to **greater stability in answer accuracy**.
+
+To reveal the relationship, run the command 6 times and adjust the parameter `--num-tries` for each run:
+```bash
+python3 bench_sglang.py --parallel 64 --port 30000 --data-path Maxwell-Jia/AIME_2024 --question-key Problem --answer-key Answer --num-tries <num_tries>
+```
+![SE_num_tries](figure/SE_numtries.png)
--- a/benchmark/reasoning_benchmark/answer_extraction.py
+++ b/benchmark/reasoning_benchmark/answer_extraction.py
+# Adapted from https://github.com/deepseek-ai/DeepSeek-Math/blob/main/evaluation/data_processing/answer_extraction.py
+
+import re
+
+import regex
+
+
+def _fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if len(substr) > 0 and substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def _fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        if "sqrt" not in a:
+            a = int(a)
+        if "sqrt" not in b:
+            b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except:
+        return string
+
+
+def _fix_sqrt(string):
+    _string = re.sub(r"\\sqrt(-?[0-9.a-zA-Z]+)", r"\\sqrt{\1}", string)
+    _string = re.sub(r"\\sqrt\s+(\w+)$", r"\\sqrt{\1}", _string)
+    return _string
+
+
+def _fix_tan(string):
+    _string = re.sub(r"\\tan(-?[0-9.a-zA-Z]+)", r"\\tan{\1}", string)
+    _string = re.sub(r"\\tan\s+(\w+)$", r"\\tan{\1}", _string)
+    return _string
+
+
+def strip_string(string):
+    string = str(string).strip()
+    # linebreaks
+    string = string.replace("\n", "")
+
+    # right "."
+    string = string.rstrip(".")
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # string = string.replace("\\ ", "")
+
+    # replace \\ with \
+    # string = string.replace("\\\\", "\\")
+    # string = string.replace("\\\\", "\\")
+
+    if string.startswith("\\text{") and string.endswith("}"):
+        string = string.split("{", 1)[1][:-1]
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    string = string.replace("cfrac", "frac")
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+
+    # Remove unit: miles, dollars if after is not none
+    _string = re.sub(r"\\text{.*?}$", "", string).strip()
+    if _string != "" and _string != string:
+        # print("Warning: unit not removed: '{}' -> '{}'".format(string, _string))
+        string = _string
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "").strip()
+    string = string.replace("^\\circ", "").strip()
+
+    string = regex.sub(r"\{(c|m)?m\}(\^(2|3))?", "", string).strip()
+    string = regex.sub(r"p\.m\.$", "", string).strip()
+    string = regex.sub(r"(\d)\s*t$", r"\1", string).strip()
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    string = string.replace("$", "")
+
+    # string = string.replace("\\text", "")
+    string = string.replace("x\\in", "")
+
+    # remove percentage
+    string = string.replace("\\%", "%")
+    string = string.replace("\%", "%")
+    # string = string.replace("%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+
+    # cdot
+    string = string.replace("\\cdot", "")
+
+    # inf
+    string = string.replace("infinity", "\\infty")
+    if "\\infty" not in string:
+        string = string.replace("inf", "\\infty")
+    string = string.replace("+\\inity", "\\infty")
+
+    # and
+    # string = string.replace("and", "")
+    string = string.replace("\\mathbf", "")
+    string = string.replace("\\mathrm", "")
+
+    # use regex to remove \mbox{...}
+    string = re.sub(r"\\mbox{.*?}", "", string)
+
+    # quote
+    string.replace("'", "")
+    string.replace('"', "")
+
+    # i, j
+    if "j" in string and "i" not in string:
+        string = string.replace("j", "i")
+
+    # replace a.000b where b is not number or b is end, with ab, use regex
+    string = re.sub(r"(\d+)\.0+([^\d])", r"\1\2", string)
+    string = re.sub(r"(\d+)\.0+$", r"\1", string)
+
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    # if len(string.split("=")) == 2:
+    #     if len(string.split("=")[0]) <= 2:
+    #         string = string.split("=")[1]
+
+    string = _fix_sqrt(string)
+    string = _fix_tan(string)
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    string = regex.sub(r"(\\|,|\.)+$", "", string)
+
+    return string
+
+
+def extract_boxed_answers(text):
+    answers = []
+    for piece in text.split("boxed{")[1:]:
+        n = 0
+        for i in range(len(piece)):
+            if piece[i] == "{":
+                n += 1
+            elif piece[i] == "}":
+                n -= 1
+                if n < 0:
+                    if i + 1 < len(piece) and piece[i + 1] == "%":
+                        answers.append(piece[: i + 1])
+                    else:
+                        answers.append(piece[:i])
+                    break
+    return answers
+
+
+def extract_program_output(pred_str):
+    """
+    extract output between the last ```output\n...\n```
+    """
+    if "```output" not in pred_str:
+        return ""
+    if "```output" in pred_str:
+        pred_str = pred_str.split("```output")[-1]
+    if "```" in pred_str:
+        pred_str = pred_str.split("```")[0]
+    output = pred_str.strip()
+    return output
+
+
+def extract_answer(pred_str, exhaust=False):
+    pred = []
+    if "final answer is $" in pred_str and "$. I hope" in pred_str:
+        tmp = pred_str.split("final answer is $", 1)[1]
+        pred = [tmp.split("$. I hope", 1)[0].strip()]
+    elif "boxed" in pred_str:
+        pred = extract_boxed_answers(pred_str)
+    elif "he answer is" in pred_str:
+        pred = [pred_str.split("he answer is")[-1].strip()]
+    else:
+        program_output = extract_program_output(pred_str)
+        if program_output != "":
+            # fall back to program
+            pred.append(program_output)
+        else:  # use the last number
+            pattern = "-?\d*\.?\d+"
+            ans = re.findall(pattern, pred_str.replace(",", ""))
+            if len(ans) >= 1:
+                ans = ans[-1]
+            else:
+                ans = ""
+            if ans:
+                pred.append(ans)
+
+    # multiple line
+    _pred = []
+    for ans in pred:
+        ans = ans.strip().split("\n")[0]
+        ans = ans.lstrip(":")
+        ans = ans.rstrip(".")
+        ans = ans.rstrip("/")
+        ans = strip_string(ans)
+        _pred.append(ans)
+    if exhaust:
+        return _pred
+    else:
+        return _pred[-1] if _pred else ""
+
+
+def extract_math_answer(question, reasoning, task):
+    answer = []
+    for ans in extract_answer(reasoning, exhaust=True):
+        if "separated by commas" in question and all(ch not in ans for ch in "()[]"):
+            answer.extend([a.strip() for a in ans.split(",")])
+        elif regex.search(r"\\text\{\s*and\s*\}", ans):
+            answer.extend(
+                [
+                    a.strip()
+                    for a in regex.sub(r"\\text\{\s*and\s*\}", "[SEP]", ans).split(
+                        "[SEP]"
+                    )
+                ]
+            )
+        else:
+            answer.append(ans.strip())
+    return answer
--- a/benchmark/reasoning_benchmark/bench_sglang.py
+++ b/benchmark/reasoning_benchmark/bench_sglang.py
+import argparse
+import json
+import time
+
+import answer_extraction
+import eval_utils
+import numpy as np
+from datasets import load_dataset
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+@sgl.function
+def reasoning_gen(s, question: str):
+    s += sgl.user(
+        question
+        + "\nPlease reason step by step, and put your final answer within \boxed{}."
+    )
+    s += sgl.assistant(
+        sgl.gen(
+            "answer",
+        )
+    )
+
+
+def convert_dataset(path: str, question_key: str, answer_key: str, num_tries: int):
+    raw_dataset = load_dataset(path)
+    questions = []
+    answers = []
+    for data in raw_dataset["train"]:
+        question = data[question_key]
+        answer = data[answer_key]
+        for _ in range(num_tries):
+            questions.append({"question": question})
+            answers.append({"answer": answer})
+    return questions, answers
+
+
+def main(args):
+    # Select backend
+    sgl.set_default_backend(select_sglang_backend(args))
+
+    # Get dataset
+    questions, answers = convert_dataset(
+        args.data_path, args.question_key, args.answer_key, args.num_tries
+    )
+
+    # Run requests
+    tic = time.perf_counter()
+    states = reasoning_gen.run_batch(
+        questions,
+        num_threads=args.parallel,
+        progress_bar=True,
+        temperature=0.6,
+        max_new_tokens=32768,
+        top_p=0.95,
+    )
+    latency = time.perf_counter() - tic
+
+    # Extract results and record outcomes in a list.
+    outcomes = []
+    for i, state in enumerate(states):
+        try:
+            pred_answer = answer_extraction.extract_math_answer(
+                questions[i]["question"], state["answer"], "limo"
+            )
+            gt_answer = str(answers[i]["answer"])
+            pred_answer = (
+                pred_answer[-1] if isinstance(pred_answer, list) else pred_answer
+            )
+            is_correct = 1 if eval_utils.math_equal(pred_answer, gt_answer) else 0
+        except Exception as e:
+            print(f"Error extracting answer: {e}")
+            is_correct = 0
+
+        outcomes.append(is_correct)
+
+    # Calculate overall accuracy using numpy
+    overall_accuracy = np.mean(outcomes)
+    print(f"Overall Accuracy: {overall_accuracy}")
+
+    # Calculate mean standard error over questions if num_tries >= 2
+    if args.num_tries > 1:
+        outcomes_np = np.array(outcomes).reshape(-1, args.num_tries)
+        # Using sample standard deviation with ddof=1
+        std_per_question = np.std(outcomes_np, axis=1, ddof=1)
+        # Compute the standard error for each question: std / sqrt(num_tries)
+        se_per_question = std_per_question / np.sqrt(args.num_tries)
+        mean_se = se_per_question.mean()
+        print(f"Mean Standard Error of Accuracy across questions: {mean_se}")
+    else:
+        mean_se = None
+        print("Not enough samples per question to compute standard error.")
+
+    # Calculate output throughput
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+    print(f"Output throughput: {output_throughput} token/s")
+
+    # Dump results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "limo",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "overall_accuracy": round(overall_accuracy, 3),
+            "mean_se_accuracy": round(mean_se, 3) if mean_se is not None else None,
+            "num_requests": len(questions),
+            "other": {
+                "num_questions": len(questions),
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="GAIR/LIMO")
+    parser.add_argument("--question-key", type=str, default="question")
+    parser.add_argument("--answer-key", type=str, default="answer")
+    parser.add_argument("--num-tries", type=int, default=1)
+    add_common_sglang_args_and_parse(parser)
+    args = parser.parse_args()
+    main(args)
--- a/benchmark/reasoning_benchmark/eval_utils.py
+++ b/benchmark/reasoning_benchmark/eval_utils.py
+# Adapted from https://github.com/deepseek-ai/DeepSeek-Math/blob/main/evaluation/eval/eval_utils.py
+
+from math import isclose
+
+import regex
+from sympy import N, simplify
+from sympy.parsing.latex import parse_latex
+from sympy.parsing.sympy_parser import parse_expr
+
+
+def parse_digits(num):
+    # format: 234.23 || 23%
+    num = regex.sub(",", "", str(num))
+    try:
+        return float(num)
+    except:
+        if num.endswith("%"):
+            num = num[:-1]
+            if num.endswith("\\"):
+                num = num[:-1]
+            try:
+                return float(num) / 100
+            except:
+                pass
+    return None
+
+
+def is_digit(num):
+    # paired with parse_digits
+    return parse_digits(num) is not None
+
+
+def symbolic_equal(a, b):
+    def _parse(s):
+        for f in [parse_latex, parse_expr]:
+            try:
+                return f(s)
+            except:
+                pass
+        return s
+
+    a = _parse(a)
+    b = _parse(b)
+
+    try:
+        if simplify(a - b) == 0:
+            return True
+    except:
+        pass
+
+    try:
+        if isclose(N(a), N(b), abs_tol=1e-3):
+            return True
+    except:
+        pass
+    return False
+
+
+def math_equal(prediction, reference, include_percentage=True, is_close=True):
+    """
+    Exact match of math if and only if:
+    1. numerical equal: both can convert to float and are equal
+    2. symbolic equal: both can convert to sympy expression and are equal
+    """
+    if str(prediction) == str(reference):
+        return True
+
+    try:  # 1. numerical equal
+        if is_digit(prediction) and is_digit(reference):
+            prediction = parse_digits(prediction)
+            reference = parse_digits(reference)
+            # number questions
+            if include_percentage:
+                gt_result = [reference / 100, reference, reference * 100]
+            else:
+                gt_result = [reference]
+            for item in gt_result:
+                try:
+                    if is_close:
+                        if isclose(item, prediction, abs_tol=1e-3):
+                            return True
+                    else:
+                        if item == prediction:
+                            return True
+                except Exception:
+                    continue
+            return False
+    except:
+        pass
+
+    if not prediction and prediction not in [0, False]:
+        return False
+
+    # 2. symbolic equal
+    reference = str(reference).strip()
+    prediction = str(prediction).strip()
+
+    if (
+        regex.match(r"(\(|\[).+(\)|\])", prediction) is not None
+        and regex.match(r"(\(|\[).+(\)|\])", reference) is not None
+    ):
+        pred_parts = prediction[1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts):
+            if all(
+                [
+                    math_equal(
+                        pred_parts[i], ref_parts[i], include_percentage, is_close
+                    )
+                    for i in range(len(pred_parts))
+                ]
+            ):
+                return True
+
+    # Add back matrix comparison
+    if (
+        (
+            prediction.startswith("\\begin{pmatrix}")
+            or prediction.startswith("\\begin{bmatrix}")
+        )
+        and (
+            prediction.endswith("\\end{pmatrix}")
+            or prediction.endswith("\\end{bmatrix}")
+        )
+        and (
+            reference.startswith("\\begin{pmatrix}")
+            or reference.startswith("\\begin{bmatrix}")
+        )
+        and (
+            reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")
+        )
+    ):
+        pred_lines = [
+            line.strip()
+            for line in prediction[
+                len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
+            ].split("\\\\")
+            if line.strip()
+        ]
+        ref_lines = [
+            line.strip()
+            for line in reference[
+                len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
+            ].split("\\\\")
+            if line.strip()
+        ]
+        matched = True
+        if len(pred_lines) == len(ref_lines):
+            for pred_line, ref_line in zip(pred_lines, ref_lines):
+                pred_parts = pred_line.split("&")
+                ref_parts = ref_line.split("&")
+                if len(pred_parts) == len(ref_parts):
+                    if not all(
+                        [
+                            math_equal(
+                                pred_parts[i],
+                                ref_parts[i],
+                                include_percentage,
+                                is_close,
+                            )
+                            for i in range(len(pred_parts))
+                        ]
+                    ):
+                        matched = False
+                        break
+                else:
+                    matched = False
+                if not matched:
+                    break
+        else:
+            matched = False
+        if matched:
+            return True
+
+    # Add back equation comparison
+    if prediction.count("=") == 1 and reference.count("=") == 1:
+        pred = prediction.split("=")
+        pred = f"{pred[0].strip()} - ({pred[1].strip()})"
+        ref = reference.split("=")
+        ref = f"{ref[0].strip()} - ({ref[1].strip()})"
+        if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):
+            return True
+    elif (
+        prediction.count("=") == 1
+        and len(prediction.split("=")[0].strip()) <= 2
+        and "=" not in reference
+    ):
+        if math_equal(
+            prediction.split("=")[1], reference, include_percentage, is_close
+        ):
+            return True
+    elif (
+        reference.count("=") == 1
+        and len(reference.split("=")[0].strip()) <= 2
+        and "=" not in prediction
+    ):
+        if math_equal(
+            prediction, reference.split("=")[1], include_percentage, is_close
+        ):
+            return True
+
+    # symbolic equal with sympy
+    if symbolic_equal(prediction, reference):
+        return True
+
+    return False
--- a/benchmark/reasoning_benchmark/figure/Acc_histplot.png
+++ b/benchmark/reasoning_benchmark/figure/Acc_histplot.png
--- a/benchmark/reasoning_benchmark/figure/SE_numtries.png
+++ b/benchmark/reasoning_benchmark/figure/SE_numtries.png
--- a/benchmark/score/bench_score.py
+++ b/benchmark/score/bench_score.py
+"""
+SGLang Scoring Benchmark Script
+
+This script benchmarks SGLang's scoring API performance using HTTP requests.
+
+Current Features:
+- HTTP-only implementation (open source compatible)
+- Uses /v1/score API endpoint directly
+- Single item scoring with batching support
+- Configurable RPS, duration, and batch sizes
+- Progress tracking and detailed metrics
+- Poisson and constant request distributions
+
+Usage:
+- Update configuration variables at the top of the file
+- Ensure SGLang server is running on the configured HTTP_URL
+- Run: python bench_score.py
+- Each request will contain ITEM_COUNT_VALUES items for batch scoring
+
+"""
+
+import asyncio
+import concurrent.futures  # For parallel prompt generation
+import json
+import os
+import random
+from statistics import mean
+
+import aiohttp
+import numpy as np
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+###############################################################################
+# CONFIG
+###############################################################################
+# Server Configuration
+SERVER_TYPE = "HTTP"  # Fixed to HTTP for open source
+
+# HTTP Configuration
+HTTP_URL = "http://localhost:30000/v1/score"  # Use score API directly
+
+# Score API Config
+# ITEM_COUNT_VALUES determines number of items per score request (batch size)
+SCORE_QUERY_TOKENS = 120
+SCORE_ITEM_TOKENS = 180
+SCORE_MODEL_PATH = "Qwen/Qwen3-0.6B"
+SCORE_LABEL_TOKEN_IDS = [9454, 2753]  # Yes/No token IDs
+
+# Array of RPS values to test
+RPS_VALUES = [70]
+# Array of duration values to test
+DURATION_SECS_VALUES = [60]  # Duration values in seconds
+# Array of item count values to test
+ITEM_COUNT_VALUES = [10]  # Number of items per request
+# Number of unique requests to generate (will be reused)
+NUM_UNIQUE_REQUESTS = 100
+DISTRIBUTION = "POISSON"  # Options: "CONSTANT", "POISSON"
+
+# Profiling Configuration
+PROFILE = False  # Enable profiling with START_PROFILE/STOP_PROFILE prompts
+# Directory for profiler output
+SGLANG_TORCH_PROFILER_DIR = "/shared/user/sglang-oss-trace/remove-decode"
+if PROFILE:
+    os.environ["SGLANG_TORCH_PROFILER_DIR"] = SGLANG_TORCH_PROFILER_DIR
+
+# Special token to replicate for precise token counting
+SPECIAL_REPLICATED_TOKEN = "<|im_start|>"
+
+
+###############################################################################
+# REQUEST GENERATION (in parallel)
+###############################################################################
+def prepare_all_requests_parallel(num_requests, item_count):
+    """
+    Generates unique requests in parallel, then reuses them to create the
+    full request list. Returns a list of str prompts for HTTP.
+    """
+    # Load tokenizer once here to verify special token and get precise counts
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(SCORE_MODEL_PATH)
+
+    # Verify that our special token produces exactly 1 token
+    special_token_count = len(
+        tokenizer.encode(SPECIAL_REPLICATED_TOKEN, add_special_tokens=False)
+    )
+    print(
+        f"Special token '{SPECIAL_REPLICATED_TOKEN}' produces "
+        f"{special_token_count} token(s)"
+    )
+
+    def generate_text_with_token_count(num_toks):
+        """Generate text with precise token count using replicated token."""
+        if special_token_count == 1:
+            # Simple case: token maps to exactly 1 token
+            return SPECIAL_REPLICATED_TOKEN * num_toks
+        else:
+            print(
+                f"Special token '{SPECIAL_REPLICATED_TOKEN}' produces more than 1 token!!!"
+            )
+            # Handle case where special token produces multiple tokens
+            # Repeat the token enough times to get at least num_toks tokens
+            repetitions = (num_toks + special_token_count - 1) // special_token_count
+            text = SPECIAL_REPLICATED_TOKEN * repetitions
+
+            # Verify we got the expected token count (approximately)
+            actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
+            if actual_tokens < num_toks:
+                print(
+                    f"Warning: Generated {actual_tokens} tokens, "
+                    f"expected {num_toks}"
+                )
+
+            return text
+
+    def build_request(index):
+        """Build a single request using the shared tokenizer."""
+        try:
+            # Generate query and items for score API
+            query = generate_text_with_token_count(SCORE_QUERY_TOKENS)
+            items = [
+                generate_text_with_token_count(SCORE_ITEM_TOKENS)
+                for _ in range(item_count)
+            ]
+
+            # Return as dict for score API format
+            score_data = {
+                "query": query,
+                "items": items,
+                "label_token_ids": SCORE_LABEL_TOKEN_IDS,
+                "model": SCORE_MODEL_PATH,
+            }
+            return (index, score_data)
+
+        except Exception as e:
+            print(f"Error building request {index}: {e}")
+            return (index, None)
+
+    # Generate only the unique requests
+    unique_requests = [None] * NUM_UNIQUE_REQUESTS
+
+    # Use ThreadPoolExecutor instead of ProcessPoolExecutor to avoid
+    # tokenizer loading issues across processes
+    max_workers = min(8, os.cpu_count() or 1)  # Limit to 8 threads max
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        for i in tqdm(
+            range(NUM_UNIQUE_REQUESTS), desc="Submitting prompt generation tasks"
+        ):
+            future = executor.submit(build_request, i)
+            futures.append(future)
+
+        # Collect results as they complete
+        for f in tqdm(
+            concurrent.futures.as_completed(futures),
+            desc="Building unique requests",
+            total=NUM_UNIQUE_REQUESTS,
+        ):
+            try:
+                index, req_data = f.result()
+                if req_data is not None:
+                    unique_requests[index] = req_data
+                else:
+                    print(f"Failed to build request {index}")
+            except Exception as e:
+                print(f"Error processing request result: {e}")
+
+    # Check if we have any valid requests
+    valid_requests = [req for req in unique_requests if req is not None]
+    if not valid_requests:
+        raise RuntimeError("Failed to generate any valid requests")
+
+    print(
+        f"Successfully generated {len(valid_requests)} out of "
+        f"{NUM_UNIQUE_REQUESTS} unique requests"
+    )
+
+    # Create the full request list by cycling through unique requests
+    print(
+        f"Reusing {len(valid_requests)} unique requests to create "
+        f"{num_requests} total requests..."
+    )
+    all_requests = []
+    for i in tqdm(range(num_requests), desc="Reusing requests"):
+        unique_index = i % len(valid_requests)
+        all_requests.append(valid_requests[unique_index])
+
+    print("All prompts/requests prepared.\n")
+    return all_requests
+
+
+###############################################################################
+# PROFILING HELPERS
+###############################################################################
+async def send_profile_request(profile_text, item_count, session=None):
+    """Send a profile request and wait for completion."""
+    try:
+        if session:
+            print(f"Sending {profile_text} request via HTTP...")
+
+            # Determine the correct endpoint
+            base_url = HTTP_URL.rsplit("/", 2)[0]  # Remove /v1/score
+            if profile_text == "START_PROFILE":
+                endpoint_url = f"{base_url}/start_profile"
+            elif profile_text == "STOP_PROFILE":
+                endpoint_url = f"{base_url}/stop_profile"
+            else:
+                print(f"Unknown profile request: {profile_text}")
+                return
+
+            headers = {"Content-Type": "application/json"}
+
+            async with session.post(endpoint_url, headers=headers) as resp:
+                resp_text = await resp.text()
+                if resp.status == 200:
+                    print(f"{profile_text} request completed")
+                else:
+                    print(
+                        f"{profile_text} request failed with status "
+                        f"{resp.status}: {resp_text}"
+                    )
+        else:
+            print(f"Cannot send {profile_text} request - missing session")
+
+    except Exception as e:
+        print(f"Error sending {profile_text} request: {e}")
+
+
+###############################################################################
+# HTTP CALLS
+###############################################################################
+def build_http_request_json(score_data):
+    """Build HTTP request JSON for /v1/score endpoint.
+
+    Score API format:
+    {
+        "query": "Generated query text with SCORE_QUERY_TOKENS tokens",
+        "items": ["item1", "item2", ...],  # Items to score with SCORE_ITEM_TOKENS each
+        "label_token_ids": [token_id1, token_id2],  # Target token IDs
+        "model": "/path/to/model"
+    }
+
+    Args:
+        score_data: A dict containing query, items, label_token_ids, and model
+    """
+    # score_data is already in the correct format from build_request
+    return json.dumps(score_data)
+
+
+async def make_http_call(session, score_data, request_id, results_queue):
+    """HTTP call to /v1/score endpoint."""
+    try:
+        start_time = asyncio.get_event_loop().time()
+
+        request_json = build_http_request_json(score_data)
+        headers = {"Content-Type": "application/json"}
+
+        async with session.post(HTTP_URL, data=request_json, headers=headers) as resp:
+            resp_text = await resp.text()
+
+            if resp.status != 200:
+                print(
+                    f"[HTTP] Request {request_id} failed with status "
+                    f"{resp.status}: {resp_text}"
+                )
+                completion_time = asyncio.get_event_loop().time()
+                await results_queue.put((request_id, 0, False, completion_time))
+                return
+
+            # Parse score API response
+            try:
+                response_data = json.loads(resp_text)
+                # Score API returns scores for each item
+                # For now, just verify we got a valid response
+                if "scores" in response_data or "logprobs" in response_data:
+                    success = True
+                else:
+                    print(
+                        f"[HTTP] Request {request_id} missing expected fields in response"
+                    )
+                    success = False
+            except json.JSONDecodeError:
+                print(f"[HTTP] Request {request_id} failed to parse JSON response")
+                success = False
+
+        completion_time = asyncio.get_event_loop().time()
+        elapsed_time = (completion_time - start_time) * 1000
+        await results_queue.put((request_id, elapsed_time, success, completion_time))
+
+    except Exception as e:
+        print(f"[HTTP] Error for request {request_id}: {e}")
+        completion_time = asyncio.get_event_loop().time()
+        await results_queue.put((request_id, 0, False, completion_time))
+
+
+###############################################################################
+# RESULTS
+###############################################################################
+async def process_results(
+    results_queue,
+    num_requests,
+    send_duration,
+    total_duration,
+    rps,
+    duration_secs,
+    item_count,
+    test_start_time,
+):
+    """Processes results and groups them by minute intervals.
+    Returns a list of dictionaries, one for each minute."""
+    all_results = []
+
+    # Collect all results
+    for _ in range(num_requests):
+        result = await results_queue.get()
+        request_id, elapsed_time, success, completion_time = result
+        all_results.append(
+            {
+                "request_id": request_id,
+                "elapsed_time": elapsed_time,
+                "success": success,
+                "completion_time": completion_time,
+            }
+        )
+
+    # Group results by minute intervals
+    minute_results = []
+    num_minutes = int(duration_secs // 60) + (1 if duration_secs % 60 > 0 else 0)
+
+    for minute in range(num_minutes):
+        minute_start = test_start_time + (minute * 60)
+        minute_end = test_start_time + ((minute + 1) * 60)
+
+        # Filter results that completed in this minute
+        minute_data = [
+            r for r in all_results if minute_start <= r["completion_time"] < minute_end
+        ]
+
+        response_times = [r["elapsed_time"] for r in minute_data if r["success"]]
+        successful_requests = len([r for r in minute_data if r["success"]])
+        failed_requests = len([r for r in minute_data if not r["success"]])
+
+        avg_response_time = mean(response_times) if response_times else 0
+
+        # Calculate percentiles using numpy
+        if response_times:
+            p50 = np.percentile(response_times, 50)
+            p90 = np.percentile(response_times, 90)
+            p99 = np.percentile(response_times, 99)
+        else:
+            p50 = p90 = p99 = 0
+
+        minute_result = {
+            "test_duration_secs": duration_secs,
+            "minute_interval": minute + 1,
+            "target_rps": rps,
+            "item_count": item_count,
+            "server_type": SERVER_TYPE,
+            "distribution": DISTRIBUTION,
+            "unique_requests": NUM_UNIQUE_REQUESTS,
+            "total_requests": len(minute_data),
+            "successful_requests": successful_requests,
+            "failed_requests": failed_requests,
+            "send_duration_secs": send_duration,
+            "total_duration_secs": total_duration,
+            "avg_response_time_ms": avg_response_time,
+            "p50_response_time_ms": p50,
+            "p90_response_time_ms": p90,
+            "p99_response_time_ms": p99,
+        }
+
+        minute_results.append(minute_result)
+
+        print(
+            f"\nMinute {minute + 1} Summary for RPS {rps}, "
+            f"Duration {duration_secs}s, Item Count {item_count}:"
+        )
+        print(f"  Requests completed in minute: {len(minute_data)}")
+        print(f"  Successful requests:   {successful_requests}")
+        print(f"  Failed requests:       {failed_requests}")
+        print(f"  Average response time: {avg_response_time:.2f} ms")
+        print(f"  P50 response time:     {p50:.2f} ms")
+        print(f"  P90 response time:     {p90:.2f} ms")
+        print(f"  P99 response time:     {p99:.2f} ms")
+
+    # Also print overall summary
+    all_response_times = [r["elapsed_time"] for r in all_results if r["success"]]
+    total_successful = len([r for r in all_results if r["success"]])
+    total_failed = len([r for r in all_results if not r["success"]])
+
+    overall_avg = mean(all_response_times) if all_response_times else 0
+    if all_response_times:
+        overall_p50 = np.percentile(all_response_times, 50)
+        overall_p90 = np.percentile(all_response_times, 90)
+        overall_p99 = np.percentile(all_response_times, 99)
+    else:
+        overall_p50 = overall_p90 = overall_p99 = 0
+
+    print(
+        f"\nOverall Summary for RPS {rps}, Duration {duration_secs}s, "
+        f"Item Count {item_count}:"
+    )
+    print(f"  Test duration:         {duration_secs} seconds")
+    print(f"  Server type:           {SERVER_TYPE}")
+    print(f"  HTTP mode:             SINGLE_ITEM_SCORING")
+    print(f"  Target RPS:            {rps}")
+    print(f"  Item count:            {item_count}")
+    print(f"  Distribution:          {DISTRIBUTION}")
+    print(f"  Unique requests generated: {NUM_UNIQUE_REQUESTS}")
+    print(f"  Total requests sent:   {num_requests}")
+    print(f"  Successful requests:   {total_successful}")
+    print(f"  Failed requests:       {total_failed}")
+    print(f"  Time to send all requests: {send_duration:.2f} seconds")
+    print(f"  Time for all requests to complete: {total_duration:.2f} seconds")
+    print(f"  Average response time: {overall_avg:.2f} ms")
+    print(f"  P50 response time:     {overall_p50:.2f} ms")
+    print(f"  P90 response time:     {overall_p90:.2f} ms")
+    print(f"  P99 response time:     {overall_p99:.2f} ms\n")
+
+    return minute_results
+
+
+###############################################################################
+# MAIN
+###############################################################################
+async def run_benchmark(rps, duration_secs, item_count):
+    """Run a single benchmark with the given RPS value."""
+    num_requests = int(rps * duration_secs)
+    print(
+        f"Starting benchmark with RPS={rps}, Duration={duration_secs}s, "
+        f"Item Count={item_count}, num_requests={num_requests}"
+    )
+    print(f"Server Type: {SERVER_TYPE}")
+    print(f"HTTP Mode: SINGLE_ITEM_SCORING")
+    print(f"Profiling Enabled: {PROFILE}")
+
+    # Build requests in parallel (unmeasured)
+    all_requests = prepare_all_requests_parallel(num_requests, item_count)
+
+    results_queue = asyncio.Queue()
+    tasks = []
+
+    # Track timing for sending requests
+    send_start_time = asyncio.get_event_loop().time()
+
+    # HTTP implementation (open source only supports HTTP with /v1/score API)
+    async with aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=300)
+    ) as session:
+
+        # Send START_PROFILE if profiling is enabled
+        if PROFILE:
+            await send_profile_request("START_PROFILE", item_count, session=session)
+
+        # Add progress bar for sending requests
+        with tqdm(
+            total=len(all_requests),
+            desc=f"Sending HTTP score requests at {rps} RPS",
+            unit="req",
+        ) as pbar:
+            for i, score_data in enumerate(all_requests):
+                request_id = i + 1
+                tasks.append(
+                    asyncio.create_task(
+                        make_http_call(session, score_data, request_id, results_queue)
+                    )
+                )
+
+                # Update progress bar
+                pbar.update(1)
+
+                # Throttle based on distribution
+                if i < len(all_requests) - 1:
+                    if DISTRIBUTION == "CONSTANT":
+                        interval = 1 / rps
+                        await asyncio.sleep(interval)
+                    elif DISTRIBUTION == "POISSON":
+                        # For Poisson process, inter-arrival times follow
+                        # exponential distribution
+                        interval = random.expovariate(rps)
+                        await asyncio.sleep(interval)
+                    else:
+                        raise ValueError(
+                            f"Unknown distribution: {DISTRIBUTION}. "
+                            f"Use 'CONSTANT' or 'POISSON'."
+                        )
+
+        send_end_time = asyncio.get_event_loop().time()
+        send_duration = send_end_time - send_start_time
+
+        # Wait for all requests to complete with progress tracking
+        print(f"Waiting for {len(tasks)} HTTP score requests to complete...")
+        with tqdm(
+            total=len(tasks), desc="Completing HTTP score requests", unit="req"
+        ) as completion_pbar:
+            completed_tasks = []
+            for task in asyncio.as_completed(tasks):
+                await task
+                completed_tasks.append(task)
+                completion_pbar.update(1)
+
+        # Send STOP_PROFILE if profiling is enabled
+        if PROFILE:
+            await send_profile_request("STOP_PROFILE", item_count, session=session)
+
+    completion_end_time = asyncio.get_event_loop().time()
+    total_duration = completion_end_time - send_start_time
+
+    return await process_results(
+        results_queue,
+        num_requests,
+        send_duration,
+        total_duration,
+        rps,
+        duration_secs,
+        item_count,
+        send_start_time,
+    )
+
+
+async def main():
+    """Main function that runs benchmarks for all RPS values."""
+    total_combinations = (
+        len(DURATION_SECS_VALUES) * len(RPS_VALUES) * len(ITEM_COUNT_VALUES)
+    )
+    print(
+        f"Running benchmarks for {len(DURATION_SECS_VALUES)} duration "
+        f"values, {len(RPS_VALUES)} RPS values, and "
+        f"{len(ITEM_COUNT_VALUES)} item count values = "
+        f"{total_combinations} total combinations"
+    )
+    print(f"Server Type: {SERVER_TYPE}")
+    print(f"HTTP Mode: SINGLE_ITEM_SCORING")
+    print(f"Score API URL: {HTTP_URL}")
+    print(f"Query tokens per request: {SCORE_QUERY_TOKENS}")
+    print(f"Item tokens per item: {SCORE_ITEM_TOKENS}")
+    print(f"Items per request (batch size): {ITEM_COUNT_VALUES}")
+    print(f"Profiling Enabled: {PROFILE}")
+    print(f"Duration values: {DURATION_SECS_VALUES}")
+    print(f"RPS values: {RPS_VALUES}")
+    print(f"Item count values: {ITEM_COUNT_VALUES}")
+    print("=" * 80)
+
+    all_results = []
+
+    for duration_secs in DURATION_SECS_VALUES:
+        for rps in RPS_VALUES:
+            for item_count in ITEM_COUNT_VALUES:
+                result = await run_benchmark(rps, duration_secs, item_count)
+                all_results.extend(result)  # Extend with minute results
+
+    # Print CSV header and results
+    print("\n" + "=" * 80)
+    print("FINAL CSV RESULTS:")
+    print("=" * 80)
+
+    # CSV Header
+    headers = [
+        "test_duration_secs",
+        "minute_interval",
+        "target_rps",
+        "item_count",
+        "server_type",
+        "distribution",
+        "unique_requests",
+        "total_requests",
+        "successful_requests",
+        "failed_requests",
+        "send_duration_secs",
+        "total_duration_secs",
+        "avg_response_time_ms",
+        "p50_response_time_ms",
+        "p90_response_time_ms",
+        "p99_response_time_ms",
+    ]
+    print(",".join(headers))
+
+    # CSV Data
+    for result in all_results:
+        row = [
+            result["test_duration_secs"],
+            result["minute_interval"],
+            result["target_rps"],
+            result["item_count"],
+            result["server_type"],
+            result["distribution"],
+            result["unique_requests"],
+            result["total_requests"],
+            result["successful_requests"],
+            result["failed_requests"],
+            f"{result['send_duration_secs']:.2f}",
+            f"{result['total_duration_secs']:.2f}",
+            f"{result['avg_response_time_ms']:.2f}",
+            f"{result['p50_response_time_ms']:.2f}",
+            f"{result['p90_response_time_ms']:.2f}",
+            f"{result['p99_response_time_ms']:.2f}",
+        ]
+        print(",".join(map(str, row)))
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/benchmark/tip_suggestion/.gitignore
+++ b/benchmark/tip_suggestion/.gitignore
+!topic.jsonl
--- a/benchmark/tip_suggestion/README.md
+++ b/benchmark/tip_suggestion/README.md
+## Run benchmark
+
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+python3 bench_sglang.py --num-questions 64
+python3 bench_sglang.py --num-questions 32 --parallel 1
+```
+
+
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+
+```
+python3 bench_other.py --backend vllm --num-questions 64
+```
+
+
+### Benchmark guidance
+```
+python3 bench_other.py --backend guidance --num-questions 32 --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+
+### Benchmark lmql
+
+```
+python3 bench_other.py --backend lmql --num-questions 32 --parallel 1
+```
--- a/benchmark/tip_suggestion/bench_other.py
+++ b/benchmark/tip_suggestion/bench_other.py
+import argparse
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+
+from tqdm import tqdm
+
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text, read_jsonl
+
+number = 5
+
+
+def expand_tip(topic, tip, generate):
+    s = (
+        """Please expand a tip for a topic into a detailed paragraph.
+
+Topic: staying healthy
+Tip: Regular Exercise
+Paragraph: Incorporate physical activity into your daily routine. This doesn't necessarily mean intense gym workouts; it can be as simple as walking, cycling, or yoga. Regular exercise helps in maintaining a healthy weight, improves cardiovascular health, boosts mental health, and can enhance cognitive function, which is crucial for fields that require intense intellectual engagement.
+
+Topic: building a campfire
+Tip: Choose the Right Location
+Paragraph: Always build your campfire in a safe spot. This means selecting a location that's away from trees, bushes, and other flammable materials. Ideally, use a fire ring if available. If you're building a fire pit, it should be on bare soil or on a bed of stones, not on grass or near roots which can catch fire underground. Make sure the area above is clear of low-hanging branches.
+
+Topic: writing a blog post
+Tip: structure your content effectively
+Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.
+
+Topic: """
+        + topic
+        + "\nTip: "
+        + tip
+        + "\nParagraph:"
+    )
+    return generate(s, max_tokens=128, stop=["\n\n"])
+
+
+def suggest_tips(topic, generate):
+    s = "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
+    s += "USER: Give some tips for " + topic + ".\n"
+    s += (
+        "ASSISTANT: Okay. Here are "
+        + str(number)
+        + " concise tips, each under 8 words:\n"
+    )
+
+    tips = []
+    for i in range(1, 1 + number):
+        s += f"{i}."
+        tip = generate(s, max_tokens=24, stop=[".", "\n"])
+        s += tip + ".\n"
+        tips.append(tip)
+
+    paragraphs = [expand_tip(topic, tip, generate=generate) for tip in tips]
+
+    for i in range(1, 1 + number):
+        s += f"Tip {i}:" + paragraphs[i - 1] + "\n"
+    return s
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    states = [None] * len(lines)
+
+    # Select backend
+    call_generate = partial(get_call_generate(args), temperature=0)
+
+    # Run requests
+    tic = time.perf_counter()
+    if args.backend != "lmql":
+
+        def get_one_answer(i):
+            states[i] = suggest_tips(lines[i]["topic"], call_generate)
+
+        if args.parallel == 1:
+            for i in tqdm(range(len(lines))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(lines)))),
+                        total=len(lines),
+                    )
+                )
+
+    else:
+        import asyncio
+
+        from lmql_funcs import suggest_tips_async
+
+        async def get_one_answer_async(i):
+            states[i] = await suggest_tips_async(lines[i]["topic"], call_generate)
+
+        batches = []
+        for i in range(0, len(lines), args.parallel):
+            batches.append(list(range(i, min(i + args.parallel, len(lines)))))
+        loop = asyncio.get_event_loop()
+        for batch in tqdm(batches):
+            loop.run_until_complete(
+                asyncio.gather(*[get_one_answer_async(i) for i in batch])
+            )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "tip_suggestion",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="topic.jsonl")
+    parser.add_argument("--num-questions", type=int, default=100)
+    args = add_common_other_args_and_parse(parser)
+    main(args)
--- a/benchmark/tip_suggestion/bench_sglang.py
+++ b/benchmark/tip_suggestion/bench_sglang.py
+import argparse
+import json
+import time
+
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+
+number = 5
+
+
+@sgl.function
+def expand_tip(s, topic, tip):
+    s += (
+        """Please expand a tip for a topic into a detailed paragraph.
+
+Topic: staying healthy
+Tip: Regular Exercise
+Paragraph: Incorporate physical activity into your daily routine. This doesn't necessarily mean intense gym workouts; it can be as simple as walking, cycling, or yoga. Regular exercise helps in maintaining a healthy weight, improves cardiovascular health, boosts mental health, and can enhance cognitive function, which is crucial for fields that require intense intellectual engagement.
+
+Topic: building a campfire
+Tip: Choose the Right Location
+Paragraph: Always build your campfire in a safe spot. This means selecting a location that's away from trees, bushes, and other flammable materials. Ideally, use a fire ring if available. If you're building a fire pit, it should be on bare soil or on a bed of stones, not on grass or near roots which can catch fire underground. Make sure the area above is clear of low-hanging branches.
+
+Topic: writing a blog post
+Tip: structure your content effectively
+Paragraph: A well-structured post is easier to read and more enjoyable. Start with an engaging introduction that hooks the reader and clearly states the purpose of your post. Use headings and subheadings to break up the text and guide readers through your content. Bullet points and numbered lists can make information more digestible. Ensure each paragraph flows logically into the next, and conclude with a summary or call-to-action that encourages reader engagement.
+
+Topic: """
+        + topic
+        + "\nTip: "
+        + tip
+        + "\nParagraph:"
+    )
+    s += sgl.gen("paragraph", max_tokens=128, stop=["\n\n"], temperature=0)
+
+
+@sgl.function
+def suggest_tips(s, topic):
+    s += "Please act as a helpful assistant. Your job is to provide users with useful tips on a specific topic.\n"
+    s += "USER: Give some tips for " + topic + ".\n"
+    s += (
+        "ASSISTANT: Okay. Here are "
+        + str(number)
+        + " concise tips, each under 8 words:\n"
+    )
+
+    paragraphs = []
+    for i in range(1, 1 + number):
+        s += f"{i}." + sgl.gen(f"tip_{i}", max_tokens=24, stop=[".", "\n"]) + ".\n"
+        paragraphs.append(expand_tip(topic=topic, tip=s[f"tip_{i}"]))
+
+    for i in range(1, 1 + number):
+        s += f"Tip {i}:" + paragraphs[i - 1]["paragraph"] + "\n"
+
+
+def main(args):
+    lines = read_jsonl(args.data_path)[: args.num_questions]
+    arguments = [{"topic": l["topic"]} for l in lines]
+
+    # Select backend
+    sgl.set_default_backend(select_sglang_backend(args))
+
+    # Run requests
+    tic = time.perf_counter()
+    states = suggest_tips.run_batch(
+        arguments, temperature=0, num_threads=args.parallel, progress_bar=True
+    )
+    latency = time.perf_counter() - tic
+
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "tip_suggestion",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="topic.jsonl")
+    parser.add_argument("--num-questions", type=int, default=100)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)