misc: add pre-commit config (#637)

2e341cd4 · zhyncs · GitHub · a8552cb1 · 2e341cd4 · 2e341cd4
Unverified Commit 2e341cd4 authored Jul 18, 2024 by zhyncs Committed by GitHub Jul 17, 2024
20 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+repos:
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+  - repo: https://github.com/psf/black
+    rev: stable
+    hooks:
+      - id: black
--- a/benchmark/latency_throughput/bench_serving.py
+++ b/benchmark/latency_throughput/bench_serving.py
@@ -312,8 +312,8 @@ def main(args: argparse.Namespace):
        np.sum([output_len for _, output_len, _ in REQUEST_LATENCY]) / benchmark_time
    )

-    #latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY]
-    #print(latencies)
+    # latencies = [round(latency, 2) for _, _, latency in REQUEST_LATENCY]
+    # print(latencies)

    print(f"Total time: {benchmark_time:.2f} s")
    print(f"Request throughput: {args.num_prompts / benchmark_time:.2f} requests/s")

--- a/benchmark/line_retrieval/gen_data.py
+++ b/benchmark/line_retrieval/gen_data.py
@@ -48,9 +48,9 @@ def generate_lines(random_words, num_lines, redirect_ratio):
        )
        for i in redirect_indices:
            target_idx = np.random.choice(min(i * 2 + 100, num_lines))
-            lines[
-                i
-            ] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
+            lines[i] = (
+                f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
+            )
            redirects[i] = target_idx

    # Build links and find sources

--- a/examples/quick_start/anthropic_example_chat.py
+++ b/examples/quick_start/anthropic_example_chat.py
@@ -3,6 +3,7 @@ Usage:
 export ANTHROPIC_API_KEY=sk-******
 python3 anthropic_example_chat.py
 """
+
 import sglang as sgl


@@ -30,7 +31,7 @@ def stream():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
-        stream=True
+        stream=True,
    )

    for out in state.text_iter():
@@ -39,13 +40,18 @@ def stream():


 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )

    for s in states:
        print(s.messages())

--- a/examples/quick_start/anthropic_example_complete.py
+++ b/examples/quick_start/anthropic_example_complete.py
@@ -9,15 +9,14 @@ import sglang as sgl

 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""
+    s += """
 \n\nHuman: What is the capital of France?
 \n\nAssistant: Paris
 \n\nHuman: What is the capital of Germany?
 \n\nAssistant: Berlin
 \n\nHuman: What is the capital of Italy?
 \n\nAssistant: Rome
-""")
+"""
    s += "\n\nHuman: " + question + "\n"
    s += "\n\nAssistant:" + sgl.gen("answer", temperature=0)

@@ -33,8 +32,8 @@ def single():

 def stream():
    state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )

    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
@@ -42,10 +41,12 @@ def stream():


 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )

    for s in states:
        print(s["answer"])

--- a/examples/quick_start/azure_openai_example_chat.py
+++ b/examples/quick_start/azure_openai_example_chat.py
@@ -3,9 +3,11 @@ Usage:
 export AZURE_OPENAI_API_KEY=sk-******
 python3 openai_example_chat.py
 """
-import sglang as sgl
+
 import os

+import sglang as sgl
+

 @sgl.function
 def multi_turn_question(s, question_1, question_2):
@@ -32,7 +34,7 @@ def stream():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
-        stream=True
+        stream=True,
    )

    for out in state.text_iter():
@@ -41,13 +43,18 @@ def stream():


 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )

    for s in states:
        print(s.messages())

--- a/examples/quick_start/gemini_example_chat.py
+++ b/examples/quick_start/gemini_example_chat.py
@@ -3,6 +3,7 @@ Usage:
 export GCP_PROJECT_ID=******
 python3 gemini_example_chat.py
 """
+
 import sglang as sgl


@@ -30,7 +31,7 @@ def stream():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
-        stream=True
+        stream=True,
    )

    for out in state.text_iter():
@@ -39,13 +40,18 @@ def stream():


 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )

    for s in states:
        print(s.messages())

--- a/examples/quick_start/gemini_example_complete.py
+++ b/examples/quick_start/gemini_example_complete.py
@@ -9,15 +9,14 @@ import sglang as sgl

 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""The following are questions with answers.
+    s += """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
-""")
+"""
    s += "Q: " + question + "\n"
    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)

@@ -33,8 +32,8 @@ def single():

 def stream():
    state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )

    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
@@ -42,10 +41,12 @@ def stream():


 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )

    for s in states:
        print(s["answer"])

--- a/examples/quick_start/gemini_example_multimodal_chat.py
+++ b/examples/quick_start/gemini_example_multimodal_chat.py
@@ -3,6 +3,7 @@ Usage:
 export GCP_PROJECT_ID=******
 python3 gemini_example_multimodal_chat.py
 """
+
 import sglang as sgl


@@ -19,7 +20,7 @@ if __name__ == "__main__":
        image_file1="./images/cat.jpeg",
        image_file2="./images/dog.jpeg",
        question="Describe difference of the two images in one sentence.",
-        stream=True
+        stream=True,
    )

    for out in state.text_iter("answer"):

--- a/examples/quick_start/openai_example_chat.py
+++ b/examples/quick_start/openai_example_chat.py
@@ -3,6 +3,7 @@ Usage:
 export OPENAI_API_KEY=sk-******
 python3 openai_example_chat.py
 """
+
 import sglang as sgl


@@ -31,7 +32,7 @@ def stream():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
-        stream=True
+        stream=True,
    )

    for out in state.text_iter():
@@ -40,13 +41,18 @@ def stream():


 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )

    for s in states:
        print(s.messages())

--- a/examples/quick_start/openai_example_complete.py
+++ b/examples/quick_start/openai_example_complete.py
@@ -9,15 +9,14 @@ import sglang as sgl

 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""The following are questions with answers.
+    s += """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
-""")
+"""
    s += "Q: " + question + "\n"
    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)

@@ -33,8 +32,8 @@ def single():

 def stream():
    state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )

    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
@@ -42,10 +41,12 @@ def stream():


 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )

    for s in states:
        print(s["answer"])

--- a/examples/quick_start/openrouter_example_chat.py
+++ b/examples/quick_start/openrouter_example_chat.py
@@ -3,9 +3,11 @@ Usage:
 export OPENROUTER_API_KEY=sk-******
 python3 together_example_chat.py
 """
-import sglang as sgl
+
 import os

+import sglang as sgl
+

 @sgl.function
 def multi_turn_question(s, question_1, question_2):

--- a/examples/quick_start/srt_example_chat.py
+++ b/examples/quick_start/srt_example_chat.py
@@ -2,6 +2,7 @@
 Usage:
 python3 srt_example_chat.py
 """
+
 import sglang as sgl


@@ -29,7 +30,7 @@ def stream():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
-        stream=True
+        stream=True,
    )

    for out in state.text_iter():
@@ -38,13 +39,18 @@ def stream():


 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )

    for s in states:
        print(s.messages())

--- a/examples/quick_start/srt_example_complete.py
+++ b/examples/quick_start/srt_example_complete.py
@@ -2,20 +2,20 @@
 Usage:
 python3 srt_example_complete.py
 """
+
 import sglang as sgl


 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""The following are questions with answers.
+    s += """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
-""")
+"""
    s += "Q: " + question + "\n"
    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)

@@ -31,8 +31,8 @@ def single():

 def stream():
    state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )

    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
@@ -40,10 +40,12 @@ def stream():


 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )

    for s in states:
        print(s["answer"])

--- a/examples/quick_start/srt_example_llava.py
+++ b/examples/quick_start/srt_example_llava.py
 """
 Usage: python3 srt_example_llava.py
 """
+
 import sglang as sgl


@@ -12,9 +13,8 @@ def image_qa(s, image_path, question):

 def single():
    state = image_qa.run(
-        image_path="images/cat.jpeg",
-        question="What is this?",
-        max_new_tokens=128)
+        image_path="images/cat.jpeg", question="What is this?", max_new_tokens=128
+    )
    print(state["answer"], "\n")


@@ -23,7 +23,8 @@ def stream():
        image_path="images/cat.jpeg",
        question="What is this?",
        max_new_tokens=64,
-        stream=True)
+        stream=True,
+    )

    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
@@ -33,8 +34,8 @@ def stream():
 def batch():
    states = image_qa.run_batch(
        [
-            {"image_path": "images/cat.jpeg", "question":"What is this?"},
-            {"image_path": "images/dog.jpeg", "question":"What is this?"},
+            {"image_path": "images/cat.jpeg", "question": "What is this?"},
+            {"image_path": "images/dog.jpeg", "question": "What is this?"},
        ],
        max_new_tokens=128,
    )
@@ -43,8 +44,10 @@ def batch():


 if __name__ == "__main__":
-    runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.6-vicuna-7b",
-                          tokenizer_path="llava-hf/llava-1.5-7b-hf")
+    runtime = sgl.Runtime(
+        model_path="liuhaotian/llava-v1.6-vicuna-7b",
+        tokenizer_path="llava-hf/llava-1.5-7b-hf",
+    )
    sgl.set_default_backend(runtime)
    print(f"chat template: {runtime.endpoint.chat_template.name}")


--- a/examples/quick_start/srt_example_yi_vl.py
+++ b/examples/quick_start/srt_example_yi_vl.py
@@ -3,6 +3,7 @@ Usage: python3 srt_example_yi_vl.py

 Requirements: transformers==4.38
 """
+
 import sglang as sgl


@@ -17,7 +18,8 @@ def single():
        image_path="images/cat.jpeg",
        question="What is this?",
        max_new_tokens=64,
-        stop="###")
+        stop="###",
+    )
    print(state["answer"], "\n")


@@ -27,7 +29,8 @@ def stream():
        question="What is this?",
        max_new_tokens=64,
        stream=True,
-        stop="###")
+        stop="###",
+    )

    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
@@ -37,11 +40,11 @@ def stream():
 def batch():
    states = image_qa.run_batch(
        [
-            {"image_path": "images/cat.jpeg", "question":"What is this?"},
-            {"image_path": "images/dog.jpeg", "question":"What is this?"},
+            {"image_path": "images/cat.jpeg", "question": "What is this?"},
+            {"image_path": "images/dog.jpeg", "question": "What is this?"},
        ],
        max_new_tokens=64,
-        stop="###"
+        stop="###",
    )
    for s in states:
        print(s["answer"], "\n")

--- a/examples/quick_start/together_example_chat.py
+++ b/examples/quick_start/together_example_chat.py
@@ -3,9 +3,11 @@ Usage:
 export TOGETHER_API_KEY=sk-******
 python3 together_example_chat.py
 """
-import sglang as sgl
+
 import os

+import sglang as sgl
+

 @sgl.function
 def multi_turn_question(s, question_1, question_2):
@@ -32,7 +34,7 @@ def stream():
    state = multi_turn_question.run(
        question_1="What is the capital of the United States?",
        question_2="List two local attractions.",
-        stream=True
+        stream=True,
    )

    for out in state.text_iter():
@@ -41,13 +43,18 @@ def stream():


 def batch():
-    states = multi_turn_question.run_batch([
-        {"question_1": "What is the capital of the United States?",
-         "question_2": "List two local attractions."},
-
-        {"question_1": "What is the capital of France?",
-         "question_2": "What is the population of this city?"},
-    ])
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )

    for s in states:
        print(s.messages())

--- a/examples/quick_start/together_example_complete.py
+++ b/examples/quick_start/together_example_complete.py
@@ -4,21 +4,21 @@ export TOGETHER_API_KEY=sk-******
 python3 together_example_complete.py
 """

-import sglang as sgl
 import os

+import sglang as sgl
+

 @sgl.function
 def few_shot_qa(s, question):
-    s += (
-"""The following are questions with answers.
+    s += """The following are questions with answers.
 Q: What is the capital of France?
 A: Paris
 Q: What is the capital of Germany?
 A: Berlin
 Q: What is the capital of Italy?
 A: Rome
-""")
+"""
    s += "Q: " + question + "\n"
    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)

@@ -34,8 +34,8 @@ def single():

 def stream():
    state = few_shot_qa.run(
-        question="What is the capital of the United States?",
-        stream=True)
+        question="What is the capital of the United States?", stream=True
+    )

    for out in state.text_iter("answer"):
        print(out, end="", flush=True)
@@ -43,10 +43,12 @@ def stream():


 def batch():
-    states = few_shot_qa.run_batch([
-        {"question": "What is the capital of the United States?"},
-        {"question": "What is the capital of China?"},
-    ])
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )

    for s in states:
        print(s["answer"])

--- a/examples/usage/async_io.py
+++ b/examples/usage/async_io.py
@@ -2,7 +2,9 @@
 Usage:
 python3 async_io.py
 """
+
 import asyncio
+
 from sglang import Runtime


@@ -14,7 +16,10 @@ async def generate(
    tokenizer = engine.get_tokenizer()

    messages = [
-        {"role": "system", "content": "You will be given question answer tasks.",},
+        {
+            "role": "system",
+            "content": "You will be given question answer tasks.",
+        },
        {"role": "user", "content": prompt},
    ]

@@ -36,5 +41,5 @@ if __name__ == "__main__":
    prompt = "Who is Alan Turing?"
    sampling_params = {"max_new_tokens": 128}
    asyncio.run(generate(runtime, prompt, sampling_params))
-    
+
    runtime.shutdown()
--- a/examples/usage/cot_decoding.py
+++ b/examples/usage/cot_decoding.py
@@ -33,8 +33,7 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
    )
    logprobs = step_0.get_meta_info("get_top_k")["decode_top_logprobs"][0]

-    print("Decoding step 0:",
-          ", ".join(pformat(token[2]) for token in logprobs))
+    print("Decoding step 0:", ", ".join(pformat(token[2]) for token in logprobs))
    for idx, (f, token) in enumerate(zip(forks, logprobs)):
        logprob, token_id, text = token
        f += text
@@ -56,17 +55,9 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
        )

        # calculate probability disparity between the top and secondary tokens
-        x1s = [
-            exp(xt[0][0])
-            for xt in f.get_meta_info("answer")["decode_top_logprobs"]
-        ]
-        x2s = [
-            exp(xt[1][0])
-            for xt in f.get_meta_info("answer")["decode_top_logprobs"]
-        ]
-        tokens = [
-            xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"]
-        ]
+        x1s = [exp(xt[0][0]) for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
+        x2s = [exp(xt[1][0]) for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
+        tokens = [xt[0][2] for xt in f.get_meta_info("answer")["decode_top_logprobs"]]
        delta = (sum(x1s) - sum(x2s)) / len(x1s)

        # extract the answer span (without the '<|end_of_text|>' token)
@@ -79,42 +70,45 @@ def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
            top_logprobs_num=2,
            return_text_in_logprobs=True,
        )
-        answer = answer_forks[idx]['answer_span'].replace('\n', ' ').strip(':')
+        answer = answer_forks[idx]["answer_span"].replace("\n", " ").strip(":")
        print(
            f"{YELLOW}Path #{idx} {pformat(text)}[{exp(logprob):.3f}] (score={delta}, answer={answer}){CLEAR}"
        )
-        generated_text = str(answer_forks[idx])[len("ProgramState("):-1]
+        generated_text = str(answer_forks[idx])[len("ProgramState(") : -1]
        print(f"{BLUE}{pformat(generated_text)}{CLEAR}")

        if verbose:
            answer_tokens = [
-                xt[0][2] for xt in answer_forks[idx].get_meta_info(
-                    "answer_span")["decode_top_logprobs"]
+                xt[0][2]
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "decode_top_logprobs"
+                ]
            ]
            answer_x1s = [
-                exp(xt[0][0]) for xt in answer_forks[idx].get_meta_info(
-                    "answer_span")["decode_top_logprobs"]
+                exp(xt[0][0])
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "decode_top_logprobs"
+                ]
            ]
            answer_x2s = [
-                exp(xt[1][0]) for xt in answer_forks[idx].get_meta_info(
-                    "answer_span")["decode_top_logprobs"]
+                exp(xt[1][0])
+                for xt in answer_forks[idx].get_meta_info("answer_span")[
+                    "decode_top_logprobs"
+                ]
            ]

            for token, x1, x2 in zip(tokens, x1s, x2s):
-                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})",
-                      end="")
+                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="")
            print("\n===========")
            for token, x1, x2 in zip(answer_tokens, answer_x1s, answer_x2s):
-                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})",
-                      end="")
+                print(f" {GREEN}{pformat(token)}{CLEAR}({x1:.3f}-{x2:.3f})", end="")
            print()


 sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))

 state = cot_decoding.run(
-    question=
-    r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4  weeks?",
+    question=r"Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4  weeks?",
    get_top_k=10,
    is_chat_model=True,
    verbose=False,