adapt to sglang v0.5.2rc1 on dcu

909abb58 · maxiao · 909abb58 · 909abb58 · 909abb58 · 909abb58
Commit 909abb58 authored Sep 04, 2025 by maxiao
20 changed files
--- a/examples/frontend_language/quick_start/anthropic_example_chat.py
+++ b/examples/frontend_language/quick_start/anthropic_example_chat.py
+"""
+Usage:
+export ANTHROPIC_API_KEY=sk-******
+python3 anthropic_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.Anthropic("claude-3-haiku-20240307"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/anthropic_example_complete.py
+++ b/examples/frontend_language/quick_start/anthropic_example_complete.py
+"""
+Usage:
+export ANTHROPIC_API_KEY=sk-******
+python3 anthropic_example_complete.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """
+\n\nHuman: What is the capital of France?
+\n\nAssistant: Paris
+\n\nHuman: What is the capital of Germany?
+\n\nAssistant: Berlin
+\n\nHuman: What is the capital of Italy?
+\n\nAssistant: Rome
+"""
+    s += "\n\nHuman: " + question + "\n"
+    s += "\n\nAssistant:" + sgl.gen("answer", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.Anthropic("claude-3-haiku-20240307"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/azure_openai_example_chat.py
+++ b/examples/frontend_language/quick_start/azure_openai_example_chat.py
+"""
+Usage:
+export AZURE_OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+
+import os
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="azure-gpt-4",
+        api_version="2023-07-01-preview",
+        azure_endpoint="https://oai-arena-sweden.openai.azure.com/",
+        api_key=os.environ["AZURE_OPENAI_API_KEY"],
+        is_azure=True,
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/gemini_example_chat.py
+++ b/examples/frontend_language/quick_start/gemini_example_chat.py
+"""
+Usage:
+export GCP_PROJECT_ID=******
+python3 gemini_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/gemini_example_complete.py
+++ b/examples/frontend_language/quick_start/gemini_example_complete.py
+"""
+Usage:
+export GCP_PROJECT_ID=******
+python3 gemini_example_complete.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+"""
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/gemini_example_multimodal_chat.py
+++ b/examples/frontend_language/quick_start/gemini_example_multimodal_chat.py
+"""
+Usage:
+export GCP_PROJECT_ID=******
+python3 gemini_example_multimodal_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def image_qa(s, image_file1, image_file2, question):
+    s += sgl.user(sgl.image(image_file1) + sgl.image(image_file2) + question)
+    s += sgl.assistant(sgl.gen("answer", max_tokens=256))
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
+
+    state = image_qa.run(
+        image_file1="./images/cat.jpeg",
+        image_file2="./images/dog.jpeg",
+        question="Describe difference of the two images in one sentence.",
+        stream=True,
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+    print(state["answer"])
--- a/examples/frontend_language/quick_start/images/cat.jpeg
+++ b/examples/frontend_language/quick_start/images/cat.jpeg
--- a/examples/frontend_language/quick_start/images/dog.jpeg
+++ b/examples/frontend_language/quick_start/images/dog.jpeg
--- a/examples/frontend_language/quick_start/local_example_chat.py
+++ b/examples/frontend_language/quick_start/local_example_chat.py
+"""
+Usage:
+python3 local_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
+    sgl.set_default_backend(runtime)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
+
+    runtime.shutdown()
--- a/examples/frontend_language/quick_start/local_example_complete.py
+++ b/examples/frontend_language/quick_start/local_example_complete.py
+"""
+Usage:
+python3 local_example_complete.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+"""
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
+    sgl.set_default_backend(runtime)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
+
+    runtime.shutdown()
--- a/examples/frontend_language/quick_start/local_example_llava_next.py
+++ b/examples/frontend_language/quick_start/local_example_llava_next.py
+"""
+Usage: python3 local_example_llava_next.py
+"""
+
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
+
+
+@sgl.function
+def image_qa(s, image_path, question):
+    s += sgl.user(sgl.image(image_path) + question)
+    s += sgl.assistant(sgl.gen("answer"))
+
+
+def single():
+    state = image_qa.run(
+        image_path="images/cat.jpeg", question="What is this?", max_new_tokens=128
+    )
+    print(state["answer"], "\n")
+
+
+def stream():
+    state = image_qa.run(
+        image_path="images/cat.jpeg",
+        question="What is this?",
+        max_new_tokens=64,
+        stream=True,
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = image_qa.run_batch(
+        [
+            {"image_path": "images/cat.jpeg", "question": "What is this?"},
+            {"image_path": "images/dog.jpeg", "question": "What is this?"},
+        ],
+        max_new_tokens=128,
+    )
+    for s in states:
+        print(s["answer"], "\n")
+
+
+if __name__ == "__main__":
+    import multiprocessing as mp
+
+    mp.set_start_method("spawn", force=True)
+
+    runtime = sgl.Runtime(model_path="lmms-lab/llama3-llava-next-8b")
+    runtime.endpoint.chat_template = get_chat_template("llama-3-instruct-llava")
+
+    # Or you can use the 72B model
+    # runtime = sgl.Runtime(model_path="lmms-lab/llava-next-72b", tp_size=8)
+    # runtime.endpoint.chat_template = get_chat_template("chatml-llava")
+
+    sgl.set_default_backend(runtime)
+    print(f"chat template: {runtime.endpoint.chat_template.name}")
+
+    # Or you can use API models
+    # sgl.set_default_backend(sgl.OpenAI("gpt-4-vision-preview"))
+    # sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
+
+    runtime.shutdown()
--- a/examples/frontend_language/quick_start/openai_example_chat.py
+++ b/examples/frontend_language/quick_start/openai_example_chat.py
+"""
+Usage:
+export OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/openai_example_complete.py
+++ b/examples/frontend_language/quick_start/openai_example_complete.py
+"""
+Usage:
+export OPENAI_API_KEY=sk-******
+python3 openai_example_complete.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+"""
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/openai_example_n.py
+++ b/examples/frontend_language/quick_start/openai_example_n.py
+"""
+Usage:
+export OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=1024, n=2))
+    s += sgl.user(question_2)
+    s += sgl.assistant(
+        sgl.gen(
+            "answer_2",
+            max_tokens=1024,
+        )
+    )
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+    print("\n-- answer_2 --\n", state["answer_2"])
+    assert isinstance(state["answer_1"], list)
+    assert len(state["answer_1"]) == 2
+    assert isinstance(state["answer_2"], str)
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+        print("\n-- answer_1 --\n", s["answer_1"])
+        print("\n-- answer_2 --\n", s["answer_2"])
+        assert isinstance(s["answer_1"], list)
+        assert len(s["answer_1"]) == 2
+        assert isinstance(s["answer_2"], str)
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.OpenAI("o1"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/openai_example_o1.py
+++ b/examples/frontend_language/quick_start/openai_example_o1.py
+"""
+Usage:
+export OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=100))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2"))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.OpenAI("o1"))
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/openrouter_example_chat.py
+++ b/examples/frontend_language/quick_start/openrouter_example_chat.py
+"""
+Usage:
+export OPENROUTER_API_KEY=sk-******
+python3 together_example_chat.py
+"""
+
+import os
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="google/gemma-7b-it:free",
+        base_url="https://openrouter.ai/api/v1",
+        api_key=os.environ.get("OPENROUTER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/together_example_chat.py
+++ b/examples/frontend_language/quick_start/together_example_chat.py
+"""
+Usage:
+export TOGETHER_API_KEY=sk-******
+python3 together_example_chat.py
+"""
+
+import os
+
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True,
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch(
+        [
+            {
+                "question_1": "What is the capital of the United States?",
+                "question_2": "List two local attractions.",
+            },
+            {
+                "question_1": "What is the capital of France?",
+                "question_2": "What is the population of this city?",
+            },
+        ]
+    )
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ.get("TOGETHER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/quick_start/together_example_complete.py
+++ b/examples/frontend_language/quick_start/together_example_complete.py
+"""
+Usage:
+export TOGETHER_API_KEY=sk-******
+python3 together_example_complete.py
+"""
+
+import os
+
+import sglang as sgl
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += """The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+"""
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?", stream=True
+    )
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch(
+        [
+            {"question": "What is the capital of the United States?"},
+            {"question": "What is the capital of China?"},
+        ]
+    )
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        is_chat_model=False,
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ.get("TOGETHER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/frontend_language/usage/chinese_regex.py
+++ b/examples/frontend_language/usage/chinese_regex.py
+import sglang as sgl
+
+character_regex = (
+    r"""\{\n"""
+    + r"""    "姓名": "[^"]{1,32}",\n"""
+    + r"""    "学院": "(格兰芬多|赫奇帕奇|拉文克劳|斯莱特林)",\n"""
+    + r"""    "血型": "(纯血|混血|麻瓜)",\n"""
+    + r"""    "职业": "(学生|教师|傲罗|魔法部|食死徒|凤凰社成员)",\n"""
+    + r"""    "魔杖": \{\n"""
+    + r"""        "材质": "[^"]{1,32}",\n"""
+    + r"""        "杖芯": "[^"]{1,32}",\n"""
+    + r"""        "长度": [0-9]{1,2}\.[0-9]{0,2}\n"""
+    + r"""    \},\n"""
+    + r"""    "存活": "(存活|死亡)",\n"""
+    + r"""    "守护神": "[^"]{1,32}",\n"""
+    + r"""    "博格特": "[^"]{1,32}"\n"""
+    + r"""\}"""
+)
+
+
+@sgl.function
+def character_gen(s, name):
+    s += name + " 是一名哈利波特系列小说中的角色。请填写以下关于这个角色的信息。"
+    s += """\
+这是一个例子
+{
+    "姓名": "哈利波特",
+    "学院": "格兰芬多",
+    "血型": "混血",
+    "职业": "学生",
+    "魔杖": {
+        "材质": "冬青木",
+        "杖芯": "凤凰尾羽",
+        "长度": 11.0
+    },
+    "存活": "存活",
+    "守护神": "麋鹿",
+    "博格特": "摄魂怪"
+}
+"""
+    s += f"现在请你填写{name}的信息：\n"
+    s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
+
+
+def main():
+    backend = sgl.RuntimeEndpoint("http://localhost:30000")
+    sgl.set_default_backend(backend)
+    ret = character_gen.run(name="赫敏格兰杰", temperature=0)
+    print(ret.text())
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/frontend_language/usage/choices_logprob.py
+++ b/examples/frontend_language/usage/choices_logprob.py
+"""
+Usage:
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+python choices_logprob.py
+"""
+
+import sglang as sgl
+
+
+@sgl.function
+def tool_use(s, question):
+    s += "To answer this question: " + question + ", "
+    s += "I need to use a " + sgl.gen("tool", choices=["calculator", "search engine"])
+
+
+def main():
+    # Run one case
+    question = "What is 5 + 5?"
+    state = tool_use.run(question)
+    print("questions:", question)
+    print("choice:", state["tool"])
+    meta_info = state.get_meta_info("tool")
+    print("logprobs of choice 1", meta_info["input_token_logprobs"][0])
+    print("logprobs of choice 2", meta_info["input_token_logprobs"][1])
+    print("-" * 50)
+
+    # Run a batch
+    questions = [
+        "What is 5 + 6?",
+        "Who is Michael Jordan?",
+    ]
+    states = tool_use.run_batch([{"question": q} for q in questions])
+    for question, state in zip(questions, states):
+        print("questions:", question)
+        print("choice:", state["tool"])
+        meta_info = state.get_meta_info("tool")
+        print("logprobs of choice 1", meta_info["input_token_logprobs"][0])
+        print("logprobs of choice 2", meta_info["input_token_logprobs"][1])
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+    main()