Add Together and AzureOpenAI examples (#184)

bb824da4 · Lianmin Zheng · GitHub · 93121324 · bb824da4 · bb824da4
Unverified Commit bb824da4 authored Feb 12, 2024 by Lianmin Zheng Committed by GitHub Feb 12, 2024
8 changed files
--- a/examples/quick_start/anthropic_example_chat.py
+++ b/examples/quick_start/anthropic_example_chat.py
@@ -23,7 +23,7 @@ def single():
    for m in state.messages():
        print(m["role"], ":", m["content"])
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():

--- a/examples/quick_start/azure_openai_example_chat.py
+++ b/examples/quick_start/azure_openai_example_chat.py
+"""
+Usage:
+export AZURE_OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+import sglang as sgl
+import os
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+    print("\n-- answer_1 --\n", state["answer_1"])
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True
+    )
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+def batch():
+    states = multi_turn_question.run_batch([
+        {"question_1": "What is the capital of the United States?",
+         "question_2": "List two local attractions."},
+        {"question_1": "What is the capital of France?",
+         "question_2": "What is the population of this city?"},
+    ])
+    for s in states:
+        print(s.messages())
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="azure-gpt-4",
+        api_version="2023-07-01-preview",
+        azure_endpoint="https://oai-arena-sweden.openai.azure.com/",
+        api_key=os.environ["AZURE_OPENAI_API_KEY"],
+        is_azure=True,
+    )
+    sgl.set_default_backend(backend)
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/quick_start/gemini_example_chat.py
+++ b/examples/quick_start/gemini_example_chat.py
@@ -23,7 +23,7 @@ def single():
    for m in state.messages():
        print(m["role"], ":", m["content"])
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():

--- a/examples/quick_start/openai_example_chat.py
+++ b/examples/quick_start/openai_example_chat.py
@@ -24,7 +24,7 @@ def single():
    for m in state.messages():
        print(m["role"], ":", m["content"])
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():

--- a/examples/quick_start/srt_example_chat.py
+++ b/examples/quick_start/srt_example_chat.py
@@ -22,7 +22,7 @@ def single():
    for m in state.messages():
        print(m["role"], ":", m["content"])
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 def stream():

--- a/examples/quick_start/together_example_chat.py
+++ b/examples/quick_start/together_example_chat.py
+"""
+Usage:
+export TOGETHER_API_KEY=sk-******
+python3 together_example_chat.py
+"""
+import sglang as sgl
+import os
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+    print("\n-- answer_1 --\n", state["answer_1"])
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True
+    )
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+def batch():
+    states = multi_turn_question.run_batch([
+        {"question_1": "What is the capital of the United States?",
+         "question_2": "List two local attractions."},
+        {"question_1": "What is the capital of France?",
+         "question_2": "What is the population of this city?"},
+    ])
+    for s in states:
+        print(s.messages())
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ.get("TOGETHER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/examples/quick_start/together_example_complete.py
+++ b/examples/quick_start/together_example_complete.py
+"""
+Usage:
+export TOGETHER_API_KEY=sk-******
+python3 together_example_complete.py
+"""
+import sglang as sgl
+import os
+@sgl.function
+def few_shot_qa(s, question):
+    s += (
+"""The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+""")
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+    assert "washington" in answer, f"answer: {state['answer']}"
+    print(state.text())
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?",
+        stream=True)
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+def batch():
+    states = few_shot_qa.run_batch([
+        {"question": "What is the capital of the United States?"},
+        {"question": "What is the capital of China?"},
+    ])
+    for s in states:
+        print(s["answer"])
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        is_chat_model=False,
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ.get("TOGETHER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
--- a/python/sglang/backend/openai.py
+++ b/python/sglang/backend/openai.py
@@ -4,7 +4,7 @@ from typing import Callable, List, Optional, Union
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
-from sglang.lang.chat_template import get_chat_template
+from sglang.lang.chat_template import get_chat_template_by_model_path, ChatTemplate
 from sglang.lang.interpreter import StreamExecutor
 from sglang.lang.ir import SglSamplingParams
@@ -41,23 +41,39 @@ INSTRUCT_MODEL_NAMES = [
 class OpenAI(BaseBackend):
-    def __init__(self, model_name, *args, **kwargs):
+    def __init__(self, model_name: str,
+                 is_chat_model: Optional[bool] = None,
+                 chat_template: Optional[ChatTemplate] = None,
+                 is_azure: bool = False,
+                 *args, **kwargs):
        super().__init__()
        if isinstance(openai, Exception):
            raise openai
-        self.client = openai.OpenAI(*args, **kwargs)
+        if is_azure:
+            self.client = openai.AzureOpenAI(*args, **kwargs)
+        else:
+            self.client = openai.OpenAI(*args, **kwargs)
        self.model_name = model_name
-        self.tokenizer = tiktoken.encoding_for_model(model_name)
+        try:
+            self.tokenizer = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            self.tokenizer = tiktoken.get_encoding("cl100k_base")
        self.logit_bias_int = create_logit_bias_int(self.tokenizer)
-        if model_name in INSTRUCT_MODEL_NAMES:
+        self.chat_template = chat_template or get_chat_template_by_model_path(model_name)
-            self.is_chat_model = False
+        if is_chat_model is not None:
+            self.is_chat_model = is_chat_model
        else:
-            self.is_chat_model = True
+            if model_name in INSTRUCT_MODEL_NAMES:
+                self.is_chat_model = False
+            else:
+                self.is_chat_model = True
-        self.chat_template = get_chat_template("default")
+        self.chat_begin_str = self.chat_template.role_prefix_and_suffix["assistant"][0]
    def get_chat_template(self):
        return self.chat_template
@@ -69,7 +85,7 @@ class OpenAI(BaseBackend):
    ):
        if sampling_params.dtype is None:
            if self.is_chat_model:
-                if not s.text_.endswith("ASSISTANT:"):
+                if not s.text_.endswith(self.chat_begin_str):
                    raise RuntimeError(
                        "This use case is not supported. "
                        "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
@@ -122,7 +138,11 @@ class OpenAI(BaseBackend):
    ):
        if sampling_params.dtype is None:
            if self.is_chat_model:
-                assert s.text_.endswith("ASSISTANT:")
+                if not s.text_.endswith(self.chat_begin_str):
+                    raise RuntimeError(
+                        "This use case is not supported. "
+                        "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
+                    )
                prompt = s.messages_
            else:
                prompt = s.text_
@@ -241,7 +261,10 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa
                    messages=prompt, stream=True, **kwargs
                )
                for ret in generator:
-                    content = ret.choices[0].delta.content
+                    try:
+                        content = ret.choices[0].delta.content
+                    except IndexError:
+                        content = None
                    yield content or "", {}
            else:
                generator = client.completions.create(