release initial code

Co-authored-by: Ying Sheng <sqy1415@gmail.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com> Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu> Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com> Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

release initial code
Co-authored-by: Ying Sheng <sqy1415@gmail.com> Co-authored-by: Liangsheng Yin <hnyls2002@gmail.com> Co-authored-by: Zhiqiang Xie <xiezhq@stanford.edu> Co-authored-by: parasol-aser <3848358+parasol-aser@users.noreply.github.com> Co-authored-by: LiviaSun <33578456+ChuyueSun@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
22085081 · Lianmin Zheng · f6d40df0 · 22085081 · 22085081 · 22085081
Commit 22085081 authored Jan 08, 2024 by Lianmin Zheng
20 changed files
--- a/docs/test_process.md
+++ b/docs/test_process.md
+## SRT Unit Tests
+
+### Low-level API
+```
+cd sglang/test/srt/model
+
+python3 test_llama_low_api.py
+python3 test_llama_extend.py
+python3 test_llava_low_api.py
+python3 bench_llama_low_api.py
+```
+
+### High-level API
+
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+cd test/lang
+python3 test_srt_backend.py
+```
+
+### Performance
+
+#### MMLU
+```
+cd benchmark/mmlu
+```
+Follow README.md to download the data.
+
+```
+python3 bench_sglang.py --nsub 3
+
+# Expected performance on A10G
+# Total latency: 8.200
+# Average accuracy: 0.413
+```
+
+### More Models
+
+#### LLaVA
+
+```
+python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
+```
+
+```
+cd benchmark/llava_bench
+python3 bench_sglang.py
+```
+
+## SGLang Unit Tests
+```
+export ANTHROPIC_API_KEY=
+export OPENAI_API_KEY=
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+
+```
+cd test/lang
+python3 run_all.py
+```
--- a/examples/quick_start/anthropic_example_chat.py
+++ b/examples/quick_start/anthropic_example_chat.py
+from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic
+
+
+@function
+def multi_turn_question(s, question_1, question_2):
+    s += user(question_1)
+    s += assistant(gen("answer_1", max_tokens=256))
+    s += user(question_2)
+    s += assistant(gen("answer_2", max_tokens=256))
+
+set_default_backend(Anthropic("claude-2"))
+
+state = multi_turn_question.run(
+    question_1="What is the capital of the United States?",
+    question_2="List two local attractions.",
+)
+
+for m in state.messages():
+    print(m["role"], ":", m["content"])
--- a/examples/quick_start/anthropic_example_complete.py
+++ b/examples/quick_start/anthropic_example_complete.py
+from sglang import function, gen, set_default_backend, Anthropic
+
+
+@function
+def few_shot_qa(s, question):
+    s += (
+"""
+\n\nHuman: What is the capital of France?
+\n\nAssistant: Paris
+\n\nHuman: What is the capital of Germany?
+\n\nAssistant: Berlin
+\n\nHuman: What is the capital of Italy?
+\n\nAssistant: Rome
+""")
+    s += "\n\nHuman: " + question + "\n"
+    s += "\n\nAssistant:" + gen("answer", stop="\n", temperature=0)
+
+
+set_default_backend(Anthropic("claude-2"))
+
+state = few_shot_qa.run(question="What is the capital of the United States?")
+answer = state["answer"].strip().lower()
+
+assert "washington" in answer, f"answer: {state['answer']}"
+
+print(state.text())
--- a/examples/quick_start/anthropic_example_stream.py
+++ b/examples/quick_start/anthropic_example_stream.py
+from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic
+
+
+@function
+def multi_turn_question(s, question_1, question_2):
+    s += user(question_1)
+    s += assistant(gen("answer_1", max_tokens=256))
+    s += user(question_2)
+    s += assistant(gen("answer_2", max_tokens=256))
+
+set_default_backend(Anthropic("claude-2"))
+
+state = multi_turn_question.run(
+    question_1="What is the capital of the United States?",
+    question_2="List two local attractions.",
+    stream=True
+)
+
+for out in state.text_iter():
+    print(out, end="", flush=True)
--- a/examples/quick_start/more_stream_methods.py
+++ b/examples/quick_start/more_stream_methods.py
+import asyncio
+import sglang as sgl
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo"))
+#sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
+
+
+def stream_a_variable():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True
+    )
+
+    for out in state.text_iter(var_name="answer_2"):
+        print(out, end="", flush=True)
+    print()
+
+
+async def async_stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True
+    )
+
+    async for out in state.text_async_iter(var_name="answer_2"):
+        print(out, end="", flush=True)
+    print()
+
+
+if __name__ == "__main__":
+    #stream_a_variable()
+    asyncio.run(async_stream())
--- a/examples/quick_start/openai_example_chat.py
+++ b/examples/quick_start/openai_example_chat.py
+from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
+
+
+@function
+def multi_turn_question(s, question_1, question_2):
+    s += system("You are a helpful assistant.")
+    s += user(question_1)
+    s += assistant(gen("answer_1", max_tokens=256))
+    s += user(question_2)
+    s += assistant(gen("answer_2", max_tokens=256))
+
+set_default_backend(OpenAI("gpt-3.5-turbo"))
+
+state = multi_turn_question.run(
+    question_1="What is the capital of the United States?",
+    question_2="List two local attractions.",
+)
+
+for m in state.messages():
+    print(m["role"], ":", m["content"])
--- a/examples/quick_start/openai_example_complete.py
+++ b/examples/quick_start/openai_example_complete.py
+from sglang import function, gen, set_default_backend, OpenAI
+
+
+@function
+def few_shot_qa(s, question):
+    s += (
+"""The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+""")
+    s += "Q: " + question + "\n"
+    s += "A:" + gen("answer", stop="\n", temperature=0)
+
+
+set_default_backend(OpenAI("gpt-3.5-turbo-instruct"))
+
+state = few_shot_qa.run(question="What is the capital of the United States?")
+answer = state["answer"].strip().lower()
+
+assert "washington" in answer, f"answer: {state['answer']}"
+
+print(state.text())
--- a/examples/quick_start/openai_example_stream.py
+++ b/examples/quick_start/openai_example_stream.py
+from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
+
+
+@function
+def multi_turn_question(s, question_1, question_2):
+    s += system("You are a helpful assistant.")
+    s += user(question_1)
+    s += assistant(gen("answer_1", max_tokens=256))
+    s += user(question_2)
+    s += assistant(gen("answer_2", max_tokens=256))
+
+set_default_backend(OpenAI("gpt-3.5-turbo"))
+
+state = multi_turn_question.run(
+    question_1="What is the capital of the United States?",
+    question_2="List two local attractions.",
+    stream=True
+)
+
+for out in state.text_iter():
+    print(out, end="", flush=True)
--- a/examples/quick_start/srt_example_chat.py
+++ b/examples/quick_start/srt_example_chat.py
+from sglang import function, system, user, assistant, gen, set_default_backend, Runtime
+
+
+@function
+def multi_turn_question(s, question_1, question_2):
+    s += system("You are a helpful assistant.")
+    s += user(question_1)
+    s += assistant(gen("answer_1", max_tokens=256))
+    s += user(question_2)
+    s += assistant(gen("answer_2", max_tokens=256))
+
+
+runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
+#runtime = Runtime(model_path="mistralai/Mixtral-8x7B-Instruct-v0.1")
+set_default_backend(runtime)
+
+state = multi_turn_question.run(
+    question_1="What is the capital of the United States?",
+    question_2="List two local attractions.",
+)
+
+for m in state.messages():
+    print(m["role"], ":", m["content"])
+
+
+runtime.shutdown()
--- a/examples/quick_start/srt_example_complete.py
+++ b/examples/quick_start/srt_example_complete.py
+from sglang import function, gen, set_default_backend, Runtime
+
+
+@function
+def few_shot_qa(s, question):
+    s += (
+"""The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+""")
+    s += "Q: " + question + "\n"
+    s += "A:" + gen("answer", stop="\n", temperature=0)
+
+
+runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
+set_default_backend(runtime)
+
+state = few_shot_qa.run(question="What is the capital of the United States?")
+
+answer = state["answer"].strip().lower()
+assert "washington" in answer, f"answer: {state['answer']}"
+print(state.text())
+
+runtime.shutdown()
--- a/examples/quick_start/srt_example_regex.py
+++ b/examples/quick_start/srt_example_regex.py
+from sglang import function, gen, set_default_backend, Runtime
+
+
+@function
+def regex_gen(s):
+    s += "Q: What is the IP address of the Google DNS servers?\n"
+    s += "A: " + gen(
+        "answer",
+        temperature=0,
+        regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
+    )
+
+
+runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
+set_default_backend(runtime)
+
+state = regex_gen.run()
+
+print(state.text())
+
+runtime.shutdown()
--- a/examples/quick_start/srt_example_stream.py
+++ b/examples/quick_start/srt_example_stream.py
+from sglang import function, system, user, assistant, gen, set_default_backend, Runtime
+
+
+@function
+def multi_turn_question(s, question_1, question_2):
+    s += system("You are a helpful assistant.")
+    s += user(question_1)
+    s += assistant(gen("answer_1", max_tokens=256))
+    s += user(question_2)
+    s += assistant(gen("answer_2", max_tokens=256))
+
+runtime = Runtime("meta-llama/Llama-2-7b-chat-hf")
+set_default_backend(runtime)
+
+state = multi_turn_question.run(
+    question_1="What is the capital of the United States?",
+    question_2="List two local attractions.",
+    temperature=0,
+    stream=True,
+)
+
+for out in state.text_iter():
+    print(out, end="", flush=True)
+print()
+
+runtime.shutdown()
--- a/format.sh
+++ b/format.sh
+isort python
+black python
+
+isort test
+black test
--- a/playground/launch_tgi.sh
+++ b/playground/launch_tgi.sh
+# Assuming the model is downdloaded at /home/ubuntu/model_weights/Llama-2-7b-chat-hf
+docker run --name tgi --rm -ti --gpus all --network host \
+  -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
+  ghcr.io/huggingface/text-generation-inference:1.1.0 \
+  --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
+  --max-input-length 2048 --max-total-tokens 4096 \
+  --port 24000
--- a/playground/load_tokenizer.py
+++ b/playground/load_tokenizer.py
+import transformers
+import code
+
+name = "meta-llama/Llama-2-7b-chat-hf"
+
+t = transformers.AutoTokenizer.from_pretrained(name)
+code.interact(local=locals())
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "sglang"
+version = "0.1.0"
+description = "A structured generation langauge for LLMs." 
+readme = "README.md"
+requires-python = ">=3.8"
+license = {file = "LICENSE"}
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "requests",
+]
+
+[project.optional-dependencies]
+srt = ["fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn", "zmq", "vllm>=0.2.5",
+       "interegular", "lark"]
+openai = ["openai>=1.0"]
+anthropic = ["anthropic"]
+all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
+
+[tool.setuptools.packages.find]
+exclude = ["assets*", "benchmark*", "docs*", "dist*", "playground*", "scripts*", "tests*"]
+
+[tool.wheel]
+exclude = ["assets*", "benchmark*", "docs*", "dist*", "playground*", "scripts*", "tests*"]
--- a/python/sglang/__init__.py
+++ b/python/sglang/__init__.py
+from sglang.api import *
+from sglang.global_config import global_config
--- a/python/sglang/api.py
+++ b/python/sglang/api.py
+"""Public API"""
+import re
+from typing import Callable, List, Optional, Union
+
+from sglang.backend.anthropic import Anthropic
+from sglang.backend.base_backend import BaseBackend
+from sglang.backend.openai import OpenAI
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.global_config import global_config
+from sglang.lang.ir import (
+    SglExpr,
+    SglExprList,
+    SglFunction,
+    SglGen,
+    SglImage,
+    SglRoleBegin,
+    SglRoleEnd,
+    SglSelect,
+)
+from sglang.srt.server import Runtime
+
+
+def function(func: Callable):
+    return SglFunction(func)
+
+
+def set_default_backend(backend: BaseBackend):
+    global_config.default_backend = backend
+
+
+def gen(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+    dtype: Optional[type] = None,
+    choices: Optional[List[str]] = None,
+    regex: Optional[str] = None,
+):
+    if choices:
+        return SglSelect(name, choices, temperature)
+
+    # check regex is valid
+    if regex is not None:
+        try:
+            re.compile(regex)
+        except re.error as e:
+            raise e
+
+    return SglGen(
+        name,
+        max_tokens,
+        stop,
+        temperature,
+        top_p,
+        top_k,
+        frequency_penalty,
+        presence_penalty,
+        dtype,
+        regex,
+    )
+
+
+def gen_int(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+):
+    return SglGen(
+        name,
+        max_tokens,
+        stop,
+        temperature,
+        top_p,
+        top_k,
+        frequency_penalty,
+        presence_penalty,
+        int,
+        None,
+    )
+
+
+def gen_string(
+    name: Optional[str] = None,
+    max_tokens: Optional[int] = None,
+    stop: Optional[Union[str, List[str]]] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    frequency_penalty: Optional[float] = None,
+    presence_penalty: Optional[float] = None,
+):
+    return SglGen(
+        name,
+        max_tokens,
+        stop,
+        temperature,
+        top_p,
+        top_k,
+        frequency_penalty,
+        presence_penalty,
+        str,
+        None,
+    )
+
+
+def image(expr: SglExpr):
+    return SglImage(expr)
+
+
+def select(
+    name: Optional[str] = None,
+    choices: List[str] = None,
+    temperature: float = 0.0,
+):
+    assert choices is not None
+    return SglSelect(name, choices, temperature)
+
+
+def _role_common(name: str, expr: Optional[SglExpr] = None):
+    if expr is None:
+        return SglExprList([SglRoleBegin(name), SglRoleEnd(name)])
+    else:
+        return SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
+
+
+def system(expr: Optional[SglExpr] = None):
+    return _role_common("system", expr)
+
+
+def user(expr: Optional[SglExpr] = None):
+    return _role_common("user", expr)
+
+
+def assistant(expr: Optional[SglExpr] = None):
+    return _role_common("assistant", expr)
+
+
+def user_begin():
+    return SglRoleBegin("user")
+
+
+def user_end():
+    return SglRoleEnd("user")
+
+
+def assistant_begin():
+    return SglRoleBegin("assistant")
+
+
+def assistant_end():
+    return SglRoleEnd("assistant")
--- a/python/sglang/backend/__init__.py
+++ b/python/sglang/backend/__init__.py
--- a/python/sglang/backend/anthropic.py
+++ b/python/sglang/backend/anthropic.py
+from typing import List, Optional, Union
+
+import numpy as np
+from sglang.backend.base_backend import BaseBackend
+from sglang.lang.chat_template import get_chat_template
+from sglang.lang.interpreter import StreamExecutor
+from sglang.lang.ir import SamplingParams
+
+try:
+    import anthropic
+except ImportError as e:
+    anthropic = e
+
+
+class Anthropic(BaseBackend):
+    def __init__(self, model_name):
+        super().__init__()
+
+        if isinstance(anthropic, Exception):
+            raise anthropic
+
+        self.model_name = model_name
+        self.chat_template = get_chat_template("claude")
+
+    def get_chat_template(self):
+        return self.chat_template
+
+    def generate(
+        self,
+        s: StreamExecutor,
+        sampling_params: SamplingParams,
+    ):
+        prompt = s.text_
+        ret = anthropic.Anthropic().completions.create(
+            model=self.model_name,
+            prompt=prompt,
+            **sampling_params.to_anthropic_kwargs(),
+        )
+        comp = ret.completion
+
+        return comp, {}
+
+    def generate_stream(
+        self,
+        s: StreamExecutor,
+        sampling_params: SamplingParams,
+    ):
+        prompt = s.text_
+        generator = anthropic.Anthropic().completions.create(
+            model=self.model_name,
+            prompt=prompt,
+            stream=True,
+            **sampling_params.to_anthropic_kwargs(),
+        )
+
+        for ret in generator:
+            yield ret.completion, {}