Commit 22085081 authored by Lianmin Zheng's avatar Lianmin Zheng
Browse files
parent f6d40df0
## SRT Unit Tests
### Low-level API
```
cd sglang/test/srt/model
python3 test_llama_low_api.py
python3 test_llama_extend.py
python3 test_llava_low_api.py
python3 bench_llama_low_api.py
```
### High-level API
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
cd test/lang
python3 test_srt_backend.py
```
### Performance
#### MMLU
```
cd benchmark/mmlu
```
Follow README.md to download the data.
```
python3 bench_sglang.py --nsub 3
# Expected performance on A10G
# Total latency: 8.200
# Average accuracy: 0.413
```
### More Models
#### LLaVA
```
python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
```
```
cd benchmark/llava_bench
python3 bench_sglang.py
```
## SGLang Unit Tests
```
export ANTHROPIC_API_KEY=
export OPENAI_API_KEY=
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
cd test/lang
python3 run_all.py
```
from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic
@function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(Anthropic("claude-2"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
from sglang import function, gen, set_default_backend, Anthropic
@function
def few_shot_qa(s, question):
s += (
"""
\n\nHuman: What is the capital of France?
\n\nAssistant: Paris
\n\nHuman: What is the capital of Germany?
\n\nAssistant: Berlin
\n\nHuman: What is the capital of Italy?
\n\nAssistant: Rome
""")
s += "\n\nHuman: " + question + "\n"
s += "\n\nAssistant:" + gen("answer", stop="\n", temperature=0)
set_default_backend(Anthropic("claude-2"))
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic
@function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(Anthropic("claude-2"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
import asyncio
import sglang as sgl
@sgl.function
def multi_turn_question(s, question_1, question_2):
s += sgl.system("You are a helpful assistant.")
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += sgl.user(question_2)
s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo"))
#sgl.set_default_backend(sgl.RuntimeEndpoint("http://localhost:30000"))
def stream_a_variable():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter(var_name="answer_2"):
print(out, end="", flush=True)
print()
async def async_stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
async for out in state.text_async_iter(var_name="answer_2"):
print(out, end="", flush=True)
print()
if __name__ == "__main__":
#stream_a_variable()
asyncio.run(async_stream())
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
from sglang import function, gen, set_default_backend, OpenAI
@function
def few_shot_qa(s, question):
s += (
"""The following are questions with answers.
Q: What is the capital of France?
A: Paris
Q: What is the capital of Germany?
A: Berlin
Q: What is the capital of Italy?
A: Rome
""")
s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0)
set_default_backend(OpenAI("gpt-3.5-turbo-instruct"))
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
from sglang import function, system, user, assistant, gen, set_default_backend, Runtime
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
#runtime = Runtime(model_path="mistralai/Mixtral-8x7B-Instruct-v0.1")
set_default_backend(runtime)
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
runtime.shutdown()
from sglang import function, gen, set_default_backend, Runtime
@function
def few_shot_qa(s, question):
s += (
"""The following are questions with answers.
Q: What is the capital of France?
A: Paris
Q: What is the capital of Germany?
A: Berlin
Q: What is the capital of Italy?
A: Rome
""")
s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0)
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
set_default_backend(runtime)
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
runtime.shutdown()
from sglang import function, gen, set_default_backend, Runtime
@function
def regex_gen(s):
s += "Q: What is the IP address of the Google DNS servers?\n"
s += "A: " + gen(
"answer",
temperature=0,
regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
)
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
set_default_backend(runtime)
state = regex_gen.run()
print(state.text())
runtime.shutdown()
from sglang import function, system, user, assistant, gen, set_default_backend, Runtime
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
runtime = Runtime("meta-llama/Llama-2-7b-chat-hf")
set_default_backend(runtime)
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
temperature=0,
stream=True,
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
runtime.shutdown()
isort python
black python
isort test
black test
# Assuming the model is downdloaded at /home/ubuntu/model_weights/Llama-2-7b-chat-hf
docker run --name tgi --rm -ti --gpus all --network host \
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
ghcr.io/huggingface/text-generation-inference:1.1.0 \
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
--max-input-length 2048 --max-total-tokens 4096 \
--port 24000
import transformers
import code
name = "meta-llama/Llama-2-7b-chat-hf"
t = transformers.AutoTokenizer.from_pretrained(name)
code.interact(local=locals())
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "sglang"
version = "0.1.0"
description = "A structured generation langauge for LLMs."
readme = "README.md"
requires-python = ">=3.8"
license = {file = "LICENSE"}
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
]
dependencies = [
"requests",
]
[project.optional-dependencies]
srt = ["fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn", "zmq", "vllm>=0.2.5",
"interegular", "lark"]
openai = ["openai>=1.0"]
anthropic = ["anthropic"]
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
[tool.setuptools.packages.find]
exclude = ["assets*", "benchmark*", "docs*", "dist*", "playground*", "scripts*", "tests*"]
[tool.wheel]
exclude = ["assets*", "benchmark*", "docs*", "dist*", "playground*", "scripts*", "tests*"]
from sglang.api import *
from sglang.global_config import global_config
"""Public API"""
import re
from typing import Callable, List, Optional, Union
from sglang.backend.anthropic import Anthropic
from sglang.backend.base_backend import BaseBackend
from sglang.backend.openai import OpenAI
from sglang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.global_config import global_config
from sglang.lang.ir import (
SglExpr,
SglExprList,
SglFunction,
SglGen,
SglImage,
SglRoleBegin,
SglRoleEnd,
SglSelect,
)
from sglang.srt.server import Runtime
def function(func: Callable):
return SglFunction(func)
def set_default_backend(backend: BaseBackend):
global_config.default_backend = backend
def gen(
name: Optional[str] = None,
max_tokens: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
top_k: Optional[int] = None,
frequency_penalty: Optional[float] = None,
presence_penalty: Optional[float] = None,
dtype: Optional[type] = None,
choices: Optional[List[str]] = None,
regex: Optional[str] = None,
):
if choices:
return SglSelect(name, choices, temperature)
# check regex is valid
if regex is not None:
try:
re.compile(regex)
except re.error as e:
raise e
return SglGen(
name,
max_tokens,
stop,
temperature,
top_p,
top_k,
frequency_penalty,
presence_penalty,
dtype,
regex,
)
def gen_int(
name: Optional[str] = None,
max_tokens: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
top_k: Optional[int] = None,
frequency_penalty: Optional[float] = None,
presence_penalty: Optional[float] = None,
):
return SglGen(
name,
max_tokens,
stop,
temperature,
top_p,
top_k,
frequency_penalty,
presence_penalty,
int,
None,
)
def gen_string(
name: Optional[str] = None,
max_tokens: Optional[int] = None,
stop: Optional[Union[str, List[str]]] = None,
temperature: Optional[float] = None,
top_p: Optional[float] = None,
top_k: Optional[int] = None,
frequency_penalty: Optional[float] = None,
presence_penalty: Optional[float] = None,
):
return SglGen(
name,
max_tokens,
stop,
temperature,
top_p,
top_k,
frequency_penalty,
presence_penalty,
str,
None,
)
def image(expr: SglExpr):
return SglImage(expr)
def select(
name: Optional[str] = None,
choices: List[str] = None,
temperature: float = 0.0,
):
assert choices is not None
return SglSelect(name, choices, temperature)
def _role_common(name: str, expr: Optional[SglExpr] = None):
if expr is None:
return SglExprList([SglRoleBegin(name), SglRoleEnd(name)])
else:
return SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
def system(expr: Optional[SglExpr] = None):
return _role_common("system", expr)
def user(expr: Optional[SglExpr] = None):
return _role_common("user", expr)
def assistant(expr: Optional[SglExpr] = None):
return _role_common("assistant", expr)
def user_begin():
return SglRoleBegin("user")
def user_end():
return SglRoleEnd("user")
def assistant_begin():
return SglRoleBegin("assistant")
def assistant_end():
return SglRoleEnd("assistant")
from typing import List, Optional, Union
import numpy as np
from sglang.backend.base_backend import BaseBackend
from sglang.lang.chat_template import get_chat_template
from sglang.lang.interpreter import StreamExecutor
from sglang.lang.ir import SamplingParams
try:
import anthropic
except ImportError as e:
anthropic = e
class Anthropic(BaseBackend):
def __init__(self, model_name):
super().__init__()
if isinstance(anthropic, Exception):
raise anthropic
self.model_name = model_name
self.chat_template = get_chat_template("claude")
def get_chat_template(self):
return self.chat_template
def generate(
self,
s: StreamExecutor,
sampling_params: SamplingParams,
):
prompt = s.text_
ret = anthropic.Anthropic().completions.create(
model=self.model_name,
prompt=prompt,
**sampling_params.to_anthropic_kwargs(),
)
comp = ret.completion
return comp, {}
def generate_stream(
self,
s: StreamExecutor,
sampling_params: SamplingParams,
):
prompt = s.text_
generator = anthropic.Anthropic().completions.create(
model=self.model_name,
prompt=prompt,
stream=True,
**sampling_params.to_anthropic_kwargs(),
)
for ret in generator:
yield ret.completion, {}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment