Unverified Commit 06175286 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update quick start examples (#120)

parent 4ea92f83
......@@ -39,18 +39,20 @@ pip install -e "python[all]"
- For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
## Quick Start
The example below shows how to use sglang to answer a mulit-turn question.
### Using OpenAI Models
Set the OpenAI API Key
### Using Local Models
First, launch a server with
```
export OPENAI_API_KEY=sk-******
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
Then, answer a multi-turn question.
Then, connect to the server and answer a multi-turn question.
```python
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
from sglang import function, system, user, assistant, gen, set_default_backend, RuntimeEndpoint
@function
def multi_turn_question(s, question_1, question_2):
......@@ -60,7 +62,7 @@ def multi_turn_question(s, question_1, question_2):
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
set_default_backend(RuntimeEndpoint("http://localhost:30000"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
......@@ -73,16 +75,15 @@ for m in state.messages():
print(state["answer_1"])
```
### Using Local Models
First, launch a server with
### Using OpenAI Models
Set the OpenAI API Key
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
export OPENAI_API_KEY=sk-******
```
Then, connect to the server and answer a multi-turn question.
Then, answer a multi-turn question.
```python
from sglang import function, system, user, assistant, gen, set_default_backend, RuntimeEndpoint
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
......@@ -92,7 +93,7 @@ def multi_turn_question(s, question_1, question_2):
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(RuntimeEndpoint("http://localhost:30000"))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
......@@ -120,7 +121,7 @@ import sglang as sgl
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
You can implement your prompt flow in a function decorated by `sgl.function`.
You can then invoke the function with `run` or `run_batch`.
The system will manage the state, chat template, and parallelism for you.
The system will manage the state, chat template, parallelism and batching for you.
### Control Flow
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
......
from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic
"""
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_chat.py
"""
import sglang as sgl
@function
@sgl.function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += sgl.user(question_2)
s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
set_default_backend(Anthropic("claude-2"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
def single():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
for m in state.messages():
print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.Anthropic("claude-2"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, gen, set_default_backend, Anthropic
"""
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_complete.py
"""
import sglang as sgl
@function
@sgl.function
def few_shot_qa(s, question):
s += (
"""
......@@ -13,14 +19,49 @@ def few_shot_qa(s, question):
\n\nAssistant: Rome
""")
s += "\n\nHuman: " + question + "\n"
s += "\n\nAssistant:" + gen("answer", stop="\n", temperature=0)
s += "\n\nAssistant:" + sgl.gen("answer", stop="\n", temperature=0)
def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
set_default_backend(Anthropic("claude-2"))
if __name__ == "__main__":
sgl.set_default_backend(sgl.Anthropic("claude-2"))
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
# Run a single request
print("\n========== single ==========\n")
single()
assert "washington" in answer, f"answer: {state['answer']}"
# Stream output
print("\n========== stream ==========\n")
stream()
print(state.text())
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic
@function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(Anthropic("claude-2"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_chat.py
"""
import sglang as sgl
@sgl.function
def multi_turn_question(s, question_1, question_2):
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += sgl.user(question_2)
s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
def single():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, gen, set_default_backend, VertexAI
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_complete.py
"""
import sglang as sgl
@function
@sgl.function
def few_shot_qa(s, question):
s += (
"""The following are questions with answers.
......@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome
""")
s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0)
s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
set_default_backend(VertexAI("gemini-pro"))
if __name__ == "__main__":
sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
# Run a single request
print("\n========== single ==========\n")
single()
assert "washington" in answer, f"answer: {state['answer']}"
# Stream output
print("\n========== stream ==========\n")
stream()
print(state.text())
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, user, assistant, gen, image, set_default_backend, VertexAI
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_multimodal_chat.py
"""
import sglang as sgl
@function
@sgl.function
def image_qa(s, image_file1, image_file2, question):
s += user(image(image_file1) + image(image_file2) + question)
s += assistant(gen("answer_1", max_tokens=256))
s += sgl.user(sgl.image(image_file1) + sgl.image(image_file2) + question)
s += sgl.assistant(sgl.gen("answer", max_tokens=256))
set_default_backend(VertexAI("gemini-pro-vision"))
state = image_qa.run(
image_file1="./images/cat.jpeg",
image_file2="./images/dog.jpeg",
question="Describe difference of the 2 images in one sentence.",
stream=True
)
if __name__ == "__main__":
sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
for out in state.text_iter():
print(out, end="", flush=True)
state = image_qa.run(
image_file1="./images/cat.jpeg",
image_file2="./images/dog.jpeg",
question="Describe difference of the two images in one sentence.",
stream=True
)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
print(state["answer"])
from sglang import function, user, assistant, gen, set_default_backend, VertexAI
@function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(VertexAI("gemini-pro"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
"""
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_chat.py
"""
import sglang as sgl
@function
@sgl.function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
s += sgl.system("You are a helpful assistant.")
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += sgl.user(question_2)
s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
def single():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
for m in state.messages():
print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, gen, set_default_backend, OpenAI
"""
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_complete.py
"""
import sglang as sgl
@function
@sgl.function
def few_shot_qa(s, question):
s += (
"""The following are questions with answers.
......@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome
""")
s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0)
s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
set_default_backend(OpenAI("gpt-3.5-turbo-instruct"))
if __name__ == "__main__":
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
# Run a single request
print("\n========== single ==========\n")
single()
assert "washington" in answer, f"answer: {state['answer']}"
# Stream output
print("\n========== stream ==========\n")
stream()
print(state.text())
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
from sglang import function, system, user, assistant, gen, set_default_backend, Runtime
"""
Usage:
python3 srt_example_chat.py
"""
import sglang as sgl
@function
@sgl.function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += sgl.user(question_2)
s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
#runtime = Runtime(model_path="mistralai/Mixtral-8x7B-Instruct-v0.1")
set_default_backend(runtime)
def single():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
for m in state.messages():
print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
runtime.shutdown()
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()
from sglang import function, gen, set_default_backend, Runtime
"""
Usage:
python3 srt_example_complete.py
"""
import sglang as sgl
@function
@sgl.function
def few_shot_qa(s, question):
s += (
"""The following are questions with answers.
......@@ -13,16 +17,52 @@ Q: What is the capital of Italy?
A: Rome
""")
s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0)
s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
set_default_backend(runtime)
def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
state = few_shot_qa.run(question="What is the capital of the United States?")
assert "washington" in answer, f"answer: {state['answer']}"
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
print(state.text())
runtime.shutdown()
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()
......@@ -10,29 +10,53 @@ def image_qa(s, image_path, question):
s += sgl.assistant(sgl.gen("answer"))
runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b",
tokenizer_path="llava-hf/llava-1.5-7b-hf")
sgl.set_default_backend(runtime)
# Single
state = image_qa.run(
image_path="images/cat.jpeg",
question="What is this?",
max_new_tokens=64)
print(state["answer"], "\n")
# Batch
states = image_qa.run_batch(
[
{"image_path": "images/cat.jpeg", "question":"What is this?"},
{"image_path": "images/dog.jpeg", "question":"What is this?"},
],
max_new_tokens=64,
)
for s in states:
print(s["answer"], "\n")
runtime.shutdown()
def single():
state = image_qa.run(
image_path="images/cat.jpeg",
question="What is this?",
max_new_tokens=64)
print(state["answer"], "\n")
def stream():
state = image_qa.run(
image_path="images/cat.jpeg",
question="What is this?",
max_new_tokens=64,
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = image_qa.run_batch(
[
{"image_path": "images/cat.jpeg", "question":"What is this?"},
{"image_path": "images/dog.jpeg", "question":"What is this?"},
],
max_new_tokens=64,
)
for s in states:
print(s["answer"], "\n")
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b",
tokenizer_path="llava-hf/llava-1.5-7b-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()
from sglang import function, system, user, assistant, gen, set_default_backend, Runtime
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
runtime = Runtime("meta-llama/Llama-2-7b-chat-hf")
set_default_backend(runtime)
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
temperature=0,
stream=True,
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
runtime.shutdown()
......@@ -651,7 +651,7 @@ class ProgramState:
def sync(self):
return self.stream_executor.sync()
def text_iter(self, var_name=None):
def text_iter(self, var_name: Optional[str] = None):
if self.stream_executor.stream:
prev = 0
if var_name is None:
......@@ -682,7 +682,9 @@ class ProgramState:
else:
yield self.get_var(name)
async def text_async_iter(self, var_name=None, return_meta_data=False):
async def text_async_iter(
self, var_name: Optional[str] = None, return_meta_data: bool = False
):
loop = asyncio.get_running_loop()
if self.stream_executor.stream:
......
......@@ -74,7 +74,9 @@ class SglSamplingParams:
)
return {
"max_tokens_to_sample": self.max_new_tokens,
"stop_sequences": self.stop,
"stop_sequences": self.stop
if isinstance(self.stop, (list, tuple))
else [self.stop],
"temperature": self.temperature,
"top_p": self.top_p,
"top_k": self.top_k,
......
......@@ -8,7 +8,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn
from transformers import Qwen2Config
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (
......@@ -30,6 +29,8 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator,
)
Qwen2Config = None
class Qwen2MLP(nn.Module):
def __init__(
......
......@@ -445,18 +445,26 @@ class Runtime:
pipe_reader, pipe_writer = mp.Pipe(duplex=False)
proc = mp.Process(target=launch_server, args=(self.server_args, pipe_writer))
proc.start()
pipe_writer.close()
self.pid = proc.pid
init_state = pipe_reader.recv()
try:
init_state = pipe_reader.recv()
except EOFError:
init_state = ""
if init_state != "init ok":
self.shutdown()
raise RuntimeError("Launch failed")
raise RuntimeError("Launch failed. Please see the error messages above.")
self.endpoint = RuntimeEndpoint(self.url)
def shutdown(self):
if self.pid is not None:
parent = psutil.Process(self.pid)
try:
parent = psutil.Process(self.pid)
except psutil.NoSuchProcess:
return
children = parent.children(recursive=True)
for child in children:
child.kill()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment