Unverified Commit 06175286 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update quick start examples (#120)

parent 4ea92f83
...@@ -39,18 +39,20 @@ pip install -e "python[all]" ...@@ -39,18 +39,20 @@ pip install -e "python[all]"
- For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version. - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"` - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
## Quick Start ## Quick Start
The example below shows how to use sglang to answer a mulit-turn question. The example below shows how to use sglang to answer a mulit-turn question.
### Using OpenAI Models ### Using Local Models
Set the OpenAI API Key First, launch a server with
``` ```
export OPENAI_API_KEY=sk-****** python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
``` ```
Then, answer a multi-turn question. Then, connect to the server and answer a multi-turn question.
```python ```python
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI from sglang import function, system, user, assistant, gen, set_default_backend, RuntimeEndpoint
@function @function
def multi_turn_question(s, question_1, question_2): def multi_turn_question(s, question_1, question_2):
...@@ -60,7 +62,7 @@ def multi_turn_question(s, question_1, question_2): ...@@ -60,7 +62,7 @@ def multi_turn_question(s, question_1, question_2):
s += user(question_2) s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256)) s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo")) set_default_backend(RuntimeEndpoint("http://localhost:30000"))
state = multi_turn_question.run( state = multi_turn_question.run(
question_1="What is the capital of the United States?", question_1="What is the capital of the United States?",
...@@ -73,16 +75,15 @@ for m in state.messages(): ...@@ -73,16 +75,15 @@ for m in state.messages():
print(state["answer_1"]) print(state["answer_1"])
``` ```
### Using Local Models ### Using OpenAI Models
First, launch a server with Set the OpenAI API Key
``` ```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 export OPENAI_API_KEY=sk-******
``` ```
Then, connect to the server and answer a multi-turn question. Then, answer a multi-turn question.
```python ```python
from sglang import function, system, user, assistant, gen, set_default_backend, RuntimeEndpoint from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function @function
def multi_turn_question(s, question_1, question_2): def multi_turn_question(s, question_1, question_2):
...@@ -92,7 +93,7 @@ def multi_turn_question(s, question_1, question_2): ...@@ -92,7 +93,7 @@ def multi_turn_question(s, question_1, question_2):
s += user(question_2) s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256)) s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(RuntimeEndpoint("http://localhost:30000")) set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run( state = multi_turn_question.run(
question_1="What is the capital of the United States?", question_1="What is the capital of the United States?",
...@@ -120,7 +121,7 @@ import sglang as sgl ...@@ -120,7 +121,7 @@ import sglang as sgl
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`. `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
You can implement your prompt flow in a function decorated by `sgl.function`. You can implement your prompt flow in a function decorated by `sgl.function`.
You can then invoke the function with `run` or `run_batch`. You can then invoke the function with `run` or `run_batch`.
The system will manage the state, chat template, and parallelism for you. The system will manage the state, chat template, parallelism and batching for you.
### Control Flow ### Control Flow
You can use any Python code within the function body, including control flow, nested function calls, and external libraries. You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
......
from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic """
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_chat.py
"""
import sglang as sgl
@function @sgl.function
def multi_turn_question(s, question_1, question_2): def multi_turn_question(s, question_1, question_2):
s += user(question_1) s += sgl.user(question_1)
s += assistant(gen("answer_1", max_tokens=256)) s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += user(question_2) s += sgl.user(question_2)
s += assistant(gen("answer_2", max_tokens=256)) s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
set_default_backend(Anthropic("claude-2"))
state = multi_turn_question.run( def single():
question_1="What is the capital of the United States?", state = multi_turn_question.run(
question_2="List two local attractions.", question_1="What is the capital of the United States?",
) question_2="List two local attractions.",
)
for m in state.messages(): for m in state.messages():
print(m["role"], ":", m["content"]) print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.Anthropic("claude-2"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, gen, set_default_backend, Anthropic """
Usage:
export ANTHROPIC_API_KEY=sk-******
python3 anthropic_example_complete.py
"""
import sglang as sgl
@function
@sgl.function
def few_shot_qa(s, question): def few_shot_qa(s, question):
s += ( s += (
""" """
...@@ -13,14 +19,49 @@ def few_shot_qa(s, question): ...@@ -13,14 +19,49 @@ def few_shot_qa(s, question):
\n\nAssistant: Rome \n\nAssistant: Rome
""") """)
s += "\n\nHuman: " + question + "\n" s += "\n\nHuman: " + question + "\n"
s += "\n\nAssistant:" + gen("answer", stop="\n", temperature=0) s += "\n\nAssistant:" + sgl.gen("answer", stop="\n", temperature=0)
def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
set_default_backend(Anthropic("claude-2")) if __name__ == "__main__":
sgl.set_default_backend(sgl.Anthropic("claude-2"))
state = few_shot_qa.run(question="What is the capital of the United States?") # Run a single request
answer = state["answer"].strip().lower() print("\n========== single ==========\n")
single()
assert "washington" in answer, f"answer: {state['answer']}" # Stream output
print("\n========== stream ==========\n")
stream()
print(state.text()) # Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, system, user, assistant, gen, set_default_backend, Anthropic
@function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(Anthropic("claude-2"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
"""
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_chat.py
"""
import sglang as sgl
@sgl.function
def multi_turn_question(s, question_1, question_2):
s += sgl.user(question_1)
s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += sgl.user(question_2)
s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
def single():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
for m in state.messages():
print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, gen, set_default_backend, VertexAI """
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_complete.py
"""
import sglang as sgl
@function
@sgl.function
def few_shot_qa(s, question): def few_shot_qa(s, question):
s += ( s += (
"""The following are questions with answers. """The following are questions with answers.
...@@ -13,14 +19,49 @@ Q: What is the capital of Italy? ...@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome A: Rome
""") """)
s += "Q: " + question + "\n" s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0) s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
set_default_backend(VertexAI("gemini-pro")) if __name__ == "__main__":
sgl.set_default_backend(sgl.VertexAI("gemini-pro"))
state = few_shot_qa.run(question="What is the capital of the United States?") # Run a single request
answer = state["answer"].strip().lower() print("\n========== single ==========\n")
single()
assert "washington" in answer, f"answer: {state['answer']}" # Stream output
print("\n========== stream ==========\n")
stream()
print(state.text()) # Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, user, assistant, gen, image, set_default_backend, VertexAI """
Usage:
export GCP_PROJECT_ID=******
python3 gemini_example_multimodal_chat.py
"""
import sglang as sgl
@function @sgl.function
def image_qa(s, image_file1, image_file2, question): def image_qa(s, image_file1, image_file2, question):
s += user(image(image_file1) + image(image_file2) + question) s += sgl.user(sgl.image(image_file1) + sgl.image(image_file2) + question)
s += assistant(gen("answer_1", max_tokens=256)) s += sgl.assistant(sgl.gen("answer", max_tokens=256))
set_default_backend(VertexAI("gemini-pro-vision"))
state = image_qa.run( if __name__ == "__main__":
image_file1="./images/cat.jpeg", sgl.set_default_backend(sgl.VertexAI("gemini-pro-vision"))
image_file2="./images/dog.jpeg",
question="Describe difference of the 2 images in one sentence.",
stream=True
)
for out in state.text_iter(): state = image_qa.run(
print(out, end="", flush=True) image_file1="./images/cat.jpeg",
image_file2="./images/dog.jpeg",
question="Describe difference of the two images in one sentence.",
stream=True
)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
print(state["answer"])
from sglang import function, user, assistant, gen, set_default_backend, VertexAI
@function
def multi_turn_question(s, question_1, question_2):
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(VertexAI("gemini-pro"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI """
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_chat.py
"""
import sglang as sgl
@function @sgl.function
def multi_turn_question(s, question_1, question_2): def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.") s += sgl.system("You are a helpful assistant.")
s += user(question_1) s += sgl.user(question_1)
s += assistant(gen("answer_1", max_tokens=256)) s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += user(question_2) s += sgl.user(question_2)
s += assistant(gen("answer_2", max_tokens=256)) s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run( def single():
question_1="What is the capital of the United States?", state = multi_turn_question.run(
question_2="List two local attractions.", question_1="What is the capital of the United States?",
) question_2="List two local attractions.",
)
for m in state.messages(): for m in state.messages():
print(m["role"], ":", m["content"]) print(m["role"], ":", m["content"])
print("answer_1", state["answer_1"])
def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo"))
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, gen, set_default_backend, OpenAI """
Usage:
export OPENAI_API_KEY=sk-******
python3 openai_example_complete.py
"""
import sglang as sgl
@function
@sgl.function
def few_shot_qa(s, question): def few_shot_qa(s, question):
s += ( s += (
"""The following are questions with answers. """The following are questions with answers.
...@@ -13,14 +19,49 @@ Q: What is the capital of Italy? ...@@ -13,14 +19,49 @@ Q: What is the capital of Italy?
A: Rome A: Rome
""") """)
s += "Q: " + question + "\n" s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0) s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
def single():
state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
set_default_backend(OpenAI("gpt-3.5-turbo-instruct")) if __name__ == "__main__":
sgl.set_default_backend(sgl.OpenAI("gpt-3.5-turbo-instruct"))
state = few_shot_qa.run(question="What is the capital of the United States?") # Run a single request
answer = state["answer"].strip().lower() print("\n========== single ==========\n")
single()
assert "washington" in answer, f"answer: {state['answer']}" # Stream output
print("\n========== stream ==========\n")
stream()
print(state.text()) # Run a batch of requests
print("\n========== batch ==========\n")
batch()
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
set_default_backend(OpenAI("gpt-3.5-turbo"))
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
from sglang import function, system, user, assistant, gen, set_default_backend, Runtime """
Usage:
python3 srt_example_chat.py
"""
import sglang as sgl
@function @sgl.function
def multi_turn_question(s, question_1, question_2): def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.") s += sgl.user(question_1)
s += user(question_1) s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
s += assistant(gen("answer_1", max_tokens=256)) s += sgl.user(question_2)
s += user(question_2) s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
s += assistant(gen("answer_2", max_tokens=256))
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf") def single():
#runtime = Runtime(model_path="mistralai/Mixtral-8x7B-Instruct-v0.1") state = multi_turn_question.run(
set_default_backend(runtime) question_1="What is the capital of the United States?",
question_2="List two local attractions.",
)
state = multi_turn_question.run( for m in state.messages():
question_1="What is the capital of the United States?", print(m["role"], ":", m["content"])
question_2="List two local attractions.",
)
for m in state.messages(): print("answer_1", state["answer_1"])
print(m["role"], ":", m["content"])
runtime.shutdown() def stream():
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
stream=True
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
def batch():
states = multi_turn_question.run_batch([
{"question_1": "What is the capital of the United States?",
"question_2": "List two local attractions."},
{"question_1": "What is the capital of France?",
"question_2": "What is the population of this city?"},
])
for s in states:
print(s.messages())
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()
from sglang import function, gen, set_default_backend, Runtime """
Usage:
python3 srt_example_complete.py
"""
import sglang as sgl
@function @sgl.function
def few_shot_qa(s, question): def few_shot_qa(s, question):
s += ( s += (
"""The following are questions with answers. """The following are questions with answers.
...@@ -13,16 +17,52 @@ Q: What is the capital of Italy? ...@@ -13,16 +17,52 @@ Q: What is the capital of Italy?
A: Rome A: Rome
""") """)
s += "Q: " + question + "\n" s += "Q: " + question + "\n"
s += "A:" + gen("answer", stop="\n", temperature=0) s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
runtime = Runtime(model_path="meta-llama/Llama-2-7b-chat-hf") def single():
set_default_backend(runtime) state = few_shot_qa.run(question="What is the capital of the United States?")
answer = state["answer"].strip().lower()
state = few_shot_qa.run(question="What is the capital of the United States?") assert "washington" in answer, f"answer: {state['answer']}"
answer = state["answer"].strip().lower() print(state.text())
assert "washington" in answer, f"answer: {state['answer']}"
print(state.text())
runtime.shutdown()
def stream():
state = few_shot_qa.run(
question="What is the capital of the United States?",
stream=True)
for out in state.text_iter("answer"):
print(out, end="", flush=True)
print()
def batch():
states = few_shot_qa.run_batch([
{"question": "What is the capital of the United States?"},
{"question": "What is the capital of China?"},
])
for s in states:
print(s["answer"])
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="meta-llama/Llama-2-7b-chat-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()
...@@ -10,29 +10,53 @@ def image_qa(s, image_path, question): ...@@ -10,29 +10,53 @@ def image_qa(s, image_path, question):
s += sgl.assistant(sgl.gen("answer")) s += sgl.assistant(sgl.gen("answer"))
runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b", def single():
tokenizer_path="llava-hf/llava-1.5-7b-hf") state = image_qa.run(
sgl.set_default_backend(runtime) image_path="images/cat.jpeg",
question="What is this?",
max_new_tokens=64)
# Single print(state["answer"], "\n")
state = image_qa.run(
image_path="images/cat.jpeg",
question="What is this?", def stream():
max_new_tokens=64) state = image_qa.run(
print(state["answer"], "\n") image_path="images/cat.jpeg",
question="What is this?",
max_new_tokens=64,
# Batch stream=True)
states = image_qa.run_batch(
[ for out in state.text_iter("answer"):
{"image_path": "images/cat.jpeg", "question":"What is this?"}, print(out, end="", flush=True)
{"image_path": "images/dog.jpeg", "question":"What is this?"}, print()
],
max_new_tokens=64,
) def batch():
for s in states: states = image_qa.run_batch(
print(s["answer"], "\n") [
{"image_path": "images/cat.jpeg", "question":"What is this?"},
{"image_path": "images/dog.jpeg", "question":"What is this?"},
runtime.shutdown() ],
max_new_tokens=64,
)
for s in states:
print(s["answer"], "\n")
if __name__ == "__main__":
runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.5-7b",
tokenizer_path="llava-hf/llava-1.5-7b-hf")
sgl.set_default_backend(runtime)
# Run a single request
print("\n========== single ==========\n")
single()
# Stream output
print("\n========== stream ==========\n")
stream()
# Run a batch of requests
print("\n========== batch ==========\n")
batch()
runtime.shutdown()
from sglang import function, system, user, assistant, gen, set_default_backend, Runtime
@function
def multi_turn_question(s, question_1, question_2):
s += system("You are a helpful assistant.")
s += user(question_1)
s += assistant(gen("answer_1", max_tokens=256))
s += user(question_2)
s += assistant(gen("answer_2", max_tokens=256))
runtime = Runtime("meta-llama/Llama-2-7b-chat-hf")
set_default_backend(runtime)
state = multi_turn_question.run(
question_1="What is the capital of the United States?",
question_2="List two local attractions.",
temperature=0,
stream=True,
)
for out in state.text_iter():
print(out, end="", flush=True)
print()
runtime.shutdown()
...@@ -651,7 +651,7 @@ class ProgramState: ...@@ -651,7 +651,7 @@ class ProgramState:
def sync(self): def sync(self):
return self.stream_executor.sync() return self.stream_executor.sync()
def text_iter(self, var_name=None): def text_iter(self, var_name: Optional[str] = None):
if self.stream_executor.stream: if self.stream_executor.stream:
prev = 0 prev = 0
if var_name is None: if var_name is None:
...@@ -682,7 +682,9 @@ class ProgramState: ...@@ -682,7 +682,9 @@ class ProgramState:
else: else:
yield self.get_var(name) yield self.get_var(name)
async def text_async_iter(self, var_name=None, return_meta_data=False): async def text_async_iter(
self, var_name: Optional[str] = None, return_meta_data: bool = False
):
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
if self.stream_executor.stream: if self.stream_executor.stream:
......
...@@ -74,7 +74,9 @@ class SglSamplingParams: ...@@ -74,7 +74,9 @@ class SglSamplingParams:
) )
return { return {
"max_tokens_to_sample": self.max_new_tokens, "max_tokens_to_sample": self.max_new_tokens,
"stop_sequences": self.stop, "stop_sequences": self.stop
if isinstance(self.stop, (list, tuple))
else [self.stop],
"temperature": self.temperature, "temperature": self.temperature,
"top_p": self.top_p, "top_p": self.top_p,
"top_k": self.top_k, "top_k": self.top_k,
......
...@@ -8,7 +8,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor ...@@ -8,7 +8,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.radix_attention import RadixAttention
from sglang.srt.managers.router.model_runner import InputMetadata from sglang.srt.managers.router.model_runner import InputMetadata
from torch import nn from torch import nn
from transformers import Qwen2Config
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
...@@ -30,6 +29,8 @@ from vllm.model_executor.weight_utils import ( ...@@ -30,6 +29,8 @@ from vllm.model_executor.weight_utils import (
hf_model_weights_iterator, hf_model_weights_iterator,
) )
Qwen2Config = None
class Qwen2MLP(nn.Module): class Qwen2MLP(nn.Module):
def __init__( def __init__(
......
...@@ -445,18 +445,26 @@ class Runtime: ...@@ -445,18 +445,26 @@ class Runtime:
pipe_reader, pipe_writer = mp.Pipe(duplex=False) pipe_reader, pipe_writer = mp.Pipe(duplex=False)
proc = mp.Process(target=launch_server, args=(self.server_args, pipe_writer)) proc = mp.Process(target=launch_server, args=(self.server_args, pipe_writer))
proc.start() proc.start()
pipe_writer.close()
self.pid = proc.pid self.pid = proc.pid
init_state = pipe_reader.recv() try:
init_state = pipe_reader.recv()
except EOFError:
init_state = ""
if init_state != "init ok": if init_state != "init ok":
self.shutdown() self.shutdown()
raise RuntimeError("Launch failed") raise RuntimeError("Launch failed. Please see the error messages above.")
self.endpoint = RuntimeEndpoint(self.url) self.endpoint = RuntimeEndpoint(self.url)
def shutdown(self): def shutdown(self):
if self.pid is not None: if self.pid is not None:
parent = psutil.Process(self.pid) try:
parent = psutil.Process(self.pid)
except psutil.NoSuchProcess:
return
children = parent.children(recursive=True) children = parent.children(recursive=True)
for child in children: for child in children:
child.kill() child.kill()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment