Unverified Commit 3bc99e6f authored by Ying Sheng's avatar Ying Sheng Committed by GitHub
Browse files

Test openai vision api (#925)

parent ebf69964
...@@ -136,7 +136,7 @@ response = client.chat.completions.create( ...@@ -136,7 +136,7 @@ response = client.chat.completions.create(
print(response) print(response)
``` ```
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/). It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
### Additional Server Arguments ### Additional Server Arguments
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option. - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
......
...@@ -390,8 +390,13 @@ class TokenizerManager: ...@@ -390,8 +390,13 @@ class TokenizerManager:
obj.return_text_in_logprobs, obj.return_text_in_logprobs,
) )
# Log requests
if self.server_args.log_requests and state.finished: if self.server_args.log_requests and state.finished:
logger.info(f"in={obj.text}, out={out}") if obj.text is None:
in_obj = {"text": self.tokenizer.decode(obj.input_ids)}
else:
in_obj = {"text": obj.text}
logger.info(f"in={in_obj}, out={out}")
state.out_list = [] state.out_list = []
if state.finished: if state.finished:
......
...@@ -18,7 +18,7 @@ from sglang.lang.backend.openai import OpenAI ...@@ -18,7 +18,7 @@ from sglang.lang.backend.openai import OpenAI
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None): def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
......
"""
First run the following command to launch the server.
Note that TinyLlama adopts different chat templates in different versions.
For v0.4, the chat template is chatml.
python3 -m sglang.launch_server --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 \
--port 30000 --chat-template chatml
Output example:
The capital of France is Paris.
The capital of the United States is Washington, D.C.
The capital of Canada is Ottawa.
The capital of Japan is Tokyo
"""
import argparse
import json
import openai
def test_completion(args, echo, logprobs):
client = openai.Client(api_key="EMPTY", base_url=args.base_url)
response = client.completions.create(
model="default",
prompt="The capital of France is",
temperature=0,
max_tokens=32,
echo=echo,
logprobs=logprobs,
)
text = response.choices[0].text
print(response.choices[0].text)
if echo:
assert text.startswith("The capital of France is")
if logprobs:
print(response.choices[0].logprobs.top_logprobs)
assert response.choices[0].logprobs
if echo:
assert response.choices[0].logprobs.token_logprobs[0] == None
else:
assert response.choices[0].logprobs.token_logprobs[0] != None
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
print("=" * 100)
def test_completion_stream(args, echo, logprobs):
client = openai.Client(api_key="EMPTY", base_url=args.base_url)
response = client.completions.create(
model="default",
prompt="The capital of France is",
temperature=0,
max_tokens=32,
stream=True,
echo=echo,
logprobs=logprobs,
)
first = True
for r in response:
if first:
if echo:
assert r.choices[0].text.startswith("The capital of France is")
first = False
if logprobs:
print(
f"{r.choices[0].text:12s}\t" f"{r.choices[0].logprobs.token_logprobs}",
flush=True,
)
print(r.choices[0].logprobs.top_logprobs)
else:
print(r.choices[0].text, end="", flush=True)
assert r.id
assert r.usage.prompt_tokens > 0
assert r.usage.completion_tokens > 0
assert r.usage.total_tokens > 0
print("=" * 100)
def test_chat_completion(args):
client = openai.Client(api_key="EMPTY", base_url=args.base_url)
response = client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant"},
{"role": "user", "content": "What is the capital of France?"},
],
temperature=0,
max_tokens=32,
)
print(response.choices[0].message.content)
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
print("=" * 100)
def test_chat_completion_image(args):
client = openai.Client(api_key="EMPTY", base_url=args.base_url)
response = client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant"},
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image"},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/sgl-project/sglang/main/assets/mixtral_8x7b.jpg"
},
},
],
},
],
temperature=0,
max_tokens=32,
)
print(response.choices[0].message.content)
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
print("=" * 100)
def test_chat_completion_stream(args):
client = openai.Client(api_key="EMPTY", base_url=args.base_url)
response = client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant"},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=0,
max_tokens=64,
stream=True,
)
is_first = True
for chunk in response:
if is_first:
is_first = False
assert chunk.choices[0].delta.role == "assistant"
continue
data = chunk.choices[0].delta
if not data.content:
continue
print(data.content, end="", flush=True)
print("=" * 100)
def test_regex(args):
client = openai.Client(api_key="EMPTY", base_url=args.base_url)
regex = (
r"""\{\n"""
+ r""" "name": "[\w]+",\n"""
+ r""" "population": [\d]+\n"""
+ r"""\}"""
)
response = client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant"},
{"role": "user", "content": "Introduce the capital of France."},
],
temperature=0,
max_tokens=128,
extra_body={"regex": regex},
)
text = response.choices[0].message.content
print(json.loads(text))
print("=" * 100)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--base-url", type=str, default="http://127.0.0.1:30000/v1")
parser.add_argument(
"--test-image", action="store_true", help="Enables testing image inputs"
)
args = parser.parse_args()
test_completion(args, echo=False, logprobs=False)
test_completion(args, echo=True, logprobs=False)
test_completion(args, echo=False, logprobs=True)
test_completion(args, echo=True, logprobs=True)
test_completion(args, echo=False, logprobs=3)
test_completion(args, echo=True, logprobs=3)
test_completion_stream(args, echo=False, logprobs=False)
test_completion_stream(args, echo=True, logprobs=False)
test_completion_stream(args, echo=False, logprobs=True)
test_completion_stream(args, echo=True, logprobs=True)
test_completion_stream(args, echo=False, logprobs=3)
test_completion_stream(args, echo=True, logprobs=3)
test_chat_completion(args)
test_chat_completion_stream(args)
test_regex(args)
if args.test_image:
test_chat_completion_image(args)
import unittest import unittest
import sglang as sgl import sglang as sgl
from sglang.test.test_utils import MODEL_NAME_FOR_TEST from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
class TestBind(unittest.TestCase): class TestBind(unittest.TestCase):
...@@ -9,7 +9,7 @@ class TestBind(unittest.TestCase): ...@@ -9,7 +9,7 @@ class TestBind(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.backend = sgl.Runtime(model_path=MODEL_NAME_FOR_TEST) cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST)
sgl.set_default_backend(cls.backend) sgl.set_default_backend(cls.backend)
@classmethod @classmethod
......
...@@ -14,7 +14,7 @@ from sglang.test.test_programs import ( ...@@ -14,7 +14,7 @@ from sglang.test.test_programs import (
test_stream, test_stream,
test_tool_use, test_tool_use,
) )
from sglang.test.test_utils import MODEL_NAME_FOR_TEST from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
class TestSRTBackend(unittest.TestCase): class TestSRTBackend(unittest.TestCase):
...@@ -22,7 +22,7 @@ class TestSRTBackend(unittest.TestCase): ...@@ -22,7 +22,7 @@ class TestSRTBackend(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.backend = sgl.Runtime(model_path=MODEL_NAME_FOR_TEST) cls.backend = sgl.Runtime(model_path=DEFAULT_MODEL_NAME_FOR_TEST)
sgl.set_default_backend(cls.backend) sgl.set_default_backend(cls.backend)
@classmethod @classmethod
......
...@@ -5,8 +5,9 @@ from sglang.test.test_utils import run_unittest_files ...@@ -5,8 +5,9 @@ from sglang.test.test_utils import run_unittest_files
suites = { suites = {
"minimal": [ "minimal": [
"test_openai_server.py",
"test_eval_accuracy.py", "test_eval_accuracy.py",
"test_openai_server.py",
"test_vision_openai_server.py",
"test_chunked_prefill.py", "test_chunked_prefill.py",
"test_torch_compile.py", "test_torch_compile.py",
"models/test_causal_models.py", "models/test_causal_models.py",
......
...@@ -3,14 +3,14 @@ from types import SimpleNamespace ...@@ -3,14 +3,14 @@ from types import SimpleNamespace
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
class TestAccuracy(unittest.TestCase): class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:8157" cls.base_url = f"http://localhost:8157"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.model,
......
...@@ -3,14 +3,14 @@ from types import SimpleNamespace ...@@ -3,14 +3,14 @@ from types import SimpleNamespace
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
class TestAccuracy(unittest.TestCase): class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:8157" cls.base_url = f"http://localhost:8157"
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
......
...@@ -5,21 +5,21 @@ import openai ...@@ -5,21 +5,21 @@ import openai
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
class TestOpenAIServer(unittest.TestCase): class TestOpenAIServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:8157" cls.base_url = f"http://localhost:8157"
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, api_key=cls.api_key cls.model, cls.base_url, timeout=300, api_key=cls.api_key
) )
cls.base_url += "/v1" cls.base_url += "/v1"
cls.tokenizer = get_tokenizer(MODEL_NAME_FOR_TEST) cls.tokenizer = get_tokenizer(DEFAULT_MODEL_NAME_FOR_TEST)
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
...@@ -147,6 +147,7 @@ class TestOpenAIServer(unittest.TestCase): ...@@ -147,6 +147,7 @@ class TestOpenAIServer(unittest.TestCase):
top_logprobs=logprobs, top_logprobs=logprobs,
n=parallel_sample_num, n=parallel_sample_num,
) )
if logprobs: if logprobs:
assert isinstance( assert isinstance(
response.choices[0].logprobs.content[0].top_logprobs[0].token, str response.choices[0].logprobs.content[0].top_logprobs[0].token, str
...@@ -158,6 +159,7 @@ class TestOpenAIServer(unittest.TestCase): ...@@ -158,6 +159,7 @@ class TestOpenAIServer(unittest.TestCase):
assert ( assert (
ret_num_top_logprobs == logprobs ret_num_top_logprobs == logprobs
), f"{ret_num_top_logprobs} vs {logprobs}" ), f"{ret_num_top_logprobs} vs {logprobs}"
assert len(response.choices) == parallel_sample_num assert len(response.choices) == parallel_sample_num
assert response.choices[0].message.role == "assistant" assert response.choices[0].message.role == "assistant"
assert isinstance(response.choices[0].message.content, str) assert isinstance(response.choices[0].message.content, str)
......
...@@ -5,14 +5,14 @@ import requests ...@@ -5,14 +5,14 @@ import requests
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
class TestSRTEndpoint(unittest.TestCase): class TestSRTEndpoint(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:{8157}" cls.base_url = f"http://localhost:{8157}"
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
......
...@@ -3,14 +3,14 @@ from types import SimpleNamespace ...@@ -3,14 +3,14 @@ from types import SimpleNamespace
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
class TestAccuracy(unittest.TestCase): class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = f"http://localhost:8157" cls.base_url = f"http://localhost:8157"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
......
import json
import unittest
import openai
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import popen_launch_server
class TestOpenAIVisionServer(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
cls.base_url = "http://localhost:8157"
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=300,
api_key=cls.api_key,
other_args=[
"--chat-template",
"vicuna_v1.1",
"--tokenizer-path",
"llava-hf/llava-1.5-7b-hf",
"--log-requests",
],
)
cls.base_url += "/v1"
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_chat_completion(self):
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
response = client.chat.completions.create(
model="default",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://github.com/sgl-project/sglang/blob/main/assets/logo.png?raw=true"
},
},
{"type": "text", "text": "Describe this image"},
],
},
],
temperature=0,
max_tokens=32,
)
assert response.choices[0].message.role == "assistant"
assert isinstance(response.choices[0].message.content, str)
assert response.id
assert response.created
assert response.usage.prompt_tokens > 0
assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0
if __name__ == "__main__":
unittest.main(warnings="ignore")
# t = TestOpenAIVisionServer()
# t.setUpClass()
# t.test_chat_completion()
# t.tearDownClass()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment