Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
ffd1a26e
Unverified
Commit
ffd1a26e
authored
Jun 18, 2025
by
Jinn
Committed by
GitHub
Jun 18, 2025
Browse files
Add more refactored openai test & in CI (#7284)
parent
09ae5b20
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
576 additions
and
1059 deletions
+576
-1059
python/sglang/srt/entrypoints/openai/api_server.py
python/sglang/srt/entrypoints/openai/api_server.py
+2
-2
test/srt/openai/conftest.py
test/srt/openai/conftest.py
+4
-3
test/srt/openai/test_protocol.py
test/srt/openai/test_protocol.py
+179
-177
test/srt/openai/test_server.py
test/srt/openai/test_server.py
+41
-5
test/srt/openai/test_serving_chat.py
test/srt/openai/test_serving_chat.py
+156
-562
test/srt/openai/test_serving_completions.py
test/srt/openai/test_serving_completions.py
+68
-143
test/srt/openai/test_serving_embedding.py
test/srt/openai/test_serving_embedding.py
+121
-167
test/srt/run_suite.py
test/srt/run_suite.py
+5
-0
No files found.
python/sglang/srt/entrypoints/openai/api_server.py
View file @
ffd1a26e
...
...
@@ -36,7 +36,7 @@ from fastapi.middleware.cors import CORSMiddleware
from
fastapi.responses
import
Response
from
sglang.srt.disaggregation.utils
import
(
F
akeBootstrapHost
,
F
AKE_BOOTSTRAP_HOST
,
register_disaggregation_server
,
)
from
sglang.srt.entrypoints.engine
import
Engine
,
_launch_subprocesses
...
...
@@ -265,7 +265,7 @@ def _wait_and_warmup(
"max_new_tokens"
:
8
,
"ignore_eos"
:
True
,
},
"bootstrap_host"
:
[
F
akeBootstrapHost
]
*
server_args
.
dp_size
,
"bootstrap_host"
:
[
F
AKE_BOOTSTRAP_HOST
]
*
server_args
.
dp_size
,
# This is a hack to ensure fake transfer is enabled during prefill warmup
# ensure each dp rank has a unique bootstrap_room during prefill warmup
"bootstrap_room"
:
[
...
...
test/srt/openai/conftest.py
View file @
ffd1a26e
...
...
@@ -12,9 +12,10 @@ import pytest
import
requests
from
sglang.srt.utils
import
kill_process_tree
# reuse SGLang helper
from
sglang.test.test_utils
import
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
SERVER_MODULE
=
"sglang.srt.entrypoints.openai.api_server"
DEFAULT_MODEL
=
"dummy-model"
DEFAULT_MODEL
=
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
STARTUP_TIMEOUT
=
float
(
os
.
getenv
(
"SGLANG_OPENAI_STARTUP_TIMEOUT"
,
120
))
...
...
@@ -39,7 +40,7 @@ def _wait_until_healthy(proc: subprocess.Popen, base: str, timeout: float) -> No
def
launch_openai_server
(
model
:
str
=
DEFAULT_MODEL
,
**
kw
):
"""Spawn the draft OpenAI-compatible server and wait until it
’
s ready."""
"""Spawn the draft OpenAI-compatible server and wait until it
'
s ready."""
port
=
_pick_free_port
()
cmd
=
[
sys
.
executable
,
...
...
@@ -79,7 +80,7 @@ def launch_openai_server(model: str = DEFAULT_MODEL, **kw):
@
pytest
.
fixture
(
scope
=
"session"
)
def
openai_server
()
->
Generator
[
str
,
None
,
None
]:
"""PyTest fixture that provides the server
’
s base URL and cleans up."""
"""PyTest fixture that provides the server
'
s base URL and cleans up."""
proc
,
base
,
log_file
=
launch_openai_server
()
yield
base
kill_process_tree
(
proc
.
pid
)
...
...
test/srt/openai/test_protocol.py
View file @
ffd1a26e
This diff is collapsed.
Click to expand it.
test/srt/openai/test_server.py
View file @
ffd1a26e
# sglang/test/srt/openai/test_server.py
import
pytest
import
requests
from
sglang.test.test_utils
import
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
as
MODEL_ID
def
test_health
(
openai_server
:
str
):
r
=
requests
.
get
(
f
"
{
openai_server
}
/health"
)
assert
r
.
status_code
==
200
,
r
.
text
assert
r
.
status_code
==
200
# FastAPI returns an empty body → r.text == ""
assert
r
.
text
==
""
@
pytest
.
mark
.
xfail
(
reason
=
"Endpoint skeleton not implemented yet"
)
def
test_models_endpoint
(
openai_server
:
str
):
r
=
requests
.
get
(
f
"
{
openai_server
}
/v1/models"
)
# once implemented this should be 200
assert
r
.
status_code
==
200
assert
r
.
status_code
==
200
,
r
.
text
payload
=
r
.
json
()
# Basic contract
assert
"data"
in
payload
and
isinstance
(
payload
[
"data"
],
list
)
and
payload
[
"data"
]
# Validate fields of the first model card
first
=
payload
[
"data"
][
0
]
for
key
in
(
"id"
,
"root"
,
"max_model_len"
):
assert
key
in
first
,
f
"missing
{
key
}
in
{
first
}
"
# max_model_len must be positive
assert
isinstance
(
first
[
"max_model_len"
],
int
)
and
first
[
"max_model_len"
]
>
0
# The server should report the same model id we launched it with
ids
=
{
m
[
"id"
]
for
m
in
payload
[
"data"
]}
assert
MODEL_ID
in
ids
def
test_get_model_info
(
openai_server
:
str
):
r
=
requests
.
get
(
f
"
{
openai_server
}
/get_model_info"
)
assert
r
.
status_code
==
200
,
r
.
text
info
=
r
.
json
()
expected_keys
=
{
"model_path"
,
"tokenizer_path"
,
"is_generation"
}
assert
expected_keys
.
issubset
(
info
.
keys
())
# model_path must end with the one we passed on the CLI
assert
info
[
"model_path"
].
endswith
(
MODEL_ID
)
# is_generation is documented as a boolean
assert
isinstance
(
info
[
"is_generation"
],
bool
)
def
test_unknown_route_returns_404
(
openai_server
:
str
):
r
=
requests
.
get
(
f
"
{
openai_server
}
/definitely-not-a-real-route"
)
assert
r
.
status_code
==
404
test/srt/openai/test_serving_chat.py
View file @
ffd1a26e
This diff is collapsed.
Click to expand it.
test/srt/openai/test_serving_completions.py
View file @
ffd1a26e
"""
Tests for the refactored completions serving handler
Unit-tests for the refactored completions-serving handler (no pytest).
Run with:
python -m unittest tests.test_serving_completions_unit -v
"""
import
unittest
from
unittest.mock
import
AsyncMock
,
Mock
,
patch
import
pytest
from
sglang.srt.entrypoints.openai.protocol
import
(
CompletionRequest
,
CompletionResponse
,
CompletionResponseChoice
,
CompletionStreamResponse
,
ErrorResponse
,
)
from
sglang.srt.entrypoints.openai.protocol
import
CompletionRequest
from
sglang.srt.entrypoints.openai.serving_completions
import
OpenAIServingCompletion
from
sglang.srt.managers.io_struct
import
GenerateReqInput
from
sglang.srt.managers.tokenizer_manager
import
TokenizerManager
@
pytest
.
fixture
def
mock_tokenizer_manager
():
"""Create a mock tokenizer manager"""
manager
=
Mock
(
spec
=
TokenizerManager
)
# Mock tokenizer
manager
.
tokenizer
=
Mock
()
manager
.
tokenizer
.
encode
=
Mock
(
return_value
=
[
1
,
2
,
3
,
4
])
manager
.
tokenizer
.
decode
=
Mock
(
return_value
=
"decoded text"
)
manager
.
tokenizer
.
bos_token_id
=
1
# Mock model config
manager
.
model_config
=
Mock
()
manager
.
model_config
.
is_multimodal
=
False
# Mock server args
manager
.
server_args
=
Mock
()
manager
.
server_args
.
enable_cache_report
=
False
class
ServingCompletionTestCase
(
unittest
.
TestCase
):
"""Bundle all prompt/echo tests in one TestCase."""
# Mock generation
manager
.
generate_request
=
AsyncMock
()
manager
.
create_abort_task
=
Mock
(
return_value
=
None
)
# ---------- shared test fixtures ----------
def
setUp
(
self
):
# build the mock TokenizerManager once for every test
tm
=
Mock
(
spec
=
TokenizerManager
)
return
manager
tm
.
tokenizer
=
Mock
()
tm
.
tokenizer
.
encode
.
return_value
=
[
1
,
2
,
3
,
4
]
tm
.
tokenizer
.
decode
.
return_value
=
"decoded text"
tm
.
tokenizer
.
bos_token_id
=
1
tm
.
model_config
=
Mock
(
is_multimodal
=
False
)
tm
.
server_args
=
Mock
(
enable_cache_report
=
False
)
@
pytest
.
fixture
def
serving_completion
(
mock_tokenizer_manager
):
"""Create a OpenAIServingCompletion instance"""
return
OpenAIServingCompletion
(
mock_tokenizer_manager
)
tm
.
generate_request
=
AsyncMock
()
tm
.
create_abort_task
=
Mock
()
self
.
sc
=
OpenAIServingCompletion
(
tm
)
class
TestPromptHandling
:
"""Test different prompt types and formats from adapter.py"""
def
test_single_string_prompt
(
self
,
serving_completion
):
"""Test handling single string prompt"""
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
"Hello world"
,
max_tokens
=
100
)
adapted_request
,
_
=
serving_completion
.
_convert_to_internal_request
(
[
request
],
[
"test-id"
]
)
assert
adapted_request
.
text
==
"Hello world"
def
test_single_token_ids_prompt
(
self
,
serving_completion
):
"""Test handling single token IDs prompt"""
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
[
1
,
2
,
3
,
4
],
max_tokens
=
100
)
adapted_request
,
_
=
serving_completion
.
_convert_to_internal_request
(
[
request
],
[
"test-id"
]
)
# ---------- prompt-handling ----------
def
test_single_string_prompt
(
self
):
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
"Hello world"
,
max_tokens
=
100
)
internal
,
_
=
self
.
sc
.
_convert_to_internal_request
([
req
],
[
"id"
])
self
.
assertEqual
(
internal
.
text
,
"Hello world"
)
assert
adapted_request
.
input_ids
==
[
1
,
2
,
3
,
4
]
def
test_single_token_ids_prompt
(
self
):
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
[
1
,
2
,
3
,
4
],
max_tokens
=
100
)
internal
,
_
=
self
.
sc
.
_convert_to_internal_request
([
req
],
[
"id"
])
self
.
assertEqual
(
internal
.
input_ids
,
[
1
,
2
,
3
,
4
])
def
test_completion_template_handling
(
self
,
serving_completion
):
"""Test completion template processing"""
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
"def hello():"
,
suffix
=
"return 'world'"
,
max_tokens
=
100
,
def
test_completion_template_handling
(
self
):
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
"def f():"
,
suffix
=
"return 1"
,
max_tokens
=
100
)
with
patch
(
"sglang.srt.entrypoints.openai.serving_completions.is_completion_template_defined"
,
return_value
=
True
,
),
patch
(
"sglang.srt.entrypoints.openai.serving_completions.generate_completion_prompt_from_request"
,
return_value
=
"processed_prompt"
,
):
with
patch
(
"sglang.srt.entrypoints.openai.serving_completions.generate_completion_prompt_from_request"
,
return_value
=
"processed_prompt"
,
):
adapted_request
,
_
=
serving_completion
.
_convert_to_internal_request
(
[
request
],
[
"test-id"
]
)
assert
adapted_request
.
text
==
"processed_prompt"
internal
,
_
=
self
.
sc
.
_convert_to_internal_request
([
req
],
[
"id"
])
self
.
assertEqual
(
internal
.
text
,
"processed_prompt"
)
# ---------- echo-handling ----------
def
test_echo_with_string_prompt_streaming
(
self
):
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
"Hello"
,
max_tokens
=
1
,
echo
=
True
)
self
.
assertEqual
(
self
.
sc
.
_get_echo_text
(
req
,
0
),
"Hello"
)
class
TestEchoHandling
:
"""Test echo functionality from adapter.py"""
def
test_echo_with_string_prompt_streaming
(
self
,
serving_completion
):
"""Test echo handling with string prompt in streaming"""
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
"Hello"
,
max_tokens
=
100
,
echo
=
True
def
test_echo_with_list_of_strings_streaming
(
self
):
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
[
"A"
,
"B"
],
max_tokens
=
1
,
echo
=
True
,
n
=
1
)
self
.
assertEqual
(
self
.
sc
.
_get_echo_text
(
req
,
0
),
"A"
)
self
.
assertEqual
(
self
.
sc
.
_get_echo_text
(
req
,
1
),
"B"
)
# Test _get_echo_text method
echo_text
=
serving_completion
.
_get_echo_text
(
request
,
0
)
assert
echo_text
==
"Hello"
def
test_echo_with_list_of_strings_streaming
(
self
,
serving_completion
):
"""Test echo handling with list of strings in streaming"""
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
[
"Hello"
,
"World"
],
max_tokens
=
100
,
echo
=
True
,
n
=
1
,
)
def
test_echo_with_token_ids_streaming
(
self
):
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
[
1
,
2
,
3
],
max_tokens
=
1
,
echo
=
True
)
self
.
sc
.
tokenizer_manager
.
tokenizer
.
decode
.
return_value
=
"decoded_prompt"
self
.
assertEqual
(
self
.
sc
.
_get_echo_text
(
req
,
0
),
"decoded_prompt"
)
echo_text
=
serving_completion
.
_get_echo_text
(
request
,
0
)
assert
echo_text
==
"Hello"
echo_text
=
serving_completion
.
_get_echo_text
(
request
,
1
)
assert
echo_text
==
"World"
def
test_echo_with_token_ids_streaming
(
self
,
serving_completion
):
"""Test echo handling with token IDs in streaming"""
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
[
1
,
2
,
3
],
max_tokens
=
100
,
echo
=
True
def
test_echo_with_multiple_token_ids_streaming
(
self
):
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
[[
1
,
2
],
[
3
,
4
]],
max_tokens
=
1
,
echo
=
True
,
n
=
1
)
self
.
sc
.
tokenizer_manager
.
tokenizer
.
decode
.
return_value
=
"decoded"
self
.
assertEqual
(
self
.
sc
.
_get_echo_text
(
req
,
0
),
"decoded"
)
serving_completion
.
tokenizer_manager
.
tokenizer
.
decode
.
return_value
=
(
"decoded_prompt"
)
echo_text
=
serving_completion
.
_get_echo_text
(
request
,
0
)
assert
echo_text
==
"decoded_prompt"
def
test_prepare_echo_prompts_non_streaming
(
self
):
# single string
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
"Hi"
,
echo
=
True
)
self
.
assertEqual
(
self
.
sc
.
_prepare_echo_prompts
(
req
),
[
"Hi"
])
def
test_echo_with_multiple_token_ids_streaming
(
self
,
serving_completion
):
"""Test echo handling with multiple token ID prompts in streaming"""
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
[[
1
,
2
],
[
3
,
4
]],
max_tokens
=
100
,
echo
=
True
,
n
=
1
)
serving_completion
.
tokenizer_manager
.
tokenizer
.
decode
.
return_value
=
"decoded"
echo_text
=
serving_completion
.
_get_echo_text
(
request
,
0
)
assert
echo_text
==
"decoded"
def
test_prepare_echo_prompts_non_streaming
(
self
,
serving_completion
):
"""Test prepare echo prompts for non-streaming response"""
# Test with single string
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
"Hello"
,
echo
=
True
)
echo_prompts
=
serving_completion
.
_prepare_echo_prompts
(
request
)
assert
echo_prompts
==
[
"Hello"
]
# Test with list of strings
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
[
"Hello"
,
"World"
],
echo
=
True
)
# list of strings
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
[
"Hi"
,
"Yo"
],
echo
=
True
)
self
.
assertEqual
(
self
.
sc
.
_prepare_echo_prompts
(
req
),
[
"Hi"
,
"Yo"
])
echo_prompts
=
serving_completion
.
_prepare_echo_prompts
(
request
)
assert
echo_prompts
==
[
"Hello"
,
"World"
]
# token IDs
req
=
CompletionRequest
(
model
=
"x"
,
prompt
=
[
1
,
2
,
3
],
echo
=
True
)
self
.
sc
.
tokenizer_manager
.
tokenizer
.
decode
.
return_value
=
"decoded"
self
.
assertEqual
(
self
.
sc
.
_prepare_echo_prompts
(
req
),
[
"decoded"
])
# Test with token IDs
request
=
CompletionRequest
(
model
=
"test-model"
,
prompt
=
[
1
,
2
,
3
],
echo
=
True
)
serving_completion
.
tokenizer_manager
.
tokenizer
.
decode
.
return_value
=
"decoded"
echo_prompts
=
serving_completion
.
_prepare_echo_prompts
(
request
)
assert
echo_prompts
==
[
"decoded"
]
if
__name__
==
"__main__"
:
unittest
.
main
(
verbosity
=
2
)
test/srt/openai/test_serving_embedding.py
View file @
ffd1a26e
...
...
@@ -8,11 +8,11 @@ with the original adapter.py functionality and follows OpenAI API specifications
import
asyncio
import
json
import
time
import
unittest
import
uuid
from
typing
import
Any
,
Dict
,
List
from
unittest.mock
import
AsyncMock
,
Mock
,
patch
import
pytest
from
fastapi
import
Request
from
fastapi.responses
import
ORJSONResponse
from
pydantic_core
import
ValidationError
...
...
@@ -30,7 +30,7 @@ from sglang.srt.managers.io_struct import EmbeddingReqInput
# Mock TokenizerManager for embedding tests
class
MockTokenizerManager
:
class
_
MockTokenizerManager
:
def
__init__
(
self
):
self
.
model_config
=
Mock
()
self
.
model_config
.
is_multimodal
=
False
...
...
@@ -58,141 +58,98 @@ class MockTokenizerManager:
self
.
generate_request
=
Mock
(
return_value
=
mock_generate_embedding
())
@
pytest
.
fixture
def
mock_tokenizer_manager
():
"""Create a mock tokenizer manager for testing."""
return
MockTokenizerManager
()
class
ServingEmbeddingTestCase
(
unittest
.
TestCase
):
def
setUp
(
self
):
"""Set up test fixtures."""
self
.
tokenizer_manager
=
_MockTokenizerManager
()
self
.
serving_embedding
=
OpenAIServingEmbedding
(
self
.
tokenizer_manager
)
self
.
request
=
Mock
(
spec
=
Request
)
self
.
request
.
headers
=
{}
@
pytest
.
fixture
def
serving_embedding
(
mock_tokenizer_manager
):
"""Create an OpenAIServingEmbedding instance for testing."""
return
OpenAIServingEmbedding
(
mock_tokenizer_manager
)
@
pytest
.
fixture
def
mock_request
():
"""Create a mock FastAPI request."""
request
=
Mock
(
spec
=
Request
)
request
.
headers
=
{}
return
request
@
pytest
.
fixture
def
basic_embedding_request
():
"""Create a basic embedding request."""
return
EmbeddingRequest
(
model
=
"test-model"
,
input
=
"Hello, how are you?"
,
encoding_format
=
"float"
,
)
@
pytest
.
fixture
def
list_embedding_request
():
"""Create an embedding request with list input."""
return
EmbeddingRequest
(
model
=
"test-model"
,
input
=
[
"Hello, how are you?"
,
"I am fine, thank you!"
],
encoding_format
=
"float"
,
)
@
pytest
.
fixture
def
multimodal_embedding_request
():
"""Create a multimodal embedding request."""
return
EmbeddingRequest
(
model
=
"test-model"
,
input
=
[
MultimodalEmbeddingInput
(
text
=
"Hello"
,
image
=
"base64_image_data"
),
MultimodalEmbeddingInput
(
text
=
"World"
,
image
=
None
),
],
encoding_format
=
"float"
,
)
@
pytest
.
fixture
def
token_ids_embedding_request
():
"""Create an embedding request with token IDs."""
return
EmbeddingRequest
(
model
=
"test-model"
,
input
=
[
1
,
2
,
3
,
4
,
5
],
encoding_format
=
"float"
,
)
class
TestOpenAIServingEmbeddingConversion
:
"""Test request conversion methods."""
self
.
basic_req
=
EmbeddingRequest
(
model
=
"test-model"
,
input
=
"Hello, how are you?"
,
encoding_format
=
"float"
,
)
self
.
list_req
=
EmbeddingRequest
(
model
=
"test-model"
,
input
=
[
"Hello, how are you?"
,
"I am fine, thank you!"
],
encoding_format
=
"float"
,
)
self
.
multimodal_req
=
EmbeddingRequest
(
model
=
"test-model"
,
input
=
[
MultimodalEmbeddingInput
(
text
=
"Hello"
,
image
=
"base64_image_data"
),
MultimodalEmbeddingInput
(
text
=
"World"
,
image
=
None
),
],
encoding_format
=
"float"
,
)
self
.
token_ids_req
=
EmbeddingRequest
(
model
=
"test-model"
,
input
=
[
1
,
2
,
3
,
4
,
5
],
encoding_format
=
"float"
,
)
def
test_convert_single_string_request
(
self
,
serving_embedding
,
basic_embedding_request
):
def
test_convert_single_string_request
(
self
):
"""Test converting single string request to internal format."""
adapted_request
,
processed_request
=
(
serving_embedding
.
_convert_to_internal_request
(
[
basic_
embedding_request
],
[
"test-id"
]
self
.
serving_embedding
.
_convert_to_internal_request
(
[
self
.
basic_
req
],
[
"test-id"
]
)
)
assert
isi
nstance
(
adapted_request
,
EmbeddingReqInput
)
assert
adapted_request
.
text
==
"Hello, how are you?"
assert
adapted_request
.
rid
==
"test-id"
assert
processed_request
==
basic_embedding_request
self
.
assert
IsI
nstance
(
adapted_request
,
EmbeddingReqInput
)
self
.
assert
Equal
(
adapted_request
.
text
,
"Hello, how are you?"
)
self
.
assert
Equal
(
adapted_request
.
rid
,
"test-id"
)
self
.
assert
Equal
(
processed_request
,
self
.
basic_req
)
def
test_convert_list_string_request
(
self
,
serving_embedding
,
list_embedding_request
):
def
test_convert_list_string_request
(
self
):
"""Test converting list of strings request to internal format."""
adapted_request
,
processed_request
=
(
serving_embedding
.
_convert_to_internal_request
(
[
list_embedding_request
],
[
"test-id"
]
self
.
serving_embedding
.
_convert_to_internal_request
(
[
self
.
list_req
],
[
"test-id"
]
)
)
assert
isinstance
(
adapted_request
,
EmbeddingReqInput
)
assert
adapted_request
.
text
==
[
"Hello, how are you?"
,
"I am fine, thank you!"
]
assert
adapted_request
.
rid
==
"test-id"
assert
processed_request
==
list_embedding_request
self
.
assertIsInstance
(
adapted_request
,
EmbeddingReqInput
)
self
.
assertEqual
(
adapted_request
.
text
,
[
"Hello, how are you?"
,
"I am fine, thank you!"
]
)
self
.
assertEqual
(
adapted_request
.
rid
,
"test-id"
)
self
.
assertEqual
(
processed_request
,
self
.
list_req
)
def
test_convert_token_ids_request
(
self
,
serving_embedding
,
token_ids_embedding_request
):
def
test_convert_token_ids_request
(
self
):
"""Test converting token IDs request to internal format."""
adapted_request
,
processed_request
=
(
serving_embedding
.
_convert_to_internal_request
(
[
token_ids_
embedding_request
],
[
"test-id"
]
self
.
serving_embedding
.
_convert_to_internal_request
(
[
self
.
token_ids_
req
],
[
"test-id"
]
)
)
assert
isi
nstance
(
adapted_request
,
EmbeddingReqInput
)
assert
adapted_request
.
input_ids
==
[
1
,
2
,
3
,
4
,
5
]
assert
adapted_request
.
rid
==
"test-id"
assert
processed_request
==
token_ids_
embedding_request
self
.
assert
IsI
nstance
(
adapted_request
,
EmbeddingReqInput
)
self
.
assert
Equal
(
adapted_request
.
input_ids
,
[
1
,
2
,
3
,
4
,
5
]
)
self
.
assert
Equal
(
adapted_request
.
rid
,
"test-id"
)
self
.
assert
Equal
(
processed_request
,
self
.
token_ids_
req
)
def
test_convert_multimodal_request
(
self
,
serving_embedding
,
multimodal_embedding_request
):
def
test_convert_multimodal_request
(
self
):
"""Test converting multimodal request to internal format."""
adapted_request
,
processed_request
=
(
serving_embedding
.
_convert_to_internal_request
(
[
multimodal_
embedding_request
],
[
"test-id"
]
self
.
serving_embedding
.
_convert_to_internal_request
(
[
self
.
multimodal_
req
],
[
"test-id"
]
)
)
assert
isi
nstance
(
adapted_request
,
EmbeddingReqInput
)
self
.
assert
IsI
nstance
(
adapted_request
,
EmbeddingReqInput
)
# Should extract text and images separately
assert
len
(
adapted_request
.
text
)
==
2
assert
"Hello"
in
adapted_request
.
text
assert
"World"
in
adapted_request
.
text
assert
adapted_request
.
image_data
[
0
]
==
"base64_image_data"
assert
adapted_request
.
image_data
[
1
]
is
None
assert
adapted_request
.
rid
==
"test-id"
class
TestEmbeddingResponseBuilding
:
"""Test response building methods."""
def
test_build_single_embedding_response
(
self
,
serving_embedding
):
self
.
assertEqual
(
len
(
adapted_request
.
text
),
2
)
self
.
assertIn
(
"Hello"
,
adapted_request
.
text
)
self
.
assertIn
(
"World"
,
adapted_request
.
text
)
self
.
assertEqual
(
adapted_request
.
image_data
[
0
],
"base64_image_data"
)
self
.
assertIsNone
(
adapted_request
.
image_data
[
1
])
self
.
assertEqual
(
adapted_request
.
rid
,
"test-id"
)
def
test_build_single_embedding_response
(
self
):
"""Test building response for single embedding."""
ret_data
=
[
{
...
...
@@ -201,19 +158,21 @@ class TestEmbeddingResponseBuilding:
}
]
response
=
serving_embedding
.
_build_embedding_response
(
ret_data
,
"test-model"
)
assert
isinstance
(
response
,
EmbeddingResponse
)
assert
response
.
model
==
"test-model"
assert
len
(
response
.
data
)
==
1
assert
response
.
data
[
0
].
embedding
==
[
0.1
,
0.2
,
0.3
,
0.4
,
0.5
]
assert
response
.
data
[
0
].
index
==
0
assert
response
.
data
[
0
].
object
==
"embedding"
assert
response
.
usage
.
prompt_tokens
==
5
assert
response
.
usage
.
total_tokens
==
5
assert
response
.
usage
.
completion_tokens
==
0
response
=
self
.
serving_embedding
.
_build_embedding_response
(
ret_data
,
"test-model"
)
def
test_build_multiple_embedding_response
(
self
,
serving_embedding
):
self
.
assertIsInstance
(
response
,
EmbeddingResponse
)
self
.
assertEqual
(
response
.
model
,
"test-model"
)
self
.
assertEqual
(
len
(
response
.
data
),
1
)
self
.
assertEqual
(
response
.
data
[
0
].
embedding
,
[
0.1
,
0.2
,
0.3
,
0.4
,
0.5
])
self
.
assertEqual
(
response
.
data
[
0
].
index
,
0
)
self
.
assertEqual
(
response
.
data
[
0
].
object
,
"embedding"
)
self
.
assertEqual
(
response
.
usage
.
prompt_tokens
,
5
)
self
.
assertEqual
(
response
.
usage
.
total_tokens
,
5
)
self
.
assertEqual
(
response
.
usage
.
completion_tokens
,
0
)
def
test_build_multiple_embedding_response
(
self
):
"""Test building response for multiple embeddings."""
ret_data
=
[
{
...
...
@@ -226,25 +185,20 @@ class TestEmbeddingResponseBuilding:
},
]
response
=
serving_embedding
.
_build_embedding_response
(
ret_data
,
"test-model"
)
assert
isinstance
(
response
,
EmbeddingResponse
)
assert
len
(
response
.
data
)
==
2
assert
response
.
data
[
0
].
embedding
==
[
0.1
,
0.2
,
0.3
]
assert
response
.
data
[
0
].
index
==
0
assert
response
.
data
[
1
].
embedding
==
[
0.4
,
0.5
,
0.6
]
assert
response
.
data
[
1
].
index
==
1
assert
response
.
usage
.
prompt_tokens
==
7
# 3 + 4
assert
response
.
usage
.
total_tokens
==
7
response
=
self
.
serving_embedding
.
_build_embedding_response
(
ret_data
,
"test-model"
)
@
pytest
.
mark
.
asyncio
class
TestOpenAIServingEmbeddingAsyncMethods
:
"""Test async methods of OpenAIServingEmbedding."""
self
.
assertIsInstance
(
response
,
EmbeddingResponse
)
self
.
assertEqual
(
len
(
response
.
data
),
2
)
self
.
assertEqual
(
response
.
data
[
0
].
embedding
,
[
0.1
,
0.2
,
0.3
])
self
.
assertEqual
(
response
.
data
[
0
].
index
,
0
)
self
.
assertEqual
(
response
.
data
[
1
].
embedding
,
[
0.4
,
0.5
,
0.6
])
self
.
assertEqual
(
response
.
data
[
1
].
index
,
1
)
self
.
assertEqual
(
response
.
usage
.
prompt_tokens
,
7
)
# 3 + 4
self
.
assertEqual
(
response
.
usage
.
total_tokens
,
7
)
async
def
test_handle_request_success
(
self
,
serving_embedding
,
basic_embedding_request
,
mock_request
):
async
def
test_handle_request_success
(
self
):
"""Test successful embedding request handling."""
# Mock the generate_request to return expected data
...
...
@@ -254,32 +208,30 @@ class TestOpenAIServingEmbeddingAsyncMethods:
"meta_info"
:
{
"prompt_tokens"
:
5
},
}
serving_embedding
.
tokenizer_manager
.
generate_request
=
Mock
(
self
.
serving_embedding
.
tokenizer_manager
.
generate_request
=
Mock
(
return_value
=
mock_generate
()
)
response
=
await
serving_embedding
.
handle_request
(
basic_embedding_request
,
mock_
request
response
=
await
self
.
serving_embedding
.
handle_request
(
self
.
basic_req
,
self
.
request
)
assert
isi
nstance
(
response
,
EmbeddingResponse
)
assert
len
(
response
.
data
)
==
1
assert
response
.
data
[
0
].
embedding
==
[
0.1
,
0.2
,
0.3
,
0.4
,
0.5
]
self
.
assert
IsI
nstance
(
response
,
EmbeddingResponse
)
self
.
assert
Equal
(
len
(
response
.
data
)
,
1
)
self
.
assert
Equal
(
response
.
data
[
0
].
embedding
,
[
0.1
,
0.2
,
0.3
,
0.4
,
0.5
]
)
async
def
test_handle_request_validation_error
(
self
,
serving_embedding
,
mock_request
):
async
def
test_handle_request_validation_error
(
self
):
"""Test handling request with validation error."""
invalid_request
=
EmbeddingRequest
(
model
=
"test-model"
,
input
=
""
)
response
=
await
serving_embedding
.
handle_request
(
invalid_request
,
mock_request
)
response
=
await
self
.
serving_embedding
.
handle_request
(
invalid_request
,
self
.
request
)
assert
isi
nstance
(
response
,
ORJSONResponse
)
assert
response
.
status_code
==
400
self
.
assert
IsI
nstance
(
response
,
ORJSONResponse
)
self
.
assert
Equal
(
response
.
status_code
,
400
)
async
def
test_handle_request_generation_error
(
self
,
serving_embedding
,
basic_embedding_request
,
mock_request
):
async
def
test_handle_request_generation_error
(
self
):
"""Test handling request with generation error."""
# Mock generate_request to raise an error
...
...
@@ -287,30 +239,32 @@ class TestOpenAIServingEmbeddingAsyncMethods:
raise
ValueError
(
"Generation failed"
)
yield
# This won't be reached but needed for async generator
serving_embedding
.
tokenizer_manager
.
generate_request
=
Mock
(
self
.
serving_embedding
.
tokenizer_manager
.
generate_request
=
Mock
(
return_value
=
mock_generate_error
()
)
response
=
await
serving_embedding
.
handle_request
(
basic_embedding_request
,
mock_
request
response
=
await
self
.
serving_embedding
.
handle_request
(
self
.
basic_req
,
self
.
request
)
assert
isi
nstance
(
response
,
ORJSONResponse
)
assert
response
.
status_code
==
400
self
.
assert
IsI
nstance
(
response
,
ORJSONResponse
)
self
.
assert
Equal
(
response
.
status_code
,
400
)
async
def
test_handle_request_internal_error
(
self
,
serving_embedding
,
basic_embedding_request
,
mock_request
):
async
def
test_handle_request_internal_error
(
self
):
"""Test handling request with internal server error."""
# Mock _convert_to_internal_request to raise an exception
with
patch
.
object
(
serving_embedding
,
self
.
serving_embedding
,
"_convert_to_internal_request"
,
side_effect
=
Exception
(
"Internal error"
),
):
response
=
await
serving_embedding
.
handle_request
(
basic_embedding_request
,
mock_
request
response
=
await
self
.
serving_embedding
.
handle_request
(
self
.
basic_req
,
self
.
request
)
assert
isinstance
(
response
,
ORJSONResponse
)
assert
response
.
status_code
==
500
self
.
assertIsInstance
(
response
,
ORJSONResponse
)
self
.
assertEqual
(
response
.
status_code
,
500
)
if
__name__
==
"__main__"
:
unittest
.
main
(
verbosity
=
2
)
test/srt/run_suite.py
View file @
ffd1a26e
...
...
@@ -62,6 +62,11 @@ suites = {
TestFile
(
"test_openai_adapter.py"
,
1
),
TestFile
(
"test_openai_function_calling.py"
,
60
),
TestFile
(
"test_openai_server.py"
,
149
),
TestFile
(
"openai/test_server.py"
,
120
),
TestFile
(
"openai/test_protocol.py"
,
60
),
TestFile
(
"openai/test_serving_chat.py"
,
120
),
TestFile
(
"openai/test_serving_completions.py"
,
120
),
TestFile
(
"openai/test_serving_embedding.py"
,
120
),
TestFile
(
"test_openai_server_hidden_states.py"
,
240
),
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_page_size.py"
,
60
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment