Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
711aa9d5
Commit
711aa9d5
authored
Jul 30, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.0' into v0.10.0-dev
parents
751c492c
6d8d0a24
Changes
519
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1961 additions
and
345 deletions
+1961
-345
tests/v1/entrypoints/openai/responses/conftest.py
tests/v1/entrypoints/openai/responses/conftest.py
+32
-0
tests/v1/entrypoints/openai/responses/test_basic.py
tests/v1/entrypoints/openai/responses/test_basic.py
+75
-0
tests/v1/entrypoints/openai/responses/test_image.py
tests/v1/entrypoints/openai/responses/test_image.py
+166
-0
tests/v1/entrypoints/openai/responses/test_stateful.py
tests/v1/entrypoints/openai/responses/test_stateful.py
+137
-0
tests/v1/entrypoints/openai/responses/test_structured_output.py
...v1/entrypoints/openai/responses/test_structured_output.py
+92
-0
tests/v1/entrypoints/openai/test_completion.py
tests/v1/entrypoints/openai/test_completion.py
+17
-1
tests/v1/entrypoints/openai/test_multi_api_servers.py
tests/v1/entrypoints/openai/test_multi_api_servers.py
+3
-120
tests/v1/kv_connector/__init__.py
tests/v1/kv_connector/__init__.py
+0
-0
tests/v1/kv_connector/unit/test_multi_connector.py
tests/v1/kv_connector/unit/test_multi_connector.py
+15
-69
tests/v1/kv_connector/unit/test_nixl_connector.py
tests/v1/kv_connector/unit/test_nixl_connector.py
+119
-80
tests/v1/kv_connector/unit/test_output_aggreagator.py
tests/v1/kv_connector/unit/test_output_aggreagator.py
+108
-0
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/utils.py
+62
-0
tests/v1/metrics/test_ray_metrics.py
tests/v1/metrics/test_ray_metrics.py
+12
-6
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+47
-4
tests/v1/sample/test_sampling_params_e2e.py
tests/v1/sample/test_sampling_params_e2e.py
+36
-38
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+44
-24
tests/v1/test_async_llm_dp.py
tests/v1/test_async_llm_dp.py
+4
-2
tests/v1/test_external_lb_dp.py
tests/v1/test_external_lb_dp.py
+1
-1
tests/v1/test_hybrid_lb_dp.py
tests/v1/test_hybrid_lb_dp.py
+352
-0
tests/v1/test_internal_lb_dp.py
tests/v1/test_internal_lb_dp.py
+639
-0
No files found.
Too many changes to show.
To preserve performance only
519 of 519+
files are displayed.
Plain diff
Email patch
tests/v1/entrypoints/openai/responses/conftest.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
# Use a small reasoning model to test the responses API.
MODEL_NAME
=
"Qwen/Qwen3-0.6B"
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
():
return
[
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# For faster startup.
"--reasoning-parser"
,
"deepseek_r1"
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
tests/v1/entrypoints/openai/responses/test_basic.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
openai
# use the official client for correctness check
import
pytest
@
pytest
.
mark
.
asyncio
async
def
test_simple_input
(
client
:
openai
.
AsyncOpenAI
):
response
=
await
client
.
responses
.
create
(
input
=
"What is 13 * 24?"
)
print
(
response
)
outputs
=
response
.
output
# Whether the output contains the answer.
assert
outputs
[
-
1
].
type
==
"message"
assert
"312"
in
outputs
[
-
1
].
content
[
0
].
text
# Whether the output contains the reasoning.
assert
outputs
[
0
].
type
==
"reasoning"
assert
outputs
[
0
].
text
!=
""
@
pytest
.
mark
.
asyncio
async
def
test_instructions
(
client
:
openai
.
AsyncOpenAI
):
response
=
await
client
.
responses
.
create
(
instructions
=
"Finish the answer with QED."
,
input
=
"What is 13 * 24?"
,
)
print
(
response
)
output_text
=
response
.
output
[
-
1
].
content
[
0
].
text
assert
"312"
in
output_text
assert
"QED"
in
output_text
@
pytest
.
mark
.
asyncio
async
def
test_chat
(
client
:
openai
.
AsyncOpenAI
):
response
=
await
client
.
responses
.
create
(
input
=
[
{
"role"
:
"system"
,
"content"
:
"Finish the answer with QED."
},
{
"role"
:
"user"
,
"content"
:
"What is 5 * 3?"
},
{
"role"
:
"assistant"
,
"content"
:
"15. QED."
},
{
"role"
:
"user"
,
"content"
:
"Multiply the result by 2."
},
],
)
print
(
response
)
output_text
=
response
.
output
[
-
1
].
content
[
0
].
text
assert
"30"
in
output_text
assert
"QED"
in
output_text
@
pytest
.
mark
.
asyncio
async
def
test_chat_with_input_type
(
client
:
openai
.
AsyncOpenAI
):
response
=
await
client
.
responses
.
create
(
input
=
[
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"input_text"
,
"text"
:
"Hello!"
}],
},
],
)
print
(
response
)
assert
response
.
status
==
"completed"
tests/v1/entrypoints/openai/responses/test_image.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
openai
import
pytest
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
# Use a small vision model for testing
MODEL_NAME
=
"Qwen/Qwen2.5-VL-3B-Instruct"
MAXIMUM_IMAGES
=
2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
,
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_image_server_args
():
return
[
"--enforce-eager"
,
"--max-model-len"
,
"6000"
,
"--max-num-seqs"
,
"128"
,
"--limit-mm-per-prompt"
,
json
.
dumps
({
"image"
:
MAXIMUM_IMAGES
}),
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
image_server
(
default_image_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_image_server_args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
image_server
):
async
with
image_server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_image
()
->
dict
[
str
,
str
]:
return
{
image_url
:
encode_image_base64
(
fetch_image
(
image_url
))
for
image_url
in
TEST_IMAGE_URLS
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
content_text
=
"What's in this image?"
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"input_image"
,
"image_url"
:
image_url
,
"detail"
:
"auto"
,
},
{
"type"
:
"input_text"
,
"text"
:
content_text
},
],
}]
# test image url
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
messages
,
)
assert
len
(
response
.
output_text
)
>
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image_base64encoded
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
base64_encoded_image
:
dict
[
str
,
str
],
):
content_text
=
"What's in this image?"
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"input_image"
,
"image_url"
:
f
"data:image/jpeg;base64,
{
base64_encoded_image
[
image_url
]
}
"
,
"detail"
:
"auto"
,
},
{
"type"
:
"input_text"
,
"text"
:
content_text
},
],
}]
# test image base64
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
messages
,
)
assert
len
(
response
.
output_text
)
>
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[
TEST_IMAGE_URLS
[:
i
]
for
i
in
range
(
2
,
len
(
TEST_IMAGE_URLS
))])
async
def
test_multi_image_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_urls
:
list
[
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"input_image"
,
"image_url"
:
image_url
,
"detail"
:
"auto"
,
}
for
image_url
in
image_urls
),
{
"type"
:
"input_text"
,
"text"
:
"What's in this image?"
},
],
}]
if
len
(
image_urls
)
>
MAXIMUM_IMAGES
:
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-image input
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
messages
,
)
# the server should still work afterwards
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
[{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris today?"
,
}],
)
assert
len
(
response
.
output_text
)
>
0
else
:
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
messages
,
)
assert
len
(
response
.
output_text
)
>
0
tests/v1/entrypoints/openai/responses/test_stateful.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
openai
import
pytest
@
pytest
.
mark
.
asyncio
async
def
test_store
(
client
:
openai
.
AsyncOpenAI
):
# By default, store is True.
response
=
await
client
.
responses
.
create
(
input
=
"Hello!"
)
assert
response
.
status
==
"completed"
# Retrieve the response.
response
=
await
client
.
responses
.
retrieve
(
response
.
id
)
assert
response
.
status
==
"completed"
# Test store=False.
response
=
await
client
.
responses
.
create
(
input
=
"Hello!"
,
store
=
False
,
)
assert
response
.
status
==
"completed"
# The response should not be found.
with
pytest
.
raises
(
openai
.
NotFoundError
,
match
=
"Response with id .* not found."
):
await
client
.
responses
.
retrieve
(
response
.
id
)
@
pytest
.
mark
.
asyncio
async
def
test_background
(
client
:
openai
.
AsyncOpenAI
):
# NOTE: This query should be easy enough for the model to answer
# within the 10 seconds.
response
=
await
client
.
responses
.
create
(
input
=
"Hello!"
,
background
=
True
,
)
assert
response
.
status
==
"queued"
max_retries
=
10
for
_
in
range
(
max_retries
):
await
asyncio
.
sleep
(
1
)
response
=
await
client
.
responses
.
retrieve
(
response
.
id
)
if
response
.
status
!=
"queued"
:
break
print
(
response
)
assert
response
.
status
==
"completed"
@
pytest
.
mark
.
asyncio
async
def
test_background_error
(
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
"background can only be used when `store` is true"
):
_
=
await
client
.
responses
.
create
(
input
=
"What is 13 * 24?"
,
background
=
True
,
store
=
False
,
)
@
pytest
.
mark
.
asyncio
async
def
test_background_cancel
(
client
:
openai
.
AsyncOpenAI
):
response
=
await
client
.
responses
.
create
(
input
=
"Write a long story about a cat."
,
background
=
True
,
)
assert
response
.
status
==
"queued"
# Cancel the response before it is completed.
# FIXME: This test can be flaky.
await
asyncio
.
sleep
(
0.5
)
response
=
await
client
.
responses
.
cancel
(
response
.
id
)
assert
response
.
status
==
"cancelled"
# Make sure the response status remains unchanged.
await
asyncio
.
sleep
(
5
)
response
=
await
client
.
responses
.
retrieve
(
response
.
id
)
assert
response
.
status
==
"cancelled"
@
pytest
.
mark
.
asyncio
async
def
test_cancel_completed
(
client
:
openai
.
AsyncOpenAI
):
response
=
await
client
.
responses
.
create
(
input
=
"Hello"
)
assert
response
.
status
==
"completed"
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
"Cannot cancel a synchronous response."
):
await
client
.
responses
.
cancel
(
response
.
id
)
@
pytest
.
mark
.
asyncio
async
def
test_previous_response_id
(
client
:
openai
.
AsyncOpenAI
):
response1
=
await
client
.
responses
.
create
(
instructions
=
"You are tested on your ability to retrieve the correct "
"information from the previous response."
,
input
=
"Hello, my name is John."
)
response2
=
await
client
.
responses
.
create
(
input
=
"Actually, my name is not John. My real name is Mark."
,
previous_response_id
=
response1
.
id
,
)
response3
=
await
client
.
responses
.
create
(
input
=
"What is my real name again? Answer in one word."
,
previous_response_id
=
response2
.
id
,
)
print
(
response3
)
assert
"Mark"
in
response3
.
output
[
-
1
].
content
[
0
].
text
assert
"John"
not
in
response3
.
output
[
-
1
].
content
[
0
].
text
@
pytest
.
mark
.
asyncio
async
def
test_two_responses_with_same_prev_id
(
client
:
openai
.
AsyncOpenAI
):
response1
=
await
client
.
responses
.
create
(
instructions
=
"You are tested on your ability to retrieve the correct "
"information from the previous response."
,
input
=
"Hello, my name is John."
)
# Both response 2 and 3 use response 1 as the previous response.
response2
=
client
.
responses
.
create
(
input
=
"Actually, my name is not John. My name is Mark."
,
previous_response_id
=
response1
.
id
,
)
response3
=
client
.
responses
.
create
(
input
=
"What is my name again? Answer in one word."
,
previous_response_id
=
response1
.
id
,
)
_
=
await
response2
response3_result
=
await
response3
print
(
response3_result
)
assert
"John"
in
response3_result
.
output
[
-
1
].
content
[
0
].
text
assert
"Mark"
not
in
response3_result
.
output
[
-
1
].
content
[
0
].
text
tests/v1/entrypoints/openai/responses/test_structured_output.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
openai
import
pytest
from
pydantic
import
BaseModel
@
pytest
.
mark
.
asyncio
async
def
test_structured_output
(
client
:
openai
.
AsyncOpenAI
):
response
=
await
client
.
responses
.
create
(
input
=
[
{
"role"
:
"system"
,
"content"
:
"Extract the event information."
},
{
"role"
:
"user"
,
"content"
:
"Alice and Bob are going to a science fair on Friday."
,
},
],
text
=
{
"format"
:
{
"type"
:
"json_schema"
,
"name"
:
"calendar_event"
,
"schema"
:
{
"type"
:
"object"
,
"properties"
:
{
"event_name"
:
{
"type"
:
"string"
},
"date"
:
{
"type"
:
"string"
},
"participants"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
}
},
},
"required"
:
[
"event_name"
,
"date"
,
"participants"
],
"additionalProperties"
:
False
,
},
"description"
:
"A calendar event."
,
"strict"
:
True
,
}
},
)
print
(
response
)
# NOTE: The JSON schema is applied to the output text, not reasoning.
output_text
=
response
.
output
[
-
1
].
content
[
0
].
text
event
=
json
.
loads
(
output_text
)
assert
event
[
"event_name"
].
lower
()
==
"science fair"
assert
event
[
"date"
]
==
"Friday"
participants
=
event
[
"participants"
]
assert
len
(
participants
)
==
2
assert
participants
[
0
]
==
"Alice"
assert
participants
[
1
]
==
"Bob"
@
pytest
.
mark
.
asyncio
async
def
test_structured_output_with_parse
(
client
:
openai
.
AsyncOpenAI
):
class
CalendarEvent
(
BaseModel
):
event_name
:
str
date
:
str
participants
:
list
[
str
]
response
=
await
client
.
responses
.
parse
(
model
=
None
,
instructions
=
"Extract the event information."
,
input
=
"Alice and Bob are going to a science fair on Friday."
,
text_format
=
CalendarEvent
,
)
print
(
response
)
# The output is successfully parsed.
event
=
response
.
output_parsed
assert
event
is
not
None
# The output is correct.
assert
event
.
event_name
.
lower
()
==
"science fair"
assert
event
.
date
==
"Friday"
participants
=
event
.
participants
assert
len
(
participants
)
==
2
assert
participants
[
0
]
==
"Alice"
assert
participants
[
1
]
==
"Bob"
tests/v1/entrypoints/openai/test_completion.py
View file @
711aa9d5
...
...
@@ -7,6 +7,7 @@ import openai # use the official client for correctness check
import
pytest
import
pytest_asyncio
import
regex
as
re
import
requests
from
openai
import
BadRequestError
from
tests.utils
import
RemoteOpenAIServer
...
...
@@ -26,7 +27,8 @@ def default_server_args():
"2048"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
"--enforce-eager"
,
"--enable-prompt-tokens-details"
,
]
...
...
@@ -679,3 +681,17 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
prompt
=
prompt
,
extra_body
=
{
"guided_grammar"
:
invalid_simplified_sql_grammar
},
)
@
pytest
.
mark
.
asyncio
async
def
test_completion_with_empty_prompt_embeds
(
client
:
openai
.
AsyncOpenAI
)
->
None
:
"""Test completion with empty prompt embeds."""
payload
:
dict
[
str
,
list
]
=
{
"prompt_embeds"
:
[]}
headers
:
dict
[
str
,
str
]
=
{
"Content-Type"
:
"application/json"
}
# base_url = http://localhost:8000/v1/completions
response
=
requests
.
post
(
f
"
{
client
.
base_url
}
completions"
,
headers
=
headers
,
json
=
payload
)
assert
response
.
status_code
==
200
,
(
f
"Expected status code 200, got
{
response
.
status_code
}
. "
)
tests/v1/entrypoints/openai/test_multi_api_servers.py
View file @
711aa9d5
...
...
@@ -2,136 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
import
re
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
import
requests
from
tests.utils
import
RemoteOpenAIServer
from
tests.v1.test_utils
import
check_request_balancing
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
DP_SIZE
=
os
.
getenv
(
"DP_SIZE"
,
"1"
)
def
get_prometheus_metrics
(
server
:
RemoteOpenAIServer
)
->
dict
[
str
,
dict
[
str
,
float
]]:
"""Fetch and parse Prometheus metrics from the /metrics endpoint.
Returns:
Dict mapping metric names to their values grouped by labels.
For example: {"vllm:request_success": {
"engine=0": 5.0, "engine=1": 3.0}
}
"""
try
:
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
),
timeout
=
10
)
response
.
raise_for_status
()
metrics
:
dict
[
str
,
dict
[
str
,
float
]]
=
{}
# Regex patterns for Prometheus metrics
metric_with_labels
=
re
.
compile
(
r
'^([a-zA-Z_:][a-zA-Z0-9_:]*)\{([^}]*)\}\s+([\d\.\-\+e]+)$'
)
metric_simple
=
re
.
compile
(
r
'^([a-zA-Z_:][a-zA-Z0-9_:]*)\s+([\d\.\-\+e]+)$'
)
for
line
in
response
.
text
.
split
(
'
\n
'
):
line
=
line
.
strip
()
# Skip comments and empty lines
if
not
line
or
line
.
startswith
(
'#'
):
continue
# Try to match metric with labels first
match
=
metric_with_labels
.
match
(
line
)
if
match
:
metric_name
,
labels_part
,
value_str
=
match
.
groups
()
try
:
value
=
float
(
value_str
)
if
metric_name
not
in
metrics
:
metrics
[
metric_name
]
=
{}
metrics
[
metric_name
][
f
'{{
{
labels_part
}
}}'
]
=
value
except
ValueError
:
continue
else
:
# Try simple metric without labels
match
=
metric_simple
.
match
(
line
)
if
match
:
metric_name
,
value_str
=
match
.
groups
()
try
:
value
=
float
(
value_str
)
if
metric_name
not
in
metrics
:
metrics
[
metric_name
]
=
{}
metrics
[
metric_name
][
''
]
=
value
except
ValueError
:
continue
return
metrics
except
Exception
as
e
:
pytest
.
fail
(
f
"Failed to fetch Prometheus metrics:
{
e
}
"
)
return
{}
def
get_engine_request_counts
(
metrics
:
dict
[
str
,
dict
[
str
,
float
]])
->
dict
[
str
,
float
]:
"""Extract request counts per engine from Prometheus metrics.
Returns:
Dict mapping engine indices to request counts.
For example: {"0": 15.0, "1": 12.0}
"""
engine_counts
=
{}
# Look for request success metrics with engine labels
success_metrics
=
metrics
.
get
(
"vllm:request_success_total"
,
{})
engine_pattern
=
re
.
compile
(
r
'engine="([^"]*)"'
)
for
labels
,
count
in
success_metrics
.
items
():
# Extract engine ID from labels using regex
match
=
engine_pattern
.
search
(
labels
)
if
match
:
engine_id
=
match
.
group
(
1
)
if
engine_id
not
in
engine_counts
:
engine_counts
[
engine_id
]
=
0.0
engine_counts
[
engine_id
]
+=
count
return
engine_counts
def
check_request_balancing
(
server
:
RemoteOpenAIServer
):
"""Check request balancing via Prometheus metrics if DP_SIZE > 1.
Args:
server: The RemoteOpenAIServer instance
"""
dp_size
=
int
(
DP_SIZE
)
if
dp_size
<=
1
:
return
# Get metrics after all requests are completed
metrics
=
get_prometheus_metrics
(
server
)
engine_counts
=
get_engine_request_counts
(
metrics
)
# Check that multiple engines received requests
engines_with_requests
=
[
engine
for
engine
,
count
in
engine_counts
.
items
()
if
count
>
0
]
assert
len
(
engines_with_requests
)
==
dp_size
,
(
f
"Expected requests to be distributed across multiple engines,"
f
" but only engine(s)
{
engines_with_requests
}
received "
f
"requests. Engine counts:
{
engine_counts
}
"
)
# Verify that the load is reasonably balanced
# (no engine should handle all requests)
total_requests
=
sum
(
engine_counts
.
values
())
for
count
in
engine_counts
.
values
():
assert
count
>
total_requests
//
(
dp_size
+
1
),
(
f
"requests are imbalanced:
{
engine_counts
}
"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
():
return
[
...
...
@@ -217,7 +100,7 @@ async def test_single_completion(client: openai.AsyncOpenAI,
assert
all
(
completion
is
not
None
for
completion
in
results
)
# Check request balancing via Prometheus metrics if DP_SIZE > 1
check_request_balancing
(
server
)
check_request_balancing
(
server
,
int
(
DP_SIZE
)
)
@
pytest
.
mark
.
asyncio
...
...
@@ -295,4 +178,4 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
assert
all
(
results
),
"Not all streaming requests completed successfully."
# Check request balancing via Prometheus metrics if DP_SIZE > 1
check_request_balancing
(
server
)
check_request_balancing
(
server
,
int
(
DP_SIZE
)
)
vllm/attention/ops/blocksparse_attention
/__init__.py
→
tests/v1/kv_connector
/__init__.py
View file @
711aa9d5
File moved
tests/v1/kv_connector/unit/test_multi_connector.py
View file @
711aa9d5
...
...
@@ -3,16 +3,10 @@
import
filecmp
import
shutil
import
tempfile
from
collections
import
defaultdict
from
pathlib
import
Path
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
,
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.factory
import
(
KVConnectorFactory
)
from
vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector
import
(
# noqa
SharedStorageConnector
)
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.config
import
KVTransferConfig
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
...
...
@@ -25,62 +19,6 @@ PROMPTS = [
SAMPLING_PARAMS
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
20
)
class
TestSharedStorageConnector
(
SharedStorageConnector
):
def
__init__
(
self
,
config
:
VllmConfig
,
role
):
self
.
name
=
config
.
kv_transfer_config
.
kv_connector_extra_config
[
"name"
]
self
.
_connector
=
SharedStorageConnector
(
config
,
role
)
self
.
call_record
:
dict
[
str
,
int
]
=
defaultdict
(
int
)
# Use a unique temp file per connector
self
.
_event_file
=
tempfile
.
gettempdir
(
)
+
f
"/connector_
{
self
.
name
}
-
{
self
.
role
.
name
}
_events.log"
# Start with an empty file
with
open
(
self
.
_event_file
,
"w"
)
as
_
:
pass
def
__getattribute__
(
self
,
name
):
if
name
in
(
"_connector"
,
"call_record"
,
"name"
,
"_event_file"
,
"__class__"
,
"__dict__"
,
"__getattribute__"
,
"__init__"
):
# avoid recursion
return
object
.
__getattribute__
(
self
,
name
)
if
not
hasattr
(
self
.
_connector
,
name
):
return
object
.
__getattribute__
(
self
,
name
)
attr
=
getattr
(
self
.
_connector
,
name
)
# Intercept calls to the connector interface and write an event
# for each one to a file, which can be read back in the main test proc.
if
callable
(
attr
):
def
wrapper
(
*
args
,
**
kwargs
):
self
.
call_record
[
name
]
+=
1
# Include args that we're interested in
to_log
=
[
name
]
for
arg
in
args
:
if
isinstance
(
arg
,
int
):
to_log
.
append
(
str
(
arg
))
elif
isinstance
(
arg
,
KVCacheBlocks
):
to_log
.
append
(
f
"num_blocks=
{
[
len
(
b
)
for
b
in
arg
.
blocks
]
}
"
)
# Log the event as a line to the file
try
:
with
open
(
self
.
_event_file
,
"a"
)
as
f
:
f
.
write
(
' '
.
join
(
to_log
)
+
"
\n
"
)
except
Exception
as
e
:
print
(
f
"[ERROR] Could not log event
{
name
}
"
f
"for
{
self
.
name
}
:
{
e
}
"
)
return
attr
(
*
args
,
**
kwargs
)
return
wrapper
return
attr
KVConnectorFactory
.
register_connector
(
"TestSharedStorageConnector"
,
TestSharedStorageConnector
.
__module__
,
TestSharedStorageConnector
.
__name__
)
# Helper function to compare directories recursively
def
_compare_directories
(
dir1
:
Path
,
dir2
:
Path
)
->
bool
:
"""Compares two directories recursively for identical content."""
...
...
@@ -115,19 +53,27 @@ def test_multi_shared_storage_connector_consistency():
kv_role
=
"kv_both"
,
kv_connector_extra_config
=
{
"connectors"
:
[{
"kv_connector"
:
"TestSharedStorageConnector"
,
"kv_role"
:
"kv_both"
,
"kv_connector"
:
"TestSharedStorageConnector"
,
"kv_role"
:
"kv_both"
,
"kv_connector_extra_config"
:
{
"shared_storage_path"
:
str
(
storage_1_path
),
"name"
:
"storage1"
,
}
},
"kv_connector_module_path"
:
"tests.v1.kv_connector.unit.utils"
,
},
{
"kv_connector"
:
"TestSharedStorageConnector"
,
"kv_role"
:
"kv_both"
,
"kv_connector"
:
"TestSharedStorageConnector"
,
"kv_role"
:
"kv_both"
,
"kv_connector_extra_config"
:
{
"shared_storage_path"
:
str
(
storage_2_path
),
"name"
:
"storage2"
,
}
},
"kv_connector_module_path"
:
"tests.v1.kv_connector.unit.utils"
,
}]
},
)
...
...
tests/v1/kv_connector/unit/test_nixl_connector.py
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
tempfile
import
textwrap
import
time
import
uuid
from
collections
import
defaultdict
from
typing
import
Optional
from
unittest.mock
import
patch
import
pytest
import
ray
from
vllm
import
LLM
from
vllm.config
import
KVTransferConfig
from
vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector
import
(
KVConnectorRole
,
NixlAgentMetadata
,
NixlConnector
,
NixlConnectorMetadata
,
NixlConnectorWorker
)
from
vllm.forward_context
import
ForwardContext
from
vllm.mocks.mock_nixl_connector
import
FakeNixlWrapper
from
vllm.sampling_params
import
SamplingParams
from
.utils
import
create_request
,
create_scheduler
,
create_vllm_config
def
_make_stub_pkg
()
->
str
:
"""Return a directory that makes
`from nixl._api import nixl_agent` resolve to our FakeNixlWrapper."""
td
=
tempfile
.
mkdtemp
()
pkg_root
=
os
.
path
.
join
(
td
,
"nixl"
,
"_api"
)
os
.
makedirs
(
pkg_root
,
exist_ok
=
True
)
stub
=
textwrap
.
dedent
(
"""
\
# Forward the real FakeNixlWrapper that the driver already defined.
print("In fake package")
from vllm.mocks.mock_nixl_connector import FakeNixlWrapper as nixl_agent
"""
)
with
open
(
os
.
path
.
join
(
pkg_root
,
"__init__.py"
),
"w"
)
as
f
:
f
.
write
(
stub
)
# touch parent package
open
(
os
.
path
.
join
(
td
,
"nixl"
,
"__init__.py"
),
"w"
).
close
()
return
td
def
test_basic_interface
():
"""Unit test for basic NixlConnector interface functionality."""
...
...
@@ -41,9 +66,9 @@ def test_basic_interface():
assert
kv_connector_metadata
is
not
None
assert
isinstance
(
kv_connector_metadata
,
NixlConnectorMetadata
)
assert
len
(
kv_connector_metadata
.
req
uests
)
==
1
assert
request_id
in
kv_connector_metadata
.
req
uests
req_meta
=
kv_connector_metadata
.
req
uests
[
request_id
]
assert
len
(
kv_connector_metadata
.
req
s_to_recv
)
==
1
assert
request_id
in
kv_connector_metadata
.
req
s_to_recv
req_meta
=
kv_connector_metadata
.
req
s_to_recv
[
request_id
]
for
block_id
,
block
in
zip
(
req_meta
.
local_block_ids
,
scheduler
.
kv_cache_manager
.
coordinator
.
...
...
@@ -78,83 +103,12 @@ def test_prompt_less_than_block_size():
kv_connector_metadata
=
scheduler_output
.
kv_connector_metadata
assert
kv_connector_metadata
is
not
None
assert
isinstance
(
kv_connector_metadata
,
NixlConnectorMetadata
)
assert
len
(
kv_connector_metadata
.
req
uests
)
==
0
assert
len
(
kv_connector_metadata
.
req
s_to_recv
)
==
0
# This request should be scheduled regularly.
assert
len
(
scheduler_output
.
scheduled_new_reqs
)
==
1
class
FakeNixlWrapper
:
"""Mock implementation of NixlWrapper for testing.
We don't inherit from nixl._api.nixl_agent because nixl may not be
installed.
"""
AGENT_METADATA
=
b
"fake_agent_metadata"
REMOTE_AGENT_NAME
=
"remote_agent"
def
__init__
(
self
,
agent_name
:
str
,
*
args
,
**
kwargs
):
self
.
_cycles_before_xfer_done
=
0
self
.
_check_xfer_state_cycles
:
defaultdict
[
int
,
int
]
=
defaultdict
(
lambda
:
0
)
def
get_reg_descs
(
self
,
caches_data
,
memory_type
:
str
)
->
list
:
return
[
str
(
uuid
.
uuid4
())
for
_
in
caches_data
]
def
register_memory
(
self
,
descs
)
->
None
:
pass
def
get_xfer_descs
(
self
,
blocks_data
,
memory_type
:
str
)
->
list
:
return
[
str
(
uuid
.
uuid4
())
for
_
in
blocks_data
]
def
prep_xfer_dlist
(
self
,
agent_name
:
str
,
descs
:
list
)
->
int
:
return
uuid
.
uuid4
().
int
def
get_agent_metadata
(
self
)
->
bytes
:
return
self
.
AGENT_METADATA
def
add_remote_agent
(
self
,
agent_metadata
:
bytes
)
->
str
:
return
self
.
REMOTE_AGENT_NAME
def
get_new_notifs
(
self
)
->
dict
[
str
,
list
[
bytes
]]:
# Used to collect done_sending, which we don't test yet.
return
{}
def
check_xfer_state
(
self
,
handle
:
int
)
->
str
:
if
self
.
_check_xfer_state_cycles
[
handle
]
>=
self
.
_cycles_before_xfer_done
:
return
"DONE"
self
.
_check_xfer_state_cycles
[
handle
]
+=
1
return
"PROC"
def
release_xfer_handle
(
self
,
handle
:
int
)
->
None
:
pass
def
send_notif
(
self
,
agent_name
:
str
,
notif_msg
:
bytes
)
->
None
:
pass
def
make_prepped_xfer
(
self
,
xfer_type
:
str
,
local_xfer_side_handle
:
int
,
local_block_descs_ids
:
list
[
int
],
remote_xfer_side_handle
:
int
,
remote_block_descs_ids
:
list
[
int
],
notif_msg
:
Optional
[
bytes
]
=
None
)
->
int
:
return
uuid
.
uuid4
().
int
def
transfer
(
self
,
handle
:
int
)
->
str
:
return
"PROC"
############################################################
# Follow are for changing the behavior during testing.
############################################################
def
set_cycles_before_xfer_done
(
self
,
cycles
:
int
):
"""Set the number of cycles before a transfer is considered done."""
self
.
_cycles_before_xfer_done
=
cycles
class
FakeNixlConnectorWorker
(
NixlConnectorWorker
):
REMOTE_ENGINE_ID
=
"remote_engine"
...
...
@@ -163,8 +117,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
_hand_shake_latency
=
hand_shake_latency
def
_nixl_handshake
(
self
,
host
:
str
,
port
:
int
,
remote_tp_size
:
int
)
->
dict
[
int
,
str
]:
def
_nixl_handshake
(
self
,
host
:
str
,
port
:
int
,
remote_tp_size
:
int
,
expected_engine_id
:
str
)
->
dict
[
int
,
str
]:
# Mimic slow _nixl_handshake, as well as bypass zmq communication.
time
.
sleep
(
self
.
_hand_shake_latency
)
# These should've been done in register_kv_caches(), called by
...
...
@@ -174,6 +128,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
self
.
num_blocks
=
1
self
.
dst_num_blocks
[
self
.
engine_id
]
=
self
.
num_blocks
assert
expected_engine_id
==
self
.
REMOTE_ENGINE_ID
remote_agent_name
=
self
.
add_remote_agent
(
NixlAgentMetadata
(
engine_id
=
self
.
REMOTE_ENGINE_ID
,
...
...
@@ -371,3 +327,86 @@ class TestNixlHandshake:
if
cnt_finished_reqs
==
total_reqs
:
return
raise
TimeoutError
(
"Took too long to complete async handshake."
)
# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
# we put here is important. First run ray, it will clean up the resources, then
# the rest of the tests.
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
None
])
@
patch
(
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"
,
FakeNixlWrapper
)
def
test_abort_timeout_on_prefiller
(
monkeypatch
,
distributed_executor_backend
):
"""
Test lifecycle of an aborted Remote Prefill request hitting the timeout.
-----> P
| {process request}
<-/--- | {result is NOT delivered, eg proxy is down}
|
|
| {eventually free blocks}
"""
model_name
=
"Qwen/Qwen3-0.6B"
kv_transfer_config
=
KVTransferConfig
(
kv_connector
=
"NixlConnector"
,
kv_role
=
"kv_both"
,
)
timeout
=
6
monkeypatch
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
monkeypatch
.
setenv
(
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT"
,
str
(
timeout
))
# Build runtime_env only if we’re using Ray
if
distributed_executor_backend
==
"ray"
:
runtime_env
=
{
"working_dir"
:
_make_stub_pkg
(),
# ship stub package
"env_vars"
:
{
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT"
:
str
(
timeout
),
},
}
ray
.
init
(
runtime_env
=
runtime_env
)
llm
=
LLM
(
model
=
model_name
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.5
,
kv_transfer_config
=
kv_transfer_config
,
distributed_executor_backend
=
distributed_executor_backend
,
)
remote_prefill_opts
=
{
"do_remote_decode"
:
True
,
"do_remote_prefill"
:
False
,
"remote_engine_id"
:
None
,
"remote_block_ids"
:
None
,
"remote_host"
:
None
,
"remote_port"
:
None
,
}
# Simulate sidecar request
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
1
,
extra_args
=
{
"kv_transfer_params"
:
remote_prefill_opts
})
scheduler
=
llm
.
llm_engine
.
engine_core
.
engine_core
.
scheduler
req_to_blocks
=
scheduler
.
kv_cache_manager
.
coordinator
.
single_type_managers
[
0
].
req_to_blocks
padding
=
"Just making this request a little longer so that we're sure "
"we're not hitting the small-request lower bound beneath which we don't "
"actually trigger the whole kv transfer, but rather just recompute the "
"blocks on D."
_
=
llm
.
generate
([
f
"What is the capital of Japan?
{
padding
}
"
],
sampling_params
)
# Request finished but not freed
assert
'0'
in
scheduler
.
finished_req_ids
and
'0'
in
req_to_blocks
# Some other request, 0 still not freed
_
=
llm
.
generate
([
f
"What is the capital of Italy?
{
padding
}
"
],
sampling_params
)
assert
'0'
in
req_to_blocks
assert
'1'
in
scheduler
.
finished_req_ids
and
'1'
in
req_to_blocks
# Wait for timeout and trigger another scheduler loop
time
.
sleep
(
timeout
)
_
=
llm
.
generate
([
f
"What is the capital of France?
{
padding
}
"
],
sampling_params
)
# Request-0 times out and is cleared!
assert
'0'
not
in
req_to_blocks
tests/v1/kv_connector/unit/test_output_aggreagator.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
concurrent.futures
import
Future
from
typing
import
Optional
from
vllm.distributed.kv_transfer.kv_connector.utils
import
KVOutputAggregator
from
vllm.v1.outputs
import
ModelRunnerOutput
class
DummyModelRunnerOutput
(
ModelRunnerOutput
):
def
__init__
(
self
,
finished_sending
:
Optional
[
set
[
str
]]
=
None
,
finished_recving
:
Optional
[
set
[
str
]]
=
None
):
self
.
finished_sending
=
finished_sending
self
.
finished_recving
=
finished_recving
def
test_aggregate_workers_output
():
aggregator
=
KVOutputAggregator
(
world_size
=
2
)
output1
=
DummyModelRunnerOutput
(
finished_sending
=
{
'req1'
},
finished_recving
=
{
'req2'
})
output2
=
DummyModelRunnerOutput
(
finished_sending
=
None
,
finished_recving
=
None
)
aggregated
=
aggregator
.
aggregate
([
output1
,
output2
])
assert
aggregated
is
output1
assert
aggregated
.
finished_sending
is
None
assert
aggregated
.
finished_recving
is
None
output1
=
DummyModelRunnerOutput
(
finished_sending
=
None
,
finished_recving
=
None
)
output2
=
DummyModelRunnerOutput
(
finished_sending
=
{
'req1'
},
finished_recving
=
None
)
aggregated
=
aggregator
.
aggregate
([
output1
,
output2
])
assert
aggregated
is
output1
assert
aggregated
.
finished_sending
==
{
'req1'
}
assert
aggregated
.
finished_recving
is
None
output1
=
DummyModelRunnerOutput
(
finished_sending
=
None
,
finished_recving
=
None
)
output2
=
DummyModelRunnerOutput
(
finished_sending
=
{
'req1'
},
finished_recving
=
{
'req2'
})
aggregated
=
aggregator
.
aggregate
([
output1
,
output2
])
assert
aggregated
is
output1
assert
aggregated
.
finished_sending
is
None
assert
aggregated
.
finished_recving
==
{
'req2'
}
def
test_async_aggregate_workers_output
():
aggregator
=
KVOutputAggregator
(
world_size
=
2
)
future1
:
Future
[
DummyModelRunnerOutput
]
=
Future
()
future2
:
Future
[
DummyModelRunnerOutput
]
=
Future
()
result_future
=
aggregator
.
async_aggregate
([
future1
,
future2
])
output1
=
DummyModelRunnerOutput
(
finished_sending
=
{
'req1'
},
finished_recving
=
{
'req2'
})
output2
=
DummyModelRunnerOutput
(
finished_sending
=
None
,
finished_recving
=
None
)
future1
.
set_result
(
output1
)
future2
.
set_result
(
output2
)
assert
result_future
.
done
()
aggregated
=
result_future
.
result
()
assert
aggregated
is
output1
assert
aggregated
.
finished_sending
is
None
assert
aggregated
.
finished_recving
is
None
future1
=
Future
()
future2
=
Future
()
result_future
=
aggregator
.
async_aggregate
([
future1
,
future2
])
output1
=
DummyModelRunnerOutput
(
finished_sending
=
None
,
finished_recving
=
None
)
output2
=
DummyModelRunnerOutput
(
finished_sending
=
{
'req1'
},
finished_recving
=
None
)
future1
.
set_result
(
output1
)
future2
.
set_result
(
output2
)
assert
result_future
.
done
()
aggregated
=
result_future
.
result
()
assert
aggregated
is
output1
assert
aggregated
.
finished_sending
==
{
'req1'
}
assert
aggregated
.
finished_recving
is
None
future1
=
Future
()
future2
=
Future
()
result_future
=
aggregator
.
async_aggregate
([
future1
,
future2
])
output1
=
DummyModelRunnerOutput
(
finished_sending
=
None
,
finished_recving
=
None
)
output2
=
DummyModelRunnerOutput
(
finished_sending
=
{
'req1'
},
finished_recving
=
{
'req2'
})
future1
.
set_result
(
output1
)
future2
.
set_result
(
output2
)
assert
result_future
.
done
()
aggregated
=
result_future
.
result
()
assert
aggregated
is
output1
assert
aggregated
.
finished_sending
is
None
assert
aggregated
.
finished_recving
==
{
'req2'
}
tests/v1/kv_connector/unit/utils.py
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
tempfile
from
collections
import
defaultdict
from
typing
import
Any
,
Optional
import
torch
...
...
@@ -7,6 +9,11 @@ import torch
from
vllm
import
SamplingParams
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
KVTransferConfig
,
ModelConfig
,
SchedulerConfig
,
VllmConfig
)
from
vllm.distributed.kv_transfer.kv_connector.factory
import
(
KVConnectorFactory
)
from
vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector
import
(
# noqa
SharedStorageConnector
)
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.core.sched.scheduler
import
Scheduler
from
vllm.v1.kv_cache_interface
import
(
FullAttentionSpec
,
KVCacheConfig
,
KVCacheGroupSpec
)
...
...
@@ -187,3 +194,58 @@ def create_model_runner_output(
finished_sending
=
finished_sending
,
finished_recving
=
finished_recving
,
)
class
TestSharedStorageConnector
(
SharedStorageConnector
):
def
__init__
(
self
,
config
:
VllmConfig
,
role
):
self
.
name
=
config
.
kv_transfer_config
.
kv_connector_extra_config
[
"name"
]
self
.
_connector
=
SharedStorageConnector
(
config
,
role
)
self
.
call_record
:
dict
[
str
,
int
]
=
defaultdict
(
int
)
# Use a unique temp file per connector
self
.
_event_file
=
tempfile
.
gettempdir
(
)
+
f
"/connector_
{
self
.
name
}
-
{
self
.
role
.
name
}
_events.log"
# Start with an empty file
with
open
(
self
.
_event_file
,
"w"
)
as
_
:
pass
def
__getattribute__
(
self
,
name
):
if
name
in
(
"_connector"
,
"call_record"
,
"name"
,
"_event_file"
,
"__class__"
,
"__dict__"
,
"__getattribute__"
,
"__init__"
):
# avoid recursion
return
object
.
__getattribute__
(
self
,
name
)
if
not
hasattr
(
self
.
_connector
,
name
):
return
object
.
__getattribute__
(
self
,
name
)
attr
=
getattr
(
self
.
_connector
,
name
)
# Intercept calls to the connector interface and write an event
# for each one to a file, which can be read back in the main test proc.
if
callable
(
attr
):
def
wrapper
(
*
args
,
**
kwargs
):
self
.
call_record
[
name
]
+=
1
# Include args that we're interested in
to_log
=
[
name
]
for
arg
in
args
:
if
isinstance
(
arg
,
int
):
to_log
.
append
(
str
(
arg
))
elif
isinstance
(
arg
,
KVCacheBlocks
):
to_log
.
append
(
f
"num_blocks=
{
[
len
(
b
)
for
b
in
arg
.
blocks
]
}
"
)
# Log the event as a line to the file
try
:
with
open
(
self
.
_event_file
,
"a"
)
as
f
:
f
.
write
(
' '
.
join
(
to_log
)
+
"
\n
"
)
except
Exception
as
e
:
print
(
f
"[ERROR] Could not log event
{
name
}
"
f
"for
{
self
.
name
}
:
{
e
}
"
)
return
attr
(
*
args
,
**
kwargs
)
return
wrapper
return
attr
KVConnectorFactory
.
register_connector
(
"TestSharedStorageConnector"
,
__name__
,
TestSharedStorageConnector
.
__name__
)
tests/v1/metrics/test_ray_metrics.py
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
ray
from
vllm.config
import
ModelDType
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusStatLogger
...
...
@@ -27,7 +30,7 @@ MODELS = [
def
test_engine_log_metrics_ray
(
example_prompts
,
model
:
str
,
dtype
:
str
,
dtype
:
ModelDType
,
max_tokens
:
int
,
)
->
None
:
""" Simple smoke test, verifying this can be used without exceptions.
...
...
@@ -37,11 +40,14 @@ def test_engine_log_metrics_ray(
class
EngineTestActor
:
async
def
run
(
self
):
engine_args
=
AsyncEngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
)
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os
.
environ
[
'VLLM_USE_V1'
]
=
'1'
engine_args
=
AsyncEngineArgs
(
model
=
model
,
dtype
=
dtype
,
disable_log_stats
=
False
,
enforce_eager
=
True
)
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
,
stat_loggers
=
[
RayPrometheusStatLogger
])
...
...
tests/v1/sample/test_logprobs.py
View file @
711aa9d5
...
...
@@ -13,6 +13,7 @@ from tests.v1.sample.utils import (
assert_incr_detok_str_matches_non_incr_detok_str
,
compute_correct_cumulative_logprob
,
get_test_batch
)
from
vllm
import
SamplingParams
from
vllm.config
import
LogprobsMode
from
...conftest
import
HfRunner
,
VllmRunner
from
...utils
import
models_path_prefix
...
...
@@ -114,7 +115,7 @@ def _run_and_validate(
max_tokens
:
int
,
do_apc
:
bool
,
)
->
None
:
vllm_results
=
vllm_model
.
model
.
generate
(
vllm_results
=
vllm_model
.
llm
.
generate
(
test_prompts
,
sampling_params
=
vllm_sampling_params
)
for
vllm_result
,
hf_logprob
,
hf_output
,
logprob_prompt_logprob
in
zip
(
...
...
@@ -290,7 +291,7 @@ def test_get_logprobs_and_prompt_logprobs(
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
do_apc
=
vllm_model
.
model
.
llm_engine
.
cache_config
.
enable_prefix_caching
do_apc
=
vllm_model
.
llm
.
llm_engine
.
cache_config
.
enable_prefix_caching
if
do_apc
and
(
temperature
<
2.0
or
batch_logprobs_composition
!=
SAMPLE_PROMPT
):
# Skip some test-cases to save time.
...
...
@@ -380,7 +381,7 @@ def test_none_logprobs(vllm_model, example_prompts,
prompt_logprobs
=
None
,
temperature
=
0.0
,
)
results_logprobs_none
=
vllm_model
.
model
.
generate
(
results_logprobs_none
=
vllm_model
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_none
,
)
...
...
@@ -410,7 +411,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
logprobs
=
0
,
prompt_logprobs
=
0
,
temperature
=
0.0
)
results_logprobs_zero
=
vllm_model
.
model
.
generate
(
results_logprobs_zero
=
vllm_model
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params_logprobs_zero
)
for
i
in
range
(
len
(
results_logprobs_zero
)):
...
...
@@ -428,3 +429,45 @@ def test_zero_logprobs(vllm_model, example_prompts,
# prompt token
assert
prompt_logprobs
is
not
None
assert
len
(
prompt_token_ids
)
==
len
(
prompt_logprobs
)
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
[
"raw_logprobs"
,
"raw_logits"
,
"processed_logprobs"
,
"processed_logits"
])
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
,
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values.
"""
from
vllm
import
LLM
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
llm
=
LLM
(
"facebook/opt-125m"
,
max_logprobs
=
5
,
enable_prefix_caching
=
False
,
# 2 other llms alive during whole session
gpu_memory_utilization
=
0.05
,
max_model_len
=
16
,
logprobs_mode
=
logprobs_mode
)
vllm_sampling_params
=
SamplingParams
(
logprobs
=
1
)
results
=
llm
.
generate
([
"Hello world"
],
sampling_params
=
vllm_sampling_params
)
total_token_with_logprobs
=
0
positive_values
=
0
for
output
in
results
[
0
].
outputs
:
for
logprobs
in
output
.
logprobs
:
for
token_id
in
logprobs
:
logprob
=
logprobs
[
token_id
]
if
"logprobs"
in
logprobs_mode
:
assert
logprob
.
logprob
<=
0
if
logprob
.
logprob
>
0
:
positive_values
=
positive_values
+
1
total_token_with_logprobs
=
total_token_with_logprobs
+
1
assert
total_token_with_logprobs
>=
len
(
results
[
0
].
outputs
)
if
"logits"
in
logprobs_mode
:
assert
positive_values
>
0
del
llm
tests/v1/sample/test_sampling_params_e2e.py
View file @
711aa9d5
...
...
@@ -15,30 +15,30 @@ PROMPT = "Hello my name is Robert and I"
@
pytest
.
fixture
(
scope
=
"module"
)
def
model
()
->
LLM
:
def
llm
()
->
LLM
:
# Disable prefix caching so that we can test prompt logprobs.
# TODO remove this after https://github.com/vllm-project/vllm/pull/13949
# is merged
return
LLM
(
MODEL
,
enforce_eager
=
True
,
enable_prefix_caching
=
False
)
def
test_n_gt_1
(
model
):
def
test_n_gt_1
(
llm
):
"""ParallelSampling is supported."""
params
=
SamplingParams
(
n
=
3
)
outputs
=
model
.
generate
(
PROMPT
,
params
)
outputs
=
llm
.
generate
(
PROMPT
,
params
)
assert
len
(
outputs
[
0
].
outputs
)
==
3
def
test_best_of
(
model
):
def
test_best_of
(
llm
):
"""Raise a ValueError since best_of is deprecated."""
params
=
SamplingParams
(
n
=
2
,
best_of
=
3
)
with
pytest
.
raises
(
ValueError
):
_
=
model
.
generate
(
PROMPT
,
params
)
_
=
llm
.
generate
(
PROMPT
,
params
)
def
test_penalties
(
model
):
def
test_penalties
(
llm
):
"""Check that we do not get errors if applied."""
params
=
SamplingParams
(
...
...
@@ -50,18 +50,18 @@ def test_penalties(model):
top_p
=
0.5
,
top_k
=
3
,
)
_
=
model
.
generate
(
PROMPT
,
params
)
_
=
llm
.
generate
(
PROMPT
,
params
)
def
test_stop
(
model
):
def
test_stop
(
llm
):
"""Check that we respect the stop words."""
output
=
model
.
generate
(
PROMPT
,
SamplingParams
(
temperature
=
0
))
output
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
temperature
=
0
))
split_text
=
output
[
0
].
outputs
[
0
].
text
.
split
()
STOP_IDX
=
5
params
=
SamplingParams
(
temperature
=
0
,
stop
=
split_text
[
STOP_IDX
])
output
=
model
.
generate
(
PROMPT
,
params
)
output
=
llm
.
generate
(
PROMPT
,
params
)
new_split_text
=
output
[
0
].
outputs
[
0
].
text
.
split
()
# Output should not contain the stop word.
...
...
@@ -70,40 +70,40 @@ def test_stop(model):
params
=
SamplingParams
(
temperature
=
0
,
stop
=
split_text
[
STOP_IDX
],
include_stop_str_in_output
=
True
)
output
=
model
.
generate
(
PROMPT
,
params
)
output
=
llm
.
generate
(
PROMPT
,
params
)
new_split_text
=
output
[
0
].
outputs
[
0
].
text
.
split
()
# Output should contain the stop word.
assert
len
(
new_split_text
)
==
STOP_IDX
+
1
def
test_stop_token_ids
(
model
):
def
test_stop_token_ids
(
llm
):
"""Check that we respect the stop token ids."""
output
=
model
.
generate
(
PROMPT
,
SamplingParams
(
temperature
=
0
))
output
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
temperature
=
0
))
stop_token_id_0
=
output
[
0
].
outputs
[
0
].
token_ids
[
5
]
stop_token_id_1
=
output
[
0
].
outputs
[
0
].
token_ids
[
6
]
stop_token_ids
=
[
stop_token_id_1
,
stop_token_id_0
]
params
=
SamplingParams
(
temperature
=
0
,
stop_token_ids
=
stop_token_ids
)
output
=
model
.
generate
(
PROMPT
,
params
)
output
=
llm
.
generate
(
PROMPT
,
params
)
assert
output
[
0
].
outputs
[
0
].
token_ids
[
-
1
]
==
stop_token_id_0
stop_token_ids
=
[
stop_token_id_0
,
stop_token_id_1
]
params
=
SamplingParams
(
temperature
=
0
,
stop_token_ids
=
stop_token_ids
)
output
=
model
.
generate
(
PROMPT
,
params
)
output
=
llm
.
generate
(
PROMPT
,
params
)
assert
output
[
0
].
outputs
[
0
].
token_ids
[
-
1
]
==
stop_token_id_0
def
test_detokenize_false
(
model
):
def
test_detokenize_false
(
llm
):
"""Check that detokenize=False option works."""
output
=
model
.
generate
(
PROMPT
,
SamplingParams
(
detokenize
=
False
))
output
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
detokenize
=
False
))
assert
len
(
output
[
0
].
outputs
[
0
].
token_ids
)
>
0
assert
len
(
output
[
0
].
outputs
[
0
].
text
)
==
0
output
=
model
.
generate
(
output
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
detokenize
=
False
,
logprobs
=
3
,
prompt_logprobs
=
3
))
assert
len
(
output
[
0
].
outputs
[
0
].
token_ids
)
>
0
...
...
@@ -119,28 +119,28 @@ def test_detokenize_false(model):
assert
all
(
lp
.
decoded_token
is
None
for
lp
in
logprobs
.
values
())
def
test_bad_words
(
model
):
def
test_bad_words
(
llm
):
"""Check that we respect bad words."""
output
=
model
.
generate
(
PROMPT
,
SamplingParams
(
temperature
=
0
))
output
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
temperature
=
0
))
split_text
=
output
[
0
].
outputs
[
0
].
text
.
split
()
bad_words_1
=
" "
.
join
(
split_text
[:
2
])
params
=
SamplingParams
(
temperature
=
0
,
bad_words
=
[
bad_words_1
])
output
=
model
.
generate
(
PROMPT
,
params
)
output
=
llm
.
generate
(
PROMPT
,
params
)
new_text
=
output
[
0
].
outputs
[
0
].
text
assert
bad_words_1
not
in
new_text
bad_words_2
=
new_text
.
split
()[
-
1
]
params
=
SamplingParams
(
temperature
=
0
,
bad_words
=
[
bad_words_1
,
bad_words_2
])
output
=
model
.
generate
(
PROMPT
,
params
)
output
=
llm
.
generate
(
PROMPT
,
params
)
new_text
=
output
[
0
].
outputs
[
0
].
text
assert
bad_words_1
not
in
new_text
assert
bad_words_2
not
in
new_text
def
test_logits_processor
(
model
):
def
test_logits_processor
(
llm
):
"""Check that we reject logits processor."""
# This sample logits processor gives infinite score to the i-th token,
...
...
@@ -151,47 +151,45 @@ def test_logits_processor(model):
return
logits
with
pytest
.
raises
(
ValueError
):
_
=
model
.
generate
(
PROMPT
,
SamplingParams
(
logits_processors
=
[
pick_ith
]))
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
logits_processors
=
[
pick_ith
]))
def
test_allowed_token_ids
(
model
):
def
test_allowed_token_ids
(
llm
):
"""Check that we can use allowed_token_ids."""
TOKEN_ID
=
10
allowed_token_ids
=
[
TOKEN_ID
]
output
=
model
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
allowed_token_ids
))
output
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
allowed_token_ids
))
assert
output
[
0
].
outputs
[
0
].
token_ids
[
-
1
]
==
TOKEN_ID
# Reject empty allowed_token_ids.
with
pytest
.
raises
(
ValueError
):
_
=
model
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[]))
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[]))
# Reject negative token id.
with
pytest
.
raises
(
ValueError
):
_
=
model
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
-
1
]))
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
-
1
]))
# Reject out of vocabulary.
with
pytest
.
raises
(
ValueError
):
_
=
model
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
10000000
]))
_
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
allowed_token_ids
=
[
10000000
]))
def
test_priority
(
model
):
def
test_priority
(
llm
):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with
pytest
.
raises
(
ValueError
):
_
=
model
.
generate
(
PROMPT
,
priority
=
[
1
])
_
=
llm
.
generate
(
PROMPT
,
priority
=
[
1
])
def
test_seed
(
model
):
def
test_seed
(
llm
):
"""Check that seed impacts randomness."""
out_1
=
model
.
generate
(
PROMPT
,
SamplingParams
(
seed
=
42
))
out_2
=
model
.
generate
(
PROMPT
,
SamplingParams
(
seed
=
42
))
out_3
=
model
.
generate
(
PROMPT
,
SamplingParams
(
seed
=
43
))
out_1
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
seed
=
42
))
out_2
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
seed
=
42
))
out_3
=
llm
.
generate
(
PROMPT
,
SamplingParams
(
seed
=
43
))
assert
out_1
[
0
].
outputs
[
0
].
text
==
out_2
[
0
].
outputs
[
0
].
text
assert
out_1
[
0
].
outputs
[
0
].
text
!=
out_3
[
0
].
outputs
[
0
].
text
tests/v1/spec_decode/test_eagle.py
View file @
711aa9d5
...
...
@@ -6,6 +6,10 @@ from unittest import mock
import
pytest
import
torch
from
tests.v1.attention.utils
import
(
BatchSpec
,
_Backend
,
create_common_attn_metadata
,
create_standard_kv_cache_spec
,
get_attention_backend
)
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
SpeculativeConfig
,
VllmConfig
)
...
...
@@ -64,13 +68,19 @@ def test_prepare_inputs():
"""
device
=
torch
.
device
(
current_platform
.
device_type
)
#
a
= 4,
b
= 7,
c
= 5
#
q1
= 4,
q2
= 7,
q3
= 5
# n1 = 1, n2 = 3, n3 = 2
# Cumulative lengths: [0, 4, 11, 16]
cu_target_query_lens
=
torch
.
tensor
([
0
,
4
,
11
,
16
],
dtype
=
torch
.
int32
,
device
=
device
)
batch_spec
=
BatchSpec
(
seq_lens
=
[
4
,
7
,
5
],
query_lens
=
[
4
,
7
,
5
],
)
common_attn_metadata
=
create_common_attn_metadata
(
batch_spec
,
block_size
=
16
,
device
=
device
,
)
# Rejected tokens per request: [1, 3, 2]
num_rejected_tokens
=
torch
.
tensor
([
1
,
3
,
2
],
...
...
@@ -104,15 +114,13 @@ def test_prepare_inputs():
],
dtype
=
torch
.
int32
,
device
=
device
)
proposer
=
_create_proposer
(
"eagle"
,
1
)
# n1 + n2 + n3 - a - b -c
num_tokens
=
cu_target_query_lens
[
-
1
].
item
()
-
num_rejected_tokens
.
sum
(
).
item
()
updated_metadata
,
token_indices
=
proposer
.
prepare_inputs
(
common_attn_metadata
,
num_rejected_tokens
.
cpu
())
cu_num_tokens
,
token_indices
=
EagleProposer
.
prepare_inputs
(
cu_target_query_lens
,
num_rejected_tokens
,
num_tokens
)
assert
torch
.
equal
(
cu_num_tokens
,
expected_cu_num_tokens
)
assert
torch
.
equal
(
updated_metadata
.
query_start_loc
,
expected_cu_num_tokens
)
assert
token_indices
.
shape
[
0
]
==
expected_cu_num_tokens
[
-
1
].
item
()
assert
torch
.
equal
(
token_indices
,
expected_token_indices
)
...
...
@@ -209,6 +217,7 @@ def test_propose(num_speculative_tokens):
seq_len_2
=
3
total_tokens
=
seq_len_1
+
seq_len_2
vocab_size
=
100
seq_lens
=
[
seq_len_1
,
seq_len_2
]
# Create proposer first so we can use its actual hidden_size
proposer
=
_create_proposer
(
"eagle"
,
num_speculative_tokens
)
...
...
@@ -270,9 +279,16 @@ def test_propose(num_speculative_tokens):
proposer
.
attn_layer_names
=
[
"layer.0"
]
# Create input tensors
cu_num_tokens
=
torch
.
tensor
([
0
,
seq_len_1
,
total_tokens
],
dtype
=
torch
.
int32
,
device
=
device
)
batch_spec
=
BatchSpec
(
seq_lens
=
seq_lens
,
query_lens
=
seq_lens
,
)
common_attn_metadata
=
create_common_attn_metadata
(
batch_spec
,
block_size
=
16
,
device
=
device
,
)
target_token_ids
=
torch
.
randint
(
0
,
vocab_size
,
(
total_tokens
,
),
...
...
@@ -284,25 +300,29 @@ def test_propose(num_speculative_tokens):
target_hidden_states
=
torch
.
randn
(
total_tokens
,
hidden_size
,
device
=
device
)
target_slot_mapping
=
torch
.
randint
(
0
,
100
,
(
total_tokens
,
),
device
=
device
)
next_token_ids
=
torch
.
randint
(
0
,
vocab_size
,
(
batch_size
,
),
dtype
=
torch
.
int32
,
device
=
device
)
block_table
=
torch
.
randint
(
0
,
10
,
(
batch_size
,
10
),
device
=
device
)
sampling_metadata
=
mock
.
MagicMock
()
# Call the method under test
attn_metadata_builder_cls
,
_
=
get_attention_backend
(
_Backend
.
FLASH_ATTN_VLLM_V1
)
attn_metadata_builder
=
attn_metadata_builder_cls
(
kv_cache_spec
=
create_standard_kv_cache_spec
(
proposer
.
vllm_config
),
vllm_config
=
proposer
.
vllm_config
,
device
=
device
,
)
# Mock runner for attention metadata building
proposer
.
runner
=
mock
.
MagicMock
()
proposer
.
runner
.
attn_metadata_builders
=
[
attn_metadata_builder
]
result
=
proposer
.
propose
(
target_token_ids
=
target_token_ids
,
target_positions
=
target_positions
,
target_hidden_states
=
target_hidden_states
,
target_slot_mapping
=
target_slot_mapping
,
next_token_ids
=
next_token_ids
,
cu_num_tokens
=
cu_num_tokens
,
block_table
=
block_table
,
common_attn_metadata
=
common_attn_metadata
,
sampling_metadata
=
sampling_metadata
)
assert
result
.
shape
==
(
batch_size
,
num_speculative_tokens
)
...
...
tests/v1/test_async_llm_dp.py
View file @
711aa9d5
...
...
@@ -93,8 +93,10 @@ async def test_load(output_kind: RequestOutputKind,
def
__init__
(
self
,
vllm_config
:
VllmConfig
,
engine_index
:
int
=
0
):
stats_loggers
[
engine_index
]
=
self
def
record
(
self
,
scheduler_stats
:
Optional
[
SchedulerStats
],
iteration_stats
:
Optional
[
IterationStats
]):
def
record
(
self
,
scheduler_stats
:
Optional
[
SchedulerStats
],
iteration_stats
:
Optional
[
IterationStats
],
engine_idx
:
int
=
0
):
if
iteration_stats
:
self
.
finished_req_count
+=
len
(
iteration_stats
.
finished_requests
)
...
...
tests/v1/test_external_lb_dp.py
View file @
711aa9d5
...
...
@@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for external LB testing
DP_SIZE
=
int
(
os
.
getenv
(
"DP_SIZE"
,
"2"
))
# Default tensor parallel
l
size to use
# Default tensor parallel size to use
TP_SIZE
=
int
(
os
.
getenv
(
"TP_SIZE"
,
"1"
))
...
...
tests/v1/test_hybrid_lb_dp.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
import
threading
import
time
from
contextlib
import
AsyncExitStack
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
tests.v1.test_utils
import
check_request_balancing
from
vllm.platforms
import
Platform
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
# Number of data parallel ranks for hybrid LB testing (4 total)
DP_SIZE
=
int
(
os
.
getenv
(
"DP_SIZE"
,
"4"
))
# Default tensor parallel size to use
TP_SIZE
=
int
(
os
.
getenv
(
"TP_SIZE"
,
"1"
))
# Number of nodes (2 nodes, each with 2 DP ranks)
NUM_NODES
=
2
DP_SIZE_LOCAL
=
DP_SIZE
//
NUM_NODES
# 2 ranks per node
class
HybridLBServerManager
:
"""Manages hybrid data parallel vLLM server instances where each node
runs a single logical API server that balances requests only to the
DP engines running on that same node."""
def
__init__
(
self
,
model_name
:
str
,
dp_size
:
int
,
api_server_count
:
int
,
base_server_args
:
list
,
dp_size_local
:
int
=
DP_SIZE_LOCAL
,
tp_size
:
int
=
TP_SIZE
):
self
.
model_name
=
model_name
self
.
dp_size
=
dp_size
self
.
dp_size_local
=
dp_size_local
self
.
tp_size
=
tp_size
self
.
api_server_count
=
api_server_count
self
.
base_server_args
=
base_server_args
self
.
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]
=
[]
self
.
server_threads
:
list
[
threading
.
Thread
]
=
[]
self
.
num_nodes
=
dp_size
//
dp_size_local
def
__enter__
(
self
)
->
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]:
"""Start all server instances for hybrid LB mode."""
for
node_id
in
range
(
self
.
num_nodes
):
# Create server args for this specific node
server_args
=
self
.
base_server_args
.
copy
()
# Calculate start rank for this node
start_rank
=
node_id
*
self
.
dp_size_local
# Add hybrid LB specific arguments
server_args
.
extend
([
"--data-parallel-size"
,
str
(
self
.
dp_size
),
"--data-parallel-size-local"
,
str
(
self
.
dp_size_local
),
"--data-parallel-start-rank"
,
str
(
start_rank
),
"--data-parallel-hybrid-lb"
,
# Enable hybrid LB mode
"--tensor-parallel-size"
,
str
(
self
.
tp_size
),
"--port"
,
str
(
8000
+
node_id
),
# Different port for each node
"--api-server-count"
,
str
(
self
.
api_server_count
),
"--data-parallel-address"
,
"127.0.0.1"
,
"--data-parallel-rpc-port"
,
"13345"
,
])
# Use a thread to start each server to allow parallel initialization
def
start_server
(
node
:
int
,
sargs
:
list
[
str
]):
try
:
# Calculate GPU devices for this node
gpus_per_node
=
self
.
dp_size_local
*
self
.
tp_size
gpu_start
=
node
*
gpus_per_node
gpu_end
=
gpu_start
+
gpus_per_node
# Start the server
server
=
RemoteOpenAIServer
(
self
.
model_name
,
sargs
,
auto_port
=
False
,
env_dict
=
{
"CUDA_VISIBLE_DEVICES"
:
","
.
join
(
str
(
Platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
gpu_start
,
gpu_end
))
})
server
.
__enter__
()
print
(
f
"Hybrid LB node
{
node
}
started successfully with "
f
"
{
self
.
dp_size_local
}
local DP ranks and "
f
"
{
self
.
api_server_count
}
API servers"
)
self
.
servers
.
append
((
server
,
sargs
))
except
Exception
as
e
:
print
(
f
"Failed to start hybrid LB node
{
node
}
:
{
e
}
"
)
raise
thread
=
threading
.
Thread
(
target
=
start_server
,
args
=
(
node_id
,
server_args
))
thread
.
start
()
self
.
server_threads
.
append
(
thread
)
# Wait for all servers to start
for
thread
in
self
.
server_threads
:
thread
.
join
()
# Give servers additional time to fully initialize and coordinate
time
.
sleep
(
3
)
if
len
(
self
.
servers
)
!=
self
.
num_nodes
:
raise
Exception
(
"Servers failed to start"
)
return
self
.
servers
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
"""Stop all server instances."""
while
self
.
servers
:
try
:
self
.
servers
.
pop
()[
0
].
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
except
Exception
as
e
:
print
(
f
"Error stopping server:
{
e
}
"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
():
return
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
]
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
1
])
# Only 1 API server for now
def
servers
(
request
,
default_server_args
):
api_server_count
=
request
.
param
with
HybridLBServerManager
(
MODEL_NAME
,
DP_SIZE
,
api_server_count
,
default_server_args
,
DP_SIZE_LOCAL
,
TP_SIZE
)
as
server_list
:
yield
server_list
@
pytest_asyncio
.
fixture
async
def
clients
(
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]):
# Create a client for each node (each node has its own API endpoint)
async
with
AsyncExitStack
()
as
stack
:
yield
[
await
stack
.
enter_async_context
(
server
.
get_async_client
())
for
server
,
_
in
servers
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_hybrid_lb_completion
(
clients
:
list
[
openai
.
AsyncOpenAI
],
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]],
model_name
:
str
)
->
None
:
async
def
make_request
(
client
:
openai
.
AsyncOpenAI
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
10
,
temperature
=
1.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert
len
(
choice
.
text
)
>=
1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert
choice
.
finish_reason
in
(
"length"
,
"stop"
)
# Token counts can also vary, so we check they are positive.
assert
completion
.
usage
.
completion_tokens
>
0
assert
completion
.
usage
.
prompt_tokens
>
0
assert
completion
.
usage
.
total_tokens
>
0
return
completion
# Test single request to each node
for
i
,
client
in
enumerate
(
clients
):
result
=
await
make_request
(
client
)
assert
result
is
not
None
print
(
f
"Hybrid LB node
{
i
}
handled single completion request successfully"
)
await
asyncio
.
sleep
(
0.5
)
# Send requests to all nodes - each should balance within its local DP ranks
num_requests_per_node
=
25
# Total 50 requests across 2 nodes
all_tasks
=
[]
for
i
,
client
in
enumerate
(
clients
):
tasks
=
[
make_request
(
client
)
for
_
in
range
(
num_requests_per_node
)]
all_tasks
.
extend
(
tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests_per_node
*
len
(
clients
)
assert
all
(
completion
is
not
None
for
completion
in
results
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of requests
all_tasks
=
[]
for
i
,
client
in
enumerate
(
clients
):
tasks
=
[
make_request
(
client
)
for
_
in
range
(
num_requests_per_node
)]
all_tasks
.
extend
(
tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests_per_node
*
len
(
clients
)
assert
all
(
completion
is
not
None
for
completion
in
results
)
_
,
server_args
=
servers
[
0
]
api_server_count
=
(
server_args
.
count
(
'--api-server-count'
)
and
server_args
[
server_args
.
index
(
'--api-server-count'
)
+
1
]
or
1
)
print
(
f
"Successfully completed hybrid LB test with
{
len
(
clients
)
}
nodes "
f
"(
{
DP_SIZE_LOCAL
}
DP ranks each, API server count:
{
api_server_count
}
)"
)
# Check request balancing within each node
for
i
,
(
server
,
_
)
in
enumerate
(
servers
):
print
(
f
"Checking request balancing for node
{
i
}
"
)
check_request_balancing
(
server
,
DP_SIZE_LOCAL
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_hybrid_lb_completion_streaming
(
clients
:
list
[
openai
.
AsyncOpenAI
],
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]],
model_name
:
str
)
->
None
:
prompt
=
"What is an LLM?"
async
def
make_streaming_request
(
client
:
openai
.
AsyncOpenAI
):
# Perform a non-streaming request to get the expected full output
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
# Perform the streaming request
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
list
[
str
]
=
[]
finish_reason_count
=
0
last_chunk
=
None
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
last_chunk
=
chunk
# Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert
finish_reason_count
==
1
,
(
"Finish reason should appear exactly once."
)
assert
last_chunk
is
not
None
,
(
"Stream should have yielded at least one chunk."
)
assert
last_chunk
.
choices
[
0
].
finish_reason
==
"length"
,
"Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert
""
.
join
(
chunks
)
==
single_output
,
"Streamed output should match non-streamed output."
return
True
# Indicate success for this request
# Test single request to each node
for
i
,
client
in
enumerate
(
clients
):
result
=
await
make_streaming_request
(
client
)
assert
result
is
not
None
print
(
f
"Hybrid LB node
{
i
}
handled single streaming request successfully"
)
await
asyncio
.
sleep
(
0.5
)
# Send streaming requests to all nodes
num_requests_per_node
=
25
# Total 50 requests across 2 nodes
all_tasks
=
[]
for
i
,
client
in
enumerate
(
clients
):
tasks
=
[
make_streaming_request
(
client
)
for
_
in
range
(
num_requests_per_node
)
]
all_tasks
.
extend
(
tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests_per_node
*
len
(
clients
)
assert
all
(
results
),
"Not all streaming requests completed successfully."
await
asyncio
.
sleep
(
0.5
)
# Second burst of streaming requests
all_tasks
=
[]
for
i
,
client
in
enumerate
(
clients
):
tasks
=
[
make_streaming_request
(
client
)
for
_
in
range
(
num_requests_per_node
)
]
all_tasks
.
extend
(
tasks
)
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests_per_node
*
len
(
clients
)
assert
all
(
results
),
"Not all streaming requests completed successfully."
_
,
server_args
=
servers
[
0
]
api_server_count
=
(
server_args
.
count
(
'--api-server-count'
)
and
server_args
[
server_args
.
index
(
'--api-server-count'
)
+
1
]
or
1
)
print
(
f
"Successfully completed hybrid LB streaming test with "
f
"
{
len
(
clients
)
}
nodes (
{
DP_SIZE_LOCAL
}
DP ranks each, "
f
"API server count:
{
api_server_count
}
)"
)
# Check request balancing within each node
for
i
,
(
server
,
_
)
in
enumerate
(
servers
):
print
(
f
"Checking streaming request balancing for node
{
i
}
"
)
check_request_balancing
(
server
,
DP_SIZE_LOCAL
)
tests/v1/test_internal_lb_dp.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
import
threading
import
time
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
tests.v1.test_utils
import
check_request_balancing
from
vllm.platforms
import
Platform
MODEL_NAME
=
"ibm-research/PowerMoE-3b"
# Number of data parallel ranks for multi-node internal LB testing
DP_SIZE
=
int
(
os
.
getenv
(
"DP_SIZE"
,
"2"
))
# Default tensor parallel size to use
TP_SIZE
=
int
(
os
.
getenv
(
"TP_SIZE"
,
"1"
))
# Number of nodes to simulate
NUM_NODES
=
2
class
MultinodeInternalLBServerManager
:
"""Manages multi-node data parallel vLLM server instances for internal
load balancer testing using --headless mode."""
def
__init__
(
self
,
model_name
:
str
,
dp_size
:
int
,
api_server_count
:
int
,
base_server_args
:
list
,
dp_per_node
:
int
=
1
,
tp_size
:
int
=
TP_SIZE
):
self
.
model_name
=
model_name
self
.
dp_size
=
dp_size
self
.
dp_per_node
=
dp_per_node
self
.
tp_size
=
tp_size
self
.
api_server_count
=
api_server_count
self
.
base_server_args
=
base_server_args
self
.
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]
=
[]
self
.
server_threads
:
list
[
threading
.
Thread
]
=
[]
def
__enter__
(
self
)
->
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]:
"""Start all server instances for multi-node internal LB mode."""
for
rank
in
range
(
0
,
self
.
dp_size
,
self
.
dp_per_node
):
# Create server args for this specific rank
server_args
=
self
.
base_server_args
.
copy
()
if
rank
==
0
:
# Head node - runs API server and first DP rank
server_args
.
extend
([
"--data-parallel-size"
,
str
(
self
.
dp_size
),
"--data-parallel-size-local"
,
str
(
self
.
dp_per_node
),
"--tensor-parallel-size"
,
str
(
self
.
tp_size
),
"--port"
,
"8000"
,
# Single endpoint for all requests
"--api-server-count"
,
str
(
self
.
api_server_count
),
"--data-parallel-address"
,
"127.0.0.1"
,
"--data-parallel-rpc-port"
,
"13345"
,
])
else
:
# Secondary nodes - run in headless mode
server_args
.
extend
([
"--headless"
,
"--data-parallel-size"
,
str
(
self
.
dp_size
),
"--data-parallel-size-local"
,
str
(
self
.
dp_per_node
),
"--data-parallel-start-rank"
,
str
(
rank
),
"--tensor-parallel-size"
,
str
(
self
.
tp_size
),
"--data-parallel-address"
,
"127.0.0.1"
,
"--data-parallel-rpc-port"
,
"13345"
,
])
# Use a thread to start each server to allow parallel initialization
def
start_server
(
r
:
int
,
sargs
:
list
[
str
]):
gpus_per_node
=
self
.
tp_size
*
self
.
dp_per_node
try
:
# Start the server
server
=
RemoteOpenAIServer
(
self
.
model_name
,
sargs
,
auto_port
=
False
,
env_dict
=
{
"CUDA_VISIBLE_DEVICES"
:
","
.
join
(
str
(
Platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
r
,
r
+
gpus_per_node
))
})
server
.
__enter__
()
if
r
==
0
:
print
(
f
"Head node (rank
{
r
}
) started successfully with "
f
"
{
self
.
api_server_count
}
API servers"
)
else
:
print
(
f
"Headless node (rank
{
r
}
) started successfully"
)
self
.
servers
.
append
((
server
,
sargs
))
except
Exception
as
e
:
print
(
f
"Failed to start server rank
{
r
}
:
{
e
}
"
)
raise
thread
=
threading
.
Thread
(
target
=
start_server
,
args
=
(
rank
,
server_args
))
thread
.
start
()
self
.
server_threads
.
append
(
thread
)
# Wait for all servers to start
for
thread
in
self
.
server_threads
:
thread
.
join
()
# Give servers additional time to fully initialize and coordinate
time
.
sleep
(
3
)
if
len
(
self
.
servers
)
!=
self
.
dp_size
//
self
.
dp_per_node
:
raise
Exception
(
"Servers failed to start"
)
return
self
.
servers
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
"""Stop all server instances."""
while
self
.
servers
:
try
:
self
.
servers
.
pop
()[
0
].
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
except
Exception
as
e
:
print
(
f
"Error stopping server:
{
e
}
"
)
class
APIOnlyServerManager
:
"""Manages API-only server (Node 0) and headless engines server (Node 1)
for testing separated API server and engine configuration."""
def
__init__
(
self
,
model_name
:
str
,
dp_size
:
int
,
api_server_count
:
int
,
base_server_args
:
list
,
tp_size
:
int
=
TP_SIZE
):
self
.
model_name
=
model_name
self
.
dp_size
=
dp_size
self
.
tp_size
=
tp_size
self
.
api_server_count
=
api_server_count
self
.
base_server_args
=
base_server_args
self
.
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]
=
[]
self
.
server_threads
:
list
[
threading
.
Thread
]
=
[]
def
__enter__
(
self
)
->
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]:
"""Start API-only server and headless engines server."""
# Start API-only server (Node 0) - no engines, only API server
api_server_args
=
self
.
base_server_args
.
copy
()
api_server_args
.
extend
([
"--data-parallel-size"
,
str
(
self
.
dp_size
),
"--data-parallel-size-local"
,
"0"
,
# No engines on this node
"--tensor-parallel-size"
,
str
(
self
.
tp_size
),
"--port"
,
"8000"
,
"--api-server-count"
,
str
(
self
.
api_server_count
),
"--data-parallel-address"
,
"127.0.0.1"
,
"--data-parallel-rpc-port"
,
"13345"
,
])
# Start headless engines server (Node 1) - all engines, no API server
engines_server_args
=
self
.
base_server_args
.
copy
()
engines_server_args
.
extend
([
"--headless"
,
"--data-parallel-size"
,
str
(
self
.
dp_size
),
"--data-parallel-size-local"
,
str
(
self
.
dp_size
),
# All engines on this node
"--tensor-parallel-size"
,
str
(
self
.
tp_size
),
"--data-parallel-address"
,
"127.0.0.1"
,
"--data-parallel-rpc-port"
,
"13345"
,
])
# Use threads to start both servers in parallel
def
start_api_server
():
try
:
server
=
RemoteOpenAIServer
(
self
.
model_name
,
api_server_args
,
auto_port
=
False
,
env_dict
=
{})
# No GPUs needed for API-only server
server
.
__enter__
()
print
(
f
"API-only server started successfully with "
f
"
{
self
.
api_server_count
}
API servers"
)
self
.
servers
.
append
((
server
,
api_server_args
))
except
Exception
as
e
:
print
(
f
"Failed to start API-only server:
{
e
}
"
)
raise
def
start_engines_server
():
try
:
server
=
RemoteOpenAIServer
(
self
.
model_name
,
engines_server_args
,
auto_port
=
False
,
env_dict
=
{
"CUDA_VISIBLE_DEVICES"
:
","
.
join
(
str
(
Platform
.
device_id_to_physical_device_id
(
i
))
for
i
in
range
(
self
.
dp_size
*
self
.
tp_size
))
})
server
.
__enter__
()
print
(
f
"Headless engines server started successfully with "
f
"
{
self
.
dp_size
}
engines"
)
self
.
servers
.
append
((
server
,
engines_server_args
))
except
Exception
as
e
:
print
(
f
"Failed to start headless engines server:
{
e
}
"
)
raise
# Start API server first
api_thread
=
threading
.
Thread
(
target
=
start_api_server
)
api_thread
.
start
()
self
.
server_threads
.
append
(
api_thread
)
# Start engines server second
engines_thread
=
threading
.
Thread
(
target
=
start_engines_server
)
engines_thread
.
start
()
self
.
server_threads
.
append
(
engines_thread
)
# Wait for both servers to start
for
thread
in
self
.
server_threads
:
thread
.
join
()
# Give servers additional time to fully initialize and coordinate
time
.
sleep
(
3
)
if
len
(
self
.
servers
)
!=
2
:
raise
Exception
(
"Both servers failed to start"
)
return
self
.
servers
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
"""Stop both server instances."""
while
self
.
servers
:
try
:
self
.
servers
.
pop
()[
0
].
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
except
Exception
as
e
:
print
(
f
"Error stopping server:
{
e
}
"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
():
return
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
]
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
1
,
4
])
def
servers
(
request
,
default_server_args
):
api_server_count
=
request
.
param
with
MultinodeInternalLBServerManager
(
MODEL_NAME
,
DP_SIZE
,
api_server_count
,
default_server_args
,
DP_SIZE
//
NUM_NODES
,
TP_SIZE
)
as
server_list
:
yield
server_list
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
1
,
4
])
def
api_only_servers
(
request
,
default_server_args
):
"""Fixture for API-only server + headless engines configuration."""
api_server_count
=
request
.
param
with
APIOnlyServerManager
(
MODEL_NAME
,
DP_SIZE
,
api_server_count
,
default_server_args
,
TP_SIZE
)
as
server_list
:
yield
server_list
@
pytest_asyncio
.
fixture
async
def
client
(
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]):
# For internal LB, we only connect to the head node (rank 0)
# which provides the single API endpoint
head_server
=
servers
[
0
][
0
]
async
with
head_server
.
get_async_client
()
as
client
:
yield
client
@
pytest_asyncio
.
fixture
async
def
api_only_client
(
api_only_servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]]):
"""Client fixture for API-only server configuration."""
# Connect to the API-only server (first server in the list)
api_server
=
api_only_servers
[
0
][
0
]
async
with
api_server
.
get_async_client
()
as
client
:
yield
client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_multinode_dp_completion
(
client
:
openai
.
AsyncOpenAI
,
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]],
model_name
:
str
)
->
None
:
async
def
make_request
():
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
10
,
temperature
=
1.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert
len
(
choice
.
text
)
>=
1
# Finish reason might not always be 'length' if the model finishes early
# or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert
choice
.
finish_reason
in
(
"length"
,
"stop"
)
# Token counts can also vary, so we check they are positive.
assert
completion
.
usage
.
completion_tokens
>
0
assert
completion
.
usage
.
prompt_tokens
>
0
assert
completion
.
usage
.
total_tokens
>
0
return
completion
# Test single request
result
=
await
make_request
()
assert
result
is
not
None
print
(
"Multi-node internal LB handled single completion request successfully"
)
await
asyncio
.
sleep
(
0.5
)
# Send multiple requests - internal LB should distribute across DP ranks
num_requests
=
50
all_tasks
=
[
make_request
()
for
_
in
range
(
num_requests
)]
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of requests
all_tasks
=
[
make_request
()
for
_
in
range
(
num_requests
)]
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
_
,
server_args
=
servers
[
0
]
api_server_count
=
(
server_args
.
count
(
'--api-server-count'
)
and
server_args
[
server_args
.
index
(
'--api-server-count'
)
+
1
]
or
1
)
print
(
f
"Successfully completed multi-node internal LB test with "
f
"
{
len
(
servers
)
}
DP ranks (API server count:
{
api_server_count
}
)"
)
# Check request balancing via Prometheus metrics
head_server
=
servers
[
0
][
0
]
check_request_balancing
(
head_server
,
DP_SIZE
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_multinode_dp_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]],
model_name
:
str
)
->
None
:
prompt
=
"What is an LLM?"
async
def
make_streaming_request
():
# Perform a non-streaming request to get the expected full output
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
# Perform the streaming request
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
list
[
str
]
=
[]
finish_reason_count
=
0
last_chunk
=
None
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
last_chunk
=
chunk
# Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert
finish_reason_count
==
1
,
(
"Finish reason should appear exactly once."
)
assert
last_chunk
is
not
None
,
(
"Stream should have yielded at least one chunk."
)
assert
last_chunk
.
choices
[
0
].
finish_reason
==
"length"
,
"Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert
""
.
join
(
chunks
)
==
single_output
,
"Streamed output should match non-streamed output."
return
True
# Indicate success for this request
# Test single streaming request
result
=
await
make_streaming_request
()
assert
result
is
not
None
print
(
"Multi-node internal LB handled single streaming request successfully"
)
await
asyncio
.
sleep
(
0.5
)
# Send multiple streaming requests - internal LB should distribute across
# DP ranks
num_requests
=
50
all_tasks
=
[
make_streaming_request
()
for
_
in
range
(
num_requests
)]
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
results
),
"Not all streaming requests completed successfully."
await
asyncio
.
sleep
(
0.5
)
# Second burst of streaming requests
all_tasks
=
[
make_streaming_request
()
for
_
in
range
(
num_requests
)]
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
results
),
"Not all streaming requests completed successfully."
_
,
server_args
=
servers
[
0
]
api_server_count
=
(
server_args
.
count
(
'--api-server-count'
)
and
server_args
[
server_args
.
index
(
'--api-server-count'
)
+
1
]
or
1
)
print
(
f
"Successfully completed multi-node internal LB streaming test with "
f
"
{
len
(
servers
)
}
DP ranks (API server count:
{
api_server_count
}
)"
)
# Check request balancing via Prometheus metrics
head_server
=
servers
[
0
][
0
]
check_request_balancing
(
head_server
,
DP_SIZE
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_api_only_multinode_dp_completion
(
api_only_client
:
openai
.
AsyncOpenAI
,
api_only_servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]],
model_name
:
str
)
->
None
:
"""Test API-only server with all engines on separate headless server."""
async
def
make_request
():
completion
=
await
api_only_client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
10
,
temperature
=
1.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
# The exact number of tokens can vary slightly with temperature=1.0,
# so we check for a reasonable minimum length.
assert
len
(
choice
.
text
)
>=
1
# Finish reason might not always be 'length' if the model finishes
# early or due to other reasons, especially with high temperature.
# So, we'll accept 'length' or 'stop'.
assert
choice
.
finish_reason
in
(
"length"
,
"stop"
)
# Token counts can also vary, so we check they are positive.
assert
completion
.
usage
.
completion_tokens
>
0
assert
completion
.
usage
.
prompt_tokens
>
0
assert
completion
.
usage
.
total_tokens
>
0
return
completion
# Test single request
result
=
await
make_request
()
assert
result
is
not
None
print
(
"API-only server handled single completion request successfully"
)
await
asyncio
.
sleep
(
0.5
)
# Send multiple requests - should be distributed across engines on
# headless server
num_requests
=
50
all_tasks
=
[
make_request
()
for
_
in
range
(
num_requests
)]
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
await
asyncio
.
sleep
(
0.5
)
# Second burst of requests
all_tasks
=
[
make_request
()
for
_
in
range
(
num_requests
)]
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
completion
is
not
None
for
completion
in
results
)
_
,
api_server_args
=
api_only_servers
[
0
]
api_server_count
=
(
api_server_args
.
count
(
'--api-server-count'
)
and
api_server_args
[
api_server_args
.
index
(
'--api-server-count'
)
+
1
]
or
1
)
print
(
f
"Successfully completed API-only multi-node test with
{
DP_SIZE
}
"
f
"engines on headless server (API server count:
{
api_server_count
}
)"
)
# Check request balancing via Prometheus metrics
api_server
=
api_only_servers
[
0
][
0
]
check_request_balancing
(
api_server
,
DP_SIZE
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_api_only_multinode_dp_completion_streaming
(
api_only_client
:
openai
.
AsyncOpenAI
,
api_only_servers
:
list
[
tuple
[
RemoteOpenAIServer
,
list
[
str
]]],
model_name
:
str
)
->
None
:
"""Test API-only server streaming with all engines on separate
headless server."""
prompt
=
"What is an LLM?"
async
def
make_streaming_request
():
# Perform a non-streaming request to get the expected full output
single_completion
=
await
api_only_client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
# Perform the streaming request
stream
=
await
api_only_client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
list
[
str
]
=
[]
finish_reason_count
=
0
last_chunk
=
None
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
last_chunk
=
chunk
# Keep track of the last chunk
# finish reason should only return in the last block for OpenAI API
assert
finish_reason_count
==
1
,
(
"Finish reason should appear exactly once."
)
assert
last_chunk
is
not
None
,
(
"Stream should have yielded at least one chunk."
)
assert
last_chunk
.
choices
[
0
].
finish_reason
==
"length"
,
"Finish reason should be 'length'."
# Check that the combined text matches the non-streamed version.
assert
""
.
join
(
chunks
)
==
single_output
,
"Streamed output should match non-streamed output."
return
True
# Indicate success for this request
# Test single streaming request
result
=
await
make_streaming_request
()
assert
result
is
not
None
print
(
"API-only server handled single streaming request successfully"
)
await
asyncio
.
sleep
(
0.5
)
# Send multiple streaming requests - should be distributed across engines
num_requests
=
50
all_tasks
=
[
make_streaming_request
()
for
_
in
range
(
num_requests
)]
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
results
),
"Not all streaming requests completed successfully."
await
asyncio
.
sleep
(
0.5
)
# Second burst of streaming requests
all_tasks
=
[
make_streaming_request
()
for
_
in
range
(
num_requests
)]
results
=
await
asyncio
.
gather
(
*
all_tasks
)
assert
len
(
results
)
==
num_requests
assert
all
(
results
),
"Not all streaming requests completed successfully."
_
,
api_server_args
=
api_only_servers
[
0
]
api_server_count
=
(
api_server_args
.
count
(
'--api-server-count'
)
and
api_server_args
[
api_server_args
.
index
(
'--api-server-count'
)
+
1
]
or
1
)
print
(
f
"Successfully completed API-only streaming test with
{
DP_SIZE
}
"
f
"engines on headless server (API server count:
{
api_server_count
}
)"
)
# Check request balancing via Prometheus metrics
api_server
=
api_only_servers
[
0
][
0
]
check_request_balancing
(
api_server
,
DP_SIZE
)
Prev
1
…
18
19
20
21
22
23
24
25
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment