Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
3f57b00a
Unverified
Commit
3f57b00a
authored
Apr 21, 2025
by
Yongtong Wu
Committed by
GitHub
Apr 21, 2025
Browse files
Support PD bootstrap fields on /v1/chat/completions endpoint (#5488)
parent
453d412c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
48 additions
and
6 deletions
+48
-6
python/sglang/srt/disaggregation/mini_lb.py
python/sglang/srt/disaggregation/mini_lb.py
+42
-6
python/sglang/srt/openai_api/adapter.py
python/sglang/srt/openai_api/adapter.py
+2
-0
python/sglang/srt/openai_api/protocol.py
python/sglang/srt/openai_api/protocol.py
+4
-0
No files found.
python/sglang/srt/disaggregation/mini_lb.py
View file @
3f57b00a
...
@@ -23,8 +23,9 @@ class MiniLoadBalancer:
...
@@ -23,8 +23,9 @@ class MiniLoadBalancer:
return
random
.
choice
(
self
.
prefill_servers
),
random
.
choice
(
self
.
decode_servers
)
return
random
.
choice
(
self
.
prefill_servers
),
random
.
choice
(
self
.
decode_servers
)
async
def
generate
(
async
def
generate
(
self
,
modified_request
,
prefill_server
,
decode_server
self
,
modified_request
,
prefill_server
,
decode_server
,
endpoint
)
->
ORJSONResponse
:
)
->
ORJSONResponse
:
assert
endpoint
[
0
]
!=
"/"
,
f
"Endpoint should not start with '/':
{
endpoint
}
"
async
with
aiohttp
.
ClientSession
(
async
with
aiohttp
.
ClientSession
(
timeout
=
aiohttp
.
ClientTimeout
(
timeout
=
aiohttp
.
ClientTimeout
(
...
@@ -32,8 +33,8 @@ class MiniLoadBalancer:
...
@@ -32,8 +33,8 @@ class MiniLoadBalancer:
)
# Add timeout for request reliability
)
# Add timeout for request reliability
)
as
session
:
)
as
session
:
tasks
=
[
tasks
=
[
session
.
post
(
f
"
{
prefill_server
}
/
g
en
erate
"
,
json
=
modified_request
),
session
.
post
(
f
"
{
prefill_server
}
/
{
en
dpoint
}
"
,
json
=
modified_request
),
session
.
post
(
f
"
{
decode_server
}
/
g
en
erate
"
,
json
=
modified_request
),
session
.
post
(
f
"
{
decode_server
}
/
{
en
dpoint
}
"
,
json
=
modified_request
),
]
]
# Wait for both responses to complete. Prefill should end first.
# Wait for both responses to complete. Prefill should end first.
prefill_response
,
decode_response
=
await
asyncio
.
gather
(
*
tasks
)
prefill_response
,
decode_response
=
await
asyncio
.
gather
(
*
tasks
)
...
@@ -43,7 +44,11 @@ class MiniLoadBalancer:
...
@@ -43,7 +44,11 @@ class MiniLoadBalancer:
status_code
=
decode_response
.
status
,
status_code
=
decode_response
.
status
,
)
)
async
def
generate_stream
(
self
,
modified_request
,
prefill_server
,
decode_server
):
async
def
generate_stream
(
self
,
modified_request
,
prefill_server
,
decode_server
,
endpoint
=
"generate"
):
assert
endpoint
[
0
]
!=
"/"
,
f
"Endpoint should not start with '/':
{
endpoint
}
"
async
def
stream_results
():
async
def
stream_results
():
async
with
aiohttp
.
ClientSession
(
async
with
aiohttp
.
ClientSession
(
timeout
=
aiohttp
.
ClientTimeout
(
timeout
=
aiohttp
.
ClientTimeout
(
...
@@ -54,10 +59,10 @@ class MiniLoadBalancer:
...
@@ -54,10 +59,10 @@ class MiniLoadBalancer:
# Create the tasks for both prefill and decode requests
# Create the tasks for both prefill and decode requests
tasks
=
[
tasks
=
[
session
.
post
(
session
.
post
(
f
"
{
prefill_server
}
/
g
en
erate
"
,
json
=
modified_request
f
"
{
prefill_server
}
/
{
en
dpoint
}
"
,
json
=
modified_request
),
),
session
.
post
(
session
.
post
(
f
"
{
decode_server
}
/
g
en
erate
"
,
json
=
modified_request
f
"
{
decode_server
}
/
{
en
dpoint
}
"
,
json
=
modified_request
),
),
]
]
# Wait for both responses to complete. Since this is streaming, they return immediately.
# Wait for both responses to complete. Since this is streaming, they return immediately.
...
@@ -190,6 +195,37 @@ async def handle_generate_request(request_data: dict):
...
@@ -190,6 +195,37 @@ async def handle_generate_request(request_data: dict):
)
)
@
app
.
post
(
"/v1/chat/completions"
)
async
def
handle_completion_request
(
request_data
:
dict
):
prefill_server
,
decode_server
=
load_balancer
.
select_pair
()
# Parse and transform prefill_server for bootstrap data
parsed_url
=
urllib
.
parse
.
urlparse
(
prefill_server
)
hostname
=
parsed_url
.
hostname
modified_request
=
request_data
.
copy
()
modified_request
.
update
(
{
"bootstrap_host"
:
hostname
,
"bootstrap_room"
:
random
.
randint
(
0
,
2
**
63
-
1
),
}
)
if
request_data
.
get
(
"stream"
,
False
):
return
await
load_balancer
.
generate_stream
(
modified_request
,
prefill_server
,
decode_server
,
endpoint
=
"v1/chat/completions"
,
)
else
:
return
await
load_balancer
.
generate
(
modified_request
,
prefill_server
,
decode_server
,
endpoint
=
"v1/chat/completions"
,
)
def
_generate_bootstrap_room
():
def
_generate_bootstrap_room
():
return
random
.
randint
(
0
,
2
**
63
-
1
)
return
random
.
randint
(
0
,
2
**
63
-
1
)
...
...
python/sglang/srt/openai_api/adapter.py
View file @
3f57b00a
...
@@ -1174,6 +1174,8 @@ def v1_chat_generate_request(
...
@@ -1174,6 +1174,8 @@ def v1_chat_generate_request(
rid
=
request_ids
,
rid
=
request_ids
,
modalities
=
modalities_list
,
modalities
=
modalities_list
,
lora_path
=
lora_paths
,
lora_path
=
lora_paths
,
bootstrap_host
=
all_requests
[
0
].
bootstrap_host
,
bootstrap_room
=
all_requests
[
0
].
bootstrap_room
,
)
)
return
adapted_request
,
all_requests
if
len
(
all_requests
)
>
1
else
all_requests
[
0
]
return
adapted_request
,
all_requests
if
len
(
all_requests
)
>
1
else
all_requests
[
0
]
...
...
python/sglang/srt/openai_api/protocol.py
View file @
3f57b00a
...
@@ -362,6 +362,10 @@ class ChatCompletionRequest(BaseModel):
...
@@ -362,6 +362,10 @@ class ChatCompletionRequest(BaseModel):
separate_reasoning
:
bool
=
True
separate_reasoning
:
bool
=
True
stream_reasoning
:
bool
=
True
stream_reasoning
:
bool
=
True
# For PD disaggregation
bootstrap_host
:
Optional
[
str
]
=
None
bootstrap_room
:
Optional
[
int
]
=
None
class
FunctionResponse
(
BaseModel
):
class
FunctionResponse
(
BaseModel
):
"""Function response."""
"""Function response."""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment