Unverified Commit b1c8d4e9 authored by shangmingc's avatar shangmingc Committed by GitHub
Browse files

[PD] Abort unbootstrapped prefill requests through timeout (#6685)


Signed-off-by: default avatarShangming Cai <caishangming@linux.alibaba.com>
parent c25231c6
...@@ -168,6 +168,9 @@ class MooncakeKVManager(BaseKVManager): ...@@ -168,6 +168,9 @@ class MooncakeKVManager(BaseKVManager):
min(max(1, cpu_count // 8), 8), min(max(1, cpu_count // 8), 8),
) )
) )
self.bootstrap_time_out = get_int_env_var(
"SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT", 30
)
elif self.disaggregation_mode == DisaggregationMode.DECODE: elif self.disaggregation_mode == DisaggregationMode.DECODE:
self.heartbeat_failures = {} self.heartbeat_failures = {}
self.session_pool = defaultdict(requests.Session) self.session_pool = defaultdict(requests.Session)
...@@ -351,7 +354,7 @@ class MooncakeKVManager(BaseKVManager): ...@@ -351,7 +354,7 @@ class MooncakeKVManager(BaseKVManager):
if req.mooncake_session_id in self.failed_sessions: if req.mooncake_session_id in self.failed_sessions:
self.record_failure( self.record_failure(
kv_chunk.room, kv_chunk.room,
f"Decode instance could be dead, {req.mooncake_session_id} failed due to multiple errors", f"Decode instance could be dead, remote mooncake session {req.mooncake_session_id} is not alive",
) )
self.update_status(kv_chunk.room, KVPoll.Failed) self.update_status(kv_chunk.room, KVPoll.Failed)
self.sync_status_to_decode_endpoint( self.sync_status_to_decode_endpoint(
...@@ -447,7 +450,8 @@ class MooncakeKVManager(BaseKVManager): ...@@ -447,7 +450,8 @@ class MooncakeKVManager(BaseKVManager):
kv_chunk.room not in self.request_status kv_chunk.room not in self.request_status
or self.check_status(kv_chunk.room) == KVPoll.Success or self.check_status(kv_chunk.room) == KVPoll.Success
): ):
self.transfer_infos.pop(kv_chunk.room) if kv_chunk.room in self.transfer_infos:
self.transfer_infos.pop(kv_chunk.room)
except queue.Empty: except queue.Empty:
continue continue
...@@ -630,7 +634,8 @@ class MooncakeKVManager(BaseKVManager): ...@@ -630,7 +634,8 @@ class MooncakeKVManager(BaseKVManager):
possible_affected_rooms = self.addr_to_rooms_tracker.get( possible_affected_rooms = self.addr_to_rooms_tracker.get(
failed_bootstrap_addr, [] failed_bootstrap_addr, []
) )
del self.addr_to_rooms_tracker[failed_bootstrap_addr] if failed_bootstrap_addr in self.addr_to_rooms_tracker:
del self.addr_to_rooms_tracker[failed_bootstrap_addr]
# Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed # Report the requests associated with the failed bootstrap addr and mark their status as KVPoll.Failed
affected_rooms = [] affected_rooms = []
...@@ -660,6 +665,7 @@ class MooncakeKVSender(BaseKVSender): ...@@ -660,6 +665,7 @@ class MooncakeKVSender(BaseKVSender):
self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping) self.kv_mgr.update_status(bootstrap_room, KVPoll.Bootstrapping)
self.aux_index = None self.aux_index = None
self.bootstrap_server_url = bootstrap_addr self.bootstrap_server_url = bootstrap_addr
self.init_time = time.time()
self.conclude_state = None self.conclude_state = None
# inner state # inner state
self.curr_idx = 0 self.curr_idx = 0
...@@ -694,13 +700,24 @@ class MooncakeKVSender(BaseKVSender): ...@@ -694,13 +700,24 @@ class MooncakeKVSender(BaseKVSender):
status = self.kv_mgr.check_status(self.bootstrap_room) status = self.kv_mgr.check_status(self.bootstrap_room)
if status in (KVPoll.Success, KVPoll.Failed): if status in (KVPoll.Success, KVPoll.Failed):
self.conclude_state = status self.conclude_state = status
elif status == KVPoll.Bootstrapping:
now = time.time()
elapsed = now - self.init_time
if elapsed >= self.kv_mgr.bootstrap_time_out:
self.kv_mgr.record_failure(
self.bootstrap_room,
f"Request {self.bootstrap_room} timed out after {elapsed:.1f}s in KVPoll.Bootstrapping",
)
self.conclude_state = KVPoll.Failed
return KVPoll.Failed
return status return status
else: else:
return self.conclude_state return self.conclude_state
def clear(self) -> None: def clear(self) -> None:
self.kv_mgr.request_status.pop(self.bootstrap_room) if self.bootstrap_room in self.kv_mgr.request_status:
self.kv_mgr.request_status.pop(self.bootstrap_room)
def failure_exception(self): def failure_exception(self):
self.clear() self.clear()
...@@ -956,7 +973,8 @@ class MooncakeKVReceiver(BaseKVReceiver): ...@@ -956,7 +973,8 @@ class MooncakeKVReceiver(BaseKVReceiver):
return self.conclude_state return self.conclude_state
def clear(self) -> None: def clear(self) -> None:
self.kv_mgr.request_status.pop(self.bootstrap_room) if self.bootstrap_room in self.kv_mgr.request_status:
self.kv_mgr.request_status.pop(self.bootstrap_room)
def failure_exception(self): def failure_exception(self):
self.clear() self.clear()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment