Commit cb7d90a7 authored by jujl1's avatar jujl1
Browse files

fix: 修复pp资源抢占bug

parent 3a58da2c
...@@ -281,20 +281,27 @@ class Scheduler(SchedulerInterface): ...@@ -281,20 +281,27 @@ class Scheduler(SchedulerInterface):
num_draft_tokens=num_draft_tokens, num_draft_tokens=num_draft_tokens,
num_lookahead_tokens=self.num_lookahead_tokens) num_lookahead_tokens=self.num_lookahead_tokens)
if new_blocks is None: if new_blocks is None:
# The request cannot be scheduled. if new_blocks is None:
# Preempt the lowest-priority request. if self.use_pp:
if self.policy == SchedulingPolicy.PRIORITY: preemptable_reqs = [r for r in self.running if
preempted_req = max( r.num_tokens_with_spec != r.num_computed_tokens]
self.running, else:
key=lambda r: (r.priority, r.arrival_time), preemptable_reqs = self.running
) # The request cannot be scheduled.
# Preempt the lowest-priority request.
if self.policy == SchedulingPolicy.PRIORITY:
preempted_req = max(
preemptable_reqs,
key=lambda r: (r.priority, r.arrival_time),
)
else:
preempted_req = preemptable_reqs[-1]
self.running.remove(preempted_req) self.running.remove(preempted_req)
else:
preempted_req = self.running.pop()
self.kv_cache_manager.free(preempted_req) self.kv_cache_manager.free(preempted_req)
preempted_req.status = RequestStatus.PREEMPTED preempted_req.status = RequestStatus.PREEMPTED
preempted_req.num_computed_tokens = 0 preempted_req.num_computed_tokens = 0
preempted_req.spec_token_ids = []
if self.log_stats: if self.log_stats:
preempted_req.record_event( preempted_req.record_event(
EngineCoreEventType.PREEMPTED, scheduled_timestamp) EngineCoreEventType.PREEMPTED, scheduled_timestamp)
...@@ -901,20 +908,26 @@ class Scheduler(SchedulerInterface): ...@@ -901,20 +908,26 @@ class Scheduler(SchedulerInterface):
num_draft_tokens=num_draft_tokens, num_draft_tokens=num_draft_tokens,
num_lookahead_tokens=self.num_lookahead_tokens) num_lookahead_tokens=self.num_lookahead_tokens)
if new_blocks is None: if new_blocks is None:
if self.use_pp:
preemptable_reqs = [r for r in self.running if
r.num_tokens_with_spec != r.num_computed_tokens]
else:
preemptable_reqs = self.running
# The request cannot be scheduled. # The request cannot be scheduled.
# Preempt the lowest-priority request. # Preempt the lowest-priority request.
if self.policy == SchedulingPolicy.PRIORITY: if self.policy == SchedulingPolicy.PRIORITY:
preempted_req = max( preempted_req = max(
self.running, preemptable_reqs,
key=lambda r: (r.priority, r.arrival_time), key=lambda r: (r.priority, r.arrival_time),
) )
self.running.remove(preempted_req)
else: else:
preempted_req = self.running.pop() preempted_req = preemptable_reqs[-1]
self.running.remove(preempted_req)
self.kv_cache_manager.free(preempted_req) self.kv_cache_manager.free(preempted_req)
preempted_req.status = RequestStatus.PREEMPTED preempted_req.status = RequestStatus.PREEMPTED
preempted_req.num_computed_tokens = 0 preempted_req.num_computed_tokens = 0
preempted_req.spec_token_ids = []
if self.log_stats: if self.log_stats:
preempted_req.record_event( preempted_req.record_event(
EngineCoreEventType.PREEMPTED, scheduled_timestamp) EngineCoreEventType.PREEMPTED, scheduled_timestamp)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment