Unverified Commit e21d7687 authored by 陈序's avatar 陈序 Committed by GitHub
Browse files

Fix hanging when prompt exceeds limit (#1029)

parent ff36139f
...@@ -175,7 +175,7 @@ class Scheduler: ...@@ -175,7 +175,7 @@ class Scheduler:
num_curr_seqs += num_new_seqs num_curr_seqs += num_new_seqs
scheduled.append(seq_group) scheduled.append(seq_group)
if scheduled: if scheduled or ignored_seq_groups:
scheduler_outputs = SchedulerOutputs( scheduler_outputs = SchedulerOutputs(
scheduled_seq_groups=scheduled, scheduled_seq_groups=scheduled,
prompt_run=True, prompt_run=True,
......
...@@ -294,14 +294,12 @@ class LLMEngine: ...@@ -294,14 +294,12 @@ class LLMEngine:
def _schedule( def _schedule(
self self
) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs,
Optional[List[RequestOutput]]]: List[RequestOutput]]:
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
if scheduler_outputs.is_empty(): return seq_group_metadata_list, scheduler_outputs, [
return seq_group_metadata_list, scheduler_outputs, [ RequestOutput.from_seq_group(seq_group)
RequestOutput.from_seq_group(seq_group) for seq_group in scheduler_outputs.ignored_seq_groups
for seq_group in scheduler_outputs.ignored_seq_groups ]
]
return seq_group_metadata_list, scheduler_outputs, None
def _check_beam_search_early_stopping( def _check_beam_search_early_stopping(
self, self,
...@@ -545,10 +543,9 @@ class LLMEngine: ...@@ -545,10 +543,9 @@ class LLMEngine:
and updates the scheduler with the model outputs. Finally, it decodes and updates the scheduler with the model outputs. Finally, it decodes
the sequences and returns the newly generated results. the sequences and returns the newly generated results.
""" """
(seq_group_metadata_list, scheduler_outputs, seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
early_return) = self._schedule() if scheduler_outputs.is_empty():
if early_return is not None: return ignored
return early_return
# Execute the model. # Execute the model.
output = self._run_workers( output = self._run_workers(
...@@ -559,7 +556,7 @@ class LLMEngine: ...@@ -559,7 +556,7 @@ class LLMEngine:
blocks_to_copy=scheduler_outputs.blocks_to_copy, blocks_to_copy=scheduler_outputs.blocks_to_copy,
) )
return self._process_model_outputs(output, scheduler_outputs) return self._process_model_outputs(output, scheduler_outputs) + ignored
def _log_system_stats( def _log_system_stats(
self, self,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment