Commit e70b0ea0 authored by zhuwenwen's avatar zhuwenwen
Browse files

debug and fix erro

parent 1555157e
......@@ -189,6 +189,7 @@ class Worker(WorkerBase):
# Construct the model runner
if envs.VLLM_ZERO_OVERHEAD:
logger.info('use zero overhead model_runner')
self.model_runner: GPUModelRunner = V1ZeroModelRunner(
self.vllm_config, self.device)
else:
......
......@@ -175,6 +175,8 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
# expensive operations inside the loop.
for request in scheduler.running:
if request.is_finished():
if req_id in requsets_valid_token_len:
requsets_valid_token_len.pop(req_id)
continue
req_id = request.request_id
num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0)
......@@ -209,6 +211,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
if request.has_encoder_inputs:
scheduler._free_encoder_inputs(request)
stopped = False
new_logprobs = None
new_token_ids = generated_token_ids
kv_transfer_params = None
......@@ -290,7 +293,10 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
num_cached_tokens=request.num_cached_tokens,
))
if not stopped:
if stopped:
if req_id in requsets_valid_token_len:
requsets_valid_token_len.pop(req_id)
else:
new_running.append(request)
scheduler.running = new_running
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment