Unverified Commit 53da4cd3 authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[Bugfix][CPU] Fix InputBatch for pooling models in the CPU v1 (#20014)


Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
parent 9a3b8832
......@@ -101,4 +101,4 @@ def test_prm_models(
hf_output = torch.tensor(hf_output)
vllm_output = torch.tensor(vllm_output)
assert torch.allclose(hf_output, vllm_output, 1e-2)
assert torch.allclose(hf_output, vllm_output, 1.5e-2)
......@@ -7,6 +7,7 @@ import torch
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models.interfaces import has_step_pooler
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
logger = init_logger(__name__)
......@@ -52,6 +53,9 @@ class CPUModelRunner(GPUModelRunner):
logger.info("Starting to load model %s...", self.model_config.model)
self.model = get_model(vllm_config=self.vllm_config)
if has_step_pooler(self.model):
self.input_batch.logits_processing_needs_token_ids = True
if self.lora_config:
self.model = self.load_lora_model(self.model, self.model_config,
self.scheduler_config,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment