Unverified Commit 8d9338fa authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Chore] Rename `Processor` to `InputProcessor` (#29682)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent d40c8540
......@@ -114,7 +114,7 @@ def mock_serving_setup():
mock_engine.add_lora.reset_mock()
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
models = OpenAIServingModels(
......
......@@ -429,7 +429,7 @@ async def test_serving_chat_returns_correct_model_name():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine)
......@@ -459,7 +459,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine)
......@@ -492,7 +492,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
# Initialize the serving chat
......@@ -537,7 +537,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
# Initialize the serving chat
......@@ -583,7 +583,7 @@ async def test_serving_chat_could_load_correct_generation_config():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
# Initialize the serving chat
......@@ -629,7 +629,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine)
......@@ -662,7 +662,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock()
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
# Mock the generate method to return an async generator
......
......@@ -23,7 +23,7 @@ def serving() -> OpenAIServing:
model_config.max_model_len = 32768
models = Mock(spec=OpenAIServingModels)
models.model_config = model_config
models.processor = Mock()
models.input_processor = Mock()
models.io_processor = Mock()
serving = OpenAIServing(
......
......@@ -30,7 +30,7 @@ async def _async_serving_models_init() -> OpenAIServingModels:
mock_model_config = MagicMock(spec=ModelConfig)
mock_model_config.max_model_len = 2048
mock_engine_client.model_config = mock_model_config
mock_engine_client.processor = MagicMock()
mock_engine_client.input_processor = MagicMock()
mock_engine_client.io_processor = MagicMock()
serving_models = OpenAIServingModels(
......
......@@ -127,7 +127,7 @@ class TestInitializeToolSessions:
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.processor = MagicMock()
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()
models = MagicMock()
......@@ -213,7 +213,7 @@ class TestValidateGeneratorInput:
model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config
engine_client.processor = MagicMock()
engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock()
models = MagicMock()
......
......@@ -7,18 +7,17 @@ from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import processor as processor_mod
from vllm.v1.engine.processor import Processor
from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.input_processor import InputProcessor
cherry_pil_image = ImageAsset("cherry_blossom").pil_image
stop_pil_image = ImageAsset("stop_sign").pil_image
baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
# Mock processor for testing
def _mk_processor(
def _mock_input_processor(
monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
) -> Processor:
) -> InputProcessor:
"""
Create a Processor instance with minimal configuration suitable for unit
tests without accessing external resources.
......@@ -36,7 +35,7 @@ def _mk_processor(
raising=True,
)
monkeypatch.setattr(
processor_mod,
input_processor_mod,
"processor_cache_from_config",
lambda vllm_config, mm_registry: None,
raising=True,
......@@ -65,11 +64,11 @@ def _mk_processor(
device_config=DeviceConfig(device="cpu"),
)
return Processor(vllm_config, tokenizer=None)
return InputProcessor(vllm_config, tokenizer=None)
def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
processor = _mk_processor(monkeypatch)
input_processor = _mock_input_processor(monkeypatch)
prompt = {
"prompt": "USER: <image>\nDescribe\nASSISTANT:",
......@@ -79,7 +78,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
}
with pytest.raises(ValueError, match="must have same length as data"):
processor.process_inputs(
input_processor.process_inputs(
request_id="req-1",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
......@@ -87,7 +86,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
processor = _mk_processor(monkeypatch)
input_processor = _mock_input_processor(monkeypatch)
prompt = {
"prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
......@@ -101,7 +100,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
}
with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
processor.process_inputs(
input_processor.process_inputs(
request_id="req-2",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
......@@ -119,7 +118,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
def test_multi_modal_uuids_accepts_none_and_passes_through(
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
):
processor = _mk_processor(
input_processor = _mock_input_processor(
monkeypatch,
mm_cache_gb=mm_cache_gb,
enable_prefix_caching=enable_prefix_caching,
......@@ -137,7 +136,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
# Monkeypatch only the bound preprocess method on this instance
monkeypatch.setattr(
processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
)
# Use a consistent two-image scenario across all configurations
......@@ -151,7 +150,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
"multi_modal_uuids": mm_uuids,
}
processor.process_inputs(
input_processor.process_inputs(
request_id="req-3",
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
......@@ -163,7 +162,9 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
# When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs.
processor = _mk_processor(monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False)
input_processor = _mock_input_processor(
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
)
captured: dict[str, object] = {}
......@@ -174,7 +175,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
return {"type": "token", "prompt_token_ids": [1]}
monkeypatch.setattr(
processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
)
request_id = "req-42"
......@@ -188,7 +189,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
"multi_modal_uuids": mm_uuids,
}
processor.process_inputs(
input_processor.process_inputs(
request_id=request_id,
prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(),
......
......@@ -15,7 +15,7 @@ from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.engine.input_processor import InputProcessor
class EngineClient(ABC):
......@@ -23,7 +23,7 @@ class EngineClient(ABC):
vllm_config: VllmConfig
model_config: ModelConfig
processor: Processor
input_processor: InputProcessor
io_processor: IOProcessor | None
@property
......
......@@ -347,7 +347,7 @@ class LLM:
self.supported_tasks = supported_tasks
self.model_config = self.llm_engine.model_config
self.processor = self.llm_engine.processor
self.input_processor = self.llm_engine.input_processor
self.io_processor = self.llm_engine.io_processor
def get_tokenizer(self) -> AnyTokenizer:
......@@ -364,7 +364,7 @@ class LLM:
self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache()
self.input_processor.clear_mm_cache()
self.llm_engine.reset_mm_cache()
def get_default_sampling_params(self) -> SamplingParams:
......@@ -1674,7 +1674,7 @@ class LLM:
tokenization_kwargs,
)
engine_request = self.processor.process_inputs(
engine_request = self.input_processor.process_inputs(
request_id,
engine_prompt,
params,
......
......@@ -284,7 +284,7 @@ class OpenAIServing:
self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
self.log_error_stack = log_error_stack
self.processor = self.models.processor
self.input_processor = self.models.input_processor
self.io_processor = self.models.io_processor
self.model_config = self.models.model_config
self.max_model_len = self.model_config.max_model_len
......@@ -330,7 +330,7 @@ class OpenAIServing:
return parser
async def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache()
self.input_processor.clear_mm_cache()
await self.engine_client.reset_mm_cache()
async def beam_search(
......@@ -348,8 +348,8 @@ class OpenAIServing:
length_penalty = params.length_penalty
include_stop_str_in_output = params.include_stop_str_in_output
processor = self.processor
tokenizer = processor.tokenizer
input_processor = self.input_processor
tokenizer = input_processor.tokenizer
if tokenizer is None:
raise ValueError(
"You cannot use beam search when `skip_tokenizer_init` is True"
......@@ -1214,7 +1214,7 @@ class OpenAIServing:
self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs
)
engine_request = self.processor.process_inputs(
engine_request = self.input_processor.process_inputs(
request_id,
engine_prompt,
params,
......
......@@ -69,7 +69,7 @@ class OpenAIServingModels:
)
self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
self.processor = self.engine_client.processor
self.input_processor = self.engine_client.input_processor
self.io_processor = self.engine_client.io_processor
self.model_config = self.engine_client.model_config
self.max_model_len = self.model_config.max_model_len
......
......@@ -10,6 +10,7 @@ from typing import Any, cast
import numpy as np
import torch
from typing_extensions import deprecated
import vllm.envs as envs
from vllm.config import VllmConfig
......@@ -35,9 +36,9 @@ from vllm.utils.math_utils import cdiv
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
from vllm.v1.engine.input_processor import InputProcessor
from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
from vllm.v1.engine.parallel_sampling import ParentRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.executor import Executor
from vllm.v1.metrics.loggers import (
StatLoggerFactory,
......@@ -112,7 +113,7 @@ class AsyncLLM(EngineClient):
else:
tokenizer = init_tokenizer_from_configs(self.model_config)
self.processor = Processor(self.vllm_config, tokenizer)
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(
self.vllm_config,
self.model_config.io_processor_plugin,
......@@ -193,6 +194,14 @@ class AsyncLLM(EngineClient):
else:
self.profiler = None
@property
@deprecated(
"`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
"The old name will be removed in v0.13."
)
def processor(self):
return self.input_processor
@classmethod
def from_vllm_config(
cls,
......@@ -293,11 +302,7 @@ class AsyncLLM(EngineClient):
request = prompt
else:
assert prompt_text is None
logger.warning_once(
"Processor has been moved under OpenAIServing and will "
"be removed from AsyncLLM in v0.13."
)
request = self.processor.process_inputs(
request = self.input_processor.process_inputs(
request_id,
prompt,
params,
......@@ -481,7 +486,7 @@ class AsyncLLM(EngineClient):
output_processor = self.output_processor
log_stats = self.log_stats
logger_manager = self.logger_manager
processor = self.processor
input_processor = self.input_processor
async def output_handler():
try:
......@@ -532,7 +537,7 @@ class AsyncLLM(EngineClient):
engine_idx=outputs.engine_index,
scheduler_stats=outputs.scheduler_stats,
iteration_stats=iteration_stats,
mm_cache_stats=processor.stat_mm_cache(),
mm_cache_stats=input_processor.stat_mm_cache(),
)
except Exception as e:
logger.exception("AsyncLLM output_handler failed.")
......@@ -699,11 +704,11 @@ class AsyncLLM(EngineClient):
@property
def tokenizer(self) -> AnyTokenizer | None:
return self.processor.tokenizer
return self.input_processor.tokenizer
@tokenizer.setter
def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
self.processor.tokenizer = tokenizer
self.input_processor.tokenizer = tokenizer
async def get_tokenizer(self) -> AnyTokenizer:
if self.tokenizer is None:
......@@ -738,7 +743,7 @@ class AsyncLLM(EngineClient):
await asyncio.gather(*coros)
async def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache()
self.input_processor.clear_mm_cache()
await self.engine_core.reset_mm_cache_async()
async def reset_prefix_cache(self) -> None:
......
This diff is collapsed.
......@@ -7,7 +7,7 @@ from copy import copy
from typing import Any, cast
import torch.nn as nn
from typing_extensions import TypeVar
from typing_extensions import TypeVar, deprecated
import vllm.envs as envs
from vllm.config import ParallelConfig, VllmConfig
......@@ -28,9 +28,9 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.input_processor import InputProcessor
from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.engine.parallel_sampling import ParentRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.executor import Executor
from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
......@@ -88,7 +88,7 @@ class LLMEngine:
else:
tokenizer = init_tokenizer_from_configs(self.model_config)
self.processor = Processor(self.vllm_config, tokenizer)
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(
self.vllm_config,
self.model_config.io_processor_plugin,
......@@ -135,6 +135,14 @@ class LLMEngine:
# Don't keep the dummy data in memory
self.reset_mm_cache()
@property
@deprecated(
"`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
"The old name will be removed in v0.13."
)
def processor(self):
return self.input_processor
@classmethod
def from_vllm_config(
cls,
......@@ -231,11 +239,7 @@ class LLMEngine:
request = prompt
else:
assert prompt_text is None
logger.warning_once(
"Processor has been moved under LLM and will "
"be removed from LLMEngine in v0.13."
)
request = self.processor.process_inputs(
request = self.input_processor.process_inputs(
request_id,
prompt,
params,
......@@ -307,7 +311,7 @@ class LLMEngine:
self.logger_manager.record(
scheduler_stats=outputs.scheduler_stats,
iteration_stats=iteration_stats,
mm_cache_stats=self.processor.stat_mm_cache(),
mm_cache_stats=self.input_processor.stat_mm_cache(),
)
self.do_log_stats_with_interval()
......@@ -320,7 +324,7 @@ class LLMEngine:
self.engine_core.profile(False)
def reset_mm_cache(self):
self.processor.clear_mm_cache()
self.input_processor.clear_mm_cache()
self.engine_core.reset_mm_cache()
def reset_prefix_cache(self):
......@@ -347,11 +351,11 @@ class LLMEngine:
@property
def tokenizer(self) -> AnyTokenizer | None:
return self.processor.tokenizer
return self.input_processor.tokenizer
@tokenizer.setter
def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
self.processor.tokenizer = tokenizer
self.input_processor.tokenizer = tokenizer
def get_tokenizer(self) -> AnyTokenizer:
if self.tokenizer is None:
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment