"vscode:/vscode.git/clone" did not exist on "fca0956ac1fbaae87284169b92d078de67b46fb5"
Unverified Commit 8d9338fa authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Chore] Rename `Processor` to `InputProcessor` (#29682)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent d40c8540
...@@ -114,7 +114,7 @@ def mock_serving_setup(): ...@@ -114,7 +114,7 @@ def mock_serving_setup():
mock_engine.add_lora.reset_mock() mock_engine.add_lora.reset_mock()
mock_engine.model_config = MockModelConfig() mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
models = OpenAIServingModels( models = OpenAIServingModels(
......
...@@ -429,7 +429,7 @@ async def test_serving_chat_returns_correct_model_name(): ...@@ -429,7 +429,7 @@ async def test_serving_chat_returns_correct_model_name():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = MockModelConfig() mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine) serving_chat = _build_serving_chat(mock_engine)
...@@ -459,7 +459,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -459,7 +459,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = MockModelConfig() mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine) serving_chat = _build_serving_chat(mock_engine)
...@@ -492,7 +492,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -492,7 +492,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = mock_model_config mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
# Initialize the serving chat # Initialize the serving chat
...@@ -537,7 +537,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -537,7 +537,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = mock_model_config mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
# Initialize the serving chat # Initialize the serving chat
...@@ -583,7 +583,7 @@ async def test_serving_chat_could_load_correct_generation_config(): ...@@ -583,7 +583,7 @@ async def test_serving_chat_could_load_correct_generation_config():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = mock_model_config mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
# Initialize the serving chat # Initialize the serving chat
...@@ -629,7 +629,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): ...@@ -629,7 +629,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = mock_model_config mock_engine.model_config = mock_model_config
mock_engine.processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
serving_chat = _build_serving_chat(mock_engine) serving_chat = _build_serving_chat(mock_engine)
...@@ -662,7 +662,7 @@ async def test_serving_chat_data_parallel_rank_extraction(): ...@@ -662,7 +662,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
mock_engine.model_config = MockModelConfig() mock_engine.model_config = MockModelConfig()
mock_engine.processor = MagicMock() mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock() mock_engine.io_processor = MagicMock()
# Mock the generate method to return an async generator # Mock the generate method to return an async generator
......
...@@ -23,7 +23,7 @@ def serving() -> OpenAIServing: ...@@ -23,7 +23,7 @@ def serving() -> OpenAIServing:
model_config.max_model_len = 32768 model_config.max_model_len = 32768
models = Mock(spec=OpenAIServingModels) models = Mock(spec=OpenAIServingModels)
models.model_config = model_config models.model_config = model_config
models.processor = Mock() models.input_processor = Mock()
models.io_processor = Mock() models.io_processor = Mock()
serving = OpenAIServing( serving = OpenAIServing(
......
...@@ -30,7 +30,7 @@ async def _async_serving_models_init() -> OpenAIServingModels: ...@@ -30,7 +30,7 @@ async def _async_serving_models_init() -> OpenAIServingModels:
mock_model_config = MagicMock(spec=ModelConfig) mock_model_config = MagicMock(spec=ModelConfig)
mock_model_config.max_model_len = 2048 mock_model_config.max_model_len = 2048
mock_engine_client.model_config = mock_model_config mock_engine_client.model_config = mock_model_config
mock_engine_client.processor = MagicMock() mock_engine_client.input_processor = MagicMock()
mock_engine_client.io_processor = MagicMock() mock_engine_client.io_processor = MagicMock()
serving_models = OpenAIServingModels( serving_models = OpenAIServingModels(
......
...@@ -127,7 +127,7 @@ class TestInitializeToolSessions: ...@@ -127,7 +127,7 @@ class TestInitializeToolSessions:
model_config.get_diff_sampling_param.return_value = {} model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config engine_client.model_config = model_config
engine_client.processor = MagicMock() engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock() engine_client.io_processor = MagicMock()
models = MagicMock() models = MagicMock()
...@@ -213,7 +213,7 @@ class TestValidateGeneratorInput: ...@@ -213,7 +213,7 @@ class TestValidateGeneratorInput:
model_config.get_diff_sampling_param.return_value = {} model_config.get_diff_sampling_param.return_value = {}
engine_client.model_config = model_config engine_client.model_config = model_config
engine_client.processor = MagicMock() engine_client.input_processor = MagicMock()
engine_client.io_processor = MagicMock() engine_client.io_processor = MagicMock()
models = MagicMock() models = MagicMock()
......
...@@ -7,18 +7,17 @@ from vllm.assets.image import ImageAsset ...@@ -7,18 +7,17 @@ from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.v1.engine import processor as processor_mod from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.processor import Processor from vllm.v1.engine.input_processor import InputProcessor
cherry_pil_image = ImageAsset("cherry_blossom").pil_image cherry_pil_image = ImageAsset("cherry_blossom").pil_image
stop_pil_image = ImageAsset("stop_sign").pil_image stop_pil_image = ImageAsset("stop_sign").pil_image
baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
# Mock processor for testing def _mock_input_processor(
def _mk_processor(
monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
) -> Processor: ) -> InputProcessor:
""" """
Create a Processor instance with minimal configuration suitable for unit Create a Processor instance with minimal configuration suitable for unit
tests without accessing external resources. tests without accessing external resources.
...@@ -36,7 +35,7 @@ def _mk_processor( ...@@ -36,7 +35,7 @@ def _mk_processor(
raising=True, raising=True,
) )
monkeypatch.setattr( monkeypatch.setattr(
processor_mod, input_processor_mod,
"processor_cache_from_config", "processor_cache_from_config",
lambda vllm_config, mm_registry: None, lambda vllm_config, mm_registry: None,
raising=True, raising=True,
...@@ -65,11 +64,11 @@ def _mk_processor( ...@@ -65,11 +64,11 @@ def _mk_processor(
device_config=DeviceConfig(device="cpu"), device_config=DeviceConfig(device="cpu"),
) )
return Processor(vllm_config, tokenizer=None) return InputProcessor(vllm_config, tokenizer=None)
def test_multi_modal_uuids_length_mismatch_raises(monkeypatch): def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
processor = _mk_processor(monkeypatch) input_processor = _mock_input_processor(monkeypatch)
prompt = { prompt = {
"prompt": "USER: <image>\nDescribe\nASSISTANT:", "prompt": "USER: <image>\nDescribe\nASSISTANT:",
...@@ -79,7 +78,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch): ...@@ -79,7 +78,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
} }
with pytest.raises(ValueError, match="must have same length as data"): with pytest.raises(ValueError, match="must have same length as data"):
processor.process_inputs( input_processor.process_inputs(
request_id="req-1", request_id="req-1",
prompt=prompt, # type: ignore[arg-type] prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(), params=SamplingParams(),
...@@ -87,7 +86,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch): ...@@ -87,7 +86,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
def test_multi_modal_uuids_missing_modality_raises(monkeypatch): def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
processor = _mk_processor(monkeypatch) input_processor = _mock_input_processor(monkeypatch)
prompt = { prompt = {
"prompt": "USER: <image><video>\nDescribe\nASSISTANT:", "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
...@@ -101,7 +100,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch): ...@@ -101,7 +100,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
} }
with pytest.raises(ValueError, match="must be provided if multi_modal_data"): with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
processor.process_inputs( input_processor.process_inputs(
request_id="req-2", request_id="req-2",
prompt=prompt, # type: ignore[arg-type] prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(), params=SamplingParams(),
...@@ -119,7 +118,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch): ...@@ -119,7 +118,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
def test_multi_modal_uuids_accepts_none_and_passes_through( def test_multi_modal_uuids_accepts_none_and_passes_through(
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
): ):
processor = _mk_processor( input_processor = _mock_input_processor(
monkeypatch, monkeypatch,
mm_cache_gb=mm_cache_gb, mm_cache_gb=mm_cache_gb,
enable_prefix_caching=enable_prefix_caching, enable_prefix_caching=enable_prefix_caching,
...@@ -137,7 +136,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( ...@@ -137,7 +136,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
# Monkeypatch only the bound preprocess method on this instance # Monkeypatch only the bound preprocess method on this instance
monkeypatch.setattr( monkeypatch.setattr(
processor.input_preprocessor, "preprocess", fake_preprocess, raising=True input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
) )
# Use a consistent two-image scenario across all configurations # Use a consistent two-image scenario across all configurations
...@@ -151,7 +150,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( ...@@ -151,7 +150,7 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
"multi_modal_uuids": mm_uuids, "multi_modal_uuids": mm_uuids,
} }
processor.process_inputs( input_processor.process_inputs(
request_id="req-3", request_id="req-3",
prompt=prompt, # type: ignore[arg-type] prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(), params=SamplingParams(),
...@@ -163,7 +162,9 @@ def test_multi_modal_uuids_accepts_none_and_passes_through( ...@@ -163,7 +162,9 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
# When both processor cache is 0 and prefix caching disabled, the # When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs. # processor builds overrides from request id instead of using user UUIDs.
processor = _mk_processor(monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False) input_processor = _mock_input_processor(
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
)
captured: dict[str, object] = {} captured: dict[str, object] = {}
...@@ -174,7 +175,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): ...@@ -174,7 +175,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
return {"type": "token", "prompt_token_ids": [1]} return {"type": "token", "prompt_token_ids": [1]}
monkeypatch.setattr( monkeypatch.setattr(
processor.input_preprocessor, "preprocess", fake_preprocess, raising=True input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
) )
request_id = "req-42" request_id = "req-42"
...@@ -188,7 +189,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch): ...@@ -188,7 +189,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
"multi_modal_uuids": mm_uuids, "multi_modal_uuids": mm_uuids,
} }
processor.process_inputs( input_processor.process_inputs(
request_id=request_id, request_id=request_id,
prompt=prompt, # type: ignore[arg-type] prompt=prompt, # type: ignore[arg-type]
params=SamplingParams(), params=SamplingParams(),
......
...@@ -15,7 +15,7 @@ from vllm.sampling_params import SamplingParams ...@@ -15,7 +15,7 @@ from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask from vllm.tasks import SupportedTask
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.processor import Processor from vllm.v1.engine.input_processor import InputProcessor
class EngineClient(ABC): class EngineClient(ABC):
...@@ -23,7 +23,7 @@ class EngineClient(ABC): ...@@ -23,7 +23,7 @@ class EngineClient(ABC):
vllm_config: VllmConfig vllm_config: VllmConfig
model_config: ModelConfig model_config: ModelConfig
processor: Processor input_processor: InputProcessor
io_processor: IOProcessor | None io_processor: IOProcessor | None
@property @property
......
...@@ -347,7 +347,7 @@ class LLM: ...@@ -347,7 +347,7 @@ class LLM:
self.supported_tasks = supported_tasks self.supported_tasks = supported_tasks
self.model_config = self.llm_engine.model_config self.model_config = self.llm_engine.model_config
self.processor = self.llm_engine.processor self.input_processor = self.llm_engine.input_processor
self.io_processor = self.llm_engine.io_processor self.io_processor = self.llm_engine.io_processor
def get_tokenizer(self) -> AnyTokenizer: def get_tokenizer(self) -> AnyTokenizer:
...@@ -364,7 +364,7 @@ class LLM: ...@@ -364,7 +364,7 @@ class LLM:
self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer) self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
def reset_mm_cache(self) -> None: def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache() self.input_processor.clear_mm_cache()
self.llm_engine.reset_mm_cache() self.llm_engine.reset_mm_cache()
def get_default_sampling_params(self) -> SamplingParams: def get_default_sampling_params(self) -> SamplingParams:
...@@ -1674,7 +1674,7 @@ class LLM: ...@@ -1674,7 +1674,7 @@ class LLM:
tokenization_kwargs, tokenization_kwargs,
) )
engine_request = self.processor.process_inputs( engine_request = self.input_processor.process_inputs(
request_id, request_id,
engine_prompt, engine_prompt,
params, params,
......
...@@ -284,7 +284,7 @@ class OpenAIServing: ...@@ -284,7 +284,7 @@ class OpenAIServing:
self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {} self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
self.log_error_stack = log_error_stack self.log_error_stack = log_error_stack
self.processor = self.models.processor self.input_processor = self.models.input_processor
self.io_processor = self.models.io_processor self.io_processor = self.models.io_processor
self.model_config = self.models.model_config self.model_config = self.models.model_config
self.max_model_len = self.model_config.max_model_len self.max_model_len = self.model_config.max_model_len
...@@ -330,7 +330,7 @@ class OpenAIServing: ...@@ -330,7 +330,7 @@ class OpenAIServing:
return parser return parser
async def reset_mm_cache(self) -> None: async def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache() self.input_processor.clear_mm_cache()
await self.engine_client.reset_mm_cache() await self.engine_client.reset_mm_cache()
async def beam_search( async def beam_search(
...@@ -348,8 +348,8 @@ class OpenAIServing: ...@@ -348,8 +348,8 @@ class OpenAIServing:
length_penalty = params.length_penalty length_penalty = params.length_penalty
include_stop_str_in_output = params.include_stop_str_in_output include_stop_str_in_output = params.include_stop_str_in_output
processor = self.processor input_processor = self.input_processor
tokenizer = processor.tokenizer tokenizer = input_processor.tokenizer
if tokenizer is None: if tokenizer is None:
raise ValueError( raise ValueError(
"You cannot use beam search when `skip_tokenizer_init` is True" "You cannot use beam search when `skip_tokenizer_init` is True"
...@@ -1214,7 +1214,7 @@ class OpenAIServing: ...@@ -1214,7 +1214,7 @@ class OpenAIServing:
self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs
) )
engine_request = self.processor.process_inputs( engine_request = self.input_processor.process_inputs(
request_id, request_id,
engine_prompt, engine_prompt,
params, params,
......
...@@ -69,7 +69,7 @@ class OpenAIServingModels: ...@@ -69,7 +69,7 @@ class OpenAIServingModels:
) )
self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock) self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
self.processor = self.engine_client.processor self.input_processor = self.engine_client.input_processor
self.io_processor = self.engine_client.io_processor self.io_processor = self.engine_client.io_processor
self.model_config = self.engine_client.model_config self.model_config = self.engine_client.model_config
self.max_model_len = self.model_config.max_model_len self.max_model_len = self.model_config.max_model_len
......
...@@ -10,6 +10,7 @@ from typing import Any, cast ...@@ -10,6 +10,7 @@ from typing import Any, cast
import numpy as np import numpy as np
import torch import torch
from typing_extensions import deprecated
import vllm.envs as envs import vllm.envs as envs
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -35,9 +36,9 @@ from vllm.utils.math_utils import cdiv ...@@ -35,9 +36,9 @@ from vllm.utils.math_utils import cdiv
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
from vllm.v1.engine.input_processor import InputProcessor
from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.parallel_sampling import ParentRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.executor import Executor from vllm.v1.executor import Executor
from vllm.v1.metrics.loggers import ( from vllm.v1.metrics.loggers import (
StatLoggerFactory, StatLoggerFactory,
...@@ -112,7 +113,7 @@ class AsyncLLM(EngineClient): ...@@ -112,7 +113,7 @@ class AsyncLLM(EngineClient):
else: else:
tokenizer = init_tokenizer_from_configs(self.model_config) tokenizer = init_tokenizer_from_configs(self.model_config)
self.processor = Processor(self.vllm_config, tokenizer) self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor( self.io_processor = get_io_processor(
self.vllm_config, self.vllm_config,
self.model_config.io_processor_plugin, self.model_config.io_processor_plugin,
...@@ -193,6 +194,14 @@ class AsyncLLM(EngineClient): ...@@ -193,6 +194,14 @@ class AsyncLLM(EngineClient):
else: else:
self.profiler = None self.profiler = None
@property
@deprecated(
"`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
"The old name will be removed in v0.13."
)
def processor(self):
return self.input_processor
@classmethod @classmethod
def from_vllm_config( def from_vllm_config(
cls, cls,
...@@ -293,11 +302,7 @@ class AsyncLLM(EngineClient): ...@@ -293,11 +302,7 @@ class AsyncLLM(EngineClient):
request = prompt request = prompt
else: else:
assert prompt_text is None assert prompt_text is None
logger.warning_once( request = self.input_processor.process_inputs(
"Processor has been moved under OpenAIServing and will "
"be removed from AsyncLLM in v0.13."
)
request = self.processor.process_inputs(
request_id, request_id,
prompt, prompt,
params, params,
...@@ -481,7 +486,7 @@ class AsyncLLM(EngineClient): ...@@ -481,7 +486,7 @@ class AsyncLLM(EngineClient):
output_processor = self.output_processor output_processor = self.output_processor
log_stats = self.log_stats log_stats = self.log_stats
logger_manager = self.logger_manager logger_manager = self.logger_manager
processor = self.processor input_processor = self.input_processor
async def output_handler(): async def output_handler():
try: try:
...@@ -532,7 +537,7 @@ class AsyncLLM(EngineClient): ...@@ -532,7 +537,7 @@ class AsyncLLM(EngineClient):
engine_idx=outputs.engine_index, engine_idx=outputs.engine_index,
scheduler_stats=outputs.scheduler_stats, scheduler_stats=outputs.scheduler_stats,
iteration_stats=iteration_stats, iteration_stats=iteration_stats,
mm_cache_stats=processor.stat_mm_cache(), mm_cache_stats=input_processor.stat_mm_cache(),
) )
except Exception as e: except Exception as e:
logger.exception("AsyncLLM output_handler failed.") logger.exception("AsyncLLM output_handler failed.")
...@@ -699,11 +704,11 @@ class AsyncLLM(EngineClient): ...@@ -699,11 +704,11 @@ class AsyncLLM(EngineClient):
@property @property
def tokenizer(self) -> AnyTokenizer | None: def tokenizer(self) -> AnyTokenizer | None:
return self.processor.tokenizer return self.input_processor.tokenizer
@tokenizer.setter @tokenizer.setter
def tokenizer(self, tokenizer: AnyTokenizer | None) -> None: def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
self.processor.tokenizer = tokenizer self.input_processor.tokenizer = tokenizer
async def get_tokenizer(self) -> AnyTokenizer: async def get_tokenizer(self) -> AnyTokenizer:
if self.tokenizer is None: if self.tokenizer is None:
...@@ -738,7 +743,7 @@ class AsyncLLM(EngineClient): ...@@ -738,7 +743,7 @@ class AsyncLLM(EngineClient):
await asyncio.gather(*coros) await asyncio.gather(*coros)
async def reset_mm_cache(self) -> None: async def reset_mm_cache(self) -> None:
self.processor.clear_mm_cache() self.input_processor.clear_mm_cache()
await self.engine_core.reset_mm_cache_async() await self.engine_core.reset_mm_cache_async()
async def reset_prefix_cache(self) -> None: async def reset_prefix_cache(self) -> None:
......
This diff is collapsed.
...@@ -7,7 +7,7 @@ from copy import copy ...@@ -7,7 +7,7 @@ from copy import copy
from typing import Any, cast from typing import Any, cast
import torch.nn as nn import torch.nn as nn
from typing_extensions import TypeVar from typing_extensions import TypeVar, deprecated
import vllm.envs as envs import vllm.envs as envs
from vllm.config import ParallelConfig, VllmConfig from vllm.config import ParallelConfig, VllmConfig
...@@ -28,9 +28,9 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_ ...@@ -28,9 +28,9 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.input_processor import InputProcessor
from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.output_processor import OutputProcessor
from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.parallel_sampling import ParentRequest
from vllm.v1.engine.processor import Processor
from vllm.v1.executor import Executor from vllm.v1.executor import Executor
from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
from vllm.v1.metrics.reader import Metric, get_metrics_snapshot from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
...@@ -88,7 +88,7 @@ class LLMEngine: ...@@ -88,7 +88,7 @@ class LLMEngine:
else: else:
tokenizer = init_tokenizer_from_configs(self.model_config) tokenizer = init_tokenizer_from_configs(self.model_config)
self.processor = Processor(self.vllm_config, tokenizer) self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor( self.io_processor = get_io_processor(
self.vllm_config, self.vllm_config,
self.model_config.io_processor_plugin, self.model_config.io_processor_plugin,
...@@ -135,6 +135,14 @@ class LLMEngine: ...@@ -135,6 +135,14 @@ class LLMEngine:
# Don't keep the dummy data in memory # Don't keep the dummy data in memory
self.reset_mm_cache() self.reset_mm_cache()
@property
@deprecated(
"`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
"The old name will be removed in v0.13."
)
def processor(self):
return self.input_processor
@classmethod @classmethod
def from_vllm_config( def from_vllm_config(
cls, cls,
...@@ -231,11 +239,7 @@ class LLMEngine: ...@@ -231,11 +239,7 @@ class LLMEngine:
request = prompt request = prompt
else: else:
assert prompt_text is None assert prompt_text is None
logger.warning_once( request = self.input_processor.process_inputs(
"Processor has been moved under LLM and will "
"be removed from LLMEngine in v0.13."
)
request = self.processor.process_inputs(
request_id, request_id,
prompt, prompt,
params, params,
...@@ -307,7 +311,7 @@ class LLMEngine: ...@@ -307,7 +311,7 @@ class LLMEngine:
self.logger_manager.record( self.logger_manager.record(
scheduler_stats=outputs.scheduler_stats, scheduler_stats=outputs.scheduler_stats,
iteration_stats=iteration_stats, iteration_stats=iteration_stats,
mm_cache_stats=self.processor.stat_mm_cache(), mm_cache_stats=self.input_processor.stat_mm_cache(),
) )
self.do_log_stats_with_interval() self.do_log_stats_with_interval()
...@@ -320,7 +324,7 @@ class LLMEngine: ...@@ -320,7 +324,7 @@ class LLMEngine:
self.engine_core.profile(False) self.engine_core.profile(False)
def reset_mm_cache(self): def reset_mm_cache(self):
self.processor.clear_mm_cache() self.input_processor.clear_mm_cache()
self.engine_core.reset_mm_cache() self.engine_core.reset_mm_cache()
def reset_prefix_cache(self): def reset_prefix_cache(self):
...@@ -347,11 +351,11 @@ class LLMEngine: ...@@ -347,11 +351,11 @@ class LLMEngine:
@property @property
def tokenizer(self) -> AnyTokenizer | None: def tokenizer(self) -> AnyTokenizer | None:
return self.processor.tokenizer return self.input_processor.tokenizer
@tokenizer.setter @tokenizer.setter
def tokenizer(self, tokenizer: AnyTokenizer | None) -> None: def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
self.processor.tokenizer = tokenizer self.input_processor.tokenizer = tokenizer
def get_tokenizer(self) -> AnyTokenizer: def get_tokenizer(self) -> AnyTokenizer:
if self.tokenizer is None: if self.tokenizer is None:
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment