Unverified Commit 4bdf7ac5 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Bugfix] Fix SHM cache initialization (#26427)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent dc7976dd
......@@ -49,7 +49,6 @@ from openai.types.responses.response_reasoning_item import (
from openai_harmony import Message as OpenAIHarmonyMessage
from vllm import envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (
ChatCompletionMessageParam,
......@@ -109,7 +108,6 @@ class OpenAIServingResponses(OpenAIServing):
def __init__(
self,
engine_client: EngineClient,
model_config: ModelConfig,
models: OpenAIServingModels,
*,
request_logger: Optional[RequestLogger],
......@@ -127,7 +125,6 @@ class OpenAIServingResponses(OpenAIServing):
) -> None:
super().__init__(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
......@@ -176,7 +173,7 @@ class OpenAIServingResponses(OpenAIServing):
"the store."
)
self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
self.use_harmony = self.model_config.hf_config.model_type == "gpt_oss"
if self.use_harmony:
logger.warning(
"For gpt-oss, we ignore --enable-auto-tool-choice "
......
......@@ -7,7 +7,6 @@ from typing import Any, Optional, Union
from fastapi import Request
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import (
......@@ -47,7 +46,6 @@ class ServingScores(OpenAIServing):
def __init__(
self,
engine_client: EngineClient,
model_config: ModelConfig,
models: OpenAIServingModels,
*,
request_logger: Optional[RequestLogger],
......@@ -55,7 +53,6 @@ class ServingScores(OpenAIServing):
) -> None:
super().__init__(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=request_logger,
log_error_stack=log_error_stack,
......
......@@ -6,7 +6,6 @@ from typing import Any, Final, Optional, Union
import jinja2
from fastapi import Request
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger
......@@ -32,7 +31,6 @@ class OpenAIServingTokenization(OpenAIServing):
def __init__(
self,
engine_client: EngineClient,
model_config: ModelConfig,
models: OpenAIServingModels,
*,
request_logger: Optional[RequestLogger],
......@@ -43,7 +41,6 @@ class OpenAIServingTokenization(OpenAIServing):
) -> None:
super().__init__(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=request_logger,
log_error_stack=log_error_stack,
......
......@@ -5,7 +5,6 @@ from typing import Optional, Union
from fastapi import Request
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import (
......@@ -34,7 +33,6 @@ class OpenAIServingTranscription(OpenAISpeechToText):
def __init__(
self,
engine_client: EngineClient,
model_config: ModelConfig,
models: OpenAIServingModels,
*,
request_logger: Optional[RequestLogger],
......@@ -43,7 +41,6 @@ class OpenAIServingTranscription(OpenAISpeechToText):
):
super().__init__(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
......@@ -95,7 +92,6 @@ class OpenAIServingTranslation(OpenAISpeechToText):
def __init__(
self,
engine_client: EngineClient,
model_config: ModelConfig,
models: OpenAIServingModels,
*,
request_logger: Optional[RequestLogger],
......@@ -104,7 +100,6 @@ class OpenAIServingTranslation(OpenAISpeechToText):
):
super().__init__(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
......
......@@ -12,7 +12,6 @@ import numpy as np
from fastapi import Request
import vllm.envs as envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import (
......@@ -53,7 +52,6 @@ class OpenAISpeechToText(OpenAIServing):
def __init__(
self,
engine_client: EngineClient,
model_config: ModelConfig,
models: OpenAIServingModels,
*,
request_logger: Optional[RequestLogger],
......@@ -63,7 +61,6 @@ class OpenAISpeechToText(OpenAIServing):
):
super().__init__(
engine_client=engine_client,
model_config=model_config,
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids,
......@@ -74,7 +71,7 @@ class OpenAISpeechToText(OpenAIServing):
self.task_type = task_type
self.asr_config = self.model_cls.get_speech_to_text_config(
model_config, task_type
self.model_config, task_type
)
self.max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB
......
......@@ -20,13 +20,13 @@ class TextPrompt(TypedDict):
prompt: str
"""The input text to be tokenized before passing to the model."""
multi_modal_data: NotRequired["MultiModalDataDict"]
multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
"""
Optional multi-modal data to pass to the model,
if the model supports it.
"""
mm_processor_kwargs: NotRequired[dict[str, Any]]
mm_processor_kwargs: NotRequired[Optional[dict[str, Any]]]
"""
Optional multi-modal processor kwargs to be forwarded to the
multimodal input mapper & processor. Note that if multiple modalities
......@@ -61,13 +61,13 @@ class TokensPrompt(TypedDict):
token_type_ids: NotRequired[list[int]]
"""A list of token type IDs to pass to the cross encoder model."""
multi_modal_data: NotRequired["MultiModalDataDict"]
multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
"""
Optional multi-modal data to pass to the model,
if the model supports it.
"""
mm_processor_kwargs: NotRequired[dict[str, Any]]
mm_processor_kwargs: NotRequired[Optional[dict[str, Any]]]
"""
Optional multi-modal processor kwargs to be forwarded to the
multimodal input mapper & processor. Note that if multiple modalities
......
......@@ -17,7 +17,7 @@ from vllm.multimodal.inputs import (
MultiModalUUIDDict,
)
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils.jsontree import json_iter_leaves
from .data import (
......@@ -45,20 +45,17 @@ class InputPreprocessor:
def __init__(
self,
model_config: ModelConfig,
tokenizer: Optional[AnyTokenizer],
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
) -> None:
super().__init__()
self.model_config = model_config
self.tokenizer = tokenizer
self.mm_registry = mm_registry
self.mm_processor_cache = mm_processor_cache
if model_config.skip_tokenizer_init:
self.tokenizer = None
else:
self.tokenizer = init_tokenizer_from_configs(model_config)
def get_tokenizer(self) -> AnyTokenizer:
if self.tokenizer is None:
raise ValueError(
......@@ -351,8 +348,8 @@ class InputPreprocessor:
if self.model_config.is_multimodal_model:
inputs = self._process_multimodal(
prompt_token_ids,
parsed_content.get("multi_modal_data", {}),
parsed_content.get("mm_processor_kwargs"),
parsed_content.get("multi_modal_data") or {},
parsed_content.get("mm_processor_kwargs") or {},
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)
......@@ -380,8 +377,8 @@ class InputPreprocessor:
if self.model_config.is_multimodal_model:
inputs = self._process_multimodal(
prompt_text,
parsed_content.get("multi_modal_data", {}),
parsed_content.get("mm_processor_kwargs"),
parsed_content.get("multi_modal_data") or {},
parsed_content.get("mm_processor_kwargs") or {},
tokenization_kwargs=tokenization_kwargs,
mm_uuids=mm_uuids,
)
......
......@@ -12,23 +12,23 @@ import numpy as np
import torch
import vllm.envs as envs
from vllm.config import ModelConfig, VllmConfig
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.utils import _validate_truncation_size
from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
from vllm.inputs import PromptType
from vllm.inputs.preprocess import InputPreprocessor
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.tracing import init_tracer
from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device, as_list, cancel_task_threadsafe, cdiv, deprecate_kwargs
from vllm.v1.engine import EngineCoreRequest
......@@ -104,8 +104,16 @@ class AsyncLLM(EngineClient):
"logger list; enabling logging without default stat loggers"
)
# Processor (converts Inputs --> EngineCoreRequests).
self.processor = Processor(vllm_config, mm_registry=mm_registry)
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_configs(self.model_config)
self.processor = Processor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(
self.vllm_config,
self.model_config.io_processor_plugin,
)
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
self.output_processor = OutputProcessor(
......@@ -245,10 +253,6 @@ class AsyncLLM(EngineClient):
cancel_task_threadsafe(getattr(self, "output_handler", None))
@property
def tokenizer(self) -> Optional[AnyTokenizer]:
return self.processor.tokenizer
async def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
return await self.engine_core.get_supported_tasks_async()
......@@ -615,14 +619,13 @@ class AsyncLLM(EngineClient):
logger.info("Request %s failed.", request_id)
raise EngineGenerateError() from e
async def get_vllm_config(self) -> VllmConfig:
return self.vllm_config
async def get_model_config(self) -> ModelConfig:
return self.model_config
@property
def tokenizer(self) -> Optional[AnyTokenizer]:
return self.processor.tokenizer
async def get_input_preprocessor(self) -> InputPreprocessor:
return self.processor.input_preprocessor
@tokenizer.setter
def tokenizer(self, tokenizer: Optional[AnyTokenizer]) -> None:
self.processor.tokenizer = tokenizer
async def get_tokenizer(self) -> AnyTokenizer:
if self.tokenizer is None:
......
......@@ -19,11 +19,12 @@ from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.outputs import PoolingRequestOutput, RequestOutput
from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.tasks import SupportedTask
from vllm.tracing import init_tracer
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device
from vllm.v1.engine import EngineCoreRequest
......@@ -95,8 +96,16 @@ class LLMEngine:
self.dp_group = None
self.should_execute_dummy_batch = False
# Processor (convert Inputs --> EngineCoreRequests)
self.processor = Processor(vllm_config, mm_registry=mm_registry)
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_configs(self.model_config)
self.processor = Processor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(
self.vllm_config,
self.model_config.io_processor_plugin,
)
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
self.output_processor = OutputProcessor(
......@@ -204,14 +213,6 @@ class LLMEngine:
def validate_outputs(cls, outputs, output_type):
return outputs
@property
def tokenizer(self) -> Optional[AnyTokenizer]:
return self.processor.tokenizer
@tokenizer.setter
def tokenizer(self, tokenizer: Optional[AnyTokenizer]) -> None:
self.processor.tokenizer = tokenizer
def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
return self.engine_core.get_supported_tasks()
......@@ -313,12 +314,6 @@ class LLMEngine:
return processed_outputs.request_outputs
def get_vllm_config(self):
return self.vllm_config
def get_model_config(self):
return self.model_config
def start_profile(self):
self.engine_core.profile(True)
......@@ -345,6 +340,14 @@ class LLMEngine:
assert self.log_stats, "Stat logging disabled"
return get_metrics_snapshot()
@property
def tokenizer(self) -> Optional[AnyTokenizer]:
return self.processor.tokenizer
@tokenizer.setter
def tokenizer(self, tokenizer: Optional[AnyTokenizer]) -> None:
self.processor.tokenizer = tokenizer
def get_tokenizer(self) -> AnyTokenizer:
if self.tokenizer is None:
raise ValueError(
......
......@@ -37,6 +37,7 @@ class Processor:
def __init__(
self,
vllm_config: VllmConfig,
tokenizer: Optional[AnyTokenizer],
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
) -> None:
self.vllm_config = vllm_config
......@@ -52,6 +53,7 @@ class Processor:
self.input_preprocessor = InputPreprocessor(
self.model_config,
tokenizer,
mm_registry,
mm_processor_cache=self.mm_processor_cache,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment