Commit 2d940766 authored by guanyu1's avatar guanyu1
Browse files

VLLM_ENCODER_CACHE_SIZE控制encoder_cache_size大小

parent 06185134
...@@ -157,6 +157,7 @@ if TYPE_CHECKING: ...@@ -157,6 +157,7 @@ if TYPE_CHECKING:
VLLM_MXFP4_USE_MARLIN: bool | None = None VLLM_MXFP4_USE_MARLIN: bool | None = None
VLLM_DEEPEPLL_NVFP4_DISPATCH: bool = False VLLM_DEEPEPLL_NVFP4_DISPATCH: bool = False
VLLM_V1_USE_OUTLINES_CACHE: bool = False VLLM_V1_USE_OUTLINES_CACHE: bool = False
VLLM_ENCODER_CACHE_SIZE: int | None = None
VLLM_TPU_BUCKET_PADDING_GAP: int = 0 VLLM_TPU_BUCKET_PADDING_GAP: int = 0
VLLM_TPU_MOST_MODEL_LEN: int | None = None VLLM_TPU_MOST_MODEL_LEN: int | None = None
VLLM_TPU_USING_PATHWAYS: bool = False VLLM_TPU_USING_PATHWAYS: bool = False
...@@ -1925,6 +1926,8 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1925,6 +1926,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_MOE_W16A16_TRITON": "VLLM_USE_MOE_W16A16_TRITON":
lambda: (os.environ.get("VLLM_USE_MOE_W16A16_TRITON", "0").lower() in lambda: (os.environ.get("VLLM_USE_MOE_W16A16_TRITON", "0").lower() in
("true", "1")), ("true", "1")),
"VLLM_ENCODER_CACHE_SIZE":
lambda: maybe_convert_int(os.environ.get("VLLM_ENCODER_CACHE_SIZE", None)),
#If set to 1/True, enable the V1 fast token-id copy path in InputBatch. #If set to 1/True, enable the V1 fast token-id copy path in InputBatch.
"VLLM_V1_FAST_TOKEN_ID_COPY": "VLLM_V1_FAST_TOKEN_ID_COPY":
lambda: (os.environ.get("VLLM_V1_FAST_TOKEN_ID_COPY", "False").lower() in lambda: (os.environ.get("VLLM_V1_FAST_TOKEN_ID_COPY", "False").lower() in
......
...@@ -5,6 +5,7 @@ from collections import OrderedDict ...@@ -5,6 +5,7 @@ from collections import OrderedDict
from collections.abc import Mapping from collections.abc import Mapping
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import vllm.envs as envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.multimodal import MultiModalRegistry from vllm.multimodal import MultiModalRegistry
from vllm.v1.request import Request from vllm.v1.request import Request
...@@ -15,6 +16,16 @@ if TYPE_CHECKING: ...@@ -15,6 +16,16 @@ if TYPE_CHECKING:
logger = init_logger(__name__) logger = init_logger(__name__)
def _get_encoder_cache_size_override() -> int | None:
encoder_cache_size_override = envs.VLLM_ENCODER_CACHE_SIZE
if encoder_cache_size_override is not None:
logger.info_once(
"Using VLLM_ENCODER_CACHE_SIZE=%d to override encoder cache size.",
encoder_cache_size_override,
)
return encoder_cache_size_override
class EncoderCacheManager: class EncoderCacheManager:
"""Manages caching of encoder outputs for multimodal models in vLLM V1. """Manages caching of encoder outputs for multimodal models in vLLM V1.
...@@ -342,6 +353,10 @@ def compute_mm_encoder_budget( ...@@ -342,6 +353,10 @@ def compute_mm_encoder_budget(
encoder_compute_budget = max( encoder_compute_budget = max(
scheduler_config.max_num_encoder_input_tokens, max_tokens_per_mm_item scheduler_config.max_num_encoder_input_tokens, max_tokens_per_mm_item
) )
encoder_cache_size_override = _get_encoder_cache_size_override()
if encoder_cache_size_override is not None:
encoder_cache_size = encoder_cache_size_override
else:
encoder_cache_size = max( encoder_cache_size = max(
scheduler_config.encoder_cache_size, max_tokens_per_mm_item scheduler_config.encoder_cache_size, max_tokens_per_mm_item
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment