Unverified Commit 16abe6b8 authored by Roger Wang's avatar Roger Wang Committed by GitHub
Browse files

[Misc] Set default torch num threads for input processing (#31879)


Signed-off-by: default avatarRoger Wang <hey@rogerw.io>
parent 1eb61ab3
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
# Copyright (c) 2023 OpenGVLab # Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Annotated, Any, Literal, TypeAlias, TypeVar from typing import Annotated, Any, Literal, TypeAlias, TypeVar
...@@ -52,7 +51,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder ...@@ -52,7 +51,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.utils.torch_utils import set_default_torch_num_threads
from .interfaces import ( from .interfaces import (
MultiModalEmbeddings, MultiModalEmbeddings,
...@@ -143,19 +141,7 @@ def build_transform(input_size: int): ...@@ -143,19 +141,7 @@ def build_transform(input_size: int):
T.Normalize(mean=MEAN, std=STD), T.Normalize(mean=MEAN, std=STD),
] ]
) )
# Image transformation operations (which include tensor computations return transform
# on the CPU) can occupy a substantial number of CPU cores, introducing
# overhead due to CPU contention. This issue becomes particularly
# noticeable when deploying multiple vLLM instances on a single machine.
# Therefore, it is necessary to limit the number of threads allocated to
# image transformation tasks.
num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
def apply(img):
with set_default_torch_num_threads(num_threads):
return transform(img)
return apply
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import time import time
from collections.abc import Mapping from collections.abc import Mapping
from typing import Any, Literal, cast from typing import Any, Literal, cast
...@@ -23,6 +24,7 @@ from vllm.sampling_params import _SAMPLING_EPS, SamplingParams ...@@ -23,6 +24,7 @@ from vllm.sampling_params import _SAMPLING_EPS, SamplingParams
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.mistral import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.metrics.stats import MultiModalCacheStats from vllm.v1.metrics.stats import MultiModalCacheStats
from vllm.v1.structured_output.backend_guidance import ( from vllm.v1.structured_output.backend_guidance import (
...@@ -493,7 +495,15 @@ class InputProcessor: ...@@ -493,7 +495,15 @@ class InputProcessor:
# 1. Tokenize text prompt, with LoRA request if one exists. # 1. Tokenize text prompt, with LoRA request if one exists.
# 2. For multimodal models with a merged preprocessor, preprocess # 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly. # multimodal data and expand prompt token ids accordingly.
with set_request_id(request_id): num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
if "OMP_NUM_THREADS" not in os.environ:
logger.debug_once(
"OMP_NUM_THREADS is not set; defaulting Torch threads to %d for "
"input preprocessing.",
num_threads,
)
with set_request_id(request_id), set_default_torch_num_threads(num_threads):
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
prompt, prompt,
tokenization_kwargs=tokenization_kwargs, tokenization_kwargs=tokenization_kwargs,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment