Unverified Commit 267c80d3 authored by li-jinpeng's avatar li-jinpeng Committed by GitHub
Browse files

[Model] Limit CPU threads for image transformations in InternVL to reduce cpu contention. (#24519)


Signed-off-by: default avatarli-jinpeng <3332126450@qq.com>
Co-authored-by: default avatarRoger Wang <hey@rogerw.io>
parent 77f62613
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
# Copyright (c) 2023 OpenGVLab # Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details] # Licensed under The MIT License [see LICENSE for details]
# -------------------------------------------------------- # --------------------------------------------------------
import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Annotated, Any, Literal, Optional, TypeVar, Union from typing import Annotated, Any, Literal, Optional, TypeVar, Union
...@@ -37,6 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, ...@@ -37,6 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import set_default_torch_num_threads
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
...@@ -115,13 +117,26 @@ InternVLVideoInputs = Union[InternVLVideoPixelInputs, ...@@ -115,13 +117,26 @@ InternVLVideoInputs = Union[InternVLVideoPixelInputs,
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def build_transform(input_size: int): def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
return T.Compose([ transform = T.Compose([
T.Lambda(lambda img: convert_image_mode(img, 'RGB')), T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
T.Resize((input_size, input_size), T.Resize((input_size, input_size),
interpolation=T.InterpolationMode.BICUBIC), interpolation=T.InterpolationMode.BICUBIC),
T.ToTensor(), T.ToTensor(),
T.Normalize(mean=MEAN, std=STD) T.Normalize(mean=MEAN, std=STD)
]) ])
# Image transformation operations (which include tensor computations
# on the CPU) can occupy a substantial number of CPU cores, introducing
# overhead due to CPU contention. This issue becomes particularly
# noticeable when deploying multiple vLLM instances on a single machine.
# Therefore, it is necessary to limit the number of threads allocated to
# image transformation tasks.
num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
def apply(img):
with set_default_torch_num_threads(num_threads):
return transform(img)
return apply
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment