[Model] Limit CPU threads for image transformations in InternVL to reduce cpu contention. (#24519)

Signed-off-by: li-jinpeng <3332126450@qq.com> Co-authored-by: Roger Wang <hey@rogerw.io>

[Model] Limit CPU threads for image transformations in InternVL to reduce cpu contention. (#24519)
Signed-off-by: li-jinpeng <3332126450@qq.com> Co-authored-by: Roger Wang <hey@rogerw.io>
267c80d3 · li-jinpeng · GitHub · 77f62613 · 267c80d3
Unverified Commit 267c80d3 authored Sep 10, 2025 by li-jinpeng Committed by GitHub Sep 10, 2025
Show whitespace changes
Inline Side-by-side

Showing with 16 additions and 1 deletion

vllm/model_executor/models/internvl.py vllm/model_executor/models/internvl.py +16 -1

No files found.
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -7,6 +7,7 @@
 # Copyright (c) 2023 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
+import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Annotated, Any, Literal, Optional, TypeVar, Union
@@ -37,6 +38,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import set_default_torch_num_threads
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -115,13 +117,26 @@ InternVLVideoInputs = Union[InternVLVideoPixelInputs,
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
 def build_transform(input_size: int):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    return T.Compose([
+    transform = T.Compose([
        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
        T.Resize((input_size, input_size),
                 interpolation=T.InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
+    # Image transformation operations (which include tensor computations
+    # on the CPU) can occupy a substantial number of CPU cores, introducing
+    # overhead due to CPU contention. This issue becomes particularly
+    # noticeable when deploying multiple vLLM instances on a single machine.
+    # Therefore, it is necessary to limit the number of threads allocated to
+    # image transformation tasks.
+    num_threads = int(os.environ.get("OMP_NUM_THREADS", "1"))
+
+    def apply(img):
+        with set_default_torch_num_threads(num_threads):
+            return transform(img)
+
+    return apply


 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B