Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
16abe6b8
Unverified
Commit
16abe6b8
authored
Jan 12, 2026
by
Roger Wang
Committed by
GitHub
Jan 12, 2026
Browse files
[Misc] Set default torch num threads for input processing (#31879)
Signed-off-by:
Roger Wang
<
hey@rogerw.io
>
parent
1eb61ab3
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
16 deletions
+12
-16
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+1
-15
vllm/v1/engine/input_processor.py
vllm/v1/engine/input_processor.py
+11
-1
No files found.
vllm/model_executor/models/internvl.py
View file @
16abe6b8
...
...
@@ -7,7 +7,6 @@
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
os
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Annotated
,
Any
,
Literal
,
TypeAlias
,
TypeVar
...
...
@@ -52,7 +51,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.tokenizers
import
TokenizerLike
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
vllm.utils.torch_utils
import
set_default_torch_num_threads
from
.interfaces
import
(
MultiModalEmbeddings
,
...
...
@@ -143,19 +141,7 @@ def build_transform(input_size: int):
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
# Image transformation operations (which include tensor computations
# on the CPU) can occupy a substantial number of CPU cores, introducing
# overhead due to CPU contention. This issue becomes particularly
# noticeable when deploying multiple vLLM instances on a single machine.
# Therefore, it is necessary to limit the number of threads allocated to
# image transformation tasks.
num_threads
=
int
(
os
.
environ
.
get
(
"OMP_NUM_THREADS"
,
"1"
))
def
apply
(
img
):
with
set_default_torch_num_threads
(
num_threads
):
return
transform
(
img
)
return
apply
return
transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
...
...
vllm/v1/engine/input_processor.py
View file @
16abe6b8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
time
from
collections.abc
import
Mapping
from
typing
import
Any
,
Literal
,
cast
...
...
@@ -23,6 +24,7 @@ from vllm.sampling_params import _SAMPLING_EPS, SamplingParams
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.utils
import
length_from_prompt_token_ids_or_embeds
,
random_uuid
from
vllm.utils.torch_utils
import
set_default_torch_num_threads
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.metrics.stats
import
MultiModalCacheStats
from
vllm.v1.structured_output.backend_guidance
import
(
...
...
@@ -493,7 +495,15 @@ class InputProcessor:
# 1. Tokenize text prompt, with LoRA request if one exists.
# 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly.
with
set_request_id
(
request_id
):
num_threads
=
int
(
os
.
environ
.
get
(
"OMP_NUM_THREADS"
,
"1"
))
if
"OMP_NUM_THREADS"
not
in
os
.
environ
:
logger
.
debug_once
(
"OMP_NUM_THREADS is not set; defaulting Torch threads to %d for "
"input preprocessing."
,
num_threads
,
)
with
set_request_id
(
request_id
),
set_default_torch_num_threads
(
num_threads
):
processed_inputs
:
ProcessorInputs
=
self
.
input_preprocessor
.
preprocess
(
prompt
,
tokenization_kwargs
=
tokenization_kwargs
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment