Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c373b5c0
Unverified
Commit
c373b5c0
authored
Mar 18, 2026
by
Wentao Ye
Committed by
GitHub
Mar 18, 2026
Browse files
[Log] Reduce duplicate log (#37313)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
de1a86b7
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
20 additions
and
10 deletions
+20
-10
vllm/compilation/backends.py
vllm/compilation/backends.py
+3
-1
vllm/config/scheduler.py
vllm/config/scheduler.py
+2
-1
vllm/model_executor/layers/attention/mm_encoder_attention.py
vllm/model_executor/layers/attention/mm_encoder_attention.py
+3
-1
vllm/model_executor/models/qwen3_next.py
vllm/model_executor/models/qwen3_next.py
+4
-3
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+2
-1
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/multiproc_executor.py
+2
-1
vllm/v1/worker/dp_utils.py
vllm/v1/worker/dp_utils.py
+2
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-1
No files found.
vllm/compilation/backends.py
View file @
c373b5c0
...
@@ -371,13 +371,15 @@ class CompilerManager:
...
@@ -371,13 +371,15 @@ class CompilerManager:
logger
.
info_once
(
logger
.
info_once
(
"Cache the graph of compile range %s for later use"
,
"Cache the graph of compile range %s for later use"
,
str
(
compile_range
),
str
(
compile_range
),
scope
=
"local"
,
)
)
logger
.
debug
(
logger
.
debug
_once
(
"Store the %s-th graph for compile range%s from %s via handle %s"
,
"Store the %s-th graph for compile range%s from %s via handle %s"
,
graph_index
,
graph_index
,
str
(
compile_range
),
str
(
compile_range
),
self
.
compiler
.
name
,
self
.
compiler
.
name
,
handle
,
handle
,
scope
=
"local"
,
)
)
# after compiling the last graph, record the end time
# after compiling the last graph, record the end time
...
...
vllm/config/scheduler.py
View file @
c373b5c0
...
@@ -228,9 +228,10 @@ class SchedulerConfig:
...
@@ -228,9 +228,10 @@ class SchedulerConfig:
self
.
encoder_cache_size
=
self
.
max_num_batched_tokens
self
.
encoder_cache_size
=
self
.
max_num_batched_tokens
if
self
.
enable_chunked_prefill
:
if
self
.
enable_chunked_prefill
:
logger
.
info
(
logger
.
info
_once
(
"Chunked prefill is enabled with max_num_batched_tokens=%d."
,
"Chunked prefill is enabled with max_num_batched_tokens=%d."
,
self
.
max_num_batched_tokens
,
self
.
max_num_batched_tokens
,
scope
=
"local"
,
)
)
if
self
.
max_num_partial_prefills
>
1
:
if
self
.
max_num_partial_prefills
>
1
:
...
...
vllm/model_executor/layers/attention/mm_encoder_attention.py
View file @
c373b5c0
...
@@ -227,7 +227,9 @@ class MMEncoderAttention(CustomOp):
...
@@ -227,7 +227,9 @@ class MMEncoderAttention(CustomOp):
if
self
.
attn_backend
==
AttentionBackendEnum
.
FLASHINFER
:
if
self
.
attn_backend
==
AttentionBackendEnum
.
FLASHINFER
:
_get_flashinfer_workspace_buffer
()
_get_flashinfer_workspace_buffer
()
logger
.
info_once
(
f
"Using
{
self
.
attn_backend
}
for MMEncoderAttention."
)
logger
.
info_once
(
f
"Using
{
self
.
attn_backend
}
for MMEncoderAttention."
,
scope
=
"local"
)
@
classmethod
@
classmethod
def
enabled
(
cls
)
->
bool
:
def
enabled
(
cls
)
->
bool
:
...
...
vllm/model_executor/models/qwen3_next.py
View file @
c373b5c0
...
@@ -192,14 +192,15 @@ class ChunkGatedDeltaRule(CustomOp):
...
@@ -192,14 +192,15 @@ class ChunkGatedDeltaRule(CustomOp):
use_flashinfer
=
supports_flashinfer
use_flashinfer
=
supports_flashinfer
if
use_flashinfer
:
if
use_flashinfer
:
logger
.
info_once
(
"Using FlashInfer GDN prefill kernel"
)
logger
.
info_once
(
"Using FlashInfer GDN prefill kernel"
,
scope
=
"local"
)
logger
.
info_once
(
logger
.
info_once
(
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
"FlashInfer GDN prefill kernel is JIT-compiled; first run may "
"take a while to compile. Set `--gdn-prefill-backend triton` to "
"take a while to compile. Set `--gdn-prefill-backend triton` to "
"avoid JIT compile time."
"avoid JIT compile time."
,
scope
=
"local"
,
)
)
else
:
else
:
logger
.
info_once
(
"Using Triton/FLA GDN prefill kernel"
)
logger
.
info_once
(
"Using Triton/FLA GDN prefill kernel"
,
scope
=
"local"
)
self
.
_forward_method
=
(
self
.
_forward_method
=
(
self
.
forward_cuda
if
use_flashinfer
else
self
.
forward_native
self
.
forward_cuda
if
use_flashinfer
else
self
.
forward_native
...
...
vllm/platforms/cuda.py
View file @
c373b5c0
...
@@ -387,7 +387,8 @@ class CudaPlatformBase(Platform):
...
@@ -387,7 +387,8 @@ class CudaPlatformBase(Platform):
)
)
if
is_backend_supported
:
if
is_backend_supported
:
logger
.
info_once
(
logger
.
info_once
(
f
"Using backend
{
vit_attn_backend
}
for vit attention"
f
"Using backend
{
vit_attn_backend
}
for vit attention"
,
scope
=
"local"
,
)
)
return
vit_attn_backend
return
vit_attn_backend
except
ImportError
:
except
ImportError
:
...
...
vllm/v1/executor/multiproc_executor.py
View file @
c373b5c0
...
@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
...
@@ -998,12 +998,13 @@ def set_multiprocessing_worker_envs():
"OMP_NUM_THREADS"
not
in
os
.
environ
"OMP_NUM_THREADS"
not
in
os
.
environ
and
(
current_parallelism
:
=
torch
.
get_num_threads
())
>
default_omp_num_threads
and
(
current_parallelism
:
=
torch
.
get_num_threads
())
>
default_omp_num_threads
):
):
logger
.
warning
(
logger
.
warning
_once
(
"Reducing Torch parallelism from %d threads to %d to avoid "
"Reducing Torch parallelism from %d threads to %d to avoid "
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
"unnecessary CPU contention. Set OMP_NUM_THREADS in the "
"external environment to tune this value as needed."
,
"external environment to tune this value as needed."
,
current_parallelism
,
current_parallelism
,
default_omp_num_threads
,
default_omp_num_threads
,
scope
=
"local"
,
)
)
os
.
environ
[
"OMP_NUM_THREADS"
]
=
str
(
default_omp_num_threads
)
os
.
environ
[
"OMP_NUM_THREADS"
]
=
str
(
default_omp_num_threads
)
torch
.
set_num_threads
(
default_omp_num_threads
)
torch
.
set_num_threads
(
default_omp_num_threads
)
vllm/v1/worker/dp_utils.py
View file @
c373b5c0
...
@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
...
@@ -28,7 +28,8 @@ def _get_device_and_group(parallel_config: ParallelConfig):
# this optimization if we run into this case.
# this optimization if we run into this case.
if
parallel_config
.
disable_nccl_for_dp_synchronization
:
if
parallel_config
.
disable_nccl_for_dp_synchronization
:
logger
.
info_once
(
logger
.
info_once
(
"Using CPU all reduce to synchronize DP padding between ranks."
"Using CPU all reduce to synchronize DP padding between ranks."
,
scope
=
"local"
,
)
)
device
=
"cpu"
device
=
"cpu"
group
=
get_dp_group
().
cpu_group
group
=
get_dp_group
().
cpu_group
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
c373b5c0
...
@@ -5510,13 +5510,14 @@ class GPUModelRunner(
...
@@ -5510,13 +5510,14 @@ class GPUModelRunner(
dummy_modality
dummy_modality
]
]
logger
.
info
(
logger
.
info
_once
(
"Encoder cache will be initialized with a "
"Encoder cache will be initialized with a "
"budget of %s tokens, and profiled with "
"budget of %s tokens, and profiled with "
"%s %s items of the maximum feature size."
,
"%s %s items of the maximum feature size."
,
encoder_budget
,
encoder_budget
,
max_mm_items_per_batch
,
max_mm_items_per_batch
,
dummy_modality
,
dummy_modality
,
scope
=
"local"
,
)
)
# Create dummy batch of multimodal inputs.
# Create dummy batch of multimodal inputs.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment