Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
52efc34e
Unverified
Commit
52efc34e
authored
Oct 24, 2025
by
Wentao Ye
Committed by
GitHub
Oct 24, 2025
Browse files
[Log] Optimize Startup Log (#26740)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
d95d0f4b
Changes
21
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
145 additions
and
61 deletions
+145
-61
vllm/compilation/backends.py
vllm/compilation/backends.py
+25
-10
vllm/compilation/monitor.py
vllm/compilation/monitor.py
+4
-2
vllm/distributed/device_communicators/cuda_communicator.py
vllm/distributed/device_communicators/cuda_communicator.py
+5
-6
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/custom_all_reduce.py
+1
-1
vllm/distributed/device_communicators/pynccl.py
vllm/distributed/device_communicators/pynccl.py
+3
-1
vllm/distributed/device_communicators/shm_broadcast.py
vllm/distributed/device_communicators/shm_broadcast.py
+1
-1
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+25
-2
vllm/logger.py
vllm/logger.py
+40
-13
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+4
-2
vllm/model_executor/model_loader/default_loader.py
vllm/model_executor/model_loader/default_loader.py
+2
-1
vllm/model_executor/model_loader/weight_utils.py
vllm/model_executor/model_loader/weight_utils.py
+1
-1
vllm/platforms/__init__.py
vllm/platforms/__init__.py
+4
-2
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+3
-1
vllm/utils/gc_utils.py
vllm/utils/gc_utils.py
+1
-1
vllm/v1/core/kv_cache_utils.py
vllm/v1/core/kv_cache_utils.py
+1
-1
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+4
-4
vllm/v1/metrics/loggers.py
vllm/v1/metrics/loggers.py
+1
-1
vllm/v1/sample/ops/topk_topp_sampler.py
vllm/v1/sample/ops/topk_topp_sampler.py
+4
-1
vllm/v1/worker/dp_utils.py
vllm/v1/worker/dp_utils.py
+7
-7
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+9
-3
No files found.
vllm/compilation/backends.py
View file @
52efc34e
...
@@ -245,10 +245,14 @@ class CompilerManager:
...
@@ -245,10 +245,14 @@ class CompilerManager:
if
graph_index
==
0
:
if
graph_index
==
0
:
# adds some info logging for the first graph
# adds some info logging for the first graph
if
runtime_shape
is
None
:
if
runtime_shape
is
None
:
logger
.
info
(
"Cache the graph for dynamic shape for later use"
)
logger
.
info_once
(
"Cache the graph for dynamic shape for later use"
,
scope
=
"local"
)
else
:
else
:
logger
.
info
(
logger
.
info_once
(
"Cache the graph of shape %s for later use"
,
str
(
runtime_shape
)
"Cache the graph of shape %s for later use"
,
str
(
runtime_shape
),
scope
=
"local"
,
)
)
if
runtime_shape
is
None
:
if
runtime_shape
is
None
:
logger
.
debug
(
logger
.
debug
(
...
@@ -272,12 +276,17 @@ class CompilerManager:
...
@@ -272,12 +276,17 @@ class CompilerManager:
elapsed
=
now
-
compilation_start_time
elapsed
=
now
-
compilation_start_time
compilation_config
.
compilation_time
+=
elapsed
compilation_config
.
compilation_time
+=
elapsed
if
runtime_shape
is
None
:
if
runtime_shape
is
None
:
logger
.
info
(
"Compiling a graph for dynamic shape takes %.2f s"
,
elapsed
)
logger
.
info_once
(
"Compiling a graph for dynamic shape takes %.2f s"
,
elapsed
,
scope
=
"local"
,
)
else
:
else
:
logger
.
info
(
logger
.
info
_once
(
"Compiling a graph for shape %s takes %.2f s"
,
"Compiling a graph for shape %s takes %.2f s"
,
runtime_shape
,
runtime_shape
,
elapsed
,
elapsed
,
scope
=
"local"
,
)
)
return
compiled_graph
return
compiled_graph
...
@@ -604,10 +613,12 @@ class VllmBackend:
...
@@ -604,10 +613,12 @@ class VllmBackend:
disable_cache
=
envs
.
VLLM_DISABLE_COMPILE_CACHE
disable_cache
=
envs
.
VLLM_DISABLE_COMPILE_CACHE
if
disable_cache
:
if
disable_cache
:
logger
.
info
(
"vLLM's torch.compile cache is disabled."
)
logger
.
info
_once
(
"vLLM's torch.compile cache is disabled."
,
scope
=
"local"
)
else
:
else
:
logger
.
info
(
logger
.
info_once
(
"Using cache directory: %s for vLLM's torch.compile"
,
local_cache_dir
"Using cache directory: %s for vLLM's torch.compile"
,
local_cache_dir
,
scope
=
"local"
,
)
)
self
.
compiler_manager
.
initialize_cache
(
self
.
compiler_manager
.
initialize_cache
(
...
@@ -620,7 +631,9 @@ class VllmBackend:
...
@@ -620,7 +631,9 @@ class VllmBackend:
from
.monitor
import
torch_compile_start_time
from
.monitor
import
torch_compile_start_time
dynamo_time
=
time
.
time
()
-
torch_compile_start_time
dynamo_time
=
time
.
time
()
-
torch_compile_start_time
logger
.
info
(
"Dynamo bytecode transform time: %.2f s"
,
dynamo_time
)
logger
.
info_once
(
"Dynamo bytecode transform time: %.2f s"
,
dynamo_time
,
scope
=
"local"
)
self
.
compilation_config
.
compilation_time
+=
dynamo_time
self
.
compilation_config
.
compilation_time
+=
dynamo_time
# we control the compilation process, each instance can only be
# we control the compilation process, each instance can only be
...
@@ -672,7 +685,9 @@ class VllmBackend:
...
@@ -672,7 +685,9 @@ class VllmBackend:
with
open
(
graph_path
,
"w"
)
as
f
:
with
open
(
graph_path
,
"w"
)
as
f
:
f
.
write
(
src
)
f
.
write
(
src
)
logger
.
debug
(
"Computation graph saved to %s"
,
graph_path
)
logger
.
debug_once
(
"Computation graph saved to %s"
,
graph_path
,
scope
=
"local"
)
self
.
_called
=
True
self
.
_called
=
True
...
...
vllm/compilation/monitor.py
View file @
52efc34e
...
@@ -31,8 +31,10 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
...
@@ -31,8 +31,10 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
def
end_monitoring_torch_compile
(
vllm_config
:
VllmConfig
):
def
end_monitoring_torch_compile
(
vllm_config
:
VllmConfig
):
compilation_config
:
CompilationConfig
=
vllm_config
.
compilation_config
compilation_config
:
CompilationConfig
=
vllm_config
.
compilation_config
if
compilation_config
.
mode
==
CompilationMode
.
VLLM_COMPILE
:
if
compilation_config
.
mode
==
CompilationMode
.
VLLM_COMPILE
:
logger
.
info
(
logger
.
info_once
(
"torch.compile takes %.2f s in total"
,
compilation_config
.
compilation_time
"torch.compile takes %.2f s in total"
,
compilation_config
.
compilation_time
,
scope
=
"local"
,
)
)
global
context_manager
global
context_manager
if
context_manager
is
not
None
:
if
context_manager
is
not
None
:
...
...
vllm/distributed/device_communicators/cuda_communicator.py
View file @
52efc34e
...
@@ -13,7 +13,6 @@ from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric
...
@@ -13,7 +13,6 @@ from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric
from
vllm.distributed.device_communicators.pynccl_allocator
import
(
from
vllm.distributed.device_communicators.pynccl_allocator
import
(
is_symmetric_memory_enabled
,
is_symmetric_memory_enabled
,
)
)
from
vllm.distributed.parallel_state
import
is_global_first_rank
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -118,10 +117,10 @@ class CudaCommunicator(DeviceCommunicatorBase):
...
@@ -118,10 +117,10 @@ class CudaCommunicator(DeviceCommunicatorBase):
else
:
else
:
raise
ValueError
(
f
"Unknown all2all backend:
{
self
.
all2all_backend
}
"
)
raise
ValueError
(
f
"Unknown all2all backend:
{
self
.
all2all_backend
}
"
)
if
is_global_first_rank
():
logger
.
info_once
(
logger
.
info
(
"Using %s all2all manager."
,
"Using %s all2all manager."
,
self
.
all2all_manager
.
__class__
.
__name__
,
self
.
all2all_manager
.
__class__
.
__name__
,
scope
=
"global"
,
)
)
def
all_reduce
(
self
,
input_
):
def
all_reduce
(
self
,
input_
):
...
...
vllm/distributed/device_communicators/custom_all_reduce.py
View file @
52efc34e
...
@@ -34,7 +34,7 @@ def _can_p2p(rank: int, world_size: int) -> bool:
...
@@ -34,7 +34,7 @@ def _can_p2p(rank: int, world_size: int) -> bool:
if
i
==
rank
:
if
i
==
rank
:
continue
continue
if
envs
.
VLLM_SKIP_P2P_CHECK
:
if
envs
.
VLLM_SKIP_P2P_CHECK
:
logger
.
info
(
"Skipping P2P check and trusting the driver's P2P report."
)
logger
.
debug
(
"Skipping P2P check and trusting the driver's P2P report."
)
return
torch
.
cuda
.
can_device_access_peer
(
rank
,
i
)
return
torch
.
cuda
.
can_device_access_peer
(
rank
,
i
)
if
not
gpu_p2p_access_check
(
rank
,
i
):
if
not
gpu_p2p_access_check
(
rank
,
i
):
return
False
return
False
...
...
vllm/distributed/device_communicators/pynccl.py
View file @
52efc34e
...
@@ -108,7 +108,9 @@ class PyNcclCommunicator:
...
@@ -108,7 +108,9 @@ class PyNcclCommunicator:
if
self
.
rank
==
0
:
if
self
.
rank
==
0
:
# get the unique id from NCCL
# get the unique id from NCCL
self
.
unique_id
=
self
.
nccl
.
ncclGetUniqueId
()
self
.
unique_id
=
self
.
nccl
.
ncclGetUniqueId
()
logger
.
info
(
"vLLM is using nccl==%s"
,
self
.
nccl
.
ncclGetVersion
())
logger
.
info_once
(
"vLLM is using nccl==%s"
,
self
.
nccl
.
ncclGetVersion
(),
scope
=
"local"
)
else
:
else
:
# construct an empty unique id
# construct an empty unique id
self
.
unique_id
=
ncclUniqueId
()
self
.
unique_id
=
ncclUniqueId
()
...
...
vllm/distributed/device_communicators/shm_broadcast.py
View file @
52efc34e
...
@@ -312,7 +312,7 @@ class MessageQueue:
...
@@ -312,7 +312,7 @@ class MessageQueue:
remote_addr_ipv6
=
remote_addr_ipv6
,
remote_addr_ipv6
=
remote_addr_ipv6
,
)
)
logger
.
info
(
"vLLM message queue communication handle: %s"
,
self
.
handle
)
logger
.
debug
(
"vLLM message queue communication handle: %s"
,
self
.
handle
)
def
export_handle
(
self
)
->
Handle
:
def
export_handle
(
self
)
->
Handle
:
return
self
.
handle
return
self
.
handle
...
...
vllm/distributed/parallel_state.py
View file @
52efc34e
...
@@ -1157,7 +1157,7 @@ def init_distributed_environment(
...
@@ -1157,7 +1157,7 @@ def init_distributed_environment(
ip
=
parallel_config
.
data_parallel_master_ip
ip
=
parallel_config
.
data_parallel_master_ip
port
=
parallel_config
.
get_next_dp_init_port
()
port
=
parallel_config
.
get_next_dp_init_port
()
distributed_init_method
=
get_distributed_init_method
(
ip
,
port
)
distributed_init_method
=
get_distributed_init_method
(
ip
,
port
)
logger
.
info
(
logger
.
debug
(
"Adjusting world_size=%d rank=%d distributed_init_method=%s for DP"
,
"Adjusting world_size=%d rank=%d distributed_init_method=%s for DP"
,
world_size
,
world_size
,
rank
,
rank
,
...
@@ -1322,7 +1322,7 @@ def initialize_model_parallel(
...
@@ -1322,7 +1322,7 @@ def initialize_model_parallel(
group_ranks
,
get_world_group
().
local_rank
,
backend
,
group_name
=
"ep"
group_ranks
,
get_world_group
().
local_rank
,
backend
,
group_name
=
"ep"
)
)
logger
.
info
(
logger
.
info
_once
(
"rank %s in world size %s is assigned as "
"rank %s in world size %s is assigned as "
"DP rank %s, PP rank %s, TP rank %s, EP rank %s"
,
"DP rank %s, PP rank %s, TP rank %s, EP rank %s"
,
rank
,
rank
,
...
@@ -1625,6 +1625,29 @@ def is_global_first_rank() -> bool:
...
@@ -1625,6 +1625,29 @@ def is_global_first_rank() -> bool:
return
True
return
True
def
is_local_first_rank
()
->
bool
:
"""
Check if the current process is the first local rank (rank 0 on its node).
"""
try
:
# prefer the initialized world group if available
global
_WORLD
if
_WORLD
is
not
None
:
return
_WORLD
.
local_rank
==
0
if
not
torch
.
distributed
.
is_initialized
():
return
True
# fallback to environment-provided local rank if available
# note: envs.LOCAL_RANK is set when using env:// launchers (e.g., torchrun)
try
:
return
int
(
envs
.
LOCAL_RANK
)
==
0
# type: ignore[arg-type]
except
Exception
:
return
torch
.
distributed
.
get_rank
()
==
0
except
Exception
:
return
True
def
_node_count
(
pg
:
ProcessGroup
|
StatelessProcessGroup
)
->
int
:
def
_node_count
(
pg
:
ProcessGroup
|
StatelessProcessGroup
)
->
int
:
"""
"""
Returns the total number of nodes in the process group.
Returns the total number of nodes in the process group.
...
...
vllm/logger.py
View file @
52efc34e
...
@@ -13,7 +13,7 @@ from logging import Logger
...
@@ -13,7 +13,7 @@ from logging import Logger
from
logging.config
import
dictConfig
from
logging.config
import
dictConfig
from
os
import
path
from
os
import
path
from
types
import
MethodType
from
types
import
MethodType
from
typing
import
Any
,
cast
from
typing
import
Any
,
Literal
,
cast
import
vllm.envs
as
envs
import
vllm.envs
as
envs
...
@@ -59,20 +59,37 @@ DEFAULT_LOGGING_CONFIG = {
...
@@ -59,20 +59,37 @@ DEFAULT_LOGGING_CONFIG = {
@
lru_cache
@
lru_cache
def
_print_debug_once
(
logger
:
Logger
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
def
_print_debug_once
(
logger
:
Logger
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
# Set the stacklevel to
2
to print the original caller's line info
# Set the stacklevel to
3
to print the original caller's line info
logger
.
debug
(
msg
,
*
args
,
stacklevel
=
2
)
logger
.
debug
(
msg
,
*
args
,
stacklevel
=
3
)
@
lru_cache
@
lru_cache
def
_print_info_once
(
logger
:
Logger
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
def
_print_info_once
(
logger
:
Logger
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
# Set the stacklevel to
2
to print the original caller's line info
# Set the stacklevel to
3
to print the original caller's line info
logger
.
info
(
msg
,
*
args
,
stacklevel
=
2
)
logger
.
info
(
msg
,
*
args
,
stacklevel
=
3
)
@
lru_cache
@
lru_cache
def
_print_warning_once
(
logger
:
Logger
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
def
_print_warning_once
(
logger
:
Logger
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
# Set the stacklevel to 2 to print the original caller's line info
# Set the stacklevel to 3 to print the original caller's line info
logger
.
warning
(
msg
,
*
args
,
stacklevel
=
2
)
logger
.
warning
(
msg
,
*
args
,
stacklevel
=
3
)
LogScope
=
Literal
[
"process"
,
"global"
,
"local"
]
def
_should_log_with_scope
(
scope
:
LogScope
)
->
bool
:
"""Decide whether to log based on scope"""
if
scope
==
"global"
:
from
vllm.distributed.parallel_state
import
is_global_first_rank
return
is_global_first_rank
()
if
scope
==
"local"
:
from
vllm.distributed.parallel_state
import
is_local_first_rank
return
is_local_first_rank
()
# default "process" scope: always log
return
True
class
_VllmLogger
(
Logger
):
class
_VllmLogger
(
Logger
):
...
@@ -84,33 +101,43 @@ class _VllmLogger(Logger):
...
@@ -84,33 +101,43 @@ class _VllmLogger(Logger):
`intel_extension_for_pytorch.utils._logger`.
`intel_extension_for_pytorch.utils._logger`.
"""
"""
def
debug_once
(
self
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
def
debug_once
(
self
,
msg
:
str
,
*
args
:
Hashable
,
scope
:
LogScope
=
"process"
)
->
None
:
"""
"""
As [`debug`][logging.Logger.debug], but subsequent calls with
As [`debug`][logging.Logger.debug], but subsequent calls with
the same message are silently dropped.
the same message are silently dropped.
"""
"""
if
not
_should_log_with_scope
(
scope
):
return
_print_debug_once
(
self
,
msg
,
*
args
)
_print_debug_once
(
self
,
msg
,
*
args
)
def
info_once
(
self
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
def
info_once
(
self
,
msg
:
str
,
*
args
:
Hashable
,
scope
:
LogScope
=
"process"
)
->
None
:
"""
"""
As [`info`][logging.Logger.info], but subsequent calls with
As [`info`][logging.Logger.info], but subsequent calls with
the same message are silently dropped.
the same message are silently dropped.
"""
"""
if
not
_should_log_with_scope
(
scope
):
return
_print_info_once
(
self
,
msg
,
*
args
)
_print_info_once
(
self
,
msg
,
*
args
)
def
warning_once
(
self
,
msg
:
str
,
*
args
:
Hashable
)
->
None
:
def
warning_once
(
self
,
msg
:
str
,
*
args
:
Hashable
,
scope
:
LogScope
=
"process"
)
->
None
:
"""
"""
As [`warning`][logging.Logger.warning], but subsequent calls with
As [`warning`][logging.Logger.warning], but subsequent calls with
the same message are silently dropped.
the same message are silently dropped.
"""
"""
if
not
_should_log_with_scope
(
scope
):
return
_print_warning_once
(
self
,
msg
,
*
args
)
_print_warning_once
(
self
,
msg
,
*
args
)
# Pre-defined methods mapping to avoid repeated dictionary creation
# Pre-defined methods mapping to avoid repeated dictionary creation
_METHODS_TO_PATCH
=
{
_METHODS_TO_PATCH
=
{
"debug_once"
:
_
print_
debug_once
,
"debug_once"
:
_
VllmLogger
.
debug_once
,
"info_once"
:
_
print_
info_once
,
"info_once"
:
_
VllmLogger
.
info_once
,
"warning_once"
:
_
print_
warning_once
,
"warning_once"
:
_
VllmLogger
.
warning_once
,
}
}
...
...
vllm/model_executor/layers/fused_moe/layer.py
View file @
52efc34e
...
@@ -368,11 +368,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
...
@@ -368,11 +368,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logger
.
info_once
(
logger
.
info_once
(
"FlashInfer CUTLASS MoE is available for EP"
"FlashInfer CUTLASS MoE is available for EP"
" but not enabled, consider setting"
" but not enabled, consider setting"
" VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it."
" VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it."
,
scope
=
"local"
,
)
)
elif
self
.
moe
.
moe_parallel_config
.
dp_size
>
1
:
elif
self
.
moe
.
moe_parallel_config
.
dp_size
>
1
:
logger
.
info_once
(
logger
.
info_once
(
"FlashInfer CUTLASS MoE is currently not available for DP."
"FlashInfer CUTLASS MoE is currently not available for DP."
,
scope
=
"local"
,
)
)
self
.
flashinfer_cutlass_moe
=
None
# type: ignore
self
.
flashinfer_cutlass_moe
=
None
# type: ignore
...
...
vllm/model_executor/model_loader/default_loader.py
View file @
52efc34e
...
@@ -311,9 +311,10 @@ class DefaultModelLoader(BaseModelLoader):
...
@@ -311,9 +311,10 @@ class DefaultModelLoader(BaseModelLoader):
loaded_weights
=
load_weights_and_online_quantize
(
self
,
model
,
model_config
)
loaded_weights
=
load_weights_and_online_quantize
(
self
,
model
,
model_config
)
self
.
counter_after_loading_weights
=
time
.
perf_counter
()
self
.
counter_after_loading_weights
=
time
.
perf_counter
()
logger
.
info
(
logger
.
info
_once
(
"Loading weights took %.2f seconds"
,
"Loading weights took %.2f seconds"
,
self
.
counter_after_loading_weights
-
self
.
counter_before_loading_weights
,
self
.
counter_after_loading_weights
-
self
.
counter_before_loading_weights
,
scope
=
"local"
,
)
)
# We only enable strict check for non-quantized models
# We only enable strict check for non-quantized models
# that have loaded weights tracking currently.
# that have loaded weights tracking currently.
...
...
vllm/model_executor/model_loader/weight_utils.py
View file @
52efc34e
...
@@ -416,7 +416,7 @@ def download_weights_from_hf(
...
@@ -416,7 +416,7 @@ def download_weights_from_hf(
e
,
e
,
)
)
logger
.
info
(
"Using model weights format %s"
,
allow_patterns
)
logger
.
debug
(
"Using model weights format %s"
,
allow_patterns
)
# Use file lock to prevent multiple processes from
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
# downloading the same model weights at the same time.
with
get_lock
(
model_name_or_path
,
cache_dir
):
with
get_lock
(
model_name_or_path
,
cache_dir
):
...
...
vllm/platforms/__init__.py
View file @
52efc34e
...
@@ -222,10 +222,12 @@ def resolve_current_platform_cls_qualname() -> str:
...
@@ -222,10 +222,12 @@ def resolve_current_platform_cls_qualname() -> str:
)
)
elif
len
(
activated_builtin_plugins
)
==
1
:
elif
len
(
activated_builtin_plugins
)
==
1
:
platform_cls_qualname
=
builtin_platform_plugins
[
activated_builtin_plugins
[
0
]]()
platform_cls_qualname
=
builtin_platform_plugins
[
activated_builtin_plugins
[
0
]]()
logger
.
info
(
"Automatically detected platform %s."
,
activated_builtin_plugins
[
0
])
logger
.
debug
(
"Automatically detected platform %s."
,
activated_builtin_plugins
[
0
]
)
else
:
else
:
platform_cls_qualname
=
"vllm.platforms.interface.UnspecifiedPlatform"
platform_cls_qualname
=
"vllm.platforms.interface.UnspecifiedPlatform"
logger
.
info
(
"No platform detected, vLLM is running on UnspecifiedPlatform"
)
logger
.
debug
(
"No platform detected, vLLM is running on UnspecifiedPlatform"
)
return
platform_cls_qualname
return
platform_cls_qualname
...
...
vllm/platforms/cuda.py
View file @
52efc34e
...
@@ -298,7 +298,9 @@ class CudaPlatformBase(Platform):
...
@@ -298,7 +298,9 @@ class CudaPlatformBase(Platform):
)
)
if
use_cutlassmla
:
if
use_cutlassmla
:
logger
.
info_once
(
"Using Cutlass MLA backend on V1 engine."
)
logger
.
info_once
(
"Using Cutlass MLA backend on V1 engine."
,
scope
=
"local"
)
return
"vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
return
"vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend"
if
use_flashinfermla
:
if
use_flashinfermla
:
from
vllm.v1.attention.backends.utils
import
set_kv_cache_layout
from
vllm.v1.attention.backends.utils
import
set_kv_cache_layout
...
...
vllm/utils/gc_utils.py
View file @
52efc34e
...
@@ -37,7 +37,7 @@ class GCDebugConfig:
...
@@ -37,7 +37,7 @@ class GCDebugConfig:
except
Exception
:
except
Exception
:
self
.
enabled
=
False
self
.
enabled
=
False
logger
.
error
(
"Failed to parse VLLM_GC_DEBUG(%s)"
,
envs
.
VLLM_GC_DEBUG
)
logger
.
error
(
"Failed to parse VLLM_GC_DEBUG(%s)"
,
envs
.
VLLM_GC_DEBUG
)
logger
.
info
(
"GC Debug Config. %s"
,
str
(
self
))
logger
.
debug
(
"GC Debug Config. %s"
,
str
(
self
))
def
__repr__
(
self
)
->
str
:
def
__repr__
(
self
)
->
str
:
return
f
"enabled:
{
self
.
enabled
}
,top_objects:
{
self
.
top_objects
}
"
return
f
"enabled:
{
self
.
enabled
}
,top_objects:
{
self
.
top_objects
}
"
...
...
vllm/v1/core/kv_cache_utils.py
View file @
52efc34e
...
@@ -1226,7 +1226,7 @@ def _report_kv_cache_config(
...
@@ -1226,7 +1226,7 @@ def _report_kv_cache_config(
vllm_config
.
parallel_config
.
decode_context_parallel_size
,
vllm_config
.
parallel_config
.
decode_context_parallel_size
,
)
)
num_tokens_str
=
f
"
{
num_tokens
:,
}
"
num_tokens_str
=
f
"
{
num_tokens
:,
}
"
logger
.
info
(
"GPU KV cache size: %s tokens"
,
num_tokens_str
)
logger
.
info
_once
(
"GPU KV cache size: %s tokens"
,
num_tokens_str
,
scope
=
"local"
)
max_model_len_str
=
f
"
{
vllm_config
.
model_config
.
max_model_len
:,
}
"
max_model_len_str
=
f
"
{
vllm_config
.
model_config
.
max_model_len
:,
}
"
max_concurrency
=
get_max_concurrency_for_kv_cache_config
(
max_concurrency
=
get_max_concurrency_for_kv_cache_config
(
vllm_config
,
kv_cache_config
vllm_config
,
kv_cache_config
...
...
vllm/v1/engine/core.py
View file @
52efc34e
...
@@ -19,7 +19,6 @@ import zmq
...
@@ -19,7 +19,6 @@ import zmq
from
vllm.config
import
ParallelConfig
,
VllmConfig
from
vllm.config
import
ParallelConfig
,
VllmConfig
from
vllm.distributed
import
stateless_destroy_torch_distributed_process_group
from
vllm.distributed
import
stateless_destroy_torch_distributed_process_group
from
vllm.distributed.parallel_state
import
is_global_first_rank
from
vllm.envs
import
enable_envs_cache
from
vllm.envs
import
enable_envs_cache
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.logging_utils.dump_input
import
dump_engine_exception
from
vllm.logging_utils.dump_input
import
dump_engine_exception
...
@@ -90,7 +89,7 @@ class EngineCore:
...
@@ -90,7 +89,7 @@ class EngineCore:
load_general_plugins
()
load_general_plugins
()
self
.
vllm_config
=
vllm_config
self
.
vllm_config
=
vllm_config
if
is_global_first_rank
()
:
if
vllm_config
.
parallel_config
.
data_parallel_rank
==
0
:
logger
.
info
(
logger
.
info
(
"Initializing a V1 LLM engine (v%s) with config: %s"
,
"Initializing a V1 LLM engine (v%s) with config: %s"
,
VLLM_VERSION
,
VLLM_VERSION
,
...
@@ -235,9 +234,10 @@ class EngineCore:
...
@@ -235,9 +234,10 @@ class EngineCore:
self
.
model_executor
.
initialize_from_config
(
kv_cache_configs
)
self
.
model_executor
.
initialize_from_config
(
kv_cache_configs
)
elapsed
=
time
.
time
()
-
start
elapsed
=
time
.
time
()
-
start
logger
.
info
(
logger
.
info
_once
(
(
"init engine (profile, create kv cache, warmup model) took %.2f seconds"
),
(
"init engine (profile, create kv cache, warmup model) took %.2f seconds"
),
elapsed
,
elapsed
,
scope
=
"local"
,
)
)
return
num_gpu_blocks
,
num_cpu_blocks
,
scheduler_kv_cache_config
return
num_gpu_blocks
,
num_cpu_blocks
,
scheduler_kv_cache_config
...
@@ -713,7 +713,7 @@ class EngineCoreProc(EngineCore):
...
@@ -713,7 +713,7 @@ class EngineCoreProc(EngineCore):
)
)
# Receive initialization message.
# Receive initialization message.
logger
.
info
(
"Waiting for init message from front-end."
)
logger
.
debug
(
"Waiting for init message from front-end."
)
if
not
handshake_socket
.
poll
(
timeout
=
HANDSHAKE_TIMEOUT_MINS
*
60_000
):
if
not
handshake_socket
.
poll
(
timeout
=
HANDSHAKE_TIMEOUT_MINS
*
60_000
):
raise
RuntimeError
(
raise
RuntimeError
(
"Did not receive response from front-end "
"Did not receive response from front-end "
...
...
vllm/v1/metrics/loggers.py
View file @
52efc34e
...
@@ -215,7 +215,7 @@ class LoggingStatLogger(StatLoggerBase):
...
@@ -215,7 +215,7 @@ class LoggingStatLogger(StatLoggerBase):
def
log_engine_initialized
(
self
):
def
log_engine_initialized
(
self
):
if
self
.
vllm_config
.
cache_config
.
num_gpu_blocks
:
if
self
.
vllm_config
.
cache_config
.
num_gpu_blocks
:
logger
.
info
(
logger
.
debug
(
"Engine %03d: vllm cache_config_info with initialization "
"Engine %03d: vllm cache_config_info with initialization "
"after num_gpu_blocks is: %d"
,
"after num_gpu_blocks is: %d"
,
self
.
engine_index
,
self
.
engine_index
,
...
...
vllm/v1/sample/ops/topk_topp_sampler.py
View file @
52efc34e
...
@@ -33,7 +33,10 @@ class TopKTopPSampler(nn.Module):
...
@@ -33,7 +33,10 @@ class TopKTopPSampler(nn.Module):
):
):
if
envs
.
VLLM_USE_FLASHINFER_SAMPLER
:
if
envs
.
VLLM_USE_FLASHINFER_SAMPLER
:
# Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
# Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
logger
.
info_once
(
"Using FlashInfer for top-p & top-k sampling."
)
logger
.
info_once
(
"Using FlashInfer for top-p & top-k sampling."
,
scope
=
"global"
,
)
self
.
forward
=
self
.
forward_cuda
self
.
forward
=
self
.
forward_cuda
else
:
else
:
logger
.
debug_once
(
logger
.
debug_once
(
...
...
vllm/v1/worker/dp_utils.py
View file @
52efc34e
...
@@ -6,7 +6,7 @@ import torch
...
@@ -6,7 +6,7 @@ import torch
import
torch.distributed
as
dist
import
torch.distributed
as
dist
from
vllm.config
import
ParallelConfig
from
vllm.config
import
ParallelConfig
from
vllm.distributed.parallel_state
import
get_dp_group
,
is_global_first_rank
from
vllm.distributed.parallel_state
import
get_dp_group
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.v1.worker.ubatch_utils
import
(
from
vllm.v1.worker.ubatch_utils
import
(
UBatchSlices
,
UBatchSlices
,
...
@@ -132,11 +132,11 @@ def _synchronize_dp_ranks(
...
@@ -132,11 +132,11 @@ def _synchronize_dp_ranks(
should_ubatch
=
_post_process_ubatch
(
tensor
)
should_ubatch
=
_post_process_ubatch
(
tensor
)
if
should_ubatch
and
not
should_dp_pad
:
if
should_ubatch
and
not
should_dp_pad
:
if
is_global_first_rank
():
logger
.
debug_once
(
logger
.
debug
(
"Microbatching has been triggered and requires DP padding. "
"Microbatching has been triggered and requires DP padding. "
"Enabling DP padding even though it has been explicitly "
"Enabling DP padding even though it has been explicitly "
"disabled."
"disabled."
,
scope
=
"global"
,
)
)
should_dp_pad
=
True
should_dp_pad
=
True
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
52efc34e
...
@@ -2850,7 +2850,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2850,7 +2850,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
Args:
Args:
eep_scale_up: the model loading is for elastic EP scale up.
eep_scale_up: the model loading is for elastic EP scale up.
"""
"""
logger
.
info
(
"Starting to load model %s..."
,
self
.
model_config
.
model
)
logger
.
info_once
(
"Starting to load model %s..."
,
self
.
model_config
.
model
,
scope
=
"global"
,
)
if
eep_scale_up
:
if
eep_scale_up
:
from
vllm.distributed.parallel_state
import
get_ep_group
from
vllm.distributed.parallel_state
import
get_ep_group
...
@@ -2911,10 +2915,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2911,10 +2915,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self
.
model
.
set_aux_hidden_state_layers
(
aux_layers
)
self
.
model
.
set_aux_hidden_state_layers
(
aux_layers
)
time_after_load
=
time
.
perf_counter
()
time_after_load
=
time
.
perf_counter
()
self
.
model_memory_usage
=
m
.
consumed_memory
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
logger
.
info
_once
(
"Model loading took %.4f GiB and %.6f seconds"
,
"Model loading took %.4f GiB and %.6f seconds"
,
self
.
model_memory_usage
/
GiB_bytes
,
self
.
model_memory_usage
/
GiB_bytes
,
time_after_load
-
time_before_load
,
time_after_load
-
time_before_load
,
scope
=
"local"
,
)
)
prepare_communication_buffer_for_model
(
self
.
model
)
prepare_communication_buffer_for_model
(
self
.
model
)
...
@@ -3838,10 +3843,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -3838,10 +3843,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
elapsed_time
=
end_time
-
start_time
elapsed_time
=
end_time
-
start_time
cuda_graph_size
=
start_free_gpu_memory
-
end_free_gpu_memory
cuda_graph_size
=
start_free_gpu_memory
-
end_free_gpu_memory
# This usually takes 5~20 seconds.
# This usually takes 5~20 seconds.
logger
.
info
(
logger
.
info
_once
(
"Graph capturing finished in %.0f secs, took %.2f GiB"
,
"Graph capturing finished in %.0f secs, took %.2f GiB"
,
elapsed_time
,
elapsed_time
,
cuda_graph_size
/
(
1
<<
30
),
cuda_graph_size
/
(
1
<<
30
),
scope
=
"local"
,
)
)
return
cuda_graph_size
return
cuda_graph_size
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment