Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a88081bf
Unverified
Commit
a88081bf
authored
Apr 26, 2024
by
SangBin Cho
Committed by
GitHub
Apr 26, 2024
Browse files
[CI] Disable non-lazy string operation on logging (#4326)
Co-authored-by:
Danny Guinther
<
dguinther@neuralmagic.com
>
parent
2f30e7c7
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
116 additions
and
93 deletions
+116
-93
docs/source/conf.py
docs/source/conf.py
+3
-2
pyproject.toml
pyproject.toml
+1
-0
setup.py
setup.py
+4
-3
vllm/config.py
vllm/config.py
+8
-8
vllm/core/scheduler.py
vllm/core/scheduler.py
+6
-4
vllm/distributed/device_communicators/custom_all_reduce.py
vllm/distributed/device_communicators/custom_all_reduce.py
+4
-4
vllm/distributed/device_communicators/pynccl.py
vllm/distributed/device_communicators/pynccl.py
+8
-7
vllm/distributed/device_communicators/pynccl_utils.py
vllm/distributed/device_communicators/pynccl_utils.py
+2
-2
vllm/distributed/parallel_state.py
vllm/distributed/parallel_state.py
+4
-2
vllm/distributed/utils.py
vllm/distributed/utils.py
+2
-2
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+9
-9
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+36
-25
vllm/engine/metrics.py
vllm/engine/metrics.py
+13
-8
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+2
-2
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+5
-6
vllm/executor/cpu_executor.py
vllm/executor/cpu_executor.py
+1
-1
vllm/executor/gpu_executor.py
vllm/executor/gpu_executor.py
+2
-2
vllm/executor/ray_gpu_executor.py
vllm/executor/ray_gpu_executor.py
+2
-2
vllm/executor/ray_utils.py
vllm/executor/ray_utils.py
+3
-3
vllm/logger.py
vllm/logger.py
+1
-1
No files found.
docs/source/conf.py
View file @
a88081bf
...
...
@@ -98,9 +98,10 @@ autodoc_mock_imports = [
for
mock_target
in
autodoc_mock_imports
:
if
mock_target
in
sys
.
modules
:
logger
.
info
(
f
"Potentially problematic mock target (
{
mock_target
}
) found; "
"Potentially problematic mock target (
%s
) found; "
"autodoc_mock_imports cannot mock modules that have already "
"been loaded into sys.modules when the sphinx build starts."
)
"been loaded into sys.modules when the sphinx build starts."
,
mock_target
)
class
MockedClassDocumenter
(
autodoc
.
ClassDocumenter
):
...
...
pyproject.toml
View file @
a88081bf
...
...
@@ -32,6 +32,7 @@ select = [
"SIM"
,
# isort
# "I",
"G"
,
]
ignore
=
[
# star imports
...
...
setup.py
View file @
a88081bf
...
...
@@ -63,7 +63,7 @@ class cmake_build_ext(build_ext):
num_jobs
=
os
.
environ
.
get
(
"MAX_JOBS"
,
None
)
if
num_jobs
is
not
None
:
num_jobs
=
int
(
num_jobs
)
logger
.
info
(
f
"Using MAX_JOBS=
{
num_jobs
}
as the number of jobs."
)
logger
.
info
(
"Using MAX_JOBS=
%d
as the number of jobs."
,
num_jobs
)
else
:
try
:
# os.sched_getaffinity() isn't universally available, so fall
...
...
@@ -81,8 +81,9 @@ class cmake_build_ext(build_ext):
nvcc_threads
=
os
.
getenv
(
"NVCC_THREADS"
,
None
)
if
nvcc_threads
is
not
None
:
nvcc_threads
=
int
(
nvcc_threads
)
logger
.
info
(
f
"Using NVCC_THREADS=
{
nvcc_threads
}
as the number"
" of nvcc threads."
)
logger
.
info
(
"Using NVCC_THREADS=%d as the number of nvcc threads."
,
nvcc_threads
)
else
:
nvcc_threads
=
1
num_jobs
=
max
(
1
,
num_jobs
//
nvcc_threads
)
...
...
vllm/config.py
View file @
a88081bf
...
...
@@ -167,9 +167,9 @@ class ModelConfig:
f
"supported in ROCm."
)
if
self
.
quantization
!=
"marlin"
:
logger
.
warning
(
f
"
{
self
.
quantization
}
quantization is not fully "
"%s
quantization is not fully "
"optimized yet. The speed can be slower than "
"non-quantized models."
)
"non-quantized models."
,
self
.
quantization
)
def
_verify_cuda_graph
(
self
)
->
None
:
if
self
.
max_context_len_to_capture
is
None
:
...
...
@@ -360,7 +360,7 @@ class CacheConfig:
if
cpu_memory_usage
>
0.7
*
total_cpu_memory
:
raise
ValueError
(
"Too large swap space. "
+
msg
)
elif
cpu_memory_usage
>
0.4
*
total_cpu_memory
:
logger
.
warning
(
"Possibly too large swap space.
"
+
msg
)
logger
.
warning
(
"Possibly too large swap space.
%s"
,
msg
)
@
dataclass
...
...
@@ -898,8 +898,8 @@ class LoRAConfig:
"awq"
,
"gptq"
]:
# TODO support marlin and squeezellm
logger
.
warning
(
f
"
{
model_config
.
quantization
}
quantization is not "
"tested with LoRA yet."
)
logger
.
warning
(
"%s quantization is not tested with LoRA yet."
,
model_config
.
quantization
)
def
verify_with_scheduler_config
(
self
,
scheduler_config
:
SchedulerConfig
):
if
scheduler_config
.
max_num_batched_tokens
>
65528
:
...
...
@@ -1008,7 +1008,7 @@ def _get_and_verify_dtype(
pass
else
:
# Casting between float16 and bfloat16 is allowed with a warning.
logger
.
warning
(
f
"Casting
{
config_dtype
}
to
{
torch_dtype
}
."
)
logger
.
warning
(
"Casting
%s to %s."
,
config_dtype
,
torch_dtype
)
return
torch_dtype
...
...
@@ -1051,8 +1051,8 @@ def _get_and_verify_max_len(
logger
.
warning
(
"The model's config.json does not contain any of the following "
"keys to determine the original maximum length of the model: "
f
"
{
possible_keys
}
. Assuming the model's maximum length is
"
f
"
{
default_max_len
}
."
)
"%d
. Assuming the model's maximum length is
%d."
,
possible_keys
,
default_max_len
)
derived_max_model_len
=
default_max_len
rope_scaling
=
getattr
(
hf_config
,
"rope_scaling"
,
None
)
...
...
vllm/core/scheduler.py
View file @
a88081bf
...
...
@@ -617,8 +617,9 @@ class Scheduler:
if
num_new_tokens
>
self
.
prompt_limit
:
logger
.
warning
(
f
"Input prompt (
{
num_new_tokens
}
tokens) is too long"
f
" and exceeds limit of
{
self
.
prompt_limit
}
"
)
"Input prompt (%d tokens) is too long"
" and exceeds limit of %d"
,
num_new_tokens
,
self
.
prompt_limit
)
for
seq
in
waiting_seqs
:
seq
.
status
=
SequenceStatus
.
FINISHED_IGNORED
ignored_seq_groups
.
append
(
seq_group
)
...
...
@@ -631,8 +632,9 @@ class Scheduler:
break
elif
can_allocate
==
AllocStatus
.
NEVER
:
logger
.
warning
(
f
"Input prompt (
{
num_new_tokens
}
tokens) is too long"
f
" and exceeds the capacity of block_manager"
)
"Input prompt (%d tokens) is too long"
" and exceeds the capacity of block_manager"
,
num_new_tokens
)
for
seq
in
waiting_seqs
:
seq
.
status
=
SequenceStatus
.
FINISHED_IGNORED
ignored_seq_groups
.
append
(
seq_group
)
...
...
vllm/distributed/device_communicators/custom_all_reduce.py
View file @
a88081bf
...
...
@@ -37,7 +37,7 @@ def init_custom_ar() -> None:
return
if
world_size
not
in
_SUPPORTED_WORLD_SIZES
:
logger
.
warn
(
logger
.
warn
ing
(
"Custom allreduce is disabled due to an unsupported world size: "
"%d. Supported world sizes: %s. To silence this warning, specify"
" disable_custom_all_reduce=True explicitly."
,
world_size
,
...
...
@@ -47,7 +47,7 @@ def init_custom_ar() -> None:
# note: num dev can be larger than world_size if we're only using
# first few GPUs
if
num_dev
<
world_size
:
logger
.
warn
(
logger
.
warn
ing
(
"Cannot test GPU P2P because not all GPUs are visible to the "
"current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
" is set."
)
...
...
@@ -62,7 +62,7 @@ def init_custom_ar() -> None:
# this checks hardware and driver support for NVLink
full_nvlink
=
_is_full_nvlink
(
device_ids
)
if
world_size
>
2
and
not
full_nvlink
:
logger
.
warn
(
logger
.
warn
ing
(
"Custom allreduce is disabled because it's not supported on more"
" than two PCIe-only GPUs. To silence this warning, specify"
" disable_custom_all_reduce=True explicitly."
)
...
...
@@ -71,7 +71,7 @@ def init_custom_ar() -> None:
# this is expensive to compute at the first time
# then we cache the result
if
not
_can_p2p
(
rank
,
world_size
):
logger
.
warn
(
logger
.
warn
ing
(
"Custom allreduce is disabled because your platform lacks GPU P2P"
" capability or P2P test failed. To silence this warning, specify"
" disable_custom_all_reduce=True explicitly."
)
...
...
vllm/distributed/device_communicators/pynccl.py
View file @
a88081bf
...
...
@@ -43,15 +43,16 @@ try:
nccl
=
ctypes
.
CDLL
(
so_file
)
except
Exception
as
e
:
logger
.
error
(
f
"Failed to load NCCL library from
{
so_file
}
."
"Failed to load NCCL library from
%s
."
"It is expected if you are not running on NVIDIA/AMD GPUs."
"Otherwise, the nccl library might not exist, be corrupted "
f
"or it does not support the current platform
{
platform
.
platform
()
}
."
f
"One solution is to download libnccl2 version 2.18 from "
f
"https://developer.download.nvidia.com/compute/cuda/repos/ "
f
"and extract the libnccl.so.2 file. If you already have the "
f
"library, please set the environment variable VLLM_NCCL_SO_PATH"
" to point to the correct nccl library path."
)
"or it does not support the current platform %s."
"One solution is to download libnccl2 version 2.18 from "
"https://developer.download.nvidia.com/compute/cuda/repos/ "
"and extract the libnccl.so.2 file. If you already have the "
"library, please set the environment variable VLLM_NCCL_SO_PATH"
" to point to the correct nccl library path."
,
so_file
,
platform
.
platform
())
raise
e
# === export types and functions from nccl to Python ===
...
...
vllm/distributed/device_communicators/pynccl_utils.py
View file @
a88081bf
...
...
@@ -14,7 +14,7 @@ try:
except
Exception
as
e
:
# in non-NVIDIA environments, we can't import the nccl module
# e.g. when running on machines with AMD GPUs
logger
.
info
(
f
"Failed to import NCCL library:
{
e
}
"
)
logger
.
info
(
"Failed to import NCCL library:
%s"
,
e
)
logger
.
info
(
"It is expected if you are not running on NVIDIA GPUs."
)
pass
...
...
@@ -40,7 +40,7 @@ def set_pynccl_stream(stream: torch.cuda.Stream):
def
init_process_group
(
group
:
Optional
[
ProcessGroup
]
=
None
)
->
None
:
assert
not
is_initialized
()
global
comm
logger
.
info
(
f
"vLLM is using nccl==
{
ncclGetVersion
()
}
"
)
logger
.
info
(
"vLLM is using nccl==
%s"
,
ncclGetVersion
())
comm
=
NCCLCommunicator
(
group
=
group
)
...
...
vllm/distributed/parallel_state.py
View file @
a88081bf
...
...
@@ -57,8 +57,10 @@ def init_distributed_environment(
local_rank
:
int
=
-
1
,
backend
:
str
=
"nccl"
,
):
logger
.
debug
(
f
"
{
world_size
=
}
{
rank
=
}
{
local_rank
=
}
"
f
"
{
distributed_init_method
=
}
{
backend
=
}
"
)
logger
.
debug
(
"world_size=%d rank=%d local_rank=%d "
"distributed_init_method=%s backend=%s"
,
world_size
,
rank
,
local_rank
,
distributed_init_method
,
backend
)
if
not
torch
.
distributed
.
is_initialized
():
assert
distributed_init_method
is
not
None
,
(
"distributed_init_method must be provided when initializing "
...
...
vllm/distributed/utils.py
View file @
a88081bf
...
...
@@ -112,7 +112,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
and
(
not
os
.
path
.
exists
(
path
)):
# only the local master process (with local_rank == 0) can
# enter this block to calculate the cache
logger
.
info
(
f
"generating GPU P2P access cache for in
{
path
}
"
)
logger
.
info
(
"generating GPU P2P access cache for in
%s"
,
path
)
cache
=
{}
for
_i
in
range
(
num_dev
):
for
_j
in
range
(
num_dev
):
...
...
@@ -126,7 +126,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
if
is_distributed
:
cpu_world_group
=
get_cpu_world_group
()
dist
.
barrier
(
cpu_world_group
)
logger
.
info
(
f
"reading GPU P2P access cache from
{
path
}
"
)
logger
.
info
(
"reading GPU P2P access cache from
%s"
,
path
)
with
open
(
path
,
"r"
)
as
f
:
cache
=
json
.
load
(
f
)
_gpu_p2p_access_cache
=
cache
...
...
vllm/engine/async_llm_engine.py
View file @
a88081bf
...
...
@@ -117,7 +117,7 @@ class RequestTracker:
self
.
_request_streams
[
request_id
].
put
(
request_output
)
if
request_output
.
finished
:
if
verbose
:
logger
.
info
(
f
"Finished request
{
request_id
}
."
)
logger
.
info
(
"Finished request
%s."
,
request_id
)
self
.
abort_request
(
request_id
)
def
process_exception
(
self
,
...
...
@@ -128,7 +128,7 @@ class RequestTracker:
"""Propagate an exception from the engine."""
self
.
_request_streams
[
request_id
].
put
(
exception
)
if
verbose
:
logger
.
info
(
f
"Finished request
{
request_id
}
."
)
logger
.
info
(
"Finished request
%s."
,
request_id
)
self
.
abort_request
(
request_id
)
def
add_request
(
self
,
request_id
:
str
,
...
...
@@ -151,7 +151,7 @@ class RequestTracker:
def
abort_request
(
self
,
request_id
:
str
,
*
,
verbose
:
bool
=
False
)
->
None
:
"""Abort a request during next background loop iteration."""
if
verbose
:
logger
.
info
(
f
"Aborted request
{
request_id
}
."
)
logger
.
info
(
"Aborted request
%s."
,
request_id
)
self
.
_finished_requests
.
put_nowait
(
request_id
)
...
...
@@ -521,11 +521,11 @@ class AsyncLLMEngine:
if
shortened_token_ids
is
not
None
:
shortened_token_ids
=
shortened_token_ids
[:
self
.
max_log_len
]
logger
.
info
(
f
"Received request
{
request_id
}
: "
f
"prompt:
{
shortened_
prompt
!
r
}
, "
f
"sampling_params:
{
sampling_params
}
, "
f
"prompt_token_ids:
{
shortened_token_ids
}
, "
f
"lora_request:
{
lora_request
}
."
)
logger
.
info
(
"Received request %s:
prompt
: %r
, "
"sampling_params:
%s, prompt_token_ids: %s
, "
"lora_request: %s."
,
request_id
,
shortened_prompt
,
sampling_params
,
shortened_token_ids
,
lora_request
)
if
not
self
.
is_running
:
if
self
.
start_engine_loop
:
...
...
@@ -717,4 +717,4 @@ class AsyncLLMEngine:
raise
RuntimeError
(
"Engine is dead."
)
from
e
else
:
await
self
.
engine
.
check_health_async
()
logger
.
debug
(
f
"Health check took
{
time
.
perf_counter
()
-
t
}
s"
)
logger
.
debug
(
"Health check took
%fs"
,
time
.
perf_counter
()
-
t
)
vllm/engine/llm_engine.py
View file @
a88081bf
...
...
@@ -96,29 +96,38 @@ class LLMEngine:
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
)
->
None
:
logger
.
info
(
f
"Initializing an LLM engine (v
{
vllm
.
__version__
}
) with config: "
f
"model=
{
model_config
.
model
!
r
}
, "
f
"speculative_config=
{
speculative_config
!
r
}
, "
f
"tokenizer=
{
model_config
.
tokenizer
!
r
}
, "
f
"skip_tokenizer_init=
{
model_config
.
skip_tokenizer_init
}
, "
f
"tokenizer_mode=
{
model_config
.
tokenizer_mode
}
, "
f
"revision=
{
model_config
.
revision
}
, "
f
"tokenizer_revision=
{
model_config
.
tokenizer_revision
}
, "
f
"trust_remote_code=
{
model_config
.
trust_remote_code
}
, "
f
"dtype=
{
model_config
.
dtype
}
, "
f
"max_seq_len=
{
model_config
.
max_model_len
}
, "
f
"download_dir=
{
load_config
.
download_dir
!
r
}
, "
f
"load_format=
{
load_config
.
load_format
}
, "
f
"tensor_parallel_size=
{
parallel_config
.
tensor_parallel_size
}
, "
f
"disable_custom_all_reduce="
f
"
{
parallel_config
.
disable_custom_all_reduce
}
, "
f
"quantization=
{
model_config
.
quantization
}
, "
f
"enforce_eager=
{
model_config
.
enforce_eager
}
, "
f
"kv_cache_dtype=
{
cache_config
.
cache_dtype
}
, "
f
"quantization_param_path=
{
model_config
.
quantization_param_path
}
, "
f
"device_config=
{
device_config
.
device
}
, "
f
"decoding_config=
{
decoding_config
!
r
}
, "
f
"seed=
{
model_config
.
seed
}
)"
)
"Initializing an LLM engine (v%s) with config: "
"model=%r, speculative_config=%r, tokenizer=%r, "
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
"tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
"max_seq_len=%d, download_dir=%r, load_format=%s, "
"tensor_parallel_size=%d, disable_custom_all_reduce=%s"
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
"quantization_param_path=%s, device_config=%s, "
"decoding_config=%r, seed=%d)"
,
vllm
.
__version__
,
model_config
.
model
,
speculative_config
,
model_config
.
tokenizer
,
model_config
.
skip_tokenizer_init
,
model_config
.
tokenizer_mode
,
model_config
.
revision
,
model_config
.
tokenizer_revision
,
model_config
.
trust_remote_code
,
model_config
.
dtype
,
model_config
.
max_model_len
,
load_config
.
download_dir
,
load_config
.
load_format
,
parallel_config
.
tensor_parallel_size
,
parallel_config
.
disable_custom_all_reduce
,
model_config
.
quantization
,
model_config
.
enforce_eager
,
cache_config
.
cache_dtype
,
model_config
.
quantization_param_path
,
device_config
.
device
,
decoding_config
,
model_config
.
seed
,
)
# TODO(woosuk): Print more configs in debug mode.
self
.
model_config
=
model_config
...
...
@@ -237,8 +246,10 @@ class LLMEngine:
if
self
.
cache_config
.
num_gpu_blocks_override
is
not
None
:
num_gpu_blocks_override
=
self
.
cache_config
.
num_gpu_blocks_override
logger
.
info
(
f
"Overriding
{
num_gpu_blocks
=
}
with "
f
"
{
num_gpu_blocks_override
=
}
"
)
logger
.
info
(
"Overriding num_gpu_blocks=%d with "
"num_gpu_blocks_override=%d"
,
num_gpu_blocks
,
num_gpu_blocks_override
)
num_gpu_blocks
=
num_gpu_blocks_override
self
.
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
...
...
vllm/engine/metrics.py
View file @
a88081bf
...
...
@@ -227,14 +227,19 @@ class StatLogger:
# Log to stdout.
logger
.
info
(
f
"Avg prompt throughput:
{
prompt_throughput
:.
1
f
}
tokens/s, "
f
"Avg generation throughput: "
f
"
{
generation_throughput
:.
1
f
}
tokens/s, "
f
"Running:
{
stats
.
num_running
}
reqs, "
f
"Swapped:
{
stats
.
num_swapped
}
reqs, "
f
"Pending:
{
stats
.
num_waiting
}
reqs, "
f
"GPU KV cache usage:
{
stats
.
gpu_cache_usage
*
100
:.
1
f
}
%, "
f
"CPU KV cache usage:
{
stats
.
cpu_cache_usage
*
100
:.
1
f
}
%"
)
"Avg prompt throughput: %.1f tokens/s, "
"Avg generation throughput: %.1f tokens/s, "
"Running: %d reqs, Swapped: %d reqs, "
"Pending: %d reqs, GPU KV cache usage: %.1f%, "
"CPU KV cache usage: %.1f%"
,
prompt_throughput
,
generation_throughput
,
stats
.
num_running
,
stats
.
num_swapped
,
stats
.
num_waiting
,
stats
.
gpu_cache_usage
*
100
,
stats
.
cpu_cache_usage
*
100
,
)
# Reset tracked stats for next interval.
self
.
num_prompt_tokens
=
[]
...
...
vllm/entrypoints/openai/api_server.py
View file @
a88081bf
...
...
@@ -148,8 +148,8 @@ if __name__ == "__main__":
raise
ValueError
(
f
"Invalid middleware
{
middleware
}
. "
f
"Must be a function or a class."
)
logger
.
info
(
f
"vLLM API server version
{
vllm
.
__version__
}
"
)
logger
.
info
(
f
"args:
{
args
}
"
)
logger
.
info
(
"vLLM API server version
%s"
,
vllm
.
__version__
)
logger
.
info
(
"args:
%s"
,
args
)
if
args
.
served_model_name
is
not
None
:
served_model_names
=
args
.
served_model_name
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
a88081bf
...
...
@@ -57,8 +57,7 @@ class OpenAIServingChat(OpenAIServing):
tokenize
=
False
,
add_generation_prompt
=
request
.
add_generation_prompt
)
except
Exception
as
e
:
logger
.
error
(
f
"Error in applying chat template from request:
{
str
(
e
)
}
"
)
logger
.
error
(
"Error in applying chat template from request: %s"
,
e
)
return
self
.
create_error_response
(
str
(
e
))
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
...
...
@@ -338,11 +337,11 @@ class OpenAIServingChat(OpenAIServing):
tokenizer
.
chat_template
=
codecs
.
decode
(
chat_template
,
"unicode_escape"
)
logger
.
info
(
f
"Using supplied chat template:
\n
{
tokenizer
.
chat_template
}
"
)
logger
.
info
(
"Using supplied chat template:
\n
%s"
,
tokenizer
.
chat_template
)
elif
tokenizer
.
chat_template
is
not
None
:
logger
.
info
(
f
"Using default chat template:
\n
{
tokenizer
.
chat_template
}
"
)
logger
.
info
(
"Using default chat template:
\n
%s"
,
tokenizer
.
chat_template
)
else
:
logger
.
warning
(
"No chat template provided. Chat API will not work."
)
vllm/executor/cpu_executor.py
View file @
a88081bf
...
...
@@ -69,7 +69,7 @@ class CPUExecutor(ExecutorBase):
# NOTE: `cpu block` for CPU backend is located on CPU memory but is
# referred as `gpu block`. Because we want to reuse the existing block
# management procedure.
logger
.
info
(
f
"# CPU blocks:
{
num_gpu_blocks
}
"
)
logger
.
info
(
"# CPU blocks:
%d"
,
num_gpu_blocks
)
self
.
driver_worker
.
initialize_cache
(
num_gpu_blocks
,
num_cpu_blocks
)
def
execute_model
(
self
,
...
...
vllm/executor/gpu_executor.py
View file @
a88081bf
...
...
@@ -116,8 +116,8 @@ class GPUExecutor(ExecutorBase):
# NOTE: This is logged in the executor because there can be >1 worker
# with other executors. We could log in the engine level, but work
# remains to abstract away the device for non-GPU configurations.
logger
.
info
(
f
"# GPU blocks:
{
num_gpu_blocks
}
, "
f
"# CPU blocks:
{
num_cpu_blocks
}
"
)
logger
.
info
(
"# GPU blocks:
%d, # CPU blocks: %d"
,
num_gpu_blocks
,
num_cpu_blocks
)
self
.
driver_worker
.
initialize_cache
(
num_gpu_blocks
,
num_cpu_blocks
)
...
...
vllm/executor/ray_gpu_executor.py
View file @
a88081bf
...
...
@@ -214,8 +214,8 @@ class RayGPUExecutor(ExecutorBase):
# NOTE: We log here to avoid multiple logs when number of workers is
# greater than one. We could log in the engine, but not all executors
# have GPUs.
logger
.
info
(
f
"# GPU blocks:
{
num_gpu_blocks
}
, "
f
"# CPU blocks:
{
num_cpu_blocks
}
"
)
logger
.
info
(
"# GPU blocks:
%d, # CPU blocks: %d"
,
num_gpu_blocks
,
num_cpu_blocks
)
self
.
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
self
.
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
...
...
vllm/executor/ray_utils.py
View file @
a88081bf
...
...
@@ -43,9 +43,9 @@ try:
return
output
except
ImportError
as
e
:
logger
.
warning
(
f
"Failed to import Ray with
{
e
!
r
}
. "
"
For distributed inference,
please install Ray with
"
"
`pip install ray`."
)
logger
.
warning
(
"Failed to import Ray with %r.
For distributed inference, "
"please install Ray with
`pip install ray`."
,
e
)
ray
=
None
# type: ignore
RayWorkerWrapper
=
None
# type: ignore
...
...
vllm/logger.py
View file @
a88081bf
...
...
@@ -126,7 +126,7 @@ def enable_trace_function_call(log_file_path: str,
"VLLM_TRACE_FUNCTION is enabled. It will record every"
" function executed by Python. This will slow down the code. It "
"is suggested to be used for debugging hang or crashes only."
)
logger
.
info
(
f
"Trace frame log is saved to
{
log_file_path
}
"
)
logger
.
info
(
"Trace frame log is saved to
%s"
,
log_file_path
)
if
root_dir
is
None
:
# by default, this is the vllm root directory
root_dir
=
os
.
path
.
dirname
(
os
.
path
.
dirname
(
__file__
))
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment