Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a88081bf
Unverified
Commit
a88081bf
authored
Apr 26, 2024
by
SangBin Cho
Committed by
GitHub
Apr 26, 2024
Browse files
[CI] Disable non-lazy string operation on logging (#4326)
Co-authored-by:
Danny Guinther
<
dguinther@neuralmagic.com
>
parent
2f30e7c7
Changes
31
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
60 additions
and
56 deletions
+60
-56
vllm/lora/models.py
vllm/lora/models.py
+3
-3
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+2
-2
vllm/model_executor/model_loader/tensorizer.py
vllm/model_executor/model_loader/tensorizer.py
+4
-4
vllm/model_executor/model_loader/weight_utils.py
vllm/model_executor/model_loader/weight_utils.py
+7
-7
vllm/model_executor/models/__init__.py
vllm/model_executor/models/__init__.py
+5
-5
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma.py
+3
-3
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/spec_decode_worker.py
+2
-1
vllm/transformers_utils/configs/dbrx.py
vllm/transformers_utils/configs/dbrx.py
+7
-6
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+2
-3
vllm/utils.py
vllm/utils.py
+11
-9
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+14
-13
No files found.
vllm/lora/models.py
View file @
a88081bf
...
...
@@ -345,8 +345,8 @@ class LoRAModelManager:
index
,
_
=
first_free_slot
self
.
_active_loras
[
lora_id
]
=
None
lora_model
=
self
.
_registered_loras
[
lora_id
]
logger
.
debug
(
f
"Activating LoRA. int id:
{
lora_model
.
id
}
,
slot index:
{
index
}
"
)
logger
.
debug
(
"Activating LoRA. int id: %d, slot index: %d"
,
lora_model
.
id
,
index
)
self
.
lora_index_to_id
[
index
]
=
lora_model
.
id
for
module_name
,
module
in
self
.
modules
.
items
():
module_lora
=
lora_model
.
get_lora
(
module_name
)
...
...
@@ -567,7 +567,7 @@ class LoRALRUCache(LRUCache[LoRAModel]):
self
.
deactivate_lora_fn
=
deactivate_lora_fn
def
_on_remove
(
self
,
key
:
int
,
value
:
LoRAModel
):
logger
.
debug
(
f
"Removing LoRA. int id:
{
key
}
"
)
logger
.
debug
(
"Removing LoRA. int id:
%d"
,
key
)
self
.
deactivate_lora_fn
(
key
)
return
super
().
_on_remove
(
key
,
value
)
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
a88081bf
...
...
@@ -296,8 +296,8 @@ def get_moe_configs(E: int, N: int,
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
)),
"configs"
,
json_file_name
)
if
os
.
path
.
exists
(
config_file_path
):
with
open
(
config_file_path
)
as
f
:
logger
.
info
(
f
"Using configuration from
{
config_file_path
}
for MoE layer."
)
logger
.
info
(
"Using configuration from %s for MoE layer."
,
config_file_path
)
# If a configuration has been found, return it
return
{
int
(
key
):
val
for
key
,
val
in
json
.
load
(
f
).
items
()}
...
...
vllm/model_executor/model_loader/tensorizer.py
View file @
a88081bf
...
...
@@ -334,10 +334,10 @@ class TensorizerAgent:
per_second
=
convert_bytes
(
deserializer
.
total_tensor_bytes
/
duration
)
after_mem
=
get_mem_usage
()
deserializer
.
close
()
logger
.
info
(
f
"Deserialized
{
total_bytes_str
}
in "
f
"
{
end
-
start
:
0.2
f
}
s
,
{
per_second
}
/s"
)
logger
.
info
(
f
"Memory usage before:
{
before_mem
}
"
)
logger
.
info
(
f
"Memory usage after:
{
after_mem
}
"
)
logger
.
info
(
"Deserialized
%s in %0.2fs, %f/s"
,
total_bytes_str
,
end
-
start
,
per_second
)
logger
.
info
(
"Memory usage before:
%s"
,
before_mem
)
logger
.
info
(
"Memory usage after:
%s"
,
after_mem
)
self
.
_check_tensors_on_meta_device
()
self
.
_resize_lora_embeddings
()
...
...
vllm/model_executor/model_loader/weight_utils.py
View file @
a88081bf
...
...
@@ -190,7 +190,7 @@ def download_weights_from_hf(model_name_or_path: str,
allow_patterns
=
[
pattern
]
break
logger
.
info
(
f
"Using model weights format
{
allow_patterns
}
"
)
logger
.
info
(
"Using model weights format
%s"
,
allow_patterns
)
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with
get_lock
(
model_name_or_path
,
cache_dir
):
...
...
@@ -310,17 +310,17 @@ def kv_cache_scales_loader(
return
layer_scales_map
.
items
()
except
FileNotFoundError
:
logger
.
error
(
f
"File or directory '
{
filename
}
' not found."
)
logger
.
error
(
"File or directory '
%s
' not found."
,
filename
)
except
json
.
JSONDecodeError
:
logger
.
error
(
f
"Error decoding JSON in file '
{
filename
}
'."
)
logger
.
error
(
"Error decoding JSON in file '
%s'."
,
filename
)
except
Exception
as
e
:
logger
.
error
(
f
"An error occurred while reading '
{
filename
}
':
{
e
}
"
)
logger
.
error
(
"An error occurred while reading '
%s': %s"
,
filename
,
e
)
# This section is reached if and only if any of the excepts are hit
# Return an empty iterable (list) => no KV cache scales are loaded
# which ultimately defaults to 1.0 scales
logger
.
warning
(
"Defaulting to KV cache scaling factors = 1.0 "
f
"for all layers in TP rank
{
tp_rank
}
"
"
as an error occurred during loading."
)
logger
.
warning
(
"Defaulting to KV cache scaling factors = 1.0 for all
"
"layers in TP rank %d
as an error occurred during loading."
,
tp_rank
)
return
[]
...
...
vllm/model_executor/models/__init__.py
View file @
a88081bf
...
...
@@ -91,8 +91,8 @@ class ModelRegistry:
"ROCm for now."
)
if
model_arch
in
_ROCM_PARTIALLY_SUPPORTED_MODELS
:
logger
.
warning
(
f
"Model architecture
{
model_arch
}
is partially supported
"
"by ROCm: "
+
_ROCM_PARTIALLY_SUPPORTED_MODELS
[
model_arch
])
"Model architecture
%s
is partially supported
by ROCm: %s"
,
model_arch
,
_ROCM_PARTIALLY_SUPPORTED_MODELS
[
model_arch
])
module_name
,
model_cls_name
=
_MODELS
[
model_arch
]
module
=
importlib
.
import_module
(
...
...
@@ -107,9 +107,9 @@ class ModelRegistry:
def
register_model
(
model_arch
:
str
,
model_cls
:
Type
[
nn
.
Module
]):
if
model_arch
in
_MODELS
:
logger
.
warning
(
f
"Model architecture
{
model_arch
}
is already registered, "
"
and will be
overwritten by the new model
"
f
"class
{
model_cls
.
__name__
}
."
)
"Model architecture
%s
is already registered,
and will be
"
"overwritten by the new model
class %s."
,
model_arch
,
model_cls
.
__name__
)
global
_OOT_MODELS
_OOT_MODELS
[
model_arch
]
=
model_cls
...
...
vllm/model_executor/models/gemma.py
View file @
a88081bf
...
...
@@ -55,10 +55,10 @@ def _get_gemma_act_fn(
"in the config JSON file when it was initially released. "
"Changing the activation function to approximate GeLU "
"(`gelu_pytorch_tanh`). If you want to use the legacy "
f
"`
{
hidden_act
}
`, edit the config JSON to set "
f
"`hidden_activation=
{
hidden_act
}
` instead of `hidden_act`. "
"`
%s
`, edit the config JSON to set "
"`hidden_activation=
%s
` instead of `hidden_act`. "
"See https://github.com/huggingface/transformers/pull/29402 "
"for more details."
)
"for more details."
,
hidden_act
,
hidden_act
)
return
GeluAndMul
(
approximate
=
"tanh"
)
elif
hidden_activation
==
"gelu_pytorch_tanh"
:
return
GeluAndMul
(
approximate
=
"tanh"
)
...
...
vllm/spec_decode/spec_decode_worker.py
View file @
a88081bf
...
...
@@ -183,7 +183,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
"speculative decoding "
"requires non-None seq_group_metadata_list"
)
logger
.
info
(
f
"spec_decode_worker.execute_model
{
num_lookahead_slots
=
}
"
)
logger
.
info
(
"spec_decode_worker.execute_model num_lookahead_slots=%d"
,
num_lookahead_slots
)
# If no spec tokens, call the proposer and scorer workers normally.
# Used for prefill.
...
...
vllm/transformers_utils/configs/dbrx.py
View file @
a88081bf
...
...
@@ -72,9 +72,10 @@ class DbrxAttentionConfig(PretrainedConfig):
and
config_dict
[
"model_type"
]
!=
cls
.
model_type
):
logger
.
warning
(
f
"You are using a model of type
{
config_dict
[
'model_type'
]
}
to instantiate a model of type "
+
f
"
{
cls
.
model_type
}
. This is not supported for all configurations of models and can yield errors."
)
"You are using a model of type %s to instantiate a model of "
"type %s. This is not supported for all configurations of "
"models and can yield errors."
,
config_dict
[
"model_type"
],
cls
.
model_type
)
return
cls
.
from_dict
(
config_dict
,
**
kwargs
)
...
...
@@ -151,9 +152,9 @@ class DbrxFFNConfig(PretrainedConfig):
and
config_dict
[
"model_type"
]
!=
cls
.
model_type
):
logger
.
warning
(
f
"You are using a model of type
{
config_dict
[
'model_type'
]
}
to instantiate a model of
type
"
+
f
"
{
cls
.
model_
type
}
. This is not supported for all
configurations of models and can yield errors.
"
)
"You are using a model of type
%s
to instantiate a model of "
"
type
%s
. This is not supported for all "
"configurations of models and can yield errors."
,
config_dict
[
"model_type"
],
cls
.
model_type
)
return
cls
.
from_dict
(
config_dict
,
**
kwargs
)
...
...
vllm/transformers_utils/tokenizer.py
View file @
a88081bf
...
...
@@ -138,9 +138,8 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
# No tokenizer was found in the LoRA folder,
# use base model tokenizer
logger
.
warning
(
f
"No tokenizer found in
{
lora_request
.
lora_local_path
}
, "
"using base model tokenizer instead. "
f
"(Exception:
{
str
(
e
)
}
)"
)
"No tokenizer found in %s, using base model tokenizer instead. "
"(Exception: %s)"
,
lora_request
.
lora_local_path
,
e
)
tokenizer
=
None
return
tokenizer
...
...
vllm/utils.py
View file @
a88081bf
...
...
@@ -289,8 +289,9 @@ def get_open_port() -> int:
def
update_environment_variables
(
envs
:
Dict
[
str
,
str
]):
for
k
,
v
in
envs
.
items
():
if
k
in
os
.
environ
and
os
.
environ
[
k
]
!=
v
:
logger
.
warning
(
f
"Overwriting environment variable
{
k
}
"
f
"from '
{
os
.
environ
[
k
]
}
' to '
{
v
}
'"
)
logger
.
warning
(
"Overwriting environment variable %s "
"from '%s' to '%s'"
,
k
,
os
.
environ
[
k
],
v
)
os
.
environ
[
k
]
=
v
...
...
@@ -310,11 +311,12 @@ def get_nvcc_cuda_version() -> Optional[Version]:
if
not
cuda_home
:
cuda_home
=
'/usr/local/cuda'
if
os
.
path
.
isfile
(
cuda_home
+
'/bin/nvcc'
):
logger
.
info
(
f
'CUDA_HOME is not found in the environment. '
f
'Using
{
cuda_home
}
as CUDA_HOME.'
)
logger
.
info
(
'CUDA_HOME is not found in the environment. '
'Using %s as CUDA_HOME.'
,
cuda_home
)
else
:
logger
.
warning
(
f
'Not found nvcc in
{
cuda_home
}
. Skip cuda version check!'
)
logger
.
warning
(
'Not found nvcc in %s. Skip cuda version check!'
,
cuda_home
)
return
None
nvcc_output
=
subprocess
.
check_output
([
cuda_home
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
...
...
@@ -599,8 +601,8 @@ def find_nccl_library():
# manually load the nccl library
if
so_file
:
logger
.
info
(
f
"Found nccl from environment variable VLLM_NCCL_SO_PATH=
{
so_file
}
"
)
"Found nccl from environment variable VLLM_NCCL_SO_PATH=
%s"
,
so_file
)
else
:
if
torch
.
version
.
cuda
is
not
None
:
so_file
=
vllm_nccl_path
or
find_library
(
"libnccl.so.2"
)
...
...
@@ -608,7 +610,7 @@ def find_nccl_library():
so_file
=
find_library
(
"librccl.so.1"
)
else
:
raise
ValueError
(
"NCCL only supports CUDA and ROCm backends."
)
logger
.
info
(
f
"Found nccl from library
{
so_file
}
"
)
logger
.
info
(
"Found nccl from library
%s"
,
so_file
)
return
so_file
...
...
vllm/worker/model_runner.py
View file @
a88081bf
...
...
@@ -170,8 +170,8 @@ class ModelRunner:
)
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
f
"Loading model weights took
"
f
"
{
self
.
model_memory_usage
/
float
(
2
**
30
)
:.
4
f
}
GB"
)
logger
.
info
(
"Loading model weights took
%.4f GB"
,
self
.
model_memory_usage
/
float
(
2
**
30
))
if
self
.
lora_config
:
assert
hasattr
(
self
.
model
,
"supported_lora_modules"
...
...
@@ -196,18 +196,19 @@ class ModelRunner:
self
.
model
.
load_kv_cache_scales
(
self
.
model_config
.
quantization_param_path
)
else
:
raise
RuntimeError
(
"Using FP8 KV cache and scaling "
"
factors provided but
model
"
f
"
{
self
.
model
.
__class__
}
does not "
"support loading scaling factors."
)
raise
RuntimeError
(
"Using FP8 KV cache and scaling
factors provided but "
"model %s does not support loading scaling factors."
,
self
.
model
.
__class__
)
else
:
logger
.
warn
(
"Using FP8 KV cache but no scaling factors "
"provided. Defaulting to scaling factors of 1.0. "
"This may lead to less accurate results!"
)
logger
.
warning
(
"Using FP8 KV cache but no scaling factors "
"provided. Defaulting to scaling factors of 1.0. "
"This may lead to less accurate results!"
)
elif
self
.
model_config
.
quantization_param_path
is
not
None
:
logger
.
warn
(
"KV cache scaling factors provided, "
"but the KV cache data type is not FP8. "
"KV cache scaling factors will not be used."
)
logger
.
warn
ing
(
"KV cache scaling factors provided, "
"but the KV cache data type is not FP8. "
"KV cache scaling factors will not be used."
)
def
set_block_size
(
self
,
block_size
:
int
)
->
None
:
self
.
block_size
=
block_size
...
...
@@ -1054,7 +1055,7 @@ class ModelRunner:
end_time
=
time
.
perf_counter
()
elapsed_time
=
end_time
-
start_time
# This usually takes < 10 seconds.
logger
.
info
(
f
"Graph capturing finished in
{
elapsed_time
:.
0
f
}
secs."
)
logger
.
info
(
"Graph capturing finished in
%.0f secs."
,
elapsed_time
)
def
__del__
(
self
)
->
None
:
# Delete the CUDA graphs before deleting the pynccl communicator.
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment