Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2de41182
Unverified
Commit
2de41182
authored
Apr 01, 2025
by
yihong
Committed by
GitHub
Mar 31, 2025
Browse files
fix: change GB to GiB in logging close #14979 (#15807)
Signed-off-by:
yihong0618
<
zouzou0208@gmail.com
>
parent
239b7bef
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
11 additions
and
11 deletions
+11
-11
vllm/v1/core/kv_cache_utils.py
vllm/v1/core/kv_cache_utils.py
+2
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+4
-4
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+2
-2
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_model_runner.py
+3
-3
No files found.
vllm/v1/core/kv_cache_utils.py
View file @
2de41182
...
...
@@ -488,9 +488,9 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
if
needed_memory
>
available_memory
:
raise
ValueError
(
f
"To serve at least one request with the models's max seq len "
f
"(
{
max_model_len
}
), (
{
needed_memory
/
1024
/
1024
/
1024
:.
2
f
}
GB KV "
f
"(
{
max_model_len
}
), (
{
needed_memory
/
1024
/
1024
/
1024
:.
2
f
}
G
i
B KV "
f
"cache is needed, which is larger than the available KV cache "
f
"memory (
{
available_memory
/
1024
/
1024
/
1024
:.
2
f
}
GB). Try "
f
"memory (
{
available_memory
/
1024
/
1024
/
1024
:.
2
f
}
G
i
B). Try "
f
"increasing `gpu_memory_utilization` or decreasing "
f
"`max_model_len` when initializing the engine."
)
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
2de41182
...
...
@@ -24,8 +24,8 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
from
vllm.sampling_params
import
SamplingType
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
DeviceMemoryProfiler
,
LayerBlockType
,
LazyLoader
,
cdiv
,
check_use_alibi
,
is_pin_memory_available
)
GiB_bytes
,
LayerBlockType
,
LazyLoader
,
cdiv
,
check_use_alibi
,
is_pin_memory_available
)
from
vllm.v1.attention.backends.flash_attn
import
FlashAttentionMetadata
from
vllm.v1.core.encoder_cache_manager
import
compute_encoder_budget
from
vllm.v1.kv_cache_interface
import
(
FullAttentionSpec
,
KVCacheConfig
,
...
...
@@ -1206,8 +1206,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self
.
device
)
time_after_load
=
time
.
perf_counter
()
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
"Model loading took %.4f GB and %.6f seconds"
,
self
.
model_memory_usage
/
float
(
2
**
30
)
,
logger
.
info
(
"Model loading took %.4f G
i
B and %.6f seconds"
,
self
.
model_memory_usage
/
GiB_bytes
,
time_after_load
-
time_before_load
)
def
_get_prompt_logprobs_dict
(
...
...
vllm/worker/model_runner.py
View file @
2de41182
...
...
@@ -1143,8 +1143,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
time_after_load
=
time
.
perf_counter
()
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
"Model loading took %.4f GB and %.6f seconds"
,
self
.
model_memory_usage
/
float
(
2
**
30
)
,
logger
.
info
(
"Model loading took %.4f G
i
B and %.6f seconds"
,
self
.
model_memory_usage
/
GiB_bytes
,
time_after_load
-
time_before_load
)
if
self
.
prompt_adapter_config
:
self
.
prompt_adapter_manager
=
LRUCacheWorkerPromptAdapterManager
(
...
...
vllm/worker/xpu_model_runner.py
View file @
2de41182
...
...
@@ -25,7 +25,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalRegistry
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.utils
import
DeviceMemoryProfiler
,
make_tensor_with_pad
from
vllm.utils
import
DeviceMemoryProfiler
,
GiB_bytes
,
make_tensor_with_pad
from
vllm.worker.model_runner
import
AttentionMetadata
,
SamplingMetadata
from
vllm.worker.model_runner_base
import
(
ModelRunnerBase
,
ModelRunnerInputBase
,
ModelRunnerInputBuilderBase
,
...
...
@@ -422,8 +422,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
self
.
model
=
get_model
(
vllm_config
=
self
.
vllm_config
)
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
"Loading model weights took %.4f GB"
,
self
.
model_memory_usage
/
float
(
2
**
30
)
)
logger
.
info
(
"Loading model weights took %.4f G
i
B"
,
self
.
model_memory_usage
/
GiB_bytes
)
def
get_model
(
self
)
->
nn
.
Module
:
return
self
.
model
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment