Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
48cb2109
Unverified
Commit
48cb2109
authored
Apr 25, 2025
by
Daniel Li
Committed by
GitHub
Apr 25, 2025
Browse files
[V1] Move usage stats to worker and start logging TPU hardware (#16211)
parent
a5450f11
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
22 additions
and
10 deletions
+22
-10
vllm/usage/usage_lib.py
vllm/usage/usage_lib.py
+9
-0
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+0
-4
vllm/v1/engine/llm_engine.py
vllm/v1/engine/llm_engine.py
+0
-4
vllm/v1/utils.py
vllm/v1/utils.py
+3
-1
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+5
-0
vllm/v1/worker/tpu_worker.py
vllm/v1/worker/tpu_worker.py
+5
-1
No files found.
vllm/usage/usage_lib.py
View file @
48cb2109
...
...
@@ -174,6 +174,15 @@ class UsageMessage:
cuda_get_device_properties
(
0
,
(
"name"
,
"total_memory"
)))
if
current_platform
.
is_cuda
():
self
.
cuda_runtime
=
torch
.
version
.
cuda
if
current_platform
.
is_tpu
():
try
:
import
torch_xla
self
.
gpu_count
=
torch_xla
.
runtime
.
world_size
()
self
.
gpu_type
=
torch_xla
.
tpu
.
get_tpu_type
()
self
.
gpu_memory_per_device
=
(
torch_xla
.
core
.
xla_model
.
get_memory_info
()[
"bytes_limit"
])
except
Exception
:
pass
self
.
provider
=
_detect_cloud_provider
()
self
.
architecture
=
platform
.
machine
()
self
.
platform
=
platform
.
platform
()
...
...
vllm/v1/engine/async_llm.py
View file @
48cb2109
...
...
@@ -36,7 +36,6 @@ from vllm.v1.executor.abstract import Executor
from
vllm.v1.metrics.loggers
import
(
LoggingStatLogger
,
PrometheusStatLogger
,
StatLoggerBase
)
from
vllm.v1.metrics.stats
import
IterationStats
,
SchedulerStats
from
vllm.v1.utils
import
report_usage_stats
logger
=
init_logger
(
__name__
)
...
...
@@ -113,9 +112,6 @@ class AsyncLLM(EngineClient):
except
RuntimeError
:
pass
# If usage stat is enabled, collect relevant info.
report_usage_stats
(
vllm_config
,
usage_context
)
@
classmethod
def
from_vllm_config
(
cls
,
...
...
vllm/v1/engine/llm_engine.py
View file @
48cb2109
...
...
@@ -28,7 +28,6 @@ from vllm.v1.engine.output_processor import OutputProcessor
from
vllm.v1.engine.parallel_sampling
import
ParentRequest
from
vllm.v1.engine.processor
import
Processor
from
vllm.v1.executor.abstract
import
Executor
from
vllm.v1.utils
import
report_usage_stats
logger
=
init_logger
(
__name__
)
...
...
@@ -97,9 +96,6 @@ class LLMEngine:
# for v0 compatibility
self
.
model_executor
=
self
.
engine_core
.
engine_core
.
model_executor
# type: ignore
# If usage stat is enabled, collect relevant info.
report_usage_stats
(
vllm_config
,
usage_context
)
@
classmethod
def
from_vllm_config
(
cls
,
...
...
vllm/v1/utils.py
View file @
48cb2109
...
...
@@ -205,7 +205,9 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
return
to_tensor
[:
length
].
copy_
(
from_tensor
[:
length
],
non_blocking
=
True
)
def
report_usage_stats
(
vllm_config
,
usage_context
:
UsageContext
)
->
None
:
def
report_usage_stats
(
vllm_config
,
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
)
->
None
:
"""Report usage statistics if enabled."""
if
not
is_usage_stats_enabled
():
...
...
vllm/v1/worker/gpu_worker.py
View file @
48cb2109
...
...
@@ -23,6 +23,7 @@ from vllm.platforms import current_platform
from
vllm.utils
import
GiB_bytes
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
,
KVCacheSpec
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.utils
import
report_usage_stats
from
vllm.v1.worker.gpu_model_runner
import
GPUModelRunner
from
vllm.v1.worker.worker_base
import
WorkerBase
...
...
@@ -141,6 +142,10 @@ class Worker(WorkerBase):
self
.
model_runner
:
GPUModelRunner
=
GPUModelRunner
(
self
.
vllm_config
,
self
.
device
)
if
self
.
rank
==
0
:
# If usage stat is enabled, collect relevant info.
report_usage_stats
(
self
.
vllm_config
)
# FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
# to hijack tensor allocation.
def
load_model
(
self
)
->
None
:
...
...
vllm/v1/worker/tpu_worker.py
View file @
48cb2109
...
...
@@ -21,7 +21,7 @@ from vllm.v1.core.sched.output import SchedulerOutput
from
vllm.v1.kv_cache_interface
import
(
AttentionSpec
,
KVCacheConfig
,
KVCacheSpec
)
from
vllm.v1.outputs
import
ModelRunnerOutput
from
vllm.v1.utils
import
bind_kv_cache
from
vllm.v1.utils
import
bind_kv_cache
,
report_usage_stats
from
vllm.v1.worker.tpu_model_runner
import
TPUModelRunner
logger
=
init_logger
(
__name__
)
...
...
@@ -133,6 +133,10 @@ class TPUWorker:
# Init ModelRunner here, so that we have access to self.device.
self
.
model_runner
=
TPUModelRunner
(
self
.
vllm_config
,
self
.
device
)
if
rank
==
0
:
# If usage stat is enabled, collect relevant info.
report_usage_stats
(
self
.
vllm_config
)
def
determine_available_memory
(
self
)
->
int
:
kv_caches
:
dict
[
str
,
torch
.
Tensor
]
=
{}
kv_cache_spec
=
self
.
model_runner
.
get_kv_cache_spec
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment