Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fcfc474d
Commit
fcfc474d
authored
Apr 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.3' into v0.8.3-dev
parents
bb94d2e5
296c6572
Changes
503
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
15 additions
and
8 deletions
+15
-8
vllm/worker/tpu_worker.py
vllm/worker/tpu_worker.py
+10
-3
vllm/worker/worker.py
vllm/worker/worker.py
+2
-2
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_model_runner.py
+3
-3
No files found.
vllm/worker/tpu_worker.py
View file @
fcfc474d
...
...
@@ -93,9 +93,16 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
# can have slightly different XLA graphs.
world_size
=
self
.
parallel_config
.
world_size
rank
=
xr
.
global_ordinal
()
per_rank_path
=
os
.
path
.
join
(
envs
.
VLLM_XLA_CACHE_PATH
,
f
"tp
{
world_size
}
_rank
{
rank
}
"
)
xr
.
initialize_cache
(
per_rank_path
,
readonly
=
False
)
# The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
# Consequently, changes in optimization flags, which affect compilation
# results, don't change the cache key. This can result in the wrong
# compilation being used. To prevent this, disabling the XLA compilation
# cache during development is recommended.We can disable it by
# `export VLLM_XLA_CACHE_PATH=`
if
envs
.
VLLM_XLA_CACHE_PATH
:
per_rank_path
=
os
.
path
.
join
(
envs
.
VLLM_XLA_CACHE_PATH
,
f
"tp
{
world_size
}
_rank
{
rank
}
"
)
xr
.
initialize_cache
(
per_rank_path
,
readonly
=
False
)
self
.
profiler
=
None
if
envs
.
VLLM_TORCH_PROFILER_DIR
and
self
.
rank
<
1
:
...
...
vllm/worker/worker.py
View file @
fcfc474d
...
...
@@ -135,9 +135,9 @@ class Worker(LocalOrDistributedWorkerBase):
"%.2f GiB memory is still in use."
,
freed_bytes
/
GiB_bytes
,
used_bytes
/
GiB_bytes
)
def
wake_up
(
self
)
->
None
:
def
wake_up
(
self
,
tags
:
Optional
[
list
[
str
]]
=
None
)
->
None
:
allocator
=
CuMemAllocator
.
get_instance
()
allocator
.
wake_up
()
allocator
.
wake_up
(
tags
=
tags
)
def
init_device
(
self
)
->
None
:
if
self
.
device_config
.
device
.
type
==
"cuda"
:
...
...
vllm/worker/xpu_model_runner.py
View file @
fcfc474d
...
...
@@ -25,7 +25,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
MultiModalRegistry
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.utils
import
DeviceMemoryProfiler
,
make_tensor_with_pad
from
vllm.utils
import
DeviceMemoryProfiler
,
GiB_bytes
,
make_tensor_with_pad
from
vllm.worker.model_runner
import
AttentionMetadata
,
SamplingMetadata
from
vllm.worker.model_runner_base
import
(
ModelRunnerBase
,
ModelRunnerInputBase
,
ModelRunnerInputBuilderBase
,
...
...
@@ -422,8 +422,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
self
.
model
=
get_model
(
vllm_config
=
self
.
vllm_config
)
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
"Loading model weights took %.4f GB"
,
self
.
model_memory_usage
/
float
(
2
**
30
)
)
logger
.
info
(
"Loading model weights took %.4f G
i
B"
,
self
.
model_memory_usage
/
GiB_bytes
)
def
get_model
(
self
)
->
nn
.
Module
:
return
self
.
model
...
...
Prev
1
…
22
23
24
25
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment