Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4eabe123
Commit
4eabe123
authored
May 28, 2025
by
zhuwenwen
Browse files
Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori
parents
45840cd2
58738772
Changes
670
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
50 additions
and
37 deletions
+50
-37
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_model_runner.py
+7
-6
vllm/worker/multi_step_neuron_model_runner.py
vllm/worker/multi_step_neuron_model_runner.py
+5
-2
vllm/worker/multi_step_neuronx_distributed_model_runner.py
vllm/worker/multi_step_neuronx_distributed_model_runner.py
+5
-2
vllm/worker/neuron_model_runner.py
vllm/worker/neuron_model_runner.py
+10
-6
vllm/worker/pooling_model_runner.py
vllm/worker/pooling_model_runner.py
+7
-3
vllm/worker/tpu_worker.py
vllm/worker/tpu_worker.py
+1
-2
vllm/worker/utils.py
vllm/worker/utils.py
+1
-1
vllm/worker/worker.py
vllm/worker/worker.py
+4
-6
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_model_runner.py
+6
-3
vllm/worker/xpu_worker.py
vllm/worker/xpu_worker.py
+4
-6
No files found.
vllm/worker/multi_step_model_runner.py
View file @
4eabe123
...
@@ -733,12 +733,13 @@ def _pythonize_sampler_output(
...
@@ -733,12 +733,13 @@ def _pythonize_sampler_output(
logprobs_tensor
:
Optional
[
torch
.
Tensor
],
logprobs_tensor
:
Optional
[
torch
.
Tensor
],
cache
:
Optional
[
PythonizationCache
],
cache
:
Optional
[
PythonizationCache
],
)
->
None
:
)
->
None
:
""" This function is only called when the output tensors are ready.
""" This function is only called when the output tensors are ready.
See
{class}`
ModelOutput
`.
See
[`ModelOutput`][vllm.worker.multi_step_model_runner.
ModelOutput
].
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
adding a Pythonized output data structure
adding a Pythonized output data structure
({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
Args:
Args:
model_input
model_input
...
@@ -824,7 +825,7 @@ def _pythonize_sampler_output(
...
@@ -824,7 +825,7 @@ def _pythonize_sampler_output(
for
sgdx
,
(
seq_group
,
for
sgdx
,
(
seq_group
,
sample_result
)
in
enumerate
(
zip
(
seq_groups
,
samples_list
)):
sample_result
)
in
enumerate
(
zip
(
seq_groups
,
samples_list
)):
# Reminder: Please update docs/
source/
features/compatibility_matrix.md
# Reminder: Please update docs/features/compatibility_matrix.md
# If the feature combo become valid
# If the feature combo become valid
# (Check for Guided Decoding)
# (Check for Guided Decoding)
if
seq_group
.
sampling_params
.
logits_processors
:
if
seq_group
.
sampling_params
.
logits_processors
:
...
...
vllm/worker/multi_step_neuron_model_runner.py
View file @
4eabe123
...
@@ -70,8 +70,11 @@ class MultiStepNeuronModelRunner(NeuronModelRunner):
...
@@ -70,8 +70,11 @@ class MultiStepNeuronModelRunner(NeuronModelRunner):
input_ids
=
model_input
.
input_tokens
,
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
input_block_ids
=
model_input
.
input_block_ids
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
**
MultiModalKwargs
.
as_kwargs
(
device
=
self
.
device
),
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
)
output
=
self
.
model
.
sample
(
output
=
self
.
model
.
sample
(
...
...
vllm/worker/multi_step_neuronx_distributed_model_runner.py
View file @
4eabe123
...
@@ -49,8 +49,11 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
...
@@ -49,8 +49,11 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
positions
=
model_input
.
input_positions
,
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
input_block_ids
=
model_input
.
input_block_ids
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
**
MultiModalKwargs
.
as_kwargs
(
device
=
self
.
device
),
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
)
output
=
self
.
model
.
sample
(
output
=
self
.
model
.
sample
(
...
...
vllm/worker/neuron_model_runner.py
View file @
4eabe123
...
@@ -378,9 +378,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
...
@@ -378,9 +378,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
positions
=
model_input
.
input_positions
,
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
input_block_ids
=
model_input
.
input_block_ids
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
**
MultiModalKwargs
.
as_kwargs
(
or
{},
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
)
elif
current_platform
.
use_transformers_neuronx
():
elif
current_platform
.
use_transformers_neuronx
():
# [TODO] validate on-device sampling
# [TODO] validate on-device sampling
...
@@ -389,9 +391,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
...
@@ -389,9 +391,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
input_ids
=
model_input
.
input_tokens
,
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
input_block_ids
=
model_input
.
input_block_ids
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
**
MultiModalKwargs
.
as_kwargs
(
or
{},
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
)
# Compute the logits only if the on-device sampling is turned off as
# Compute the logits only if the on-device sampling is turned off as
...
...
vllm/worker/pooling_model_runner.py
View file @
4eabe123
...
@@ -119,10 +119,14 @@ class PoolingModelRunner(
...
@@ -119,10 +119,14 @@ class PoolingModelRunner(
input_ids
=
model_input
.
input_tokens
,
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
**
MultiModalKwargs
.
as_kwargs
(
device
=
self
.
device
),
multi_modal_kwargs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
**
cross_enc_kwargs
,
**
cross_enc_kwargs
,
**
seqlen_agnostic_kwargs
)
**
seqlen_agnostic_kwargs
,
)
if
(
self
.
observability_config
is
not
None
if
(
self
.
observability_config
is
not
None
and
self
.
observability_config
.
collect_model_forward_time
):
and
self
.
observability_config
.
collect_model_forward_time
):
...
...
vllm/worker/tpu_worker.py
View file @
4eabe123
...
@@ -76,8 +76,7 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
...
@@ -76,8 +76,7 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
)
)
ensure_model_parallel_initialized
(
ensure_model_parallel_initialized
(
self
.
parallel_config
.
tensor_parallel_size
,
self
.
parallel_config
.
tensor_parallel_size
,
self
.
parallel_config
.
pipeline_parallel_size
,
self
.
parallel_config
.
pipeline_parallel_size
)
self
.
parallel_config
.
enable_expert_parallel
)
# Device initialization should happen after initializing the distributed
# Device initialization should happen after initializing the distributed
# runtime.
# runtime.
...
...
vllm/worker/utils.py
View file @
4eabe123
...
@@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario(
...
@@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario(
a supported scenario.
a supported scenario.
'''
'''
# Reminder: Please update docs/
source/
features/compatibility_matrix.md
# Reminder: Please update docs/features/compatibility_matrix.md
# If the feature combo become valid
# If the feature combo become valid
if
enc_dec_mr
.
cache_config
.
enable_prefix_caching
:
if
enc_dec_mr
.
cache_config
.
enable_prefix_caching
:
...
...
vllm/worker/worker.py
View file @
4eabe123
...
@@ -234,10 +234,9 @@ class Worker(LocalOrDistributedWorkerBase):
...
@@ -234,10 +234,9 @@ class Worker(LocalOrDistributedWorkerBase):
Then, it calculate the maximum possible number of GPU and CPU blocks
Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
that can be allocated with the remaining free memory.
:::{tip}
Tip:
You may limit the usage of GPU memory
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
by adjusting the `gpu_memory_utilization` parameter.
:::
"""
"""
# Profile the memory usage of the model and get the maximum number of
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
# cache blocks that can be allocated with the remaining free memory.
...
@@ -530,8 +529,7 @@ def init_worker_distributed_environment(
...
@@ -530,8 +529,7 @@ def init_worker_distributed_environment(
init_distributed_environment
(
parallel_config
.
world_size
,
rank
,
init_distributed_environment
(
parallel_config
.
world_size
,
rank
,
distributed_init_method
,
local_rank
)
distributed_init_method
,
local_rank
)
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
parallel_config
.
enable_expert_parallel
)
ensure_kv_transfer_initialized
(
vllm_config
)
ensure_kv_transfer_initialized
(
vllm_config
)
...
...
vllm/worker/xpu_model_runner.py
View file @
4eabe123
...
@@ -562,9 +562,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
...
@@ -562,9 +562,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
input_ids
=
model_input
.
input_tokens
,
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
**
MultiModalKwargs
.
as_kwargs
(
or
{},
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
))
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
# Compute the logits in the last pipeline stage.
# Compute the logits in the last pipeline stage.
if
not
get_pp_group
().
is_last_rank
:
if
not
get_pp_group
().
is_last_rank
:
return
hidden_or_intermediate_states
return
hidden_or_intermediate_states
...
...
vllm/worker/xpu_worker.py
View file @
4eabe123
...
@@ -93,10 +93,9 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
...
@@ -93,10 +93,9 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
Then, it calculate the maximum possible number of GPU and CPU blocks
Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
that can be allocated with the remaining free memory.
:::{tip}
Tip:
You may limit the usage of GPU memory
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
by adjusting the `gpu_memory_utilization` parameter.
:::
"""
"""
# Profile the memory usage of the model and get the maximum number of
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
# cache blocks that can be allocated with the remaining free memory.
...
@@ -176,8 +175,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
...
@@ -176,8 +175,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
ensure_model_parallel_initialized
(
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
parallel_config
.
enable_expert_parallel
)
# global all_reduce needed for overall oneccl warm up
# global all_reduce needed for overall oneccl warm up
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
xpu
())
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
xpu
())
...
...
Prev
1
…
30
31
32
33
34
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment