Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4eabe123
Commit
4eabe123
authored
May 28, 2025
by
zhuwenwen
Browse files
Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori
parents
45840cd2
58738772
Changes
670
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
50 additions
and
37 deletions
+50
-37
vllm/worker/multi_step_model_runner.py
vllm/worker/multi_step_model_runner.py
+7
-6
vllm/worker/multi_step_neuron_model_runner.py
vllm/worker/multi_step_neuron_model_runner.py
+5
-2
vllm/worker/multi_step_neuronx_distributed_model_runner.py
vllm/worker/multi_step_neuronx_distributed_model_runner.py
+5
-2
vllm/worker/neuron_model_runner.py
vllm/worker/neuron_model_runner.py
+10
-6
vllm/worker/pooling_model_runner.py
vllm/worker/pooling_model_runner.py
+7
-3
vllm/worker/tpu_worker.py
vllm/worker/tpu_worker.py
+1
-2
vllm/worker/utils.py
vllm/worker/utils.py
+1
-1
vllm/worker/worker.py
vllm/worker/worker.py
+4
-6
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_model_runner.py
+6
-3
vllm/worker/xpu_worker.py
vllm/worker/xpu_worker.py
+4
-6
No files found.
vllm/worker/multi_step_model_runner.py
View file @
4eabe123
...
...
@@ -733,12 +733,13 @@ def _pythonize_sampler_output(
logprobs_tensor
:
Optional
[
torch
.
Tensor
],
cache
:
Optional
[
PythonizationCache
],
)
->
None
:
""" This function is only called when the output tensors are ready.
See
{class}`
ModelOutput
`.
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
""" This function is only called when the output tensors are ready.
See
[`ModelOutput`][vllm.worker.multi_step_model_runner.
ModelOutput
].
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
adding a Pythonized output data structure
({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
Args:
model_input
...
...
@@ -824,7 +825,7 @@ def _pythonize_sampler_output(
for
sgdx
,
(
seq_group
,
sample_result
)
in
enumerate
(
zip
(
seq_groups
,
samples_list
)):
# Reminder: Please update docs/
source/
features/compatibility_matrix.md
# Reminder: Please update docs/features/compatibility_matrix.md
# If the feature combo become valid
# (Check for Guided Decoding)
if
seq_group
.
sampling_params
.
logits_processors
:
...
...
vllm/worker/multi_step_neuron_model_runner.py
View file @
4eabe123
...
...
@@ -70,8 +70,11 @@ class MultiStepNeuronModelRunner(NeuronModelRunner):
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
output
=
self
.
model
.
sample
(
...
...
vllm/worker/multi_step_neuronx_distributed_model_runner.py
View file @
4eabe123
...
...
@@ -49,8 +49,11 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
sampling_params
=
sampling_params
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
output
=
self
.
model
.
sample
(
...
...
vllm/worker/neuron_model_runner.py
View file @
4eabe123
...
...
@@ -378,9 +378,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
sampling_params
=
sampling_params
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
elif
current_platform
.
use_transformers_neuronx
():
# [TODO] validate on-device sampling
...
...
@@ -389,9 +391,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
# Compute the logits only if the on-device sampling is turned off as
...
...
vllm/worker/pooling_model_runner.py
View file @
4eabe123
...
...
@@ -119,10 +119,14 @@ class PoolingModelRunner(
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
**
cross_enc_kwargs
,
**
seqlen_agnostic_kwargs
)
**
seqlen_agnostic_kwargs
,
)
if
(
self
.
observability_config
is
not
None
and
self
.
observability_config
.
collect_model_forward_time
):
...
...
vllm/worker/tpu_worker.py
View file @
4eabe123
...
...
@@ -76,8 +76,7 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
)
ensure_model_parallel_initialized
(
self
.
parallel_config
.
tensor_parallel_size
,
self
.
parallel_config
.
pipeline_parallel_size
,
self
.
parallel_config
.
enable_expert_parallel
)
self
.
parallel_config
.
pipeline_parallel_size
)
# Device initialization should happen after initializing the distributed
# runtime.
...
...
vllm/worker/utils.py
View file @
4eabe123
...
...
@@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario(
a supported scenario.
'''
# Reminder: Please update docs/
source/
features/compatibility_matrix.md
# Reminder: Please update docs/features/compatibility_matrix.md
# If the feature combo become valid
if
enc_dec_mr
.
cache_config
.
enable_prefix_caching
:
...
...
vllm/worker/worker.py
View file @
4eabe123
...
...
@@ -234,10 +234,9 @@ class Worker(LocalOrDistributedWorkerBase):
Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
:::{tip}
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
:::
Tip:
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
"""
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
...
...
@@ -530,8 +529,7 @@ def init_worker_distributed_environment(
init_distributed_environment
(
parallel_config
.
world_size
,
rank
,
distributed_init_method
,
local_rank
)
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
enable_expert_parallel
)
parallel_config
.
pipeline_parallel_size
)
ensure_kv_transfer_initialized
(
vllm_config
)
...
...
vllm/worker/xpu_model_runner.py
View file @
4eabe123
...
...
@@ -562,9 +562,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
))
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
# Compute the logits in the last pipeline stage.
if
not
get_pp_group
().
is_last_rank
:
return
hidden_or_intermediate_states
...
...
vllm/worker/xpu_worker.py
View file @
4eabe123
...
...
@@ -93,10 +93,9 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
:::{tip}
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
:::
Tip:
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
"""
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
...
...
@@ -176,8 +175,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
enable_expert_parallel
)
parallel_config
.
pipeline_parallel_size
)
# global all_reduce needed for overall oneccl warm up
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
xpu
())
...
...
Prev
1
…
30
31
32
33
34
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment