Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

4eabe123 · zhuwenwen · 45840cd2 · 58738772 · 4eabe123 · 4eabe123
Commit 4eabe123 authored May 28, 2025 by zhuwenwen
10 changed files
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -733,12 +733,13 @@ def _pythonize_sampler_output(
    logprobs_tensor: Optional[torch.Tensor],
    cache: Optional[PythonizationCache],
 ) -> None:
-    """ This function is only called when the output tensors are ready. 
-    See {class}`ModelOutput`. 
-    
-    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, 
+    """ This function is only called when the output tensors are ready.
+    See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
+
+    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
    adding a Pythonized output data structure
-    ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
+    ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
+    for each [`SequenceGroup`][vllm.sequence.SequenceGroup].

    Args:
      model_input
@@ -824,7 +825,7 @@ def _pythonize_sampler_output(

    for sgdx, (seq_group,
               sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # Reminder: Please update docs/features/compatibility_matrix.md
        # If the feature combo become valid
        # (Check for Guided Decoding)
        if seq_group.sampling_params.logits_processors:

--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ b/vllm/worker/multi_step_neuron_model_runner.py
@@ -70,8 +70,11 @@ class MultiStepNeuronModelRunner(NeuronModelRunner):
            input_ids=model_input.input_tokens,
            positions=model_input.input_positions,
            input_block_ids=model_input.input_block_ids,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
+            **MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs or {},
+                dtype=self.model_config.dtype,
+                device=self.device,
+            ),
        )

        output = self.model.sample(

--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
@@ -49,8 +49,11 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
            positions=model_input.input_positions,
            input_block_ids=model_input.input_block_ids,
            sampling_params=sampling_params,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
+            **MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs or {},
+                dtype=self.model_config.dtype,
+                device=self.device,
+            ),
        )

        output = self.model.sample(

--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -378,9 +378,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                positions=model_input.input_positions,
                input_block_ids=model_input.input_block_ids,
                sampling_params=sampling_params,
-                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
-                                             or {},
-                                             device=self.device),
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
            )
        elif current_platform.use_transformers_neuronx():
            # [TODO] validate on-device sampling
@@ -389,9 +391,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
                input_ids=model_input.input_tokens,
                positions=model_input.input_positions,
                input_block_ids=model_input.input_block_ids,
-                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
-                                             or {},
-                                             device=self.device),
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
            )

        # Compute the logits only if the on-device sampling is turned off as

--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -119,10 +119,14 @@ class PoolingModelRunner(
                input_ids=model_input.input_tokens,
                positions=model_input.input_positions,
                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                             device=self.device),
+                **MultiModalKwargs.as_kwargs(
+                    multi_modal_kwargs,
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
                **cross_enc_kwargs,
-                **seqlen_agnostic_kwargs)
+                **seqlen_agnostic_kwargs,
+            )

        if (self.observability_config is not None
                and self.observability_config.collect_model_forward_time):

--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -76,8 +76,7 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        )
        ensure_model_parallel_initialized(
            self.parallel_config.tensor_parallel_size,
-            self.parallel_config.pipeline_parallel_size,
-            self.parallel_config.enable_expert_parallel)
+            self.parallel_config.pipeline_parallel_size)

        # Device initialization should happen after initializing the distributed
        # runtime.

--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario(
    a supported scenario.
    '''

-    # Reminder: Please update docs/source/features/compatibility_matrix.md
+    # Reminder: Please update docs/features/compatibility_matrix.md
    # If the feature combo become valid

    if enc_dec_mr.cache_config.enable_prefix_caching:

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -234,10 +234,9 @@ class Worker(LocalOrDistributedWorkerBase):
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.

-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
        """
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.
@@ -530,8 +529,7 @@ def init_worker_distributed_environment(
    init_distributed_environment(parallel_config.world_size, rank,
                                 distributed_init_method, local_rank)
    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size,
-                                      parallel_config.enable_expert_parallel)
+                                      parallel_config.pipeline_parallel_size)

    ensure_kv_transfer_initialized(vllm_config)


--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -562,9 +562,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
                input_ids=model_input.input_tokens,
                positions=model_input.input_positions,
                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
-                                             or {},
-                                             device=self.device))
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
+            )
        # Compute the logits in the last pipeline stage.
        if not get_pp_group().is_last_rank:
            return hidden_or_intermediate_states

--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -93,10 +93,9 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.

-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
        """
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.
@@ -176,8 +175,7 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):

        ensure_model_parallel_initialized(
            parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size,
-            parallel_config.enable_expert_parallel)
+            parallel_config.pipeline_parallel_size)
        # global all_reduce needed for overall oneccl warm up
        torch.distributed.all_reduce(torch.zeros(1).xpu())