roll back unused change

29e922ac · lizhigong · 0ee425a6 · 29e922ac · 29e922ac · 29e922ac
Commit 29e922ac authored May 09, 2025 by lizhigong
4 changed files
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -48,13 +48,13 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase):
                f"is less than than max local gpu count ({cuda_device_count})")

        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
-        # if "CUDA_VISIBLE_DEVICES" or "HIP_VISIBLE_DEVICES" not in os.environ:
-        #     update_environment_variables({
-        #         "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
-        #     })
-        #     update_environment_variables({
-        #         "HIP_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
-        #     })
+        if "CUDA_VISIBLE_DEVICES" or "HIP_VISIBLE_DEVICES" not in os.environ:
+            update_environment_variables({
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
+            update_environment_variables({
+                "HIP_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })

    def _init_executor(self) -> None:


--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -699,7 +699,7 @@ def _sample_with_torch(
        if sampling_type == SamplingType.GREEDY:
            greedy_samples = torch.argmax(logprobs[long_sample_indices],
                                          dim=-1)
-            sampled_token_ids_ = greedy_samples.unsqueeze(-1)
+
            if sampled_token_ids_tensor is not None:
                # Store sampled tokens in output tensor.
                sampled_token_ids_tensor[
@@ -736,8 +736,7 @@ def _sample_with_torch(
                    probs[long_sample_indices],
                    max_n_in_batch,
                    seq_groups=seq_groups_arg)
-            sampled_token_ids_ = \
-                    multinomial_samples[sampling_type].to(torch.long)
+
            if sampled_token_ids_tensor is not None:
                # Store sampled tokens in output tensor.
                sampled_token_ids_tensor[long_sample_indices] = \

--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -11,7 +11,6 @@ from vllm.platforms import current_platform
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                           PromptLogprobs, SequenceGroupMetadata,
                           SequenceOutput)
-from vllm.zero_overhead.utils import is_zero_overhead

 SeqId = int

@@ -140,6 +139,7 @@ def split_batch_by_proposal_len(
    zero or not. We should remove this once vLLM supports per-sequence proposal
    lens in a batch.
    """
+
    nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
    zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
    for i, (seq_group, proposal_len) in enumerate(

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -987,7 +987,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
        ]
        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)

-        ret = self.model_input_cls(
+        return self.model_input_cls(
            input_tokens=input_tokens_tensor,
            input_positions=input_positions_tensor,
            token_types=token_types_tensor,
@@ -1002,8 +1002,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
            prompt_adapter_mapping=prompt_adapter_mapping,
            prompt_adapter_requests=prompt_adapter_requests)

-        return ret
-

 class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
    """