Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/embedding.py
+++ b/examples/offline_inference/embedding.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM
 # Sample prompts.

--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
+# SPDX-License-Identifier: Apache-2.0
 '''
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART

--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
+# SPDX-License-Identifier: Apache-2.0
 '''
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically Florence-2

--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
+# SPDX-License-Identifier: Apache-2.0
 from huggingface_hub import hf_hub_download
 from vllm import LLM, SamplingParams

--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
+# SPDX-License-Identifier: Apache-2.0
 import argparse
 from typing import List, Tuple

--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use LoRA with different quantization techniques
 for offline inference.

--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
+# SPDX-License-Identifier: Apache-2.0
 import gc
 import time
 from typing import List
@@ -49,7 +51,7 @@ if __name__ == "__main__":
    # Create an LLM with spec decoding
    llm = LLM(
        model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-fms/llama-13b-accelerator",
+        speculative_model="ibm-ai-platform/llama-13b-accelerator",
    )
    print("With speculation")

--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use the multi-LoRA functionality
 for offline inference.

--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 # Sample prompts.

--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 from vllm import LLM, SamplingParams

--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
+# SPDX-License-Identifier: Apache-2.0
 # ruff: noqa
 import argparse

--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory

--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
+# SPDX-License-Identifier: Apache-2.0
 import inspect
 import json
 import os

--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
+# SPDX-License-Identifier: Apache-2.0
 import argparse
 import dataclasses
 import os

--- a/examples/offline_inference/ray_placement.py
+++ b/examples/offline_inference/ray_placement.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+a simple demonstration to show how to control
+the placement of the vLLM workers with Ray.
+The key is to set VLLM_RAY_PER_WORKER_GPUS and
+VLLM_RAY_BUNDLE_INDICES properly.
+"""
+import os
+import ray
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from vllm import LLM
+from vllm.worker.worker import Worker
+class MyWorker(Worker):
+    def report_device_id(self) -> str:
+        from vllm.platforms import current_platform
+        return current_platform.get_device_uuid(self.device.index)
+class MyLLM(LLM):
+    def __init__(self, *args, bundle_indices: list, **kwargs):
+        # a hack to make the script work.
+        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+        # at the top-level
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        # every worker will use 0.4 GPU, so that we can schedule
+        # 2 instances on the same GPUs.
+        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(
+            map(str, bundle_indices))
+        print(f"creating LLM with bundle_indices={bundle_indices}")
+        super().__init__(*args, **kwargs)
+class RayTrainingActor:
+    def report_device_id(self) -> str:
+        # the argument for get_device_uuid is the index
+        # of the GPU in the visible devices.
+        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
+        from vllm.platforms import current_platform
+        return current_platform.get_device_uuid(0)
+# ray manages 4 GPUs
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+ray.init()
+# we want to co-locate vLLM instance and the training actor
+# on the same set of GPUs.
+# the placement plan is as follows:
+# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
+# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
+pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
+ray.get(pg.ready())
+print(f"placement group has bundles {pg.bundle_specs=}")
+training_actors = []
+training_actor_device_ids = []
+inference_engines = []
+inference_engine_device_ids = []
+for bundle_index in [0, 1, 2, 3]:
+    training_actor = ray.remote(
+        num_cpus=0,
+        num_gpus=0.4,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+            placement_group_bundle_index=bundle_index,
+        ),
+    )(RayTrainingActor).remote()
+    training_actors.append(training_actor)
+    device_id = ray.get(training_actor.report_device_id.remote())
+    print(f"training actor {bundle_index} is on {device_id}")
+    training_actor_device_ids.append(device_id)
+for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
+    # IMPORTANT: when creating vLLM instances, we need to
+    # make sure there are no GPU activities on the target GPUs,
+    # otherwise, they will interfere with the vLLM memory profiling,
+    # and cause unexpected behaviors.
+    llm = ray.remote(
+        num_cpus=0,
+        num_gpus=0,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+        ),
+    )(MyLLM).remote(
+        model="facebook/opt-125m",
+        enforce_eager=True,
+        worker_cls=MyWorker,
+        tensor_parallel_size=2,
+        distributed_executor_backend="ray",
+        gpu_memory_utilization=0.4,
+        bundle_indices=bundle_indices,
+    )
+    inference_engines.append(llm)
+    # don't call any method on the inference engine here,
+    # otherwise it will block until the vLLM instance is created.
+for i, llm in enumerate(inference_engines):
+    inference_engine_device_ids.append(
+        ray.get(llm.collective_rpc.remote("report_device_id", args=tuple())))
+    print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
+# check the placement
+# the first two training actors should be
+# on the same GPUs as the first inference engine
+assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
+# the last two training actors should be
+# on the same GPUs as the second inference engine
+assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
+# SPDX-License-Identifier: Apache-2.0
 """
 a simple demonstration of RLHF with vLLM, inspired by
 the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .

--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
+# SPDX-License-Identifier: Apache-2.0
 """
 Saves each worker's model state dict directly to a checkpoint, which enables a
 fast load path for large tensor-parallel models where each worker only needs to

--- a/examples/offline_inference/scoring.py
+++ b/examples/offline_inference/scoring.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM
 # Sample prompts.

--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import time

--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
+# SPDX-License-Identifier: Apache-2.0
 from enum import Enum
 from pydantic import BaseModel