Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+# SPDX-License-Identifier: Apache-2.0
 """
 Simple KV Cache Connector for Distributed Machine Learning Inference


--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This file contains a new class `KVLookupBufferBase` that allows developers to 
 think of KV cache operations as inserting new KV cache entries (`insert`) 

--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+# SPDX-License-Identifier: Apache-2.0
 """
    Implements a distributed key-value (KV) cache transfer mechanism.


--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This file defines an interface `KVPipeBase`
 that provides an abstraction for sending and receiving tensors, or None, via

--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import pickle

--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+# SPDX-License-Identifier: Apache-2.0
 """
    This module implements a PyNccl pipe for sending and receiving 
    Optional[torch.Tensor] between distributed ranks with advanced 

--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
+# SPDX-License-Identifier: Apache-2.0
 """A centralized entrypoint to perform distributed KV cache transfer.

 This implementation is a shim wrapper on two APIs exposed by `kv_connector`:

--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
@@ -327,9 +329,17 @@ class GroupCoordinator:
            return input_

        if input_.is_cpu:
-            import intel_extension_for_pytorch as ipex
-            ipex.distributed.all_reduce(input_, group=self.device_group)
-            return input_
+            try:
+                import intel_extension_for_pytorch as ipex
+                ipex.distributed.all_reduce(input_, group=self.device_group)
+                return input_
+            except ImportError:
+                """
+                Intel IPEX not found. Falling back to PyTorch native 
+                all_reduce for CPU
+                """
+                torch.distributed.all_reduce(input_, group=self.device_group)
+                return input_

        if self.tpu_communicator is not None and \
            not self.tpu_communicator.disabled:

--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
+# SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2023 The vLLM team.
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import json
@@ -11,10 +13,10 @@ import vllm.envs as envs
 from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
                         DecodingConfig, DeviceConfig, HfOverrides,
                         KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
-                         VllmConfig)
+                         ModelConfig, ModelImpl, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
+                         SchedulerConfig, SpeculativeConfig, TaskOption,
+                         TokenizerPoolConfig, VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -200,6 +202,7 @@ class EngineArgs:
    generation_config: Optional[str] = None
    override_generation_config: Optional[Dict[str, Any]] = None
    enable_sleep_mode: bool = False
+    model_impl: str = "auto"

    calculate_kv_scales: Optional[bool] = None

@@ -379,6 +382,18 @@ class EngineArgs:
            'qualified names that can be passed with the `logits_processors` '
            'extra completion argument. Defaults to None, which allows no '
            'processors.')
+        parser.add_argument(
+            '--model-impl',
+            type=str,
+            default=EngineArgs.model_impl,
+            choices=[f.value for f in ModelImpl],
+            help='Which implementation of the model to use.\n\n'
+            '* "auto" will try to use the vLLM implementation if it exists '
+            'and fall back to the Transformers implementation if no vLLM '
+            'implementation is available.\n'
+            '* "vllm" will use the vLLM model implementation.\n'
+            '* "transformers" will use the Transformers model '
+            'implementation.\n')
        # Parallel arguments
        parser.add_argument(
            '--distributed-executor-backend',
@@ -1032,6 +1047,7 @@ class EngineArgs:
            generation_config=self.generation_config,
            override_generation_config=self.override_generation_config,
            enable_sleep_mode=self.enable_sleep_mode,
+            model_impl=self.model_impl,
        )

    def create_load_config(self) -> LoadConfig:

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import copy
 import time

--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
+# SPDX-License-Identifier: Apache-2.0
+
 # Workaround for https://github.com/python/cpython/issues/86296
 #
 # From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import copy
 import time

--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
+# SPDX-License-Identifier: Apache-2.0
+
 import time
 from typing import TYPE_CHECKING
 from typing import Counter as CollectionsCounter

--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
+# SPDX-License-Identifier: Apache-2.0
 """
 These types are defined in this file to avoid importing vllm.engine.metrics
 and therefore importing prometheus_client.

--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum

--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import copy
 import pickle

--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle
 import signal
 from contextlib import contextmanager

--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Callable, List


--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 from typing import Callable, List, cast