Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
"""
Simple KV Cache Connector for Distributed Machine Learning Inference
......
# SPDX-License-Identifier: Apache-2.0
"""
This file contains a new class `KVLookupBufferBase` that allows developers to
think of KV cache operations as inserting new KV cache entries (`insert`)
......
# SPDX-License-Identifier: Apache-2.0
"""
Implements a distributed key-value (KV) cache transfer mechanism.
......
# SPDX-License-Identifier: Apache-2.0
"""
This file defines an interface `KVPipeBase`
that provides an abstraction for sending and receiving tensors, or None, via
......
# SPDX-License-Identifier: Apache-2.0
import json
import os
import pickle
......
# SPDX-License-Identifier: Apache-2.0
"""
This module implements a PyNccl pipe for sending and receiving
Optional[torch.Tensor] between distributed ranks with advanced
......
# SPDX-License-Identifier: Apache-2.0
"""A centralized entrypoint to perform distributed KV cache transfer.
This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
......
# SPDX-License-Identifier: Apache-2.0
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
......@@ -327,9 +329,17 @@ class GroupCoordinator:
return input_
if input_.is_cpu:
import intel_extension_for_pytorch as ipex
ipex.distributed.all_reduce(input_, group=self.device_group)
return input_
try:
import intel_extension_for_pytorch as ipex
ipex.distributed.all_reduce(input_, group=self.device_group)
return input_
except ImportError:
"""
Intel IPEX not found. Falling back to PyTorch native
all_reduce for CPU
"""
torch.distributed.all_reduce(input_, group=self.device_group)
return input_
if self.tpu_communicator is not None and \
not self.tpu_communicator.disabled:
......
# SPDX-License-Identifier: Apache-2.0
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
......
# SPDX-License-Identifier: Apache-2.0
import argparse
import dataclasses
import json
......@@ -11,10 +13,10 @@ import vllm.envs as envs
from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
DecodingConfig, DeviceConfig, HfOverrides,
KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
ModelConfig, ObservabilityConfig, ParallelConfig,
PoolerConfig, PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig, TaskOption, TokenizerPoolConfig,
VllmConfig)
ModelConfig, ModelImpl, ObservabilityConfig,
ParallelConfig, PoolerConfig, PromptAdapterConfig,
SchedulerConfig, SpeculativeConfig, TaskOption,
TokenizerPoolConfig, VllmConfig)
from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
......@@ -200,6 +202,7 @@ class EngineArgs:
generation_config: Optional[str] = None
override_generation_config: Optional[Dict[str, Any]] = None
enable_sleep_mode: bool = False
model_impl: str = "auto"
calculate_kv_scales: Optional[bool] = None
......@@ -379,6 +382,18 @@ class EngineArgs:
'qualified names that can be passed with the `logits_processors` '
'extra completion argument. Defaults to None, which allows no '
'processors.')
parser.add_argument(
'--model-impl',
type=str,
default=EngineArgs.model_impl,
choices=[f.value for f in ModelImpl],
help='Which implementation of the model to use.\n\n'
'* "auto" will try to use the vLLM implementation if it exists '
'and fall back to the Transformers implementation if no vLLM '
'implementation is available.\n'
'* "vllm" will use the vLLM model implementation.\n'
'* "transformers" will use the Transformers model '
'implementation.\n')
# Parallel arguments
parser.add_argument(
'--distributed-executor-backend',
......@@ -1032,6 +1047,7 @@ class EngineArgs:
generation_config=self.generation_config,
override_generation_config=self.override_generation_config,
enable_sleep_mode=self.enable_sleep_mode,
model_impl=self.model_impl,
)
def create_load_config(self) -> LoadConfig:
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
import copy
import time
......
# SPDX-License-Identifier: Apache-2.0
# Workaround for https://github.com/python/cpython/issues/86296
#
# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
......
# SPDX-License-Identifier: Apache-2.0
import os
import copy
import time
......
# SPDX-License-Identifier: Apache-2.0
import time
from typing import TYPE_CHECKING
from typing import Counter as CollectionsCounter
......
# SPDX-License-Identifier: Apache-2.0
"""
These types are defined in this file to avoid importing vllm.engine.metrics
and therefore importing prometheus_client.
......
# SPDX-License-Identifier: Apache-2.0
import uuid
from dataclasses import dataclass, field
from enum import Enum
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
import copy
import pickle
......
# SPDX-License-Identifier: Apache-2.0
import pickle
import signal
from contextlib import contextmanager
......
# SPDX-License-Identifier: Apache-2.0
from abc import ABC, abstractmethod
from typing import Callable, List
......
# SPDX-License-Identifier: Apache-2.0
import functools
from typing import Callable, List, cast
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment