Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 import dataclasses
 from contextlib import contextmanager

--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
+# SPDX-License-Identifier: Apache-2.0
+
 import inspect
 from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
 from unittest.mock import patch

--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
+# SPDX-License-Identifier: Apache-2.0
+
 import operator
 from typing import Dict, Iterable, List, Optional, Tuple, Union


--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Callable, Dict, List, NamedTuple, Optional, Tuple

 import torch

--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
+# SPDX-License-Identifier: Apache-2.0
+
 import operator
 from typing import Iterable, Optional


--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
+# SPDX-License-Identifier: Apache-2.0
+
 import hashlib
 import inspect
 import types

--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time


--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
+# SPDX-License-Identifier: Apache-2.0
+
 import abc
 import operator
 from abc import abstractmethod

--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Dict, List

 from torch import fx as fx

--- a/vllm/compilation/reshapes.py
+++ b/vllm/compilation/reshapes.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Union

 import torch.fx

--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
+# SPDX-License-Identifier: Apache-2.0
+
 import time

 import torch

--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 from abc import abstractmethod

--- a/vllm/config.py
+++ b/vllm/config.py
+# SPDX-License-Identifier: Apache-2.0
+
 import ast
 import copy
 import enum
@@ -81,6 +83,12 @@ class SupportsHash(Protocol):
        ...


+class ModelImpl(str, enum.Enum):
+    AUTO = "auto"
+    VLLM = "vllm"
+    TRANSFORMERS = "transformers"
+
+
 class ModelConfig:
    """Configuration for the model.

@@ -165,6 +173,12 @@ class ModelConfig:
            `logits_processors` extra completion argument. Defaults to None,
            which allows no processors.
        generation_config: Configuration parameter file for generation.
+        model_impl: Which implementation of the model to use:
+            "auto" will try to use the vLLM implementation if it exists and
+                fall back to the Transformers implementation if no vLLM
+                implementation is available.
+            "vllm" will use the vLLM model implementation.
+            "transformers" will use the Transformers model implementation.
        override_generation_config: Override the generation config with the
            given config.
    """
@@ -228,6 +242,7 @@ class ModelConfig:
        generation_config: Optional[str] = None,
        enable_sleep_mode: bool = False,
        override_generation_config: Optional[Dict[str, Any]] = None,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
    ) -> None:
        self.model = model
        self.tokenizer = tokenizer
@@ -239,6 +254,7 @@ class ModelConfig:
        self.code_revision = code_revision
        self.rope_scaling = rope_scaling
        self.rope_theta = rope_theta
+        self.model_impl = model_impl

        if hf_overrides is None:
            hf_overrides = {}
@@ -738,7 +754,6 @@ class ModelConfig:

    @property
    def is_deepseek_mla(self) -> bool:
-        # TODO add deepseek_v3
        return (hasattr(self.hf_text_config, "model_type")) \
                and (self.hf_text_config.model_type in \
                    ('deepseek_v2', 'deepseek_v3'))\
@@ -970,6 +985,9 @@ class ModelConfig:

    @property
    def use_mla(self) -> bool:
+        if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE:
+            return False
+
        if self.quantization is not None and self.quantization not in [\
            "fp8", "compressed-tensors"]:
            logger.warning(
@@ -981,8 +999,9 @@ class ModelConfig:
        # have fp8 for both weights and activations.
        if self.quantization == "compressed-tensors":
            quant_config = self._parse_quant_hf_config()
-            for group_name, cfg in quant_config.get("config_groups",
-                                                    ("", {})).items():
+            for group_name, cfg in quant_config.get("config_groups", {
+                    "": {}
+            }).items():
                act_cfg = cfg.get("input_activations", {})
                act_type = None if act_cfg is None else act_cfg.get("type", "")
                w_cfg = cfg.get("weights", {})
@@ -996,8 +1015,7 @@ class ModelConfig:
                        quant_config)
                    return False

-        use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
-        return use_mla
+        return True

    @property
    def supported_runner_types(self) -> Set[RunnerType]:

--- a/vllm/connections.py
+++ b/vllm/connections.py
+# SPDX-License-Identifier: Apache-2.0
+
 from pathlib import Path
 from typing import Mapping, MutableMapping, Optional
 from urllib.parse import urlparse

--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 from typing import List, Optional


--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import deque
 from dataclasses import dataclass
 from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple

--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, FrozenSet, List, Optional, Tuple

 from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,

--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple


--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
+# SPDX-License-Identifier: Apache-2.0
+
 from collections import deque
 from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union


--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
+# SPDX-License-Identifier: Apache-2.0
 """Token blocks."""
 import sys
 from bisect import bisect_left
@@ -64,6 +65,15 @@ class PrefixCachingBlockAllocator(BlockAllocator):
            from 0 to num_blocks - 1.
    """

+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
+    # Implements Block.Factory.
    def __init__(
        self,
        num_blocks: int,
@@ -121,7 +131,6 @@ class PrefixCachingBlockAllocator(BlockAllocator):

        self.metric_data = CacheMetricData()

-    # Implements Block.Factory.
    def _create_block(
        self,
        prev_block: Optional[Block],
@@ -736,6 +745,14 @@ class PrefixCachingBlock(Block):
            such as adapters that influence the block, apart from the token_ids.
    """

+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
    def __init__(
        self,
        prev_block: Optional[Block],
@@ -890,13 +907,13 @@ class PrefixCachingBlock(Block):

        is_first_block = self._prev_block is None
        prev_block_hash = (
-            None if is_first_block else
+            self._none_hash if is_first_block else
            self._prev_block.content_hash  # type: ignore
        )

        # Previous block exists but does not yet have a hash.
        # Return no hash in this case.
-        if prev_block_hash is None and not is_first_block:
+        if prev_block_hash == self._none_hash and not is_first_block:
            return None

        self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
@@ -906,8 +923,9 @@ class PrefixCachingBlock(Block):
            extra_hash=self._extra_hash)
        return self._cached_content_hash

-    @staticmethod
-    def hash_block_tokens(is_first_block: bool,
+    @classmethod
+    def hash_block_tokens(cls,
+                          is_first_block: bool,
                          prev_block_hash: Optional[int],
                          cur_block_token_ids: List[int],
                          extra_hash: Optional[int] = None) -> int:
@@ -928,7 +946,8 @@ class PrefixCachingBlock(Block):
        Returns:
        - int: The computed hash value for the block.
        """
-        assert (prev_block_hash is None) == is_first_block
+        if is_first_block and prev_block_hash is None:
+            prev_block_hash = cls._none_hash
        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
                     extra_hash))

@@ -948,6 +967,14 @@ class ComputedBlocksTracker:
    cached block hashes in the allocator.
    """

+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+
    def __init__(
        self,
        allocator: DeviceAwareBlockAllocator,
@@ -993,7 +1020,7 @@ class ComputedBlocksTracker:
        # We need to know the hash of the previous block to compute the hash of
        # the current block so that blocks could be uniquely identified across
        # sequences of prefixes.
-        prev_block_hash = (None if cur_num_blocks_recorded == 0 else
+        prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
                           block_hashes_recorded[-1])
        # Only update the computed block hashes for the new blocks
        for i in range(cur_num_blocks_recorded, num_computed_blocks):
@@ -1008,7 +1035,7 @@ class ComputedBlocksTracker:
            # This has to be kept in sync with the allocator's hash
            # calculation.
            block_hash = PrefixCachingBlock.hash_block_tokens(
-                is_first_block=prev_block_hash is None,
+                is_first_block=prev_block_hash == self._none_hash,
                prev_block_hash=prev_block_hash,
                cur_block_token_ids=block_token_ids,
                extra_hash=extra_hash,