Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
import copy import copy
import dataclasses import dataclasses
from contextlib import contextmanager from contextlib import contextmanager
......
# SPDX-License-Identifier: Apache-2.0
import inspect import inspect
from typing import Callable, Dict, List, Optional, TypeVar, Union, overload from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
from unittest.mock import patch from unittest.mock import patch
......
# SPDX-License-Identifier: Apache-2.0
import operator import operator
from typing import Dict, Iterable, List, Optional, Tuple, Union from typing import Dict, Iterable, List, Optional, Tuple, Union
......
# SPDX-License-Identifier: Apache-2.0
from typing import Callable, Dict, List, NamedTuple, Optional, Tuple from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
import operator import operator
from typing import Iterable, Optional from typing import Iterable, Optional
......
# SPDX-License-Identifier: Apache-2.0
import hashlib import hashlib
import inspect import inspect
import types import types
......
# SPDX-License-Identifier: Apache-2.0
import os import os
import time import time
......
# SPDX-License-Identifier: Apache-2.0
import abc import abc
import operator import operator
from abc import abstractmethod from abc import abstractmethod
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List from typing import Any, Dict, List
from torch import fx as fx from torch import fx as fx
......
# SPDX-License-Identifier: Apache-2.0
from typing import Union from typing import Union
import torch.fx import torch.fx
......
# SPDX-License-Identifier: Apache-2.0
import time import time
import torch import torch
......
# SPDX-License-Identifier: Apache-2.0
import os import os
import sys import sys
from abc import abstractmethod from abc import abstractmethod
......
# SPDX-License-Identifier: Apache-2.0
import ast import ast
import copy import copy
import enum import enum
...@@ -81,6 +83,12 @@ class SupportsHash(Protocol): ...@@ -81,6 +83,12 @@ class SupportsHash(Protocol):
... ...
class ModelImpl(str, enum.Enum):
AUTO = "auto"
VLLM = "vllm"
TRANSFORMERS = "transformers"
class ModelConfig: class ModelConfig:
"""Configuration for the model. """Configuration for the model.
...@@ -165,6 +173,12 @@ class ModelConfig: ...@@ -165,6 +173,12 @@ class ModelConfig:
`logits_processors` extra completion argument. Defaults to None, `logits_processors` extra completion argument. Defaults to None,
which allows no processors. which allows no processors.
generation_config: Configuration parameter file for generation. generation_config: Configuration parameter file for generation.
model_impl: Which implementation of the model to use:
"auto" will try to use the vLLM implementation if it exists and
fall back to the Transformers implementation if no vLLM
implementation is available.
"vllm" will use the vLLM model implementation.
"transformers" will use the Transformers model implementation.
override_generation_config: Override the generation config with the override_generation_config: Override the generation config with the
given config. given config.
""" """
...@@ -228,6 +242,7 @@ class ModelConfig: ...@@ -228,6 +242,7 @@ class ModelConfig:
generation_config: Optional[str] = None, generation_config: Optional[str] = None,
enable_sleep_mode: bool = False, enable_sleep_mode: bool = False,
override_generation_config: Optional[Dict[str, Any]] = None, override_generation_config: Optional[Dict[str, Any]] = None,
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
) -> None: ) -> None:
self.model = model self.model = model
self.tokenizer = tokenizer self.tokenizer = tokenizer
...@@ -239,6 +254,7 @@ class ModelConfig: ...@@ -239,6 +254,7 @@ class ModelConfig:
self.code_revision = code_revision self.code_revision = code_revision
self.rope_scaling = rope_scaling self.rope_scaling = rope_scaling
self.rope_theta = rope_theta self.rope_theta = rope_theta
self.model_impl = model_impl
if hf_overrides is None: if hf_overrides is None:
hf_overrides = {} hf_overrides = {}
...@@ -738,7 +754,6 @@ class ModelConfig: ...@@ -738,7 +754,6 @@ class ModelConfig:
@property @property
def is_deepseek_mla(self) -> bool: def is_deepseek_mla(self) -> bool:
# TODO add deepseek_v3
return (hasattr(self.hf_text_config, "model_type")) \ return (hasattr(self.hf_text_config, "model_type")) \
and (self.hf_text_config.model_type in \ and (self.hf_text_config.model_type in \
('deepseek_v2', 'deepseek_v3'))\ ('deepseek_v2', 'deepseek_v3'))\
...@@ -970,6 +985,9 @@ class ModelConfig: ...@@ -970,6 +985,9 @@ class ModelConfig:
@property @property
def use_mla(self) -> bool: def use_mla(self) -> bool:
if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE:
return False
if self.quantization is not None and self.quantization not in [\ if self.quantization is not None and self.quantization not in [\
"fp8", "compressed-tensors"]: "fp8", "compressed-tensors"]:
logger.warning( logger.warning(
...@@ -981,8 +999,9 @@ class ModelConfig: ...@@ -981,8 +999,9 @@ class ModelConfig:
# have fp8 for both weights and activations. # have fp8 for both weights and activations.
if self.quantization == "compressed-tensors": if self.quantization == "compressed-tensors":
quant_config = self._parse_quant_hf_config() quant_config = self._parse_quant_hf_config()
for group_name, cfg in quant_config.get("config_groups", for group_name, cfg in quant_config.get("config_groups", {
("", {})).items(): "": {}
}).items():
act_cfg = cfg.get("input_activations", {}) act_cfg = cfg.get("input_activations", {})
act_type = None if act_cfg is None else act_cfg.get("type", "") act_type = None if act_cfg is None else act_cfg.get("type", "")
w_cfg = cfg.get("weights", {}) w_cfg = cfg.get("weights", {})
...@@ -996,8 +1015,7 @@ class ModelConfig: ...@@ -996,8 +1015,7 @@ class ModelConfig:
quant_config) quant_config)
return False return False
use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE) return True
return use_mla
@property @property
def supported_runner_types(self) -> Set[RunnerType]: def supported_runner_types(self) -> Set[RunnerType]:
......
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path from pathlib import Path
from typing import Mapping, MutableMapping, Optional from typing import Mapping, MutableMapping, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
......
# SPDX-License-Identifier: Apache-2.0
import math import math
from typing import List, Optional from typing import List, Optional
......
# SPDX-License-Identifier: Apache-2.0
from collections import deque from collections import deque
from dataclasses import dataclass from dataclasses import dataclass
from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
......
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, FrozenSet, List, Optional, Tuple from typing import Dict, FrozenSet, List, Optional, Tuple
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
......
# SPDX-License-Identifier: Apache-2.0
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
......
# SPDX-License-Identifier: Apache-2.0
from collections import deque from collections import deque
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
......
# SPDX-License-Identifier: Apache-2.0
"""Token blocks.""" """Token blocks."""
import sys import sys
from bisect import bisect_left from bisect import bisect_left
...@@ -64,6 +65,15 @@ class PrefixCachingBlockAllocator(BlockAllocator): ...@@ -64,6 +65,15 @@ class PrefixCachingBlockAllocator(BlockAllocator):
from 0 to num_blocks - 1. from 0 to num_blocks - 1.
""" """
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash: int = hash('None')
# Implements Block.Factory.
def __init__( def __init__(
self, self,
num_blocks: int, num_blocks: int,
...@@ -121,7 +131,6 @@ class PrefixCachingBlockAllocator(BlockAllocator): ...@@ -121,7 +131,6 @@ class PrefixCachingBlockAllocator(BlockAllocator):
self.metric_data = CacheMetricData() self.metric_data = CacheMetricData()
# Implements Block.Factory.
def _create_block( def _create_block(
self, self,
prev_block: Optional[Block], prev_block: Optional[Block],
...@@ -736,6 +745,14 @@ class PrefixCachingBlock(Block): ...@@ -736,6 +745,14 @@ class PrefixCachingBlock(Block):
such as adapters that influence the block, apart from the token_ids. such as adapters that influence the block, apart from the token_ids.
""" """
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash: int = hash('None')
def __init__( def __init__(
self, self,
prev_block: Optional[Block], prev_block: Optional[Block],
...@@ -890,13 +907,13 @@ class PrefixCachingBlock(Block): ...@@ -890,13 +907,13 @@ class PrefixCachingBlock(Block):
is_first_block = self._prev_block is None is_first_block = self._prev_block is None
prev_block_hash = ( prev_block_hash = (
None if is_first_block else self._none_hash if is_first_block else
self._prev_block.content_hash # type: ignore self._prev_block.content_hash # type: ignore
) )
# Previous block exists but does not yet have a hash. # Previous block exists but does not yet have a hash.
# Return no hash in this case. # Return no hash in this case.
if prev_block_hash is None and not is_first_block: if prev_block_hash == self._none_hash and not is_first_block:
return None return None
self._cached_content_hash = PrefixCachingBlock.hash_block_tokens( self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
...@@ -906,8 +923,9 @@ class PrefixCachingBlock(Block): ...@@ -906,8 +923,9 @@ class PrefixCachingBlock(Block):
extra_hash=self._extra_hash) extra_hash=self._extra_hash)
return self._cached_content_hash return self._cached_content_hash
@staticmethod @classmethod
def hash_block_tokens(is_first_block: bool, def hash_block_tokens(cls,
is_first_block: bool,
prev_block_hash: Optional[int], prev_block_hash: Optional[int],
cur_block_token_ids: List[int], cur_block_token_ids: List[int],
extra_hash: Optional[int] = None) -> int: extra_hash: Optional[int] = None) -> int:
...@@ -928,7 +946,8 @@ class PrefixCachingBlock(Block): ...@@ -928,7 +946,8 @@ class PrefixCachingBlock(Block):
Returns: Returns:
- int: The computed hash value for the block. - int: The computed hash value for the block.
""" """
assert (prev_block_hash is None) == is_first_block if is_first_block and prev_block_hash is None:
prev_block_hash = cls._none_hash
return hash((is_first_block, prev_block_hash, *cur_block_token_ids, return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
extra_hash)) extra_hash))
...@@ -948,6 +967,14 @@ class ComputedBlocksTracker: ...@@ -948,6 +967,14 @@ class ComputedBlocksTracker:
cached block hashes in the allocator. cached block hashes in the allocator.
""" """
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash: int = hash('None')
def __init__( def __init__(
self, self,
allocator: DeviceAwareBlockAllocator, allocator: DeviceAwareBlockAllocator,
...@@ -993,7 +1020,7 @@ class ComputedBlocksTracker: ...@@ -993,7 +1020,7 @@ class ComputedBlocksTracker:
# We need to know the hash of the previous block to compute the hash of # We need to know the hash of the previous block to compute the hash of
# the current block so that blocks could be uniquely identified across # the current block so that blocks could be uniquely identified across
# sequences of prefixes. # sequences of prefixes.
prev_block_hash = (None if cur_num_blocks_recorded == 0 else prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
block_hashes_recorded[-1]) block_hashes_recorded[-1])
# Only update the computed block hashes for the new blocks # Only update the computed block hashes for the new blocks
for i in range(cur_num_blocks_recorded, num_computed_blocks): for i in range(cur_num_blocks_recorded, num_computed_blocks):
...@@ -1008,7 +1035,7 @@ class ComputedBlocksTracker: ...@@ -1008,7 +1035,7 @@ class ComputedBlocksTracker:
# This has to be kept in sync with the allocator's hash # This has to be kept in sync with the allocator's hash
# calculation. # calculation.
block_hash = PrefixCachingBlock.hash_block_tokens( block_hash = PrefixCachingBlock.hash_block_tokens(
is_first_block=prev_block_hash is None, is_first_block=prev_block_hash == self._none_hash,
prev_block_hash=prev_block_hash, prev_block_hash=prev_block_hash,
cur_block_token_ids=block_token_ids, cur_block_token_ids=block_token_ids,
extra_hash=extra_hash, extra_hash=extra_hash,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment