Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
import copy
import dataclasses
from contextlib import contextmanager
......
# SPDX-License-Identifier: Apache-2.0
import inspect
from typing import Callable, Dict, List, Optional, TypeVar, Union, overload
from unittest.mock import patch
......
# SPDX-License-Identifier: Apache-2.0
import operator
from typing import Dict, Iterable, List, Optional, Tuple, Union
......
# SPDX-License-Identifier: Apache-2.0
from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
import torch
......
# SPDX-License-Identifier: Apache-2.0
import operator
from typing import Iterable, Optional
......
# SPDX-License-Identifier: Apache-2.0
import hashlib
import inspect
import types
......
# SPDX-License-Identifier: Apache-2.0
import os
import time
......
# SPDX-License-Identifier: Apache-2.0
import abc
import operator
from abc import abstractmethod
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List
from torch import fx as fx
......
# SPDX-License-Identifier: Apache-2.0
from typing import Union
import torch.fx
......
# SPDX-License-Identifier: Apache-2.0
import time
import torch
......
# SPDX-License-Identifier: Apache-2.0
import os
import sys
from abc import abstractmethod
......
# SPDX-License-Identifier: Apache-2.0
import ast
import copy
import enum
......@@ -81,6 +83,12 @@ class SupportsHash(Protocol):
...
class ModelImpl(str, enum.Enum):
AUTO = "auto"
VLLM = "vllm"
TRANSFORMERS = "transformers"
class ModelConfig:
"""Configuration for the model.
......@@ -165,6 +173,12 @@ class ModelConfig:
`logits_processors` extra completion argument. Defaults to None,
which allows no processors.
generation_config: Configuration parameter file for generation.
model_impl: Which implementation of the model to use:
"auto" will try to use the vLLM implementation if it exists and
fall back to the Transformers implementation if no vLLM
implementation is available.
"vllm" will use the vLLM model implementation.
"transformers" will use the Transformers model implementation.
override_generation_config: Override the generation config with the
given config.
"""
......@@ -228,6 +242,7 @@ class ModelConfig:
generation_config: Optional[str] = None,
enable_sleep_mode: bool = False,
override_generation_config: Optional[Dict[str, Any]] = None,
model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
) -> None:
self.model = model
self.tokenizer = tokenizer
......@@ -239,6 +254,7 @@ class ModelConfig:
self.code_revision = code_revision
self.rope_scaling = rope_scaling
self.rope_theta = rope_theta
self.model_impl = model_impl
if hf_overrides is None:
hf_overrides = {}
......@@ -738,7 +754,6 @@ class ModelConfig:
@property
def is_deepseek_mla(self) -> bool:
# TODO add deepseek_v3
return (hasattr(self.hf_text_config, "model_type")) \
and (self.hf_text_config.model_type in \
('deepseek_v2', 'deepseek_v3'))\
......@@ -970,6 +985,9 @@ class ModelConfig:
@property
def use_mla(self) -> bool:
if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE:
return False
if self.quantization is not None and self.quantization not in [\
"fp8", "compressed-tensors"]:
logger.warning(
......@@ -981,8 +999,9 @@ class ModelConfig:
# have fp8 for both weights and activations.
if self.quantization == "compressed-tensors":
quant_config = self._parse_quant_hf_config()
for group_name, cfg in quant_config.get("config_groups",
("", {})).items():
for group_name, cfg in quant_config.get("config_groups", {
"": {}
}).items():
act_cfg = cfg.get("input_activations", {})
act_type = None if act_cfg is None else act_cfg.get("type", "")
w_cfg = cfg.get("weights", {})
......@@ -996,8 +1015,7 @@ class ModelConfig:
quant_config)
return False
use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE)
return use_mla
return True
@property
def supported_runner_types(self) -> Set[RunnerType]:
......
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
from typing import Mapping, MutableMapping, Optional
from urllib.parse import urlparse
......
# SPDX-License-Identifier: Apache-2.0
import math
from typing import List, Optional
......
# SPDX-License-Identifier: Apache-2.0
from collections import deque
from dataclasses import dataclass
from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
......
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, FrozenSet, List, Optional, Tuple
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
......
# SPDX-License-Identifier: Apache-2.0
from abc import ABC, abstractmethod
from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
......
# SPDX-License-Identifier: Apache-2.0
from collections import deque
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
......
# SPDX-License-Identifier: Apache-2.0
"""Token blocks."""
import sys
from bisect import bisect_left
......@@ -64,6 +65,15 @@ class PrefixCachingBlockAllocator(BlockAllocator):
from 0 to num_blocks - 1.
"""
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash: int = hash('None')
# Implements Block.Factory.
def __init__(
self,
num_blocks: int,
......@@ -121,7 +131,6 @@ class PrefixCachingBlockAllocator(BlockAllocator):
self.metric_data = CacheMetricData()
# Implements Block.Factory.
def _create_block(
self,
prev_block: Optional[Block],
......@@ -736,6 +745,14 @@ class PrefixCachingBlock(Block):
such as adapters that influence the block, apart from the token_ids.
"""
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash: int = hash('None')
def __init__(
self,
prev_block: Optional[Block],
......@@ -890,13 +907,13 @@ class PrefixCachingBlock(Block):
is_first_block = self._prev_block is None
prev_block_hash = (
None if is_first_block else
self._none_hash if is_first_block else
self._prev_block.content_hash # type: ignore
)
# Previous block exists but does not yet have a hash.
# Return no hash in this case.
if prev_block_hash is None and not is_first_block:
if prev_block_hash == self._none_hash and not is_first_block:
return None
self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
......@@ -906,8 +923,9 @@ class PrefixCachingBlock(Block):
extra_hash=self._extra_hash)
return self._cached_content_hash
@staticmethod
def hash_block_tokens(is_first_block: bool,
@classmethod
def hash_block_tokens(cls,
is_first_block: bool,
prev_block_hash: Optional[int],
cur_block_token_ids: List[int],
extra_hash: Optional[int] = None) -> int:
......@@ -928,7 +946,8 @@ class PrefixCachingBlock(Block):
Returns:
- int: The computed hash value for the block.
"""
assert (prev_block_hash is None) == is_first_block
if is_first_block and prev_block_hash is None:
prev_block_hash = cls._none_hash
return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
extra_hash))
......@@ -948,6 +967,14 @@ class ComputedBlocksTracker:
cached block hashes in the allocator.
"""
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash: int = hash('None')
def __init__(
self,
allocator: DeviceAwareBlockAllocator,
......@@ -993,7 +1020,7 @@ class ComputedBlocksTracker:
# We need to know the hash of the previous block to compute the hash of
# the current block so that blocks could be uniquely identified across
# sequences of prefixes.
prev_block_hash = (None if cur_num_blocks_recorded == 0 else
prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
block_hashes_recorded[-1])
# Only update the computed block hashes for the new blocks
for i in range(cur_num_blocks_recorded, num_computed_blocks):
......@@ -1008,7 +1035,7 @@ class ComputedBlocksTracker:
# This has to be kept in sync with the allocator's hash
# calculation.
block_hash = PrefixCachingBlock.hash_block_tokens(
is_first_block=prev_block_hash is None,
is_first_block=prev_block_hash == self._none_hash,
prev_block_hash=prev_block_hash,
cur_block_token_ids=block_token_ids,
extra_hash=extra_hash,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment