Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66b809cc
Commit
66b809cc
authored
Feb 08, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.2' into v0.7.2-dev
parents
37b63c24
0408efc6
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
94 additions
and
13 deletions
+94
-13
vllm/compilation/counter.py
vllm/compilation/counter.py
+2
-0
vllm/compilation/decorators.py
vllm/compilation/decorators.py
+2
-0
vllm/compilation/fix_functionalization.py
vllm/compilation/fix_functionalization.py
+2
-0
vllm/compilation/fusion.py
vllm/compilation/fusion.py
+2
-0
vllm/compilation/fx_utils.py
vllm/compilation/fx_utils.py
+2
-0
vllm/compilation/inductor_pass.py
vllm/compilation/inductor_pass.py
+2
-0
vllm/compilation/monitor.py
vllm/compilation/monitor.py
+2
-0
vllm/compilation/multi_output_match.py
vllm/compilation/multi_output_match.py
+2
-0
vllm/compilation/pass_manager.py
vllm/compilation/pass_manager.py
+2
-0
vllm/compilation/reshapes.py
vllm/compilation/reshapes.py
+2
-0
vllm/compilation/vllm_inductor_pass.py
vllm/compilation/vllm_inductor_pass.py
+2
-0
vllm/compilation/wrapper.py
vllm/compilation/wrapper.py
+2
-0
vllm/config.py
vllm/config.py
+23
-5
vllm/connections.py
vllm/connections.py
+2
-0
vllm/core/block/block_table.py
vllm/core/block/block_table.py
+2
-0
vllm/core/block/common.py
vllm/core/block/common.py
+2
-0
vllm/core/block/cpu_gpu_block_allocator.py
vllm/core/block/cpu_gpu_block_allocator.py
+2
-0
vllm/core/block/interfaces.py
vllm/core/block/interfaces.py
+2
-0
vllm/core/block/naive_block.py
vllm/core/block/naive_block.py
+2
-0
vllm/core/block/prefix_caching_block.py
vllm/core/block/prefix_caching_block.py
+35
-8
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
vllm/compilation/counter.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
copy
import
dataclasses
from
contextlib
import
contextmanager
...
...
vllm/compilation/decorators.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
inspect
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
TypeVar
,
Union
,
overload
from
unittest.mock
import
patch
...
...
vllm/compilation/fix_functionalization.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
operator
from
typing
import
Dict
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
...
...
vllm/compilation/fusion.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Callable
,
Dict
,
List
,
NamedTuple
,
Optional
,
Tuple
import
torch
...
...
vllm/compilation/fx_utils.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
operator
from
typing
import
Iterable
,
Optional
...
...
vllm/compilation/inductor_pass.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
hashlib
import
inspect
import
types
...
...
vllm/compilation/monitor.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
time
...
...
vllm/compilation/multi_output_match.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
abc
import
operator
from
abc
import
abstractmethod
...
...
vllm/compilation/pass_manager.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Any
,
Dict
,
List
from
torch
import
fx
as
fx
...
...
vllm/compilation/reshapes.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Union
import
torch.fx
...
...
vllm/compilation/vllm_inductor_pass.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
time
import
torch
...
...
vllm/compilation/wrapper.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
os
import
sys
from
abc
import
abstractmethod
...
...
vllm/config.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
ast
import
copy
import
enum
...
...
@@ -81,6 +83,12 @@ class SupportsHash(Protocol):
...
class
ModelImpl
(
str
,
enum
.
Enum
):
AUTO
=
"auto"
VLLM
=
"vllm"
TRANSFORMERS
=
"transformers"
class
ModelConfig
:
"""Configuration for the model.
...
...
@@ -165,6 +173,12 @@ class ModelConfig:
`logits_processors` extra completion argument. Defaults to None,
which allows no processors.
generation_config: Configuration parameter file for generation.
model_impl: Which implementation of the model to use:
"auto" will try to use the vLLM implementation if it exists and
fall back to the Transformers implementation if no vLLM
implementation is available.
"vllm" will use the vLLM model implementation.
"transformers" will use the Transformers model implementation.
override_generation_config: Override the generation config with the
given config.
"""
...
...
@@ -228,6 +242,7 @@ class ModelConfig:
generation_config
:
Optional
[
str
]
=
None
,
enable_sleep_mode
:
bool
=
False
,
override_generation_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
model_impl
:
Union
[
str
,
ModelImpl
]
=
ModelImpl
.
AUTO
,
)
->
None
:
self
.
model
=
model
self
.
tokenizer
=
tokenizer
...
...
@@ -239,6 +254,7 @@ class ModelConfig:
self
.
code_revision
=
code_revision
self
.
rope_scaling
=
rope_scaling
self
.
rope_theta
=
rope_theta
self
.
model_impl
=
model_impl
if
hf_overrides
is
None
:
hf_overrides
=
{}
...
...
@@ -738,7 +754,6 @@ class ModelConfig:
@
property
def
is_deepseek_mla
(
self
)
->
bool
:
# TODO add deepseek_v3
return
(
hasattr
(
self
.
hf_text_config
,
"model_type"
))
\
and
(
self
.
hf_text_config
.
model_type
in
\
(
'deepseek_v2'
,
'deepseek_v3'
))
\
...
...
@@ -970,6 +985,9 @@ class ModelConfig:
@
property
def
use_mla
(
self
)
->
bool
:
if
not
self
.
is_deepseek_mla
or
envs
.
VLLM_MLA_DISABLE
:
return
False
if
self
.
quantization
is
not
None
and
self
.
quantization
not
in
[
\
"fp8"
,
"compressed-tensors"
]:
logger
.
warning
(
...
...
@@ -981,8 +999,9 @@ class ModelConfig:
# have fp8 for both weights and activations.
if
self
.
quantization
==
"compressed-tensors"
:
quant_config
=
self
.
_parse_quant_hf_config
()
for
group_name
,
cfg
in
quant_config
.
get
(
"config_groups"
,
(
""
,
{})).
items
():
for
group_name
,
cfg
in
quant_config
.
get
(
"config_groups"
,
{
""
:
{}
}).
items
():
act_cfg
=
cfg
.
get
(
"input_activations"
,
{})
act_type
=
None
if
act_cfg
is
None
else
act_cfg
.
get
(
"type"
,
""
)
w_cfg
=
cfg
.
get
(
"weights"
,
{})
...
...
@@ -996,8 +1015,7 @@ class ModelConfig:
quant_config
)
return
False
use_mla
=
(
self
.
is_deepseek_mla
and
not
envs
.
VLLM_MLA_DISABLE
)
return
use_mla
return
True
@
property
def
supported_runner_types
(
self
)
->
Set
[
RunnerType
]:
...
...
vllm/connections.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
pathlib
import
Path
from
typing
import
Mapping
,
MutableMapping
,
Optional
from
urllib.parse
import
urlparse
...
...
vllm/core/block/block_table.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
import
math
from
typing
import
List
,
Optional
...
...
vllm/core/block/common.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
collections
import
deque
from
dataclasses
import
dataclass
from
typing
import
Deque
,
Dict
,
Iterable
,
List
,
Optional
,
Protocol
,
Tuple
...
...
vllm/core/block/cpu_gpu_block_allocator.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Dict
,
FrozenSet
,
List
,
Optional
,
Tuple
from
vllm.core.block.interfaces
import
(
Block
,
BlockAllocator
,
BlockId
,
...
...
vllm/core/block/interfaces.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
abc
import
ABC
,
abstractmethod
from
typing
import
Dict
,
FrozenSet
,
List
,
Optional
,
Protocol
,
Tuple
...
...
vllm/core/block/naive_block.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
from
collections
import
deque
from
typing
import
Deque
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
...
...
vllm/core/block/prefix_caching_block.py
View file @
66b809cc
# SPDX-License-Identifier: Apache-2.0
"""Token blocks."""
import
sys
from
bisect
import
bisect_left
...
...
@@ -64,6 +65,15 @@ class PrefixCachingBlockAllocator(BlockAllocator):
from 0 to num_blocks - 1.
"""
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash
:
int
=
hash
(
'None'
)
# Implements Block.Factory.
def
__init__
(
self
,
num_blocks
:
int
,
...
...
@@ -121,7 +131,6 @@ class PrefixCachingBlockAllocator(BlockAllocator):
self
.
metric_data
=
CacheMetricData
()
# Implements Block.Factory.
def
_create_block
(
self
,
prev_block
:
Optional
[
Block
],
...
...
@@ -736,6 +745,14 @@ class PrefixCachingBlock(Block):
such as adapters that influence the block, apart from the token_ids.
"""
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash
:
int
=
hash
(
'None'
)
def
__init__
(
self
,
prev_block
:
Optional
[
Block
],
...
...
@@ -890,13 +907,13 @@ class PrefixCachingBlock(Block):
is_first_block
=
self
.
_prev_block
is
None
prev_block_hash
=
(
None
if
is_first_block
else
self
.
_none_hash
if
is_first_block
else
self
.
_prev_block
.
content_hash
# type: ignore
)
# Previous block exists but does not yet have a hash.
# Return no hash in this case.
if
prev_block_hash
is
None
and
not
is_first_block
:
if
prev_block_hash
==
self
.
_none_hash
and
not
is_first_block
:
return
None
self
.
_cached_content_hash
=
PrefixCachingBlock
.
hash_block_tokens
(
...
...
@@ -906,8 +923,9 @@ class PrefixCachingBlock(Block):
extra_hash
=
self
.
_extra_hash
)
return
self
.
_cached_content_hash
@
staticmethod
def
hash_block_tokens
(
is_first_block
:
bool
,
@
classmethod
def
hash_block_tokens
(
cls
,
is_first_block
:
bool
,
prev_block_hash
:
Optional
[
int
],
cur_block_token_ids
:
List
[
int
],
extra_hash
:
Optional
[
int
]
=
None
)
->
int
:
...
...
@@ -928,7 +946,8 @@ class PrefixCachingBlock(Block):
Returns:
- int: The computed hash value for the block.
"""
assert
(
prev_block_hash
is
None
)
==
is_first_block
if
is_first_block
and
prev_block_hash
is
None
:
prev_block_hash
=
cls
.
_none_hash
return
hash
((
is_first_block
,
prev_block_hash
,
*
cur_block_token_ids
,
extra_hash
))
...
...
@@ -948,6 +967,14 @@ class ComputedBlocksTracker:
cached block hashes in the allocator.
"""
# Note that we use 'None' as a string here instead of None because
# as of Python 3.12, hash(None) returns a constant predictable value.
# This could possibly make it easier to find and exploit hash
# collisions. 'None' as a string will be hashed differently per process,
# but consistently within the same process. This is the same as the
# behavior of None prior to Python 3.12.
_none_hash
:
int
=
hash
(
'None'
)
def
__init__
(
self
,
allocator
:
DeviceAwareBlockAllocator
,
...
...
@@ -993,7 +1020,7 @@ class ComputedBlocksTracker:
# We need to know the hash of the previous block to compute the hash of
# the current block so that blocks could be uniquely identified across
# sequences of prefixes.
prev_block_hash
=
(
None
if
cur_num_blocks_recorded
==
0
else
prev_block_hash
=
(
self
.
_none_hash
if
cur_num_blocks_recorded
==
0
else
block_hashes_recorded
[
-
1
])
# Only update the computed block hashes for the new blocks
for
i
in
range
(
cur_num_blocks_recorded
,
num_computed_blocks
):
...
...
@@ -1008,7 +1035,7 @@ class ComputedBlocksTracker:
# This has to be kept in sync with the allocator's hash
# calculation.
block_hash
=
PrefixCachingBlock
.
hash_block_tokens
(
is_first_block
=
prev_block_hash
is
None
,
is_first_block
=
prev_block_hash
==
self
.
_none_hash
,
prev_block_hash
=
prev_block_hash
,
cur_block_token_ids
=
block_token_ids
,
extra_hash
=
extra_hash
,
...
...
Prev
1
…
24
25
26
27
28
29
30
31
32
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment