Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
00730fc6
Unverified
Commit
00730fc6
authored
Apr 16, 2025
by
ptarasiewiczNV
Committed by
GitHub
Apr 16, 2025
Browse files
feat: bump vLLM version to v0.8.4 (#690)
parent
48733546
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
5410 additions
and
224 deletions
+5410
-224
container/Dockerfile.vllm
container/Dockerfile.vllm
+6
-6
container/deps/vllm/tests/test_patch_install.py
container/deps/vllm/tests/test_patch_install.py
+1
-1
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
+756
-217
container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
+4647
-0
No files found.
container/Dockerfile.vllm
View file @
00730fc6
...
...
@@ -181,16 +181,12 @@ RUN mkdir /opt/dynamo && \
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
ARG VLLM_REF="0.
7.2
"
ARG VLLM_REF="0.
8.4
"
ARG VLLM_PATCH="vllm_v${VLLM_REF}-dynamo-kv-disagg-patch.patch"
ARG VLLM_PATCHED_PACKAGE_NAME="ai_dynamo_vllm"
ARG VLLM_PATCHED_PACKAGE_VERSION="0.
7.2.post1
"
ARG VLLM_PATCHED_PACKAGE_VERSION="0.
8.4
"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
mkdir /tmp/vllm && \
uv pip install pip wheel && \
...
...
@@ -211,6 +207,10 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
wheel pack . --dest-dir /workspace/dist && \
uv pip install /workspace/dist/${VLLM_PATCHED_PACKAGE_NAME}-*.whl
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Install test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
...
...
container/deps/vllm/tests/test_patch_install.py
View file @
00730fc6
...
...
@@ -27,4 +27,4 @@ pytestmark = pytest.mark.pre_merge
@
pytest
.
mark
.
skipif
(
vllm
is
None
,
reason
=
"Skipping vllm tests, vllm not installed"
)
def
test_version
():
# Verify that the image has the patched version of vllm
assert
vllm
.
__version__
.
endswith
(
"0.
7.2
"
)
# type: ignore
assert
vllm
.
__version__
.
endswith
(
"0.
8.4
"
)
# type: ignore
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
View file @
00730fc6
diff --git a/vllm/config.py b/vllm/config.py
index 9ba49757..
a4df0019
100644
index 9ba49757
6
..
db2dc002f
100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2620,6 +2620,9 @@
class KVTransferConfig(BaseModel):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import ast
import copy
@@ -2620,6 +2633,9 @@
class KVTransferConfig(BaseModel):
# The KV connector for vLLM to transmit KV caches between vLLM instances.
kv_connector: Optional[str] = None
...
...
@@ -12,7 +30,7 @@ index 9ba49757..a4df0019 100644
# The device used by kv connector to buffer the KV cache.
# Currently only support 'cuda'.
kv_buffer_device: Optional[str] = "cuda"
@@ -2629,7 +26
32
,7 @@
class KVTransferConfig(BaseModel):
@@ -2629,7 +26
45
,7 @@
class KVTransferConfig(BaseModel):
kv_buffer_size: float = 1e9
# Whether this vLLM instance produces, consumes KV cache, or both. Choices
...
...
@@ -21,7 +39,7 @@ index 9ba49757..a4df0019 100644
kv_role: Optional[str] = None
# The rank of this vLLM instance in the KV cache transfer. Typical value:
@@ -2647,6 +26
50
,14 @@
class KVTransferConfig(BaseModel):
@@ -2647,6 +26
63
,14 @@
class KVTransferConfig(BaseModel):
# The KV connector port, used to build distributed connection
kv_port: int = 14579
...
...
@@ -36,7 +54,7 @@ index 9ba49757..a4df0019 100644
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
@@ -2680,11 +2
691
,16 @@
class KVTransferConfig(BaseModel):
@@ -2680,11 +2
704
,16 @@
class KVTransferConfig(BaseModel):
f"Supported roles are `kv_producer`, `kv_consumer`, "
f"and `kv_both`")
...
...
@@ -54,7 +72,7 @@ index 9ba49757..a4df0019 100644
@property
def is_kv_transfer_instance(self) -> bool:
return self.kv_connector is not None and \
@@ -2694,6 +27
10
,8 @@
class KVTransferConfig(BaseModel):
@@ -2694,6 +27
23
,8 @@
class KVTransferConfig(BaseModel):
def need_kv_parallel_group(self) -> bool:
# for those database-based connector, vLLM does not need to create
# parallel group, and in that case the kv parallel size will be 1.
...
...
@@ -63,7 +81,7 @@ index 9ba49757..a4df0019 100644
return self.kv_connector is not None and self.kv_parallel_size > 1
@property
@@ -2706,6 +27
24
,18 @@
class KVTransferConfig(BaseModel):
@@ -2706,6 +27
37
,18 @@
class KVTransferConfig(BaseModel):
return self.kv_connector is not None and \
self.kv_role in ["kv_consumer", "kv_both"]
...
...
@@ -83,10 +101,28 @@ index 9ba49757..a4df0019 100644
class CompilationLevel:
# constants for the levels of the compilation process
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 359b5b26..
d52ee05
0 100644
index 359b5b26
3
..
7bac45ff
0 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -6,6 +6,7 @@
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
from typing import Dict, FrozenSet, List, Optional, Tuple
@@ -6,6 +19,7 @@
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
DeviceAwareBlockAllocator)
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
...
...
@@ -94,7 +130,7 @@ index 359b5b26..d52ee050 100644
from vllm.platforms import current_platform
from vllm.utils import Device
@@ -28,6 +2
9
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -28,6 +
4
2,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
num_gpu_blocks: int,
num_cpu_blocks: int,
block_size: int,
...
...
@@ -102,7 +138,7 @@ index 359b5b26..d52ee050 100644
) -> DeviceAwareBlockAllocator:
"""Creates a CpuGpuBlockAllocator instance with the specified
configuration.
@@ -64,6 +
66
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -64,6 +
79
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
cpu_block_ids = block_ids[num_gpu_blocks:]
if allocator_type == "naive":
...
...
@@ -110,7 +146,7 @@ index 359b5b26..d52ee050 100644
gpu_allocator: BlockAllocator = NaiveBlockAllocator(
create_block=NaiveBlock, # type: ignore
num_blocks=num_gpu_blocks,
@@ -82,12 +8
5
,14 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -82,12 +
9
8,14 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
num_blocks=num_gpu_blocks,
block_size=block_size,
block_ids=gpu_block_ids,
...
...
@@ -125,7 +161,7 @@ index 359b5b26..d52ee050 100644
)
else:
raise ValueError(f"Unknown allocator type {allocator_type=}")
@@ -95,10 +1
00
,12 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -95,10 +1
13
,12 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
return CpuGpuBlockAllocator(
cpu_block_allocator=cpu_allocator,
gpu_block_allocator=gpu_allocator,
...
...
@@ -139,7 +175,7 @@ index 359b5b26..d52ee050 100644
assert not (
cpu_block_allocator.all_block_ids
& gpu_block_allocator.all_block_ids
@@ -108,6 +1
15
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -108,6 +1
28
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
Device.CPU: cpu_block_allocator,
Device.GPU: gpu_block_allocator,
}
...
...
@@ -148,10 +184,24 @@ index 359b5b26..d52ee050 100644
self._swap_mapping: Dict[int, int] = {}
self._null_block: Optional[Block] = None
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index c388366b..3
1ed7aa4
100644
index c388366b
8
..3
c223b519
100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -2,7 +2,7 @@
@@ -1,8 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
from collections import deque
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
...
...
@@ -160,7 +210,7 @@ index c388366b..31ed7aa4 100644
from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
get_all_blocks_recursively)
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
@@ -38,7 +
38
,7 @@
class NaiveBlockAllocator(BlockAllocator):
@@ -38,7 +
51
,7 @@
class NaiveBlockAllocator(BlockAllocator):
if block_ids is None:
block_ids = range(num_blocks)
...
...
@@ -169,7 +219,7 @@ index c388366b..31ed7aa4 100644
self._all_block_indices = frozenset(block_ids)
assert len(self._all_block_indices) == num_blocks
@@ -134,7 +1
3
4,8 @@
class NaiveBlockAllocator(BlockAllocator):
@@ -134,7 +14
7
,8 @@
class NaiveBlockAllocator(BlockAllocator):
if not self._free_block_indices:
raise BlockAllocator.NoFreeBlocksError()
...
...
@@ -179,7 +229,7 @@ index c388366b..31ed7aa4 100644
self._refcounter.incr(block_id)
return block_id
@@ -148,7 +1
49
,7 @@
class NaiveBlockAllocator(BlockAllocator):
@@ -148,7 +1
62
,7 @@
class NaiveBlockAllocator(BlockAllocator):
refcount = self._refcounter.decr(block_id)
if refcount == 0:
...
...
@@ -189,10 +239,26 @@ index c388366b..31ed7aa4 100644
def free(self, block: Block, keep_block_object: bool = False) -> None:
# Release the physical block id
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 1ca9e49d..
cd780f69
100644
index 1ca9e49d
a
..
26fabb243
100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -4,7 +4,7 @@
import sys
@@ -1,10 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""Token blocks."""
import sys
from bisect import bisect_left
from os.path import commonprefix
from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
...
...
@@ -201,7 +267,7 @@ index 1ca9e49d..cd780f69 100644
from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
get_all_blocks_recursively)
@@ -23,6 +
2
3,9 @@
PrefixHash = int
@@ -23,6 +3
6
,9 @@
PrefixHash = int
# then we know this block hasn't been accessed yet.
_DEFAULT_LAST_ACCESSED_TIME = -1
...
...
@@ -211,7 +277,7 @@ index 1ca9e49d..cd780f69 100644
logger = init_logger(__name__)
@@ -80,6 +
83
,7 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -80,6 +
96
,7 @@
class PrefixCachingBlockAllocator(BlockAllocator):
block_size: int,
block_ids: Optional[Iterable[int]] = None,
eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
...
...
@@ -219,7 +285,7 @@ index 1ca9e49d..cd780f69 100644
):
if block_ids is None:
block_ids = range(num_blocks)
@@ -131,6 +1
35
,9 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -131,6 +1
48
,9 @@
class PrefixCachingBlockAllocator(BlockAllocator):
self.metric_data = CacheMetricData()
...
...
@@ -229,7 +295,7 @@ index 1ca9e49d..cd780f69 100644
def _create_block(
self,
prev_block: Optional[Block],
@@ -337,6 +3
44
,9 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -337,6 +3
57
,9 @@
class PrefixCachingBlockAllocator(BlockAllocator):
assert self._refcounter.get(_block_id) == 0
assert _block_id == block_id
...
...
@@ -239,7 +305,7 @@ index 1ca9e49d..cd780f69 100644
self._cached_blocks.pop(content_hash_to_evict)
self._refcounter.incr(block_id)
@@ -513,6 +5
2
3,10 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -513,6 +53
6
,10 @@
class PrefixCachingBlockAllocator(BlockAllocator):
# Mark this block as touched so that it can be marked as
# computed after the entire batch of sequences are scheduled.
self._touched_blocks.add(block.block_id)
...
...
@@ -250,7 +316,7 @@ index 1ca9e49d..cd780f69 100644
return block.block_id
# Reuse the cached content hash
@@ -579,9 +
593
,11 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -579,9 +
606
,11 @@
class PrefixCachingBlockAllocator(BlockAllocator):
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
# Mark all touched blocks as computed.
...
...
@@ -266,10 +332,28 @@ index 1ca9e49d..cd780f69 100644
def _track_block_id(self, block_id: Optional[BlockId],
computed: bool) -> None:
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c5b3b04f..
21fe0fc
8 100644
index c5b3b04f
3
..
d3a4b77f
8 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -10,7 +10,10 @@
from vllm.core.block.interfaces import Block
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""A block manager that manages token blocks."""
from typing import Dict, List, Optional
from typing import Sequence as GenericSequence
@@ -10,7 +23,10 @@
from vllm.core.block.interfaces import Block
from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
LastAccessBlocksTracker)
from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
...
...
@@ -280,7 +364,7 @@ index c5b3b04f..21fe0fc8 100644
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
from vllm.utils import Device
@@ -60,6 +6
3
,7 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
@@ -60,6 +
7
6,7 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
def __init__(
self,
...
...
@@ -288,7 +372,7 @@ index c5b3b04f..21fe0fc8 100644
block_size: int,
num_gpu_blocks: int,
num_cpu_blocks: int,
@@ -91,11 +
95
,29 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
@@ -91,11 +
108
,29 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
self.watermark_blocks = int(watermark * num_gpu_blocks)
...
...
@@ -318,7 +402,7 @@ index c5b3b04f..21fe0fc8 100644
)
self.block_tables: Dict[SeqId, BlockTable] = {}
@@ -108,7 +13
0
,8 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
@@ -108,7 +1
4
3,8 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
def can_allocate(self,
seq_group: SequenceGroup,
...
...
@@ -328,7 +412,7 @@ index c5b3b04f..21fe0fc8 100644
# FIXME(woosuk): Here we assume that all sequences in the group share
# the same prompt. This may not be true for preempted sequences.
@@ -121,6 +1
44
,10 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
@@ -121,6 +1
57
,10 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
num_lookahead_slots=num_lookahead_slots,
)
...
...
@@ -341,11 +425,24 @@ index c5b3b04f..21fe0fc8 100644
assert encoder_seq is not None
diff --git a/vllm/core/event_manager.py b/vllm/core/event_manager.py
new file mode 100644
index 00000000..
a27af580
index 00000000
0
..
79eb8db67
--- /dev/null
+++ b/vllm/core/event_manager.py
@@ -0,0 +1,108 @@
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ctypes
+import logging
+import uuid
...
...
@@ -454,10 +551,26 @@ index 00000000..a27af580
+
+ self.event_id_counter += 1
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f507847a..
170a359f
100644
index f507847a
d
..
3f3cba766
100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -4,22 +4,22 @@
import enum
@@ -1,25 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import enum
import os
import random
import time
...
...
@@ -484,7 +597,7 @@ index f507847a..170a359f 100644
logger = init_logger(__name__)
# Test-only. If configured, decode is preempted with
@@ -285,6 +28
5
,7 @@
class SchedulerPrefillOutputs:
@@ -285,6 +2
9
8,7 @@
class SchedulerPrefillOutputs:
# Ignored sequence groups.
ignored_seq_groups: List[SequenceGroup]
num_lookahead_slots: int
...
...
@@ -492,7 +605,7 @@ index f507847a..170a359f 100644
@classmethod
def create_empty(cls) -> "SchedulerPrefillOutputs":
@@ -292,6 +
293
,7 @@
class SchedulerPrefillOutputs:
@@ -292,6 +
306
,7 @@
class SchedulerPrefillOutputs:
seq_groups=[],
ignored_seq_groups=[],
num_lookahead_slots=0,
...
...
@@ -500,7 +613,7 @@ index f507847a..170a359f 100644
)
@@ -325,12 +3
27
,14 @@
class Scheduler:
@@ -325,12 +3
40
,14 @@
class Scheduler:
def __init__(
self,
...
...
@@ -515,7 +628,7 @@ index f507847a..170a359f 100644
self.scheduler_config = scheduler_config
self.cache_config = cache_config
# Note for LoRA scheduling: the current policy is extremely
@@ -356,6 +3
60
,7 @@
class Scheduler:
@@ -356,6 +3
73
,7 @@
class Scheduler:
# Create the block space manager.
self.block_manager = BlockSpaceManagerImpl(
...
...
@@ -523,7 +636,7 @@ index f507847a..170a359f 100644
block_size=self.cache_config.block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
@@ -371,6 +3
76
,16 @@
class Scheduler:
@@ -371,6 +3
89
,16 @@
class Scheduler:
# Sequence groups in the SWAPPED state.
# Contain decode requests that are swapped out.
self.swapped: Deque[SequenceGroup] = deque()
...
...
@@ -540,7 +653,7 @@ index f507847a..170a359f 100644
# Sequence groups finished requests ids since last step iteration.
# It lets the model know that any state associated with these requests
# can and must be released after the current step.
@@ -501,7 +5
16
,7 @@
class Scheduler:
@@ -501,7 +5
29
,7 @@
class Scheduler:
def has_unfinished_seqs(self) -> bool:
return len(self.waiting) != 0 or len(self.running) != 0 or len(
...
...
@@ -549,7 +662,7 @@ index f507847a..170a359f 100644
def get_prefix_cache_hit_rate(self, device: Device) -> float:
return self.block_manager.get_prefix_cache_hit_rate(device)
@@ -523,6 +5
38
,8 @@
class Scheduler:
@@ -523,6 +5
51
,8 @@
class Scheduler:
budget: SchedulingBudget,
curr_loras: Optional[Set[int]],
enable_chunking: bool = False,
...
...
@@ -558,7 +671,7 @@ index f507847a..170a359f 100644
) -> SchedulerRunningOutputs:
"""Schedule sequence groups that are running.
@@ -537,6 +5
54
,8 @@
class Scheduler:
@@ -537,6 +5
67
,8 @@
class Scheduler:
chunked number of tokens are scheduled if
`budget.num_batched_tokens` has not enough capacity to schedule
all tokens.
...
...
@@ -567,7 +680,7 @@ index f507847a..170a359f 100644
Returns:
SchedulerRunningOutputs.
@@ -566,6 +58
5
,38 @@
class Scheduler:
@@ -566,6 +5
9
8,38 @@
class Scheduler:
preempted: List[SequenceGroup] = ret.preempted
swapped_out: List[SequenceGroup] = ret.swapped_out
...
...
@@ -606,7 +719,7 @@ index f507847a..170a359f 100644
running_queue = self.running
assert len(self._async_stopped) == 0
while running_queue:
@@ -925,6 +9
76
,7 @@
class Scheduler:
@@ -925,6 +9
89
,7 @@
class Scheduler:
seq_groups: List[ScheduledSequenceGroup] = []
waiting_queue = self.waiting
...
...
@@ -614,7 +727,7 @@ index f507847a..170a359f 100644
leftover_waiting_sequences: Deque[SequenceGroup] = deque()
while self._passed_delay(time.time()) and waiting_queue:
@@ -961,8 +10
13
,10 @@
class Scheduler:
@@ -961,8 +10
26
,10 @@
class Scheduler:
True, enable_chunking)
# If the sequence group cannot be allocated, stop.
...
...
@@ -626,7 +739,7 @@ index f507847a..170a359f 100644
if can_allocate == AllocStatus.LATER:
break
elif can_allocate == AllocStatus.NEVER:
@@ -1008,7 +10
62
,18 @@
class Scheduler:
@@ -1008,7 +10
75
,18 @@
class Scheduler:
if curr_loras is not None and lora_int_id > 0:
curr_loras.add(lora_int_id)
waiting_queue.popleft()
...
...
@@ -646,7 +759,7 @@ index f507847a..170a359f 100644
if enable_chunking and self.scheduler_config.is_multi_step:
blocks_to_copy: List[Tuple[int, int]] = []
@@ -1046,9 +11
11
,11 @@
class Scheduler:
@@ -1046,9 +11
24
,11 @@
class Scheduler:
seq_groups=seq_groups,
ignored_seq_groups=ignored_seq_groups,
num_lookahead_slots=self._get_num_lookahead_slots(
...
...
@@ -660,7 +773,7 @@ index f507847a..170a359f 100644
"""Schedule queued requests.
The current policy is designed to optimize the throughput. First,
@@ -1066,9 +11
33
,13 @@
class Scheduler:
@@ -1066,9 +11
46
,13 @@
class Scheduler:
for seq_group in self.running:
budget.add_num_seqs(seq_group.request_id,
seq_group.get_max_num_running_seqs())
...
...
@@ -676,7 +789,7 @@ index f507847a..170a359f 100644
prefills = SchedulerPrefillOutputs.create_empty()
running_scheduled = SchedulerRunningOutputs.create_empty()
@@ -1090,7 +11
61
,9 @@
class Scheduler:
@@ -1090,7 +11
74
,9 @@
class Scheduler:
if len(prefills.seq_groups) == 0:
running_scheduled = self._schedule_running(budget,
curr_loras,
...
...
@@ -687,7 +800,7 @@ index f507847a..170a359f 100644
# If any sequence group is preempted, do not swap in any sequence
# group. because it means there's no slot for new running requests.
@@ -1106,7 +11
7
9,12 @@
class Scheduler:
@@ -1106,7 +119
2
,12 @@
class Scheduler:
self.waiting.extendleft(running_scheduled.preempted)
# Update new running requests.
if len(prefills.seq_groups) > 0:
...
...
@@ -701,7 +814,7 @@ index f507847a..170a359f 100644
self.running.extend(running_scheduled.decode_seq_groups_list)
@@ -1248,12 +13
26
,14 @@
class Scheduler:
@@ -1248,12 +13
39
,14 @@
class Scheduler:
len(running_scheduled.swapped_out)),
)
...
...
@@ -718,7 +831,7 @@ index f507847a..170a359f 100644
def _can_append_slots(self, seq_group: SequenceGroup,
enable_chunking: bool) -> bool:
@@ -1287,14 +13
67
,16 @@
class Scheduler:
@@ -1287,14 +13
80
,16 @@
class Scheduler:
return no_single_seq
def schedule(
...
...
@@ -738,7 +851,7 @@ index f507847a..170a359f 100644
now = time.time()
if not self.cache_config.enable_prefix_caching:
@@ -1333,7 +14
15
,8 @@
class Scheduler:
@@ -1333,7 +14
28
,8 @@
class Scheduler:
encoder_seq_data = None
cross_block_table = None
...
...
@@ -748,7 +861,7 @@ index f507847a..170a359f 100644
seq_id = seq.seq_id
seq_data[seq_id] = seq.data
block_tables[seq_id] = self.block_manager.get_block_table(seq)
@@ -1342,7 +14
25
,9 @@
class Scheduler:
@@ -1342,7 +14
38
,9 @@
class Scheduler:
if self.cache_config.enable_prefix_caching:
common_computed_block_nums = (
self.block_manager.get_common_computed_block_ids(
...
...
@@ -759,7 +872,7 @@ index f507847a..170a359f 100644
do_sample = True
is_prompt = seq_group.is_prefill()
@@ -1364,9 +14
49
,30 @@
class Scheduler:
@@ -1364,9 +14
62
,30 @@
class Scheduler:
< seqs[0].data.get_len()):
do_sample = False
...
...
@@ -790,7 +903,7 @@ index f507847a..170a359f 100644
seq_group_metadata = SequenceGroupMetadata(
request_id=seq_group.request_id,
is_prompt=is_prompt,
@@ -1392,6 +1
498
,7 @@
class Scheduler:
@@ -1392,6 +1
511
,7 @@
class Scheduler:
if scheduler_outputs.num_prefill_groups > 0 else None,
mm_processor_kwargs=seq_group.mm_processor_kwargs,
prompt_adapter_request=seq_group.prompt_adapter_request,
...
...
@@ -798,7 +911,7 @@ index f507847a..170a359f 100644
)
else:
# When SPMD mode is enabled, we only send delta data except for
@@ -1490,11 +1
597
,17 @@
class Scheduler:
@@ -1490,11 +1
610
,17 @@
class Scheduler:
self._async_stopped.clear()
...
...
@@ -821,10 +934,25 @@ index f507847a..170a359f 100644
blocks_to_copy: List[Tuple[int, int]],
diff --git a/vllm/distributed/device_communicators/kv_rearrange.py b/vllm/distributed/device_communicators/kv_rearrange.py
new file mode 100644
index 00000000..
b9485bd5
index 00000000
0
..
a2f9ce99e
--- /dev/null
+++ b/vllm/distributed/device_communicators/kv_rearrange.py
@@ -0,0 +1,110 @@
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import triton
+import triton.language as tl
...
...
@@ -938,10 +1066,25 @@ index 00000000..b9485bd5
\
No newline at end of file
diff --git a/vllm/distributed/device_communicators/nixl.py b/vllm/distributed/device_communicators/nixl.py
new file mode 100644
index 00000000..
a8bd202f
index 00000000
0
..
136a0bd37
--- /dev/null
+++ b/vllm/distributed/device_communicators/nixl.py
@@ -0,0 +1,379 @@
@@ -0,0 +1,394 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from typing import List, Tuple
+from vllm.config import VllmConfig
...
...
@@ -1323,11 +1466,24 @@ index 00000000..a8bd202f
+ return done_req_ids
diff --git a/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py b/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py
new file mode 100644
index 00000000..
7b3344f8
index 00000000
0
..
418fc7154
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py
@@ -0,0 +1,350 @@
@@ -0,0 +1,363 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Simple KV Cache Connector for Distributed Machine Learning Inference
+
...
...
@@ -1678,10 +1834,28 @@ index 00000000..7b3344f8
+ self.config.kv_consumers_pipeline_parallel_size = kv_config_enhanced["kv_consumers_pipeline_parallel_size"]
+ self.config.kv_producers_parallel_size = kv_config_enhanced["kv_producers_parallel_size"]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index fe480533..
c82fda80
100644
index fe480533
4
..
0e16f0b31
100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -27,13 +27,13 @@
class KVConnectorFactory:
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import importlib
from typing import TYPE_CHECKING, Callable, Dict, Type
@@ -27,13 +40,13 @@
class KVConnectorFactory:
@classmethod
def create_connector(cls, rank: int, local_rank: int,
...
...
@@ -1697,7 +1871,7 @@ index fe480533..c82fda80 100644
# Register various connectors here.
@@ -48,3 +
48
,8 @@
KVConnectorFactory.register_connector(
@@ -48,3 +
61
,8 @@
KVConnectorFactory.register_connector(
"MooncakeConnector",
"vllm.distributed.kv_transfer.kv_connector.simple_connector",
"SimpleConnector")
...
...
@@ -1707,10 +1881,28 @@ index fe480533..c82fda80 100644
+ "vllm.distributed.kv_transfer.kv_connector.dynamo_connector",
+ "DynamoConnector")
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 2033e976..
ddebb68e
100644
index 2033e976
2
..
983bc69a3
100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -8,13 +8,15 @@
MooncakePipe.
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""
Simple KV Cache Connector for Distributed Machine Learning Inference
@@ -8,13 +21,15 @@
MooncakePipe.
But the logic can be extended to support other pipe and lookup buffer.
"""
...
...
@@ -1727,7 +1919,7 @@ index 2033e976..ddebb68e 100644
from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
SimpleBuffer)
from vllm.logger import init_logger
@@ -33,6 +
35
,7 @@
class SimpleConnector(KVConnectorBase):
@@ -33,6 +
48
,7 @@
class SimpleConnector(KVConnectorBase):
rank: int,
local_rank: int,
config: VllmConfig,
...
...
@@ -1735,7 +1927,7 @@ index 2033e976..ddebb68e 100644
):
self.config = config.kv_transfer_config
@@ -71,20 +7
4
,31 @@
class SimpleConnector(KVConnectorBase):
@@ -71,20 +
8
7,31 @@
class SimpleConnector(KVConnectorBase):
self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
...
...
@@ -1768,7 +1960,7 @@ index 2033e976..ddebb68e 100644
local_rank=local_rank,
config=self.config,
port_offset=port_offset_base + 1,
@@ -108,11 +1
22
,13 @@
class SimpleConnector(KVConnectorBase):
@@ -108,11 +1
35
,13 @@
class SimpleConnector(KVConnectorBase):
# its recv pipe to the send pipe of KV producder
if self.config.kv_connector == "PyNcclConnector":
self.consumer_data_pipe = PyNcclPipe(
...
...
@@ -1782,7 +1974,7 @@ index 2033e976..ddebb68e 100644
local_rank=local_rank,
config=self.config,
port_offset=port_offset_base + 1,
@@ -131,21 +1
47
,25 @@
class SimpleConnector(KVConnectorBase):
@@ -131,21 +1
60
,25 @@
class SimpleConnector(KVConnectorBase):
self.config.kv_buffer_size,
)
...
...
@@ -1812,7 +2004,7 @@ index 2033e976..ddebb68e 100644
def send_kv_caches_and_hidden_states(
self,
@@ -161,12 +1
81
,20 @@
class SimpleConnector(KVConnectorBase):
@@ -161,12 +1
94
,20 @@
class SimpleConnector(KVConnectorBase):
slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
start_layer = model_executable.model.start_layer
end_layer = model_executable.model.end_layer
...
...
@@ -1837,7 +2029,7 @@ index 2033e976..ddebb68e 100644
# query_lens contains new KV caches that are added to vLLM.
# so we will send them to decode instance
@@ -175,27 +2
03
,40 @@
class SimpleConnector(KVConnectorBase):
@@ -175,27 +2
16
,40 @@
class SimpleConnector(KVConnectorBase):
start_pos = sum(seq_lens[:idx])
end_pos = start_pos + slen
current_tokens = input_tokens_tensor[start_pos:end_pos]
...
...
@@ -1892,7 +2084,7 @@ index 2033e976..ddebb68e 100644
logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
@@ -215,6 +2
5
6,7 @@
class SimpleConnector(KVConnectorBase):
@@ -215,6 +26
9
,7 @@
class SimpleConnector(KVConnectorBase):
input_tokens_tensor = model_input.input_tokens
seq_lens = model_input.attn_metadata.seq_lens
slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
...
...
@@ -1900,7 +2092,7 @@ index 2033e976..ddebb68e 100644
hidden_or_intermediate_states_for_one_req = []
@@ -222,6 +2
64
,9 @@
class SimpleConnector(KVConnectorBase):
@@ -222,6 +2
77
,9 @@
class SimpleConnector(KVConnectorBase):
num_computed_tokens_list = []
start_pos_list = []
...
...
@@ -1910,7 +2102,7 @@ index 2033e976..ddebb68e 100644
# enumerate different requests
# FIXME(Kuntai): This impl assumes that all requests are prefill.
for idx, slen in enumerate(seq_lens):
@@ -229,13 +27
4
,15 @@
class SimpleConnector(KVConnectorBase):
@@ -229,13 +2
8
7,15 @@
class SimpleConnector(KVConnectorBase):
start_pos = sum(seq_lens[:idx])
end_pos = start_pos + slen
current_tokens = input_tokens_tensor[start_pos:end_pos]
...
...
@@ -1927,7 +2119,7 @@ index 2033e976..ddebb68e 100644
torch.ones_like(current_tokens, dtype=bool))
if ret[0] is None:
# didn't find any match.
@@ -267,19 +3
14
,25 @@
class SimpleConnector(KVConnectorBase):
@@ -267,19 +3
27
,25 @@
class SimpleConnector(KVConnectorBase):
kv_cache = kv_caches[i - model_executable.model.start_layer]
layer = model_executable.model.layers[i]
...
...
@@ -1966,7 +2158,7 @@ index 2033e976..ddebb68e 100644
hidden_or_intermediate_states_for_one_req.append(hidden)
@@ -312,3 +3
65
,77 @@
class SimpleConnector(KVConnectorBase):
@@ -312,3 +3
78
,77 @@
class SimpleConnector(KVConnectorBase):
# MooncakePipe reuses data_pipe for signal_pipe, so we only have to
# close the data_pipe.
pass
...
...
@@ -2045,10 +2237,28 @@ index 2033e976..ddebb68e 100644
+ self.config.kv_consumers_pipeline_parallel_size = kv_config_enhanced["kv_consumers_pipeline_parallel_size"]
+ self.config.kv_producers_parallel_size = kv_config_enhanced["kv_producers_parallel_size"]
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 5e1b6235..b4
506877
100644
index 5e1b6235
2
..
7
b4
cb406e
100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -12,7 +12,8 @@
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""
Implements a distributed key-value (KV) cache transfer mechanism.
@@ -12,7 +25,8 @@
import threading
import time
from collections import deque
...
...
@@ -2058,7 +2268,7 @@ index 5e1b6235..b4506877 100644
import torch
@@ -46,7 +
47
,7 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -46,7 +
60
,7 @@
class SimpleBuffer(KVLookupBufferBase):
self.buffer_lock = threading.Lock()
self.signal_pipe = signal_pipe
self.data_pipe = data_pipe
...
...
@@ -2067,7 +2277,7 @@ index 5e1b6235..b4506877 100644
self.normal_signal = torch.tensor([0], device="cpu")
self.end_signal = None
@@ -57,10 +
58
,16 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -57,10 +
71
,16 @@
class SimpleBuffer(KVLookupBufferBase):
# tokens_roi_sender: tokens and roi of the producer (in the buffer)
# tokens_roi_recver: tokens and roi of the consumer (query)
...
...
@@ -2088,7 +2298,7 @@ index 5e1b6235..b4506877 100644
if tokens_recver is None:
# consumer sends an empty request
@@ -80,14 +
87
,14 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -80,14 +
100
,14 @@
class SimpleBuffer(KVLookupBufferBase):
return 0
...
...
@@ -2106,7 +2316,7 @@ index 5e1b6235..b4506877 100644
def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
@@ -100,7 +10
7
,7 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -100,7 +1
2
0,7 @@
class SimpleBuffer(KVLookupBufferBase):
raise AssertionError(f"Unknown data type {type(data)}")
...
...
@@ -2115,7 +2325,7 @@ index 5e1b6235..b4506877 100644
key: torch.Tensor, value: torch.Tensor,
hidden: torch.Tensor):
@@ -115,7 +1
22
,7 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -115,7 +1
35
,7 @@
class SimpleBuffer(KVLookupBufferBase):
if isinstance(hidden, torch.Tensor):
hidden = hidden.clone()
...
...
@@ -2124,7 +2334,7 @@ index 5e1b6235..b4506877 100644
with self.buffer_lock:
for data in buffer_item:
@@ -125,53 +1
32
,54 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -125,53 +1
45
,54 @@
class SimpleBuffer(KVLookupBufferBase):
def _is_end_signal(self, signal):
return signal is None
...
...
@@ -2223,7 +2433,7 @@ index 5e1b6235..b4506877 100644
except RuntimeError as e:
if 'Connection closed by peer' not in str(e):
@@ -180,10 +
188
,10 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -180,10 +
201
,10 @@
class SimpleBuffer(KVLookupBufferBase):
logger.debug("Closing drop_select_handler")
def drop_select(
...
...
@@ -2236,7 +2446,7 @@ index 5e1b6235..b4506877 100644
"drop_select should be called by the KV cache consumer "\
"(e.g. the decode vLLM instance)"
@@ -192,26 +2
00
,28 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -192,26 +2
13
,28 @@
class SimpleBuffer(KVLookupBufferBase):
if isinstance(roi, torch.Tensor):
roi = roi.clone().float()
...
...
@@ -2274,7 +2484,7 @@ index 5e1b6235..b4506877 100644
key: torch.Tensor, value: torch.Tensor,
hidden: torch.Tensor) -> None:
@@ -222,20 +2
32
,19 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -222,20 +2
45
,19 @@
class SimpleBuffer(KVLookupBufferBase):
while self.buffer_size > self.buffer_size_threshold:
self.full_handler()
...
...
@@ -2302,10 +2512,28 @@ index 5e1b6235..b4506877 100644
else:
# TODO: have a explicit close signal and have a explicit way to
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 40589fb3..
da2829cf
100644
index 40589fb3
e
..
a3991c39d
100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -23,7 +23,7 @@
class KVPipeBase(ABC):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""
This file defines an interface `KVPipeBase`
that provides an abstraction for sending and receiving tensors, or None, via
@@ -23,7 +36,7 @@
class KVPipeBase(ABC):
"""
@abstractmethod
...
...
@@ -2314,7 +2542,7 @@ index 40589fb3..da2829cf 100644
"""Send a tensor, or None, via the pipe.
Need to support sending None -- important for error handling.
@@ -41,7 +4
1
,7 @@
class KVPipeBase(ABC):
@@ -41,7 +
5
4,7 @@
class KVPipeBase(ABC):
raise NotImplementedError
@abstractmethod
...
...
@@ -2325,10 +2553,25 @@ index 40589fb3..da2829cf 100644
Returns:
diff --git a/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py
new file mode 100644
index 00000000..
3ee0fa78
index 00000000
0
..
ca5345359
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py
@@ -0,0 +1,124 @@
@@ -0,0 +1,139 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import threading
+import typing
...
...
@@ -2454,10 +2697,28 @@ index 00000000..3ee0fa78
+ # dtype = getattr(torch, dtype)
+ self._receive_tensor(tensor_id, rank)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 7aa53d07..
f5dd50b7
100644
index 7aa53d07
a
..
8fb256aff
100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -45,33 +45,33 @@
class PyNcclPipe(KVPipeBase):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""
This module implements a PyNccl pipe for sending and receiving
Optional[torch.Tensor] between distributed ranks with advanced
@@ -45,33 +58,33 @@
class PyNcclPipe(KVPipeBase):
METADATA_DTYPE = torch.int64
def __init__(self,
...
...
@@ -2497,7 +2758,7 @@ index 7aa53d07..f5dd50b7 100644
# transportation-related variables
self.transport_thread: Optional[ThreadPoolExecutor] = None
@@ -145,16 +1
4
5,16 @@
class PyNcclPipe(KVPipeBase):
@@ -145,16 +15
8
,16 @@
class PyNcclPipe(KVPipeBase):
dtype=metadata["dtype"],
device=self.device)
...
...
@@ -2517,7 +2778,7 @@ index 7aa53d07..f5dd50b7 100644
"""
Receive the metadata dictionary from the target rank.
@@ -162,9 +1
62
,9 @@
class PyNcclPipe(KVPipeBase):
@@ -162,9 +1
75
,9 @@
class PyNcclPipe(KVPipeBase):
- metadata: A dictionary with keys "dtype" and "shape" describing
the tensor.
"""
...
...
@@ -2529,7 +2790,7 @@ index 7aa53d07..f5dd50b7 100644
"""
The actual implementation of sending the tensor and its metadata to the
target rank.
@@ -174,12 +17
4
,12 @@
class PyNcclPipe(KVPipeBase):
@@ -174,12 +1
8
7,12 @@
class PyNcclPipe(KVPipeBase):
being sent.
"""
metadata = self._make_metadata(tensor)
...
...
@@ -2545,7 +2806,7 @@ index 7aa53d07..f5dd50b7 100644
"""
The actual implementation of receiving a tensor and its metadata from
the target rank.
@@ -187,21 +
187
,22 @@
class PyNcclPipe(KVPipeBase):
@@ -187,21 +
200
,22 @@
class PyNcclPipe(KVPipeBase):
Returns:
- buffer: The received tensor, or None if no tensor is received.
"""
...
...
@@ -2572,7 +2833,7 @@ index 7aa53d07..f5dd50b7 100644
with self.buffer_size_lock:
self.buffer_size -= tensor_size
@@ -220,7 +2
21
,7 @@
class PyNcclPipe(KVPipeBase):
@@ -220,7 +2
34
,7 @@
class PyNcclPipe(KVPipeBase):
logger.debug("KV cache transfer pipe is full. Waiting...")
time.sleep(0.05)
...
...
@@ -2581,7 +2842,7 @@ index 7aa53d07..f5dd50b7 100644
"""
Sends a tensor and its metadata to the destination rank in a
non-blocking way.
@@ -228,6 +22
9
,7 @@
class PyNcclPipe(KVPipeBase):
@@ -228,6 +2
4
2,7 @@
class PyNcclPipe(KVPipeBase):
Parameters:
- tensor: The tensor to send, or None if no tensor is being sent.
"""
...
...
@@ -2589,7 +2850,7 @@ index 7aa53d07..f5dd50b7 100644
if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1)
@@ -241,32 +2
43
,39 @@
class PyNcclPipe(KVPipeBase):
@@ -241,32 +2
56
,39 @@
class PyNcclPipe(KVPipeBase):
with self.buffer_size_lock:
self.buffer_size += tensor_size
...
...
@@ -2644,10 +2905,28 @@ index 7aa53d07..f5dd50b7 100644
def close(self):
"""
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py
index 1e80e0bd..
cd90206f
100644
index 1e80e0bd
7
..
f06c7a5f6
100644
--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
@@ -35,6 +35,7 @@
class KVTransferAgent:
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""A centralized entrypoint to perform distributed KV cache transfer.
This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
@@ -35,6 +48,7 @@
class KVTransferAgent:
rank: int,
local_rank: int,
config: "VllmConfig",
...
...
@@ -2655,7 +2934,7 @@ index 1e80e0bd..cd90206f 100644
):
self.config = config
@@ -47,7 +
48
,7 @@
class KVTransferAgent:
@@ -47,7 +
61
,7 @@
class KVTransferAgent:
"TransferAgent should only be used when kv_connector is set."
self.connector = KVConnectorFactory.create_connector(
...
...
@@ -2665,10 +2944,28 @@ index 1e80e0bd..cd90206f 100644
def send_kv_caches_and_hidden_states(
self,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 321902d1..
b8937ef8
100644
index 321902d1
1
..
03409899e
100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1085,7 +1085,8 @@
def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# Copyright 2023 The vLLM team.
# Adapted from
@@ -1085,7 +1098,8 @@
def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
_KV_TRANSFER = kv_transfer.KVTransferAgent(
rank=get_world_group().rank,
local_rank=get_world_group().local_rank,
...
...
@@ -2679,10 +2976,24 @@ index 321902d1..b8937ef8 100644
def ensure_model_parallel_initialized(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d82d9ad9..
03896aa6
100644
index d82d9ad9
d
..
61c1e429d
100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2,13 +2,17 @@
@@ -1,14 +1,31 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import copy
import time
...
...
@@ -2701,7 +3012,7 @@ index d82d9ad9..03896aa6 100644
from typing import Sequence as GenericSequence
from typing import Set, Type, Union, cast, overload
@@ -60,6 +
64
,9 @@
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
@@ -60,6 +
77
,9 @@
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
usage_message)
from vllm.utils import Counter, Device, deprecate_kwargs, weak_bind
from vllm.version import __version__ as VLLM_VERSION
...
...
@@ -2711,7 +3022,7 @@ index d82d9ad9..03896aa6 100644
logger = init_logger(__name__)
_LOCAL_LOGGING_INTERVAL_SEC = 5
@@ -90,7 +
97
,7 @@
class OutputData(NamedTuple):
@@ -90,7 +
110
,7 @@
class OutputData(NamedTuple):
# outputs from multiple steps.
is_first_step_output: Optional[bool]
skip: List[int]
...
...
@@ -2720,7 +3031,7 @@ index d82d9ad9..03896aa6 100644
class SchedulerContext:
@@ -104,11 +1
11
,14 @@
class SchedulerContext:
@@ -104,11 +1
24
,14 @@
class SchedulerContext:
self.multi_step_stream_outputs: bool = multi_step_stream_outputs
...
...
@@ -2736,7 +3047,7 @@ index d82d9ad9..03896aa6 100644
self.output_queue.append(
OutputData(outputs=outputs,
seq_group_metadata_list=seq_group_metadata_list,
@@ -116,7 +1
26
,9 @@
class SchedulerContext:
@@ -116,7 +1
39
,9 @@
class SchedulerContext:
is_async=is_async,
is_last_step=is_last_step,
is_first_step_output=is_first_step_output,
...
...
@@ -2747,7 +3058,7 @@ index d82d9ad9..03896aa6 100644
class LLMEngine:
@@ -348,7 +3
60
,7 @@
class LLMEngine:
@@ -348,7 +3
73
,7 @@
class LLMEngine:
# GPU and CPU blocks, which are profiled in the distributed executor.
self.scheduler = [
Scheduler(
...
...
@@ -2756,7 +3067,7 @@ index d82d9ad9..03896aa6 100644
self.parallel_config.pipeline_parallel_size,
self.async_callbacks[v_id]
if self.model_config.use_async_output_proc else None)
@@ -405,6 +4
17
,40 @@
class LLMEngine:
@@ -405,6 +4
30
,40 @@
class LLMEngine:
self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
...
...
@@ -2797,7 +3108,7 @@ index d82d9ad9..03896aa6 100644
def _initialize_kv_caches(self) -> None:
"""Initialize the KV cache in the worker(s).
@@ -500,6 +5
46
,8 @@
class LLMEngine:
@@ -500,6 +5
59
,8 @@
class LLMEngine:
# Shutdown model executor when engine is garbage collected
# Use getattr since __init__ can fail before the field is set
if model_executor := getattr(self, "model_executor", None):
...
...
@@ -2806,7 +3117,7 @@ index d82d9ad9..03896aa6 100644
model_executor.shutdown()
def get_tokenizer_group(
@@ -552,11 +6
00
,14 @@
class LLMEngine:
@@ -552,11 +6
13
,14 @@
class LLMEngine:
prompt_adapter_request: Optional[PromptAdapterRequest],
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
...
...
@@ -2821,7 +3132,7 @@ index d82d9ad9..03896aa6 100644
ParallelSampleSequenceGroup.add_request(
request_id,
self,
@@ -574,6 +6
25
,8 @@
class LLMEngine:
@@ -574,6 +6
38
,8 @@
class LLMEngine:
# Create the sequences.
block_size = self.cache_config.block_size
seq_id = next(self.seq_counter)
...
...
@@ -2830,7 +3141,7 @@ index d82d9ad9..03896aa6 100644
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
if is_encoder_decoder_inputs(processed_inputs):
@@ -584,7 +6
37
,7 @@
class LLMEngine:
@@ -584,7 +6
50
,7 @@
class LLMEngine:
encoder_inputs = None
seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
...
...
@@ -2839,7 +3150,7 @@ index d82d9ad9..03896aa6 100644
encoder_seq = (None if encoder_inputs is None else Sequence(
seq_id, encoder_inputs, block_size, eos_token_id, lora_request,
@@ -601,8 +6
54
,12 @@
class LLMEngine:
@@ -601,8 +6
67
,12 @@
class LLMEngine:
trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
encoder_seq=encoder_seq,
...
...
@@ -2853,7 +3164,7 @@ index d82d9ad9..03896aa6 100644
seq_group = self._create_sequence_group_with_pooling(
request_id,
seq,
@@ -673,6 +73
0
,7 @@
class LLMEngine:
@@ -673,6 +7
4
3,7 @@
class LLMEngine:
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
...
...
@@ -2861,7 +3172,7 @@ index d82d9ad9..03896aa6 100644
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
@@ -765,6 +8
2
3,7 @@
class LLMEngine:
@@ -765,6 +83
6
,7 @@
class LLMEngine:
prompt_adapter_request=prompt_adapter_request,
trace_headers=trace_headers,
priority=priority,
...
...
@@ -2869,7 +3180,7 @@ index d82d9ad9..03896aa6 100644
)
def _validate_token_prompt(self, prompt: PromptType,
@@ -799,6 +8
58
,7 @@
class LLMEngine:
@@ -799,6 +8
71
,7 @@
class LLMEngine:
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
encoder_seq: Optional[Sequence] = None,
priority: int = 0,
...
...
@@ -2877,7 +3188,7 @@ index d82d9ad9..03896aa6 100644
) -> SequenceGroup:
"""Creates a SequenceGroup with SamplingParams."""
max_logprobs = self.get_model_config().max_logprobs
@@ -829,7 +
889
,9 @@
class LLMEngine:
@@ -829,7 +
902
,9 @@
class LLMEngine:
trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
encoder_seq=encoder_seq,
...
...
@@ -2888,7 +3199,7 @@ index d82d9ad9..03896aa6 100644
return seq_group
@@ -995,11 +10
5
7,11 @@
class LLMEngine:
@@ -995,11 +107
0
,11 @@
class LLMEngine:
# When we process only one request, no pop is required
# (since later we will process all of the rest)
(outputs, seq_group_metadata_list, scheduler_outputs, is_async,
...
...
@@ -2902,7 +3213,7 @@ index d82d9ad9..03896aa6 100644
# Sanity check
assert len(seq_group_metadata_list) == len(
@@ -1325,15 +1
387
,55 @@
class LLMEngine:
@@ -1325,15 +1
400
,55 @@
class LLMEngine:
# Clear outputs for each new scheduler iteration
ctx.request_outputs.clear()
...
...
@@ -2960,7 +3271,7 @@ index d82d9ad9..03896aa6 100644
ctx.seq_group_metadata_list = seq_group_metadata_list
ctx.scheduler_outputs = scheduler_outputs
@@ -1383,9 +148
5
,46 @@
class LLMEngine:
@@ -1383,9 +14
9
8,46 @@
class LLMEngine:
execute_model_req.async_callback = self.async_callbacks[
virtual_engine]
...
...
@@ -3009,7 +3320,7 @@ index d82d9ad9..03896aa6 100644
# We need to do this here so that last step's sampled_token_ids can
# be passed to the next iteration for PP.
if self.scheduler_config.is_multi_step:
@@ -1396,7 +15
35
,26 @@
class LLMEngine:
@@ -1396,7 +15
48
,26 @@
class LLMEngine:
if len(ctx.output_queue) > 0:
self._process_model_outputs(ctx=ctx)
# No outputs in this case
...
...
@@ -3037,7 +3348,7 @@ index d82d9ad9..03896aa6 100644
# Finish the current step for all the sequence groups.
if self.scheduler_config.is_multi_step:
@@ -1456,7 +16
14
,7 @@
class LLMEngine:
@@ -1456,7 +16
27
,7 @@
class LLMEngine:
# queued control plane messages, such as add/remove lora adapters.
logger.debug("Stopping remote worker execution loop.")
self.model_executor.stop_remote_worker_execution_loop()
...
...
@@ -3047,10 +3358,28 @@ index d82d9ad9..03896aa6 100644
def _has_remaining_steps(
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 3cf1850e..
ae006579
100644
index 3cf1850e
e
..
d20a5f20b
100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -14,13 +14,17 @@
from vllm.outputs import RequestOutput
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import uuid
from dataclasses import dataclass, field
@@ -14,13 +27,17 @@
from vllm.outputs import RequestOutput
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.utils import deprecate_kwargs
...
...
@@ -3069,7 +3398,7 @@ index 3cf1850e..ae006579 100644
class MQEngineDeadError(RuntimeError):
@@ -36,6 +
40
,7 @@
class RPCProcessRequest:
@@ -36,6 +
53
,7 @@
class RPCProcessRequest:
trace_headers: Optional[Mapping[str, str]] = None
prompt_adapter_request: Optional[PromptAdapterRequest] = None
priority: int = 0
...
...
@@ -3077,7 +3406,7 @@ index 3cf1850e..ae006579 100644
@overload
def __init__(
@@ -78,6 +
83
,7 @@
class RPCProcessRequest:
@@ -78,6 +
96
,7 @@
class RPCProcessRequest:
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
...
...
@@ -3085,7 +3414,7 @@ index 3cf1850e..ae006579 100644
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
@@ -95,7 +1
0
1,7 @@
class RPCProcessRequest:
@@ -95,7 +11
4
,7 @@
class RPCProcessRequest:
self.trace_headers = trace_headers
self.prompt_adapter_request = prompt_adapter_request
self.priority = priority
...
...
@@ -3094,7 +3423,7 @@ index 3cf1850e..ae006579 100644
@dataclass
class RPCError:
@@ -116,7 +1
22
,7 @@
class RPCStartupRequest(Enum):
@@ -116,7 +1
35
,7 @@
class RPCStartupRequest(Enum):
@dataclass
class RPCStartupResponse:
tracing_enabled: bool
...
...
@@ -3103,7 +3432,7 @@ index 3cf1850e..ae006579 100644
class RPCUProfileRequest(Enum):
START_PROFILE = 1
@@ -157,3 +16
3
,13 @@
def ENGINE_DEAD_ERROR(
@@ -157,3 +1
7
6,13 @@
def ENGINE_DEAD_ERROR(
return MQEngineDeadError(
"Engine loop is not running. Inspect the stacktrace to "
f"find the original error: {repr(error)}.")
...
...
@@ -3118,10 +3447,28 @@ index 3cf1850e..ae006579 100644
+ gpu_cache_usage_perc: float
+ gpu_prefix_cache_hit_rate: float
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 85b5f31e..
05030292
100644
index 85b5f31e
3
..
c53b9eced
100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -8,6 +8,7 @@
from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import asyncio
import copy
@@ -8,6 +21,7 @@
from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
Optional, Union, cast, overload)
import cloudpickle
...
...
@@ -3129,7 +3476,7 @@ index 85b5f31e..05030292 100644
import psutil
import zmq
import zmq.asyncio
@@ -19,20 +
20
,23 @@
from vllm import PoolingParams
@@ -19,20 +
33
,23 @@
from vllm import PoolingParams
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
from vllm.core.scheduler import SchedulerOutputs
from vllm.engine.arg_utils import AsyncEngineArgs
...
...
@@ -3156,7 +3503,7 @@ index 85b5f31e..05030292 100644
from vllm.engine.protocol import EngineClient
# yapf: enable
from vllm.envs import VLLM_RPC_TIMEOUT
@@ -46,6 +
50
,8 @@
from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -46,6 +
63
,8 @@
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.utils import deprecate_kwargs
...
...
@@ -3165,7 +3512,7 @@ index 85b5f31e..05030292 100644
logger = init_logger(__name__)
@@ -91,6 +
97
,7 @@
class MQLLMEngineClient(EngineClient):
@@ -91,6 +
110
,7 @@
class MQLLMEngineClient(EngineClient):
self._errored_with: Optional[BaseException] = None
# Get the configs.
...
...
@@ -3173,7 +3520,7 @@ index 85b5f31e..05030292 100644
self.model_config = engine_config.model_config
self.decoding_config = engine_config.decoding_config
@@ -115,6 +1
22
,10 @@
class MQLLMEngineClient(EngineClient):
@@ -115,6 +1
35
,10 @@
class MQLLMEngineClient(EngineClient):
self.heartbeat_socket: Socket = self.context.socket(zmq.constants.PULL)
self.heartbeat_socket.connect(f"{ipc_path}{IPC_HEALTH_EXT}")
...
...
@@ -3184,7 +3531,7 @@ index 85b5f31e..05030292 100644
# IPC path for the data socket.
self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
@@ -129,8 +1
40
,27 @@
class MQLLMEngineClient(EngineClient):
@@ -129,8 +1
53
,27 @@
class MQLLMEngineClient(EngineClient):
# Loop to check health of the LLMEngine periodically.
# Started after the MQLLMEngine is ready.
self.health_loop: Optional[asyncio.Task] = None
...
...
@@ -3212,7 +3559,7 @@ index 85b5f31e..05030292 100644
@staticmethod
def is_unsupported_config(engine_args: AsyncEngineArgs):
# Pipeline parallel not yet supported
@@ -180,6 +2
10
,61 @@
class MQLLMEngineClient(EngineClient):
@@ -180,6 +2
23
,61 @@
class MQLLMEngineClient(EngineClient):
except Exception as e:
self._set_errored(e)
...
...
@@ -3274,7 +3621,7 @@ index 85b5f31e..05030292 100644
async def run_output_handler_loop(self):
"""Get RequestOutputs from Engine and stream to Request Queues"""
@@ -278,12 +36
3
,26 @@
class MQLLMEngineClient(EngineClient):
@@ -278,12 +3
7
6,26 @@
class MQLLMEngineClient(EngineClient):
# Wait until server is ready.
response = await self._wait_for_server_rpc(socket)
...
...
@@ -3301,7 +3648,7 @@ index 85b5f31e..05030292 100644
def close(self):
"""Destroy the ZeroMQ Context."""
@@ -293,6 +
392
,8 @@
class MQLLMEngineClient(EngineClient):
@@ -293,6 +
405
,8 @@
class MQLLMEngineClient(EngineClient):
# Cancel background tasks.
if self.health_loop is not None:
self.health_loop.cancel()
...
...
@@ -3310,7 +3657,7 @@ index 85b5f31e..05030292 100644
if self.output_loop is not None:
self.output_loop.cancel()
@@ -415,6 +5
16
,9 @@
class MQLLMEngineClient(EngineClient):
@@ -415,6 +5
29
,9 @@
class MQLLMEngineClient(EngineClient):
"""
if self._errored_with is not None:
raise self._errored_with
...
...
@@ -3320,7 +3667,7 @@ index 85b5f31e..05030292 100644
@property
def is_running(self) -> bool:
@@ -473,6 +5
77
,7 @@
class MQLLMEngineClient(EngineClient):
@@ -473,6 +5
90
,7 @@
class MQLLMEngineClient(EngineClient):
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
...
...
@@ -3328,7 +3675,7 @@ index 85b5f31e..05030292 100644
*,
inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[RequestOutput, None]:
@@ -502,7 +60
7
,8 @@
class MQLLMEngineClient(EngineClient):
@@ -502,7 +6
2
0,8 @@
class MQLLMEngineClient(EngineClient):
return self._process_request(prompt, sampling_params, request_id,
lora_request, trace_headers,
...
...
@@ -3338,7 +3685,7 @@ index 85b5f31e..05030292 100644
@overload
def encode(
@@ -586,6 +
692
,7 @@
class MQLLMEngineClient(EngineClient):
@@ -586,6 +
705
,7 @@
class MQLLMEngineClient(EngineClient):
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
...
...
@@ -3346,7 +3693,7 @@ index 85b5f31e..05030292 100644
) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
PoolingRequestOutput, None]]:
"""Send an RPCGenerateRequest to the RPCServer and stream responses."""
@@ -630,6 +7
37
,12 @@
class MQLLMEngineClient(EngineClient):
@@ -630,6 +7
50
,12 @@
class MQLLMEngineClient(EngineClient):
else:
lp_bytes = None
...
...
@@ -3359,7 +3706,7 @@ index 85b5f31e..05030292 100644
request_bytes = pickle.dumps(
RPCProcessRequest(
prompt=prompt,
@@ -639,11 +75
2
,11 @@
class MQLLMEngineClient(EngineClient):
@@ -639,11 +7
6
5,11 @@
class MQLLMEngineClient(EngineClient):
trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
priority=priority,
...
...
@@ -3373,7 +3720,7 @@ index 85b5f31e..05030292 100644
await self.input_socket.send_multipart(parts, copy=False)
# 4) Stream the RequestOutputs from the output queue. Note
@@ -705,3 +81
8
,6 @@
class MQLLMEngineClient(EngineClient):
@@ -705,3 +8
3
1,6 @@
class MQLLMEngineClient(EngineClient):
# Raise on error, otherwise happily return None
if isinstance(request_output, BaseException):
raise request_output
...
...
@@ -3381,10 +3728,25 @@ index 85b5f31e..05030292 100644
+ def set_metrics_publisher(self, metrics_publisher):
+ self.metrics_publisher = metrics_publisher
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index a0dd7958..
c82bc15b
100644
index a0dd7958
6
..
ea0d2cd68
100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -3,35 +3,115 @@
@@ -1,37 +1,130 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import pickle
import signal
from contextlib import contextmanager
...
...
@@ -3505,7 +3867,7 @@ index a0dd7958..c82bc15b 100644
class MQLLMEngine:
"""A multiprocessing wrapper for :class:`LLMEngine`.
@@ -94,12 +17
4
,37 @@
class MQLLMEngine:
@@ -94,12 +1
8
7,37 @@
class MQLLMEngine:
self.heartbeat_socket = self.ctx.socket(zmq.constants.PUSH)
self.heartbeat_socket.bind(f"{ipc_path}{IPC_HEALTH_EXT}")
...
...
@@ -3543,7 +3905,7 @@ index a0dd7958..c82bc15b 100644
@property
def dead_error(self) -> BaseException:
if self._errored_with is not None:
@@ -171,8 +2
76
,17 @@
class MQLLMEngine:
@@ -171,8 +2
89
,17 @@
class MQLLMEngine:
# Handle the query from the Client.
if request == RPCStartupRequest.IS_SERVER_READY:
tracing_enabled = self.engine.is_tracing_enabled()
...
...
@@ -3563,7 +3925,7 @@ index a0dd7958..c82bc15b 100644
except Exception as e:
response = e
@@ -185,6 +
299
,7 @@
class MQLLMEngine:
@@ -185,6 +
312
,7 @@
class MQLLMEngine:
while True:
if not self.engine.has_unfinished_requests():
...
...
@@ -3571,7 +3933,7 @@ index a0dd7958..c82bc15b 100644
# Poll until there is work to do.
while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
# When there's no work, check on engine health and send
@@ -220,6 +3
35
,13 @@
class MQLLMEngine:
@@ -220,6 +3
48
,13 @@
class MQLLMEngine:
def handle_new_input(self):
"""Handle new input from the socket"""
try:
...
...
@@ -3585,7 +3947,7 @@ index a0dd7958..c82bc15b 100644
while self.input_socket.poll(timeout=0) != 0:
frames = self.input_socket.recv_multipart(copy=False)
request = pickle.loads(frames[0].buffer)
@@ -262,6 +3
84
,11 @@
class MQLLMEngine:
@@ -262,6 +3
97
,11 @@
class MQLLMEngine:
self._send_outputs(rpc_err)
try:
...
...
@@ -3597,7 +3959,7 @@ index a0dd7958..c82bc15b 100644
self.engine.add_request(
request_id=request_id,
prompt=request.prompt,
@@ -269,7 +
396
,9 @@
class MQLLMEngine:
@@ -269,7 +
409
,9 @@
class MQLLMEngine:
lora_request=request.lora_request,
trace_headers=request.trace_headers,
prompt_adapter_request=request.prompt_adapter_request,
...
...
@@ -3609,10 +3971,28 @@ index a0dd7958..c82bc15b 100644
if self.log_requests:
logger.info("Added request %s.", request.request_id)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 107220d5..
c716f75f
100644
index 107220d5
4
..
e0e0590b6
100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -34,6 +34,7 @@
from vllm.sampling_params import BeamSearchParams, SamplingParams
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import asyncio
import json
@@ -34,6 +47,7 @@
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.sequence import Logprob
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
...
...
@@ -3620,7 +4000,7 @@ index 107220d5..c716f75f 100644
logger = init_logger(__name__)
@@ -112,6 +1
13
,7 @@
class OpenAIServingChat(OpenAIServing):
@@ -112,6 +1
26
,7 @@
class OpenAIServingChat(OpenAIServing):
self,
request: ChatCompletionRequest,
raw_request: Optional[Request] = None,
...
...
@@ -3628,7 +4008,7 @@ index 107220d5..c716f75f 100644
) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
ErrorResponse]:
"""
@@ -243,6 +2
4
5,7 @@
class OpenAIServingChat(OpenAIServing):
@@ -243,6 +25
8
,7 @@
class OpenAIServingChat(OpenAIServing):
trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
priority=request.priority,
...
...
@@ -3637,10 +4017,28 @@ index 107220d5..c716f75f 100644
generators.append(generator)
diff --git a/vllm/envs.py b/vllm/envs.py
index 745b068b..0
ae63d9
b 100644
index 745b068b
7
..0
f1a022f
b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,6 +87,10 @@
if TYPE_CHECKING:
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import os
import tempfile
@@ -87,6 +100,10 @@
if TYPE_CHECKING:
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = ""
...
...
@@ -3651,7 +4049,7 @@ index 745b068b..0ae63d9b 100644
def get_default_cache_root():
@@ -572,6 +5
76
,21 @@
environment_variables: Dict[str, Callable[[], Any]] = {
@@ -572,6 +5
89
,21 @@
environment_variables: Dict[str, Callable[[], Any]] = {
# models the alignment is already naturally aligned to 256 bytes.
"VLLM_CUDA_MEM_ALIGN_KV_CACHE":
lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
...
...
@@ -3674,10 +4072,28 @@ index 745b068b..0ae63d9b 100644
# end-env-vars-definition
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 773f5abe..3
eefd266
100644
index 773f5abe
7
..3
65685e13
100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -585,6 +585,8 @@
class DeepseekV2Model(nn.Module):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
@@ -585,6 +598,8 @@
class DeepseekV2Model(nn.Module):
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
...
...
@@ -3687,10 +4103,28 @@ index 773f5abe..3eefd266 100644
self.vocab_size = config.vocab_size
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 786380c3..
56a7cf89
100644
index 786380c3
7
..
e9c3a5e16
100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -6,16 +6,16 @@
from typing import Dict, Generic, List, MutableSequence, Optional
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import time
from dataclasses import dataclass
@@ -6,16 +19,16 @@
from typing import Dict, Generic, List, MutableSequence, Optional
from typing import Sequence as GenericSequence
from typing import Union
...
...
@@ -3711,10 +4145,25 @@ index 786380c3..56a7cf89 100644
"""The output data of one completion output of a request.
diff --git a/vllm/remote_prefill.py b/vllm/remote_prefill.py
new file mode 100644
index 00000000..3f
9711ef
index 00000000
0
..
8
3f
6cd575
--- /dev/null
+++ b/vllm/remote_prefill.py
@@ -0,0 +1,67 @@
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Callable, Optional, List
+from enum import Enum
...
...
@@ -3784,10 +4233,29 @@ index 00000000..3f9711ef
+ remote_prefill_request_callback: Optional[RemotePrefillRequestCallback] = None
\
No newline at end of file
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 97f9e212..
1bb97b00
100644
index 97f9e212
9
..
5849befba
100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -83,7 +83,7 @@
class RequestOutputKind(Enum):
@@ -1,4 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
"""Sampling parameters for text generation."""
import copy
from dataclasses import dataclass
@@ -83,7 +97,7 @@
class RequestOutputKind(Enum):
DELTA = 1
# Do not return intermediate RequestOuputs
FINAL_ONLY = 2
...
...
@@ -3797,10 +4265,29 @@ index 97f9e212..1bb97b00 100644
class SamplingParams(
msgspec.Struct,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 534b9e60..
18675d2f
100644
index 534b9e60
6
..
c33bbde1c
100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -20,6 +20,7 @@
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
@@ -1,4 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
"""Sequence and its related classes."""
import copy
import enum
@@ -20,6 +34,7 @@
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import RequestOutputKind, SamplingParams
...
...
@@ -3808,7 +4295,7 @@ index 534b9e60..18675d2f 100644
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
@@ -59,13 +
60
,14 @@
class SequenceStatus(enum.IntEnum):
@@ -59,13 +
74
,14 @@
class SequenceStatus(enum.IntEnum):
"""Status of a sequence."""
WAITING = 0
RUNNING = 1
...
...
@@ -3829,7 +4316,7 @@ index 534b9e60..18675d2f 100644
@staticmethod
def is_finished(status: "SequenceStatus") -> bool:
@@ -409,6 +4
11
,7 @@
class Sequence:
@@ -409,6 +4
25
,7 @@
class Sequence:
eos_token_id: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
...
...
@@ -3837,7 +4324,7 @@ index 534b9e60..18675d2f 100644
) -> None:
self.seq_id = seq_id
self.inputs = SingletonInputsAdapter(inputs)
@@ -416,7 +4
19
,7 @@
class Sequence:
@@ -416,7 +4
33
,7 @@
class Sequence:
self.eos_token_id = eos_token_id
self.lora_request = lora_request
self.prompt_adapter_request = prompt_adapter_request
...
...
@@ -3846,7 +4333,7 @@ index 534b9e60..18675d2f 100644
self.data = SequenceData.from_seqs(self.prompt_token_ids)
self.output_logprobs: SampleLogprobs = []
self.output_text = ""
@@ -639,6 +6
42
,7 @@
class SequenceGroup:
@@ -639,6 +6
56
,7 @@
class SequenceGroup:
trace_headers: OpenTelemetry trace headers.
prompt_adapter_request: Prompt Adapter request.
priority: User-defined priority of the request.
...
...
@@ -3854,7 +4341,7 @@ index 534b9e60..18675d2f 100644
"""
def __init__(
@@ -654,6 +6
58
,7 @@
class SequenceGroup:
@@ -654,6 +6
72
,7 @@
class SequenceGroup:
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
...
...
@@ -3862,7 +4349,7 @@ index 534b9e60..18675d2f 100644
) -> None:
self.request_id = request_id
self.seqs = seqs
@@ -678,7 +6
83
,7 @@
class SequenceGroup:
@@ -678,7 +6
97
,7 @@
class SequenceGroup:
self.encoder_seq = encoder_seq
self.trace_headers = trace_headers
self.priority = priority
...
...
@@ -3871,7 +4358,7 @@ index 534b9e60..18675d2f 100644
self.cached_request_output = None
@property
@@ -927,6 +9
32
,9 @@
class SequenceGroupMetadata(
@@ -927,6 +9
46
,9 @@
class SequenceGroupMetadata(
query tokens for prefill, we don't need sampling.
token_chunk_size: The number of tokens to be processed (per sequence).
None if chunking is not required.
...
...
@@ -3881,7 +4368,7 @@ index 534b9e60..18675d2f 100644
lora_request: LoRA request.
computed_block_nums: The block numbers that are already computed,
used in prefix caching.
@@ -966,6 +9
74
,9 @@
class SequenceGroupMetadata(
@@ -966,6 +9
88
,9 @@
class SequenceGroupMetadata(
cross_block_table: Optional[List[int]] = None
prompt_adapter_request: Optional[PromptAdapterRequest] = None
token_chunk_size: Optional[int] = None
...
...
@@ -3891,7 +4378,7 @@ index 534b9e60..18675d2f 100644
### Stateful fields that are lazily defined. ###
# The number of speculative tokens adopted in this request.
@@ -1310,6 +13
21
,8 @@
class ExecuteModelRequest(
@@ -1310,6 +13
35
,8 @@
class ExecuteModelRequest(
last_sampled_token_ids: Optional[torch.Tensor] = None
# Async callback
async_callback: Optional[Callable] = None
...
...
@@ -3901,10 +4388,28 @@ index 534b9e60..18675d2f 100644
@property
def is_first_multi_step(self) -> bool:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 12baecde..
a3f2c464
100644
index 12baecde
6
..
11034b391
100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1824,6 +1824,9 @@
class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import dataclasses
import gc
@@ -1824,6 +1837,9 @@
class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
if self.vllm_config.kv_transfer_config is None:
return False
...
...
@@ -3914,7 +4419,7 @@ index 12baecde..a3f2c464 100644
prefill_meta = model_input.attn_metadata.prefill_metadata
@@ -1849,6 +185
2
,9 @@
class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
@@ -1849,6 +18
6
5,9 @@
class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
if self.vllm_config.kv_transfer_config is None:
return False
...
...
@@ -3925,10 +4430,25 @@ index 12baecde..a3f2c464 100644
prefill_meta = model_input.attn_metadata.prefill_metadata
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 582aa460..
876329d6
100644
index 582aa460
e
..
0be784a40
100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -2,7 +2,7 @@
@@ -1,8 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
"""A GPU worker class."""
import gc
import os
...
...
@@ -3937,7 +4457,7 @@ index 582aa460..876329d6 100644
import torch
import torch.distributed
@@ -31,6 +
31
,9 @@
from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -31,6 +
45
,9 @@
from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
from vllm.worker.pooling_model_runner import PoolingModelRunner
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
WorkerInput)
...
...
@@ -3947,7 +4467,7 @@ index 582aa460..876329d6 100644
logger = init_logger(__name__)
@@ -306,6 +3
09
,46 @@
class Worker(LocalOrDistributedWorkerBase):
@@ -306,6 +3
23
,46 @@
class Worker(LocalOrDistributedWorkerBase):
self._init_cache_engine()
self._warm_up_model()
...
...
@@ -3994,7 +4514,7 @@ index 582aa460..876329d6 100644
def _init_cache_engine(self):
assert self.cache_config.num_gpu_blocks is not None
self.cache_engine = [
@@ -367,6 +4
10
,8 @@
class Worker(LocalOrDistributedWorkerBase):
@@ -367,6 +4
24
,8 @@
class Worker(LocalOrDistributedWorkerBase):
blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
device=self.device,
dtype=torch.int64).view(-1, 2)
...
...
@@ -4003,7 +4523,7 @@ index 582aa460..876329d6 100644
return WorkerInput(
num_seq_groups=num_seq_groups,
@@ -375,6 +4
20
,12 @@
class Worker(LocalOrDistributedWorkerBase):
@@ -375,6 +4
34
,12 @@
class Worker(LocalOrDistributedWorkerBase):
blocks_to_copy=blocks_to_copy,
virtual_engine=virtual_engine,
num_steps=num_steps,
...
...
@@ -4017,10 +4537,29 @@ index 582aa460..876329d6 100644
@torch.inference_mode()
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 819b81fb..
2891854b
100644
index 819b81fb
f
..
7d1b1836d
100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -9,6 +9,7 @@
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
@@ -1,4 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
import dataclasses
import os
@@ -9,6 +23,7 @@
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
import cloudpickle
import torch
import torch.nn as nn
...
...
@@ -4028,7 +4567,7 @@ index 819b81fb..2891854b 100644
from vllm.config import (ObservabilityConfig, VllmConfig,
set_current_vllm_config)
@@ -23,6 +
24
,9 @@
from vllm.utils import (enable_trace_function_call_for_thread,
@@ -23,6 +
38
,9 @@
from vllm.utils import (enable_trace_function_call_for_thread,
from vllm.worker.model_runner_base import (BroadcastableModelInput,
ModelRunnerBase,
ModelRunnerInputBase)
...
...
@@ -4038,7 +4577,7 @@ index 819b81fb..2891854b 100644
logger = init_logger(__name__)
@@ -53,6 +
5
7,8 @@
class WorkerBase(ABC):
@@ -53,6 +7
1
,8 @@
class WorkerBase(ABC):
from vllm.platforms import current_platform
self.current_platform = current_platform
...
...
@@ -4047,7 +4586,7 @@ index 819b81fb..2891854b 100644
@abstractmethod
def init_device(self) -> None:
"""Initialize device state, such as loading the model or other on-device
@@ -216,6 +2
22
,13 @@
class WorkerInput:
@@ -216,6 +2
36
,13 @@
class WorkerInput:
virtual_engine: int = 0
num_steps: int = 1
...
...
@@ -4061,7 +4600,7 @@ index 819b81fb..2891854b 100644
@classmethod
def from_broadcasted_tensor_dict(
cls: Type["WorkerInput"],
@@ -232,6 +2
4
5,12 @@
class WorkerInput:
@@ -232,6 +25
9
,12 @@
class WorkerInput:
blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
virtual_engine=tensor_dict["virtual_engine"],
num_steps=tensor_dict.pop("num_steps"),
...
...
@@ -4074,7 +4613,7 @@ index 819b81fb..2891854b 100644
)
def as_broadcastable_tensor_dict(
@@ -246,6 +2
65
,12 @@
class WorkerInput:
@@ -246,6 +2
79
,12 @@
class WorkerInput:
"blocks_to_copy": self.blocks_to_copy,
"virtual_engine": self.virtual_engine,
"num_steps": self.num_steps,
...
...
@@ -4087,7 +4626,7 @@ index 819b81fb..2891854b 100644
}
return tensor_dict
@@ -316,13 +3
41
,16 @@
class LocalOrDistributedWorkerBase(WorkerBase):
@@ -316,13 +3
55
,16 @@
class LocalOrDistributedWorkerBase(WorkerBase):
return None
worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data)
...
...
@@ -4109,7 +4648,7 @@ index 819b81fb..2891854b 100644
def _get_driver_input_and_broadcast(
self, execute_model_req: ExecuteModelRequest
@@ -396,49 +4
24
,88 @@
class LocalOrDistributedWorkerBase(WorkerBase):
@@ -396,49 +4
38
,88 @@
class LocalOrDistributedWorkerBase(WorkerBase):
self.execute_worker(worker_input)
# If there is no input, we don't need to execute the model.
...
...
@@ -4161,7 +4700,12 @@ index 819b81fb..2891854b 100644
- and self.observability_config.collect_model_execute_time):
- orig_model_execute_time = intermediate_tensors.tensors.get(
- "model_execute_time", torch.tensor(0)).item()
-
+ and self.observability_config.collect_model_execute_time
+ and output is not None):
+ for o in output:
+ o.model_execute_time = (orig_model_execute_time +
+ model_execute_time)
- output = self.model_runner.execute_model(
- model_input=model_input,
- kv_caches=self.kv_cache[worker_input.virtual_engine]
...
...
@@ -4170,12 +4714,7 @@ index 819b81fb..2891854b 100644
- num_steps=num_steps,
- **kwargs,
- )
+ and self.observability_config.collect_model_execute_time
+ and output is not None):
+ for o in output:
+ o.model_execute_time = (orig_model_execute_time +
+ model_execute_time)
-
- model_execute_time = time.perf_counter() - start_time
- if not get_pp_group().is_last_rank:
- # output is IntermediateTensors
...
...
container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
0 → 100644
View file @
00730fc6
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment