Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
00730fc6
Unverified
Commit
00730fc6
authored
Apr 16, 2025
by
ptarasiewiczNV
Committed by
GitHub
Apr 16, 2025
Browse files
feat: bump vLLM version to v0.8.4 (#690)
parent
48733546
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
5410 additions
and
224 deletions
+5410
-224
container/Dockerfile.vllm
container/Dockerfile.vllm
+6
-6
container/deps/vllm/tests/test_patch_install.py
container/deps/vllm/tests/test_patch_install.py
+1
-1
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
+756
-217
container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
+4647
-0
No files found.
container/Dockerfile.vllm
View file @
00730fc6
...
@@ -181,16 +181,12 @@ RUN mkdir /opt/dynamo && \
...
@@ -181,16 +181,12 @@ RUN mkdir /opt/dynamo && \
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Install patched vllm - keep this early in Dockerfile to avoid
# Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
# rebuilds from unrelated source code changes
ARG VLLM_REF="0.
7.2
"
ARG VLLM_REF="0.
8.4
"
ARG VLLM_PATCH="vllm_v${VLLM_REF}-dynamo-kv-disagg-patch.patch"
ARG VLLM_PATCH="vllm_v${VLLM_REF}-dynamo-kv-disagg-patch.patch"
ARG VLLM_PATCHED_PACKAGE_NAME="ai_dynamo_vllm"
ARG VLLM_PATCHED_PACKAGE_NAME="ai_dynamo_vllm"
ARG VLLM_PATCHED_PACKAGE_VERSION="0.
7.2.post1
"
ARG VLLM_PATCHED_PACKAGE_VERSION="0.
8.4
"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
mkdir /tmp/vllm && \
mkdir /tmp/vllm && \
uv pip install pip wheel && \
uv pip install pip wheel && \
...
@@ -211,6 +207,10 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
...
@@ -211,6 +207,10 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
wheel pack . --dest-dir /workspace/dist && \
wheel pack . --dest-dir /workspace/dist && \
uv pip install /workspace/dist/${VLLM_PATCHED_PACKAGE_NAME}-*.whl
uv pip install /workspace/dist/${VLLM_PATCHED_PACKAGE_NAME}-*.whl
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Install test dependencies
# Install test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
uv pip install --requirement /tmp/requirements.txt
...
...
container/deps/vllm/tests/test_patch_install.py
View file @
00730fc6
...
@@ -27,4 +27,4 @@ pytestmark = pytest.mark.pre_merge
...
@@ -27,4 +27,4 @@ pytestmark = pytest.mark.pre_merge
@
pytest
.
mark
.
skipif
(
vllm
is
None
,
reason
=
"Skipping vllm tests, vllm not installed"
)
@
pytest
.
mark
.
skipif
(
vllm
is
None
,
reason
=
"Skipping vllm tests, vllm not installed"
)
def
test_version
():
def
test_version
():
# Verify that the image has the patched version of vllm
# Verify that the image has the patched version of vllm
assert
vllm
.
__version__
.
endswith
(
"0.
7.2
"
)
# type: ignore
assert
vllm
.
__version__
.
endswith
(
"0.
8.4
"
)
# type: ignore
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
View file @
00730fc6
diff --git a/vllm/config.py b/vllm/config.py
diff --git a/vllm/config.py b/vllm/config.py
index 9ba49757..
a4df0019
100644
index 9ba49757
6
..
db2dc002f
100644
--- a/vllm/config.py
--- a/vllm/config.py
+++ b/vllm/config.py
+++ b/vllm/config.py
@@ -2620,6 +2620,9 @@
class KVTransferConfig(BaseModel):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import ast
import copy
@@ -2620,6 +2633,9 @@
class KVTransferConfig(BaseModel):
# The KV connector for vLLM to transmit KV caches between vLLM instances.
# The KV connector for vLLM to transmit KV caches between vLLM instances.
kv_connector: Optional[str] = None
kv_connector: Optional[str] = None
...
@@ -12,7 +30,7 @@ index 9ba49757..a4df0019 100644
...
@@ -12,7 +30,7 @@ index 9ba49757..a4df0019 100644
# The device used by kv connector to buffer the KV cache.
# The device used by kv connector to buffer the KV cache.
# Currently only support 'cuda'.
# Currently only support 'cuda'.
kv_buffer_device: Optional[str] = "cuda"
kv_buffer_device: Optional[str] = "cuda"
@@ -2629,7 +26
32
,7 @@
class KVTransferConfig(BaseModel):
@@ -2629,7 +26
45
,7 @@
class KVTransferConfig(BaseModel):
kv_buffer_size: float = 1e9
kv_buffer_size: float = 1e9
# Whether this vLLM instance produces, consumes KV cache, or both. Choices
# Whether this vLLM instance produces, consumes KV cache, or both. Choices
...
@@ -21,7 +39,7 @@ index 9ba49757..a4df0019 100644
...
@@ -21,7 +39,7 @@ index 9ba49757..a4df0019 100644
kv_role: Optional[str] = None
kv_role: Optional[str] = None
# The rank of this vLLM instance in the KV cache transfer. Typical value:
# The rank of this vLLM instance in the KV cache transfer. Typical value:
@@ -2647,6 +26
50
,14 @@
class KVTransferConfig(BaseModel):
@@ -2647,6 +26
63
,14 @@
class KVTransferConfig(BaseModel):
# The KV connector port, used to build distributed connection
# The KV connector port, used to build distributed connection
kv_port: int = 14579
kv_port: int = 14579
...
@@ -36,7 +54,7 @@ index 9ba49757..a4df0019 100644
...
@@ -36,7 +54,7 @@ index 9ba49757..a4df0019 100644
def compute_hash(self) -> str:
def compute_hash(self) -> str:
"""
"""
WARNING: Whenever a new field is added to this config,
WARNING: Whenever a new field is added to this config,
@@ -2680,11 +2
691
,16 @@
class KVTransferConfig(BaseModel):
@@ -2680,11 +2
704
,16 @@
class KVTransferConfig(BaseModel):
f"Supported roles are `kv_producer`, `kv_consumer`, "
f"Supported roles are `kv_producer`, `kv_consumer`, "
f"and `kv_both`")
f"and `kv_both`")
...
@@ -54,7 +72,7 @@ index 9ba49757..a4df0019 100644
...
@@ -54,7 +72,7 @@ index 9ba49757..a4df0019 100644
@property
@property
def is_kv_transfer_instance(self) -> bool:
def is_kv_transfer_instance(self) -> bool:
return self.kv_connector is not None and \
return self.kv_connector is not None and \
@@ -2694,6 +27
10
,8 @@
class KVTransferConfig(BaseModel):
@@ -2694,6 +27
23
,8 @@
class KVTransferConfig(BaseModel):
def need_kv_parallel_group(self) -> bool:
def need_kv_parallel_group(self) -> bool:
# for those database-based connector, vLLM does not need to create
# for those database-based connector, vLLM does not need to create
# parallel group, and in that case the kv parallel size will be 1.
# parallel group, and in that case the kv parallel size will be 1.
...
@@ -63,7 +81,7 @@ index 9ba49757..a4df0019 100644
...
@@ -63,7 +81,7 @@ index 9ba49757..a4df0019 100644
return self.kv_connector is not None and self.kv_parallel_size > 1
return self.kv_connector is not None and self.kv_parallel_size > 1
@property
@property
@@ -2706,6 +27
24
,18 @@
class KVTransferConfig(BaseModel):
@@ -2706,6 +27
37
,18 @@
class KVTransferConfig(BaseModel):
return self.kv_connector is not None and \
return self.kv_connector is not None and \
self.kv_role in ["kv_consumer", "kv_both"]
self.kv_role in ["kv_consumer", "kv_both"]
...
@@ -83,10 +101,28 @@ index 9ba49757..a4df0019 100644
...
@@ -83,10 +101,28 @@ index 9ba49757..a4df0019 100644
class CompilationLevel:
class CompilationLevel:
# constants for the levels of the compilation process
# constants for the levels of the compilation process
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 359b5b26..
d52ee05
0 100644
index 359b5b26
3
..
7bac45ff
0 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -6,6 +6,7 @@
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
from typing import Dict, FrozenSet, List, Optional, Tuple
@@ -6,6 +19,7 @@
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
DeviceAwareBlockAllocator)
DeviceAwareBlockAllocator)
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
...
@@ -94,7 +130,7 @@ index 359b5b26..d52ee050 100644
...
@@ -94,7 +130,7 @@ index 359b5b26..d52ee050 100644
from vllm.platforms import current_platform
from vllm.platforms import current_platform
from vllm.utils import Device
from vllm.utils import Device
@@ -28,6 +2
9
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -28,6 +
4
2,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
num_gpu_blocks: int,
num_gpu_blocks: int,
num_cpu_blocks: int,
num_cpu_blocks: int,
block_size: int,
block_size: int,
...
@@ -102,7 +138,7 @@ index 359b5b26..d52ee050 100644
...
@@ -102,7 +138,7 @@ index 359b5b26..d52ee050 100644
) -> DeviceAwareBlockAllocator:
) -> DeviceAwareBlockAllocator:
"""Creates a CpuGpuBlockAllocator instance with the specified
"""Creates a CpuGpuBlockAllocator instance with the specified
configuration.
configuration.
@@ -64,6 +
66
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -64,6 +
79
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
cpu_block_ids = block_ids[num_gpu_blocks:]
cpu_block_ids = block_ids[num_gpu_blocks:]
if allocator_type == "naive":
if allocator_type == "naive":
...
@@ -110,7 +146,7 @@ index 359b5b26..d52ee050 100644
...
@@ -110,7 +146,7 @@ index 359b5b26..d52ee050 100644
gpu_allocator: BlockAllocator = NaiveBlockAllocator(
gpu_allocator: BlockAllocator = NaiveBlockAllocator(
create_block=NaiveBlock, # type: ignore
create_block=NaiveBlock, # type: ignore
num_blocks=num_gpu_blocks,
num_blocks=num_gpu_blocks,
@@ -82,12 +8
5
,14 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -82,12 +
9
8,14 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
num_blocks=num_gpu_blocks,
num_blocks=num_gpu_blocks,
block_size=block_size,
block_size=block_size,
block_ids=gpu_block_ids,
block_ids=gpu_block_ids,
...
@@ -125,7 +161,7 @@ index 359b5b26..d52ee050 100644
...
@@ -125,7 +161,7 @@ index 359b5b26..d52ee050 100644
)
)
else:
else:
raise ValueError(f"Unknown allocator type {allocator_type=}")
raise ValueError(f"Unknown allocator type {allocator_type=}")
@@ -95,10 +1
00
,12 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -95,10 +1
13
,12 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
return CpuGpuBlockAllocator(
return CpuGpuBlockAllocator(
cpu_block_allocator=cpu_allocator,
cpu_block_allocator=cpu_allocator,
gpu_block_allocator=gpu_allocator,
gpu_block_allocator=gpu_allocator,
...
@@ -139,7 +175,7 @@ index 359b5b26..d52ee050 100644
...
@@ -139,7 +175,7 @@ index 359b5b26..d52ee050 100644
assert not (
assert not (
cpu_block_allocator.all_block_ids
cpu_block_allocator.all_block_ids
& gpu_block_allocator.all_block_ids
& gpu_block_allocator.all_block_ids
@@ -108,6 +1
15
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
@@ -108,6 +1
28
,7 @@
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
Device.CPU: cpu_block_allocator,
Device.CPU: cpu_block_allocator,
Device.GPU: gpu_block_allocator,
Device.GPU: gpu_block_allocator,
}
}
...
@@ -148,10 +184,24 @@ index 359b5b26..d52ee050 100644
...
@@ -148,10 +184,24 @@ index 359b5b26..d52ee050 100644
self._swap_mapping: Dict[int, int] = {}
self._swap_mapping: Dict[int, int] = {}
self._null_block: Optional[Block] = None
self._null_block: Optional[Block] = None
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index c388366b..3
1ed7aa4
100644
index c388366b
8
..3
c223b519
100644
--- a/vllm/core/block/naive_block.py
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -2,7 +2,7 @@
@@ -1,8 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
from collections import deque
from collections import deque
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
...
@@ -160,7 +210,7 @@ index c388366b..31ed7aa4 100644
...
@@ -160,7 +210,7 @@ index c388366b..31ed7aa4 100644
from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
get_all_blocks_recursively)
get_all_blocks_recursively)
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
@@ -38,7 +
38
,7 @@
class NaiveBlockAllocator(BlockAllocator):
@@ -38,7 +
51
,7 @@
class NaiveBlockAllocator(BlockAllocator):
if block_ids is None:
if block_ids is None:
block_ids = range(num_blocks)
block_ids = range(num_blocks)
...
@@ -169,7 +219,7 @@ index c388366b..31ed7aa4 100644
...
@@ -169,7 +219,7 @@ index c388366b..31ed7aa4 100644
self._all_block_indices = frozenset(block_ids)
self._all_block_indices = frozenset(block_ids)
assert len(self._all_block_indices) == num_blocks
assert len(self._all_block_indices) == num_blocks
@@ -134,7 +1
3
4,8 @@
class NaiveBlockAllocator(BlockAllocator):
@@ -134,7 +14
7
,8 @@
class NaiveBlockAllocator(BlockAllocator):
if not self._free_block_indices:
if not self._free_block_indices:
raise BlockAllocator.NoFreeBlocksError()
raise BlockAllocator.NoFreeBlocksError()
...
@@ -179,7 +229,7 @@ index c388366b..31ed7aa4 100644
...
@@ -179,7 +229,7 @@ index c388366b..31ed7aa4 100644
self._refcounter.incr(block_id)
self._refcounter.incr(block_id)
return block_id
return block_id
@@ -148,7 +1
49
,7 @@
class NaiveBlockAllocator(BlockAllocator):
@@ -148,7 +1
62
,7 @@
class NaiveBlockAllocator(BlockAllocator):
refcount = self._refcounter.decr(block_id)
refcount = self._refcounter.decr(block_id)
if refcount == 0:
if refcount == 0:
...
@@ -189,10 +239,26 @@ index c388366b..31ed7aa4 100644
...
@@ -189,10 +239,26 @@ index c388366b..31ed7aa4 100644
def free(self, block: Block, keep_block_object: bool = False) -> None:
def free(self, block: Block, keep_block_object: bool = False) -> None:
# Release the physical block id
# Release the physical block id
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 1ca9e49d..
cd780f69
100644
index 1ca9e49d
a
..
26fabb243
100644
--- a/vllm/core/block/prefix_caching_block.py
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -4,7 +4,7 @@
import sys
@@ -1,10 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""Token blocks."""
import sys
from bisect import bisect_left
from bisect import bisect_left
from os.path import commonprefix
from os.path import commonprefix
from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
...
@@ -201,7 +267,7 @@ index 1ca9e49d..cd780f69 100644
...
@@ -201,7 +267,7 @@ index 1ca9e49d..cd780f69 100644
from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
get_all_blocks_recursively)
get_all_blocks_recursively)
@@ -23,6 +
2
3,9 @@
PrefixHash = int
@@ -23,6 +3
6
,9 @@
PrefixHash = int
# then we know this block hasn't been accessed yet.
# then we know this block hasn't been accessed yet.
_DEFAULT_LAST_ACCESSED_TIME = -1
_DEFAULT_LAST_ACCESSED_TIME = -1
...
@@ -211,7 +277,7 @@ index 1ca9e49d..cd780f69 100644
...
@@ -211,7 +277,7 @@ index 1ca9e49d..cd780f69 100644
logger = init_logger(__name__)
logger = init_logger(__name__)
@@ -80,6 +
83
,7 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -80,6 +
96
,7 @@
class PrefixCachingBlockAllocator(BlockAllocator):
block_size: int,
block_size: int,
block_ids: Optional[Iterable[int]] = None,
block_ids: Optional[Iterable[int]] = None,
eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
...
@@ -219,7 +285,7 @@ index 1ca9e49d..cd780f69 100644
...
@@ -219,7 +285,7 @@ index 1ca9e49d..cd780f69 100644
):
):
if block_ids is None:
if block_ids is None:
block_ids = range(num_blocks)
block_ids = range(num_blocks)
@@ -131,6 +1
35
,9 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -131,6 +1
48
,9 @@
class PrefixCachingBlockAllocator(BlockAllocator):
self.metric_data = CacheMetricData()
self.metric_data = CacheMetricData()
...
@@ -229,7 +295,7 @@ index 1ca9e49d..cd780f69 100644
...
@@ -229,7 +295,7 @@ index 1ca9e49d..cd780f69 100644
def _create_block(
def _create_block(
self,
self,
prev_block: Optional[Block],
prev_block: Optional[Block],
@@ -337,6 +3
44
,9 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -337,6 +3
57
,9 @@
class PrefixCachingBlockAllocator(BlockAllocator):
assert self._refcounter.get(_block_id) == 0
assert self._refcounter.get(_block_id) == 0
assert _block_id == block_id
assert _block_id == block_id
...
@@ -239,7 +305,7 @@ index 1ca9e49d..cd780f69 100644
...
@@ -239,7 +305,7 @@ index 1ca9e49d..cd780f69 100644
self._cached_blocks.pop(content_hash_to_evict)
self._cached_blocks.pop(content_hash_to_evict)
self._refcounter.incr(block_id)
self._refcounter.incr(block_id)
@@ -513,6 +5
2
3,10 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -513,6 +53
6
,10 @@
class PrefixCachingBlockAllocator(BlockAllocator):
# Mark this block as touched so that it can be marked as
# Mark this block as touched so that it can be marked as
# computed after the entire batch of sequences are scheduled.
# computed after the entire batch of sequences are scheduled.
self._touched_blocks.add(block.block_id)
self._touched_blocks.add(block.block_id)
...
@@ -250,7 +316,7 @@ index 1ca9e49d..cd780f69 100644
...
@@ -250,7 +316,7 @@ index 1ca9e49d..cd780f69 100644
return block.block_id
return block.block_id
# Reuse the cached content hash
# Reuse the cached content hash
@@ -579,9 +
593
,11 @@
class PrefixCachingBlockAllocator(BlockAllocator):
@@ -579,9 +
606
,11 @@
class PrefixCachingBlockAllocator(BlockAllocator):
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
# Mark all touched blocks as computed.
# Mark all touched blocks as computed.
...
@@ -266,10 +332,28 @@ index 1ca9e49d..cd780f69 100644
...
@@ -266,10 +332,28 @@ index 1ca9e49d..cd780f69 100644
def _track_block_id(self, block_id: Optional[BlockId],
def _track_block_id(self, block_id: Optional[BlockId],
computed: bool) -> None:
computed: bool) -> None:
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c5b3b04f..
21fe0fc
8 100644
index c5b3b04f
3
..
d3a4b77f
8 100644
--- a/vllm/core/block_manager.py
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -10,7 +10,10 @@
from vllm.core.block.interfaces import Block
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""A block manager that manages token blocks."""
from typing import Dict, List, Optional
from typing import Sequence as GenericSequence
@@ -10,7 +23,10 @@
from vllm.core.block.interfaces import Block
from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
LastAccessBlocksTracker)
LastAccessBlocksTracker)
from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
...
@@ -280,7 +364,7 @@ index c5b3b04f..21fe0fc8 100644
...
@@ -280,7 +364,7 @@ index c5b3b04f..21fe0fc8 100644
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
from vllm.utils import Device
from vllm.utils import Device
@@ -60,6 +6
3
,7 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
@@ -60,6 +
7
6,7 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
def __init__(
def __init__(
self,
self,
...
@@ -288,7 +372,7 @@ index c5b3b04f..21fe0fc8 100644
...
@@ -288,7 +372,7 @@ index c5b3b04f..21fe0fc8 100644
block_size: int,
block_size: int,
num_gpu_blocks: int,
num_gpu_blocks: int,
num_cpu_blocks: int,
num_cpu_blocks: int,
@@ -91,11 +
95
,29 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
@@ -91,11 +
108
,29 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
self.watermark_blocks = int(watermark * num_gpu_blocks)
self.watermark_blocks = int(watermark * num_gpu_blocks)
...
@@ -318,7 +402,7 @@ index c5b3b04f..21fe0fc8 100644
...
@@ -318,7 +402,7 @@ index c5b3b04f..21fe0fc8 100644
)
)
self.block_tables: Dict[SeqId, BlockTable] = {}
self.block_tables: Dict[SeqId, BlockTable] = {}
@@ -108,7 +13
0
,8 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
@@ -108,7 +1
4
3,8 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
def can_allocate(self,
def can_allocate(self,
seq_group: SequenceGroup,
seq_group: SequenceGroup,
...
@@ -328,7 +412,7 @@ index c5b3b04f..21fe0fc8 100644
...
@@ -328,7 +412,7 @@ index c5b3b04f..21fe0fc8 100644
# FIXME(woosuk): Here we assume that all sequences in the group share
# FIXME(woosuk): Here we assume that all sequences in the group share
# the same prompt. This may not be true for preempted sequences.
# the same prompt. This may not be true for preempted sequences.
@@ -121,6 +1
44
,10 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
@@ -121,6 +1
57
,10 @@
class SelfAttnBlockSpaceManager(BlockSpaceManager):
num_lookahead_slots=num_lookahead_slots,
num_lookahead_slots=num_lookahead_slots,
)
)
...
@@ -341,11 +425,24 @@ index c5b3b04f..21fe0fc8 100644
...
@@ -341,11 +425,24 @@ index c5b3b04f..21fe0fc8 100644
assert encoder_seq is not None
assert encoder_seq is not None
diff --git a/vllm/core/event_manager.py b/vllm/core/event_manager.py
diff --git a/vllm/core/event_manager.py b/vllm/core/event_manager.py
new file mode 100644
new file mode 100644
index 00000000..
a27af580
index 00000000
0
..
79eb8db67
--- /dev/null
--- /dev/null
+++ b/vllm/core/event_manager.py
+++ b/vllm/core/event_manager.py
@@ -0,0 +1,108 @@
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ctypes
+import ctypes
+import logging
+import logging
+import uuid
+import uuid
...
@@ -454,10 +551,26 @@ index 00000000..a27af580
...
@@ -454,10 +551,26 @@ index 00000000..a27af580
+
+
+ self.event_id_counter += 1
+ self.event_id_counter += 1
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f507847a..
170a359f
100644
index f507847a
d
..
3f3cba766
100644
--- a/vllm/core/scheduler.py
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -4,22 +4,22 @@
import enum
@@ -1,25 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import enum
import os
import os
import random
import random
import time
import time
...
@@ -484,7 +597,7 @@ index f507847a..170a359f 100644
...
@@ -484,7 +597,7 @@ index f507847a..170a359f 100644
logger = init_logger(__name__)
logger = init_logger(__name__)
# Test-only. If configured, decode is preempted with
# Test-only. If configured, decode is preempted with
@@ -285,6 +28
5
,7 @@
class SchedulerPrefillOutputs:
@@ -285,6 +2
9
8,7 @@
class SchedulerPrefillOutputs:
# Ignored sequence groups.
# Ignored sequence groups.
ignored_seq_groups: List[SequenceGroup]
ignored_seq_groups: List[SequenceGroup]
num_lookahead_slots: int
num_lookahead_slots: int
...
@@ -492,7 +605,7 @@ index f507847a..170a359f 100644
...
@@ -492,7 +605,7 @@ index f507847a..170a359f 100644
@classmethod
@classmethod
def create_empty(cls) -> "SchedulerPrefillOutputs":
def create_empty(cls) -> "SchedulerPrefillOutputs":
@@ -292,6 +
293
,7 @@
class SchedulerPrefillOutputs:
@@ -292,6 +
306
,7 @@
class SchedulerPrefillOutputs:
seq_groups=[],
seq_groups=[],
ignored_seq_groups=[],
ignored_seq_groups=[],
num_lookahead_slots=0,
num_lookahead_slots=0,
...
@@ -500,7 +613,7 @@ index f507847a..170a359f 100644
...
@@ -500,7 +613,7 @@ index f507847a..170a359f 100644
)
)
@@ -325,12 +3
27
,14 @@
class Scheduler:
@@ -325,12 +3
40
,14 @@
class Scheduler:
def __init__(
def __init__(
self,
self,
...
@@ -515,7 +628,7 @@ index f507847a..170a359f 100644
...
@@ -515,7 +628,7 @@ index f507847a..170a359f 100644
self.scheduler_config = scheduler_config
self.scheduler_config = scheduler_config
self.cache_config = cache_config
self.cache_config = cache_config
# Note for LoRA scheduling: the current policy is extremely
# Note for LoRA scheduling: the current policy is extremely
@@ -356,6 +3
60
,7 @@
class Scheduler:
@@ -356,6 +3
73
,7 @@
class Scheduler:
# Create the block space manager.
# Create the block space manager.
self.block_manager = BlockSpaceManagerImpl(
self.block_manager = BlockSpaceManagerImpl(
...
@@ -523,7 +636,7 @@ index f507847a..170a359f 100644
...
@@ -523,7 +636,7 @@ index f507847a..170a359f 100644
block_size=self.cache_config.block_size,
block_size=self.cache_config.block_size,
num_gpu_blocks=num_gpu_blocks,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
num_cpu_blocks=num_cpu_blocks,
@@ -371,6 +3
76
,16 @@
class Scheduler:
@@ -371,6 +3
89
,16 @@
class Scheduler:
# Sequence groups in the SWAPPED state.
# Sequence groups in the SWAPPED state.
# Contain decode requests that are swapped out.
# Contain decode requests that are swapped out.
self.swapped: Deque[SequenceGroup] = deque()
self.swapped: Deque[SequenceGroup] = deque()
...
@@ -540,7 +653,7 @@ index f507847a..170a359f 100644
...
@@ -540,7 +653,7 @@ index f507847a..170a359f 100644
# Sequence groups finished requests ids since last step iteration.
# Sequence groups finished requests ids since last step iteration.
# It lets the model know that any state associated with these requests
# It lets the model know that any state associated with these requests
# can and must be released after the current step.
# can and must be released after the current step.
@@ -501,7 +5
16
,7 @@
class Scheduler:
@@ -501,7 +5
29
,7 @@
class Scheduler:
def has_unfinished_seqs(self) -> bool:
def has_unfinished_seqs(self) -> bool:
return len(self.waiting) != 0 or len(self.running) != 0 or len(
return len(self.waiting) != 0 or len(self.running) != 0 or len(
...
@@ -549,7 +662,7 @@ index f507847a..170a359f 100644
...
@@ -549,7 +662,7 @@ index f507847a..170a359f 100644
def get_prefix_cache_hit_rate(self, device: Device) -> float:
def get_prefix_cache_hit_rate(self, device: Device) -> float:
return self.block_manager.get_prefix_cache_hit_rate(device)
return self.block_manager.get_prefix_cache_hit_rate(device)
@@ -523,6 +5
38
,8 @@
class Scheduler:
@@ -523,6 +5
51
,8 @@
class Scheduler:
budget: SchedulingBudget,
budget: SchedulingBudget,
curr_loras: Optional[Set[int]],
curr_loras: Optional[Set[int]],
enable_chunking: bool = False,
enable_chunking: bool = False,
...
@@ -558,7 +671,7 @@ index f507847a..170a359f 100644
...
@@ -558,7 +671,7 @@ index f507847a..170a359f 100644
) -> SchedulerRunningOutputs:
) -> SchedulerRunningOutputs:
"""Schedule sequence groups that are running.
"""Schedule sequence groups that are running.
@@ -537,6 +5
54
,8 @@
class Scheduler:
@@ -537,6 +5
67
,8 @@
class Scheduler:
chunked number of tokens are scheduled if
chunked number of tokens are scheduled if
`budget.num_batched_tokens` has not enough capacity to schedule
`budget.num_batched_tokens` has not enough capacity to schedule
all tokens.
all tokens.
...
@@ -567,7 +680,7 @@ index f507847a..170a359f 100644
...
@@ -567,7 +680,7 @@ index f507847a..170a359f 100644
Returns:
Returns:
SchedulerRunningOutputs.
SchedulerRunningOutputs.
@@ -566,6 +58
5
,38 @@
class Scheduler:
@@ -566,6 +5
9
8,38 @@
class Scheduler:
preempted: List[SequenceGroup] = ret.preempted
preempted: List[SequenceGroup] = ret.preempted
swapped_out: List[SequenceGroup] = ret.swapped_out
swapped_out: List[SequenceGroup] = ret.swapped_out
...
@@ -606,7 +719,7 @@ index f507847a..170a359f 100644
...
@@ -606,7 +719,7 @@ index f507847a..170a359f 100644
running_queue = self.running
running_queue = self.running
assert len(self._async_stopped) == 0
assert len(self._async_stopped) == 0
while running_queue:
while running_queue:
@@ -925,6 +9
76
,7 @@
class Scheduler:
@@ -925,6 +9
89
,7 @@
class Scheduler:
seq_groups: List[ScheduledSequenceGroup] = []
seq_groups: List[ScheduledSequenceGroup] = []
waiting_queue = self.waiting
waiting_queue = self.waiting
...
@@ -614,7 +727,7 @@ index f507847a..170a359f 100644
...
@@ -614,7 +727,7 @@ index f507847a..170a359f 100644
leftover_waiting_sequences: Deque[SequenceGroup] = deque()
leftover_waiting_sequences: Deque[SequenceGroup] = deque()
while self._passed_delay(time.time()) and waiting_queue:
while self._passed_delay(time.time()) and waiting_queue:
@@ -961,8 +10
13
,10 @@
class Scheduler:
@@ -961,8 +10
26
,10 @@
class Scheduler:
True, enable_chunking)
True, enable_chunking)
# If the sequence group cannot be allocated, stop.
# If the sequence group cannot be allocated, stop.
...
@@ -626,7 +739,7 @@ index f507847a..170a359f 100644
...
@@ -626,7 +739,7 @@ index f507847a..170a359f 100644
if can_allocate == AllocStatus.LATER:
if can_allocate == AllocStatus.LATER:
break
break
elif can_allocate == AllocStatus.NEVER:
elif can_allocate == AllocStatus.NEVER:
@@ -1008,7 +10
62
,18 @@
class Scheduler:
@@ -1008,7 +10
75
,18 @@
class Scheduler:
if curr_loras is not None and lora_int_id > 0:
if curr_loras is not None and lora_int_id > 0:
curr_loras.add(lora_int_id)
curr_loras.add(lora_int_id)
waiting_queue.popleft()
waiting_queue.popleft()
...
@@ -646,7 +759,7 @@ index f507847a..170a359f 100644
...
@@ -646,7 +759,7 @@ index f507847a..170a359f 100644
if enable_chunking and self.scheduler_config.is_multi_step:
if enable_chunking and self.scheduler_config.is_multi_step:
blocks_to_copy: List[Tuple[int, int]] = []
blocks_to_copy: List[Tuple[int, int]] = []
@@ -1046,9 +11
11
,11 @@
class Scheduler:
@@ -1046,9 +11
24
,11 @@
class Scheduler:
seq_groups=seq_groups,
seq_groups=seq_groups,
ignored_seq_groups=ignored_seq_groups,
ignored_seq_groups=ignored_seq_groups,
num_lookahead_slots=self._get_num_lookahead_slots(
num_lookahead_slots=self._get_num_lookahead_slots(
...
@@ -660,7 +773,7 @@ index f507847a..170a359f 100644
...
@@ -660,7 +773,7 @@ index f507847a..170a359f 100644
"""Schedule queued requests.
"""Schedule queued requests.
The current policy is designed to optimize the throughput. First,
The current policy is designed to optimize the throughput. First,
@@ -1066,9 +11
33
,13 @@
class Scheduler:
@@ -1066,9 +11
46
,13 @@
class Scheduler:
for seq_group in self.running:
for seq_group in self.running:
budget.add_num_seqs(seq_group.request_id,
budget.add_num_seqs(seq_group.request_id,
seq_group.get_max_num_running_seqs())
seq_group.get_max_num_running_seqs())
...
@@ -676,7 +789,7 @@ index f507847a..170a359f 100644
...
@@ -676,7 +789,7 @@ index f507847a..170a359f 100644
prefills = SchedulerPrefillOutputs.create_empty()
prefills = SchedulerPrefillOutputs.create_empty()
running_scheduled = SchedulerRunningOutputs.create_empty()
running_scheduled = SchedulerRunningOutputs.create_empty()
@@ -1090,7 +11
61
,9 @@
class Scheduler:
@@ -1090,7 +11
74
,9 @@
class Scheduler:
if len(prefills.seq_groups) == 0:
if len(prefills.seq_groups) == 0:
running_scheduled = self._schedule_running(budget,
running_scheduled = self._schedule_running(budget,
curr_loras,
curr_loras,
...
@@ -687,7 +800,7 @@ index f507847a..170a359f 100644
...
@@ -687,7 +800,7 @@ index f507847a..170a359f 100644
# If any sequence group is preempted, do not swap in any sequence
# If any sequence group is preempted, do not swap in any sequence
# group. because it means there's no slot for new running requests.
# group. because it means there's no slot for new running requests.
@@ -1106,7 +11
7
9,12 @@
class Scheduler:
@@ -1106,7 +119
2
,12 @@
class Scheduler:
self.waiting.extendleft(running_scheduled.preempted)
self.waiting.extendleft(running_scheduled.preempted)
# Update new running requests.
# Update new running requests.
if len(prefills.seq_groups) > 0:
if len(prefills.seq_groups) > 0:
...
@@ -701,7 +814,7 @@ index f507847a..170a359f 100644
...
@@ -701,7 +814,7 @@ index f507847a..170a359f 100644
self.running.extend(running_scheduled.decode_seq_groups_list)
self.running.extend(running_scheduled.decode_seq_groups_list)
@@ -1248,12 +13
26
,14 @@
class Scheduler:
@@ -1248,12 +13
39
,14 @@
class Scheduler:
len(running_scheduled.swapped_out)),
len(running_scheduled.swapped_out)),
)
)
...
@@ -718,7 +831,7 @@ index f507847a..170a359f 100644
...
@@ -718,7 +831,7 @@ index f507847a..170a359f 100644
def _can_append_slots(self, seq_group: SequenceGroup,
def _can_append_slots(self, seq_group: SequenceGroup,
enable_chunking: bool) -> bool:
enable_chunking: bool) -> bool:
@@ -1287,14 +13
67
,16 @@
class Scheduler:
@@ -1287,14 +13
80
,16 @@
class Scheduler:
return no_single_seq
return no_single_seq
def schedule(
def schedule(
...
@@ -738,7 +851,7 @@ index f507847a..170a359f 100644
...
@@ -738,7 +851,7 @@ index f507847a..170a359f 100644
now = time.time()
now = time.time()
if not self.cache_config.enable_prefix_caching:
if not self.cache_config.enable_prefix_caching:
@@ -1333,7 +14
15
,8 @@
class Scheduler:
@@ -1333,7 +14
28
,8 @@
class Scheduler:
encoder_seq_data = None
encoder_seq_data = None
cross_block_table = None
cross_block_table = None
...
@@ -748,7 +861,7 @@ index f507847a..170a359f 100644
...
@@ -748,7 +861,7 @@ index f507847a..170a359f 100644
seq_id = seq.seq_id
seq_id = seq.seq_id
seq_data[seq_id] = seq.data
seq_data[seq_id] = seq.data
block_tables[seq_id] = self.block_manager.get_block_table(seq)
block_tables[seq_id] = self.block_manager.get_block_table(seq)
@@ -1342,7 +14
25
,9 @@
class Scheduler:
@@ -1342,7 +14
38
,9 @@
class Scheduler:
if self.cache_config.enable_prefix_caching:
if self.cache_config.enable_prefix_caching:
common_computed_block_nums = (
common_computed_block_nums = (
self.block_manager.get_common_computed_block_ids(
self.block_manager.get_common_computed_block_ids(
...
@@ -759,7 +872,7 @@ index f507847a..170a359f 100644
...
@@ -759,7 +872,7 @@ index f507847a..170a359f 100644
do_sample = True
do_sample = True
is_prompt = seq_group.is_prefill()
is_prompt = seq_group.is_prefill()
@@ -1364,9 +14
49
,30 @@
class Scheduler:
@@ -1364,9 +14
62
,30 @@
class Scheduler:
< seqs[0].data.get_len()):
< seqs[0].data.get_len()):
do_sample = False
do_sample = False
...
@@ -790,7 +903,7 @@ index f507847a..170a359f 100644
...
@@ -790,7 +903,7 @@ index f507847a..170a359f 100644
seq_group_metadata = SequenceGroupMetadata(
seq_group_metadata = SequenceGroupMetadata(
request_id=seq_group.request_id,
request_id=seq_group.request_id,
is_prompt=is_prompt,
is_prompt=is_prompt,
@@ -1392,6 +1
498
,7 @@
class Scheduler:
@@ -1392,6 +1
511
,7 @@
class Scheduler:
if scheduler_outputs.num_prefill_groups > 0 else None,
if scheduler_outputs.num_prefill_groups > 0 else None,
mm_processor_kwargs=seq_group.mm_processor_kwargs,
mm_processor_kwargs=seq_group.mm_processor_kwargs,
prompt_adapter_request=seq_group.prompt_adapter_request,
prompt_adapter_request=seq_group.prompt_adapter_request,
...
@@ -798,7 +911,7 @@ index f507847a..170a359f 100644
...
@@ -798,7 +911,7 @@ index f507847a..170a359f 100644
)
)
else:
else:
# When SPMD mode is enabled, we only send delta data except for
# When SPMD mode is enabled, we only send delta data except for
@@ -1490,11 +1
597
,17 @@
class Scheduler:
@@ -1490,11 +1
610
,17 @@
class Scheduler:
self._async_stopped.clear()
self._async_stopped.clear()
...
@@ -821,10 +934,25 @@ index f507847a..170a359f 100644
...
@@ -821,10 +934,25 @@ index f507847a..170a359f 100644
blocks_to_copy: List[Tuple[int, int]],
blocks_to_copy: List[Tuple[int, int]],
diff --git a/vllm/distributed/device_communicators/kv_rearrange.py b/vllm/distributed/device_communicators/kv_rearrange.py
diff --git a/vllm/distributed/device_communicators/kv_rearrange.py b/vllm/distributed/device_communicators/kv_rearrange.py
new file mode 100644
new file mode 100644
index 00000000..
b9485bd5
index 00000000
0
..
a2f9ce99e
--- /dev/null
--- /dev/null
+++ b/vllm/distributed/device_communicators/kv_rearrange.py
+++ b/vllm/distributed/device_communicators/kv_rearrange.py
@@ -0,0 +1,110 @@
@@ -0,0 +1,125 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch
+import triton
+import triton
+import triton.language as tl
+import triton.language as tl
...
@@ -938,10 +1066,25 @@ index 00000000..b9485bd5
...
@@ -938,10 +1066,25 @@ index 00000000..b9485bd5
\
No newline at end of file
\
No newline at end of file
diff --git a/vllm/distributed/device_communicators/nixl.py b/vllm/distributed/device_communicators/nixl.py
diff --git a/vllm/distributed/device_communicators/nixl.py b/vllm/distributed/device_communicators/nixl.py
new file mode 100644
new file mode 100644
index 00000000..
a8bd202f
index 00000000
0
..
136a0bd37
--- /dev/null
--- /dev/null
+++ b/vllm/distributed/device_communicators/nixl.py
+++ b/vllm/distributed/device_communicators/nixl.py
@@ -0,0 +1,379 @@
@@ -0,0 +1,394 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch
+from typing import List, Tuple
+from typing import List, Tuple
+from vllm.config import VllmConfig
+from vllm.config import VllmConfig
...
@@ -1323,11 +1466,24 @@ index 00000000..a8bd202f
...
@@ -1323,11 +1466,24 @@ index 00000000..a8bd202f
+ return done_req_ids
+ return done_req_ids
diff --git a/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py b/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py
diff --git a/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py b/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py
new file mode 100644
new file mode 100644
index 00000000..
7b3344f8
index 00000000
0
..
418fc7154
--- /dev/null
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/dynamo_connector.py
@@ -0,0 +1,350 @@
@@ -0,0 +1,363 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+"""
+Simple KV Cache Connector for Distributed Machine Learning Inference
+Simple KV Cache Connector for Distributed Machine Learning Inference
+
+
...
@@ -1678,10 +1834,28 @@ index 00000000..7b3344f8
...
@@ -1678,10 +1834,28 @@ index 00000000..7b3344f8
+ self.config.kv_consumers_pipeline_parallel_size = kv_config_enhanced["kv_consumers_pipeline_parallel_size"]
+ self.config.kv_consumers_pipeline_parallel_size = kv_config_enhanced["kv_consumers_pipeline_parallel_size"]
+ self.config.kv_producers_parallel_size = kv_config_enhanced["kv_producers_parallel_size"]
+ self.config.kv_producers_parallel_size = kv_config_enhanced["kv_producers_parallel_size"]
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index fe480533..
c82fda80
100644
index fe480533
4
..
0e16f0b31
100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -27,13 +27,13 @@
class KVConnectorFactory:
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import importlib
from typing import TYPE_CHECKING, Callable, Dict, Type
@@ -27,13 +40,13 @@
class KVConnectorFactory:
@classmethod
@classmethod
def create_connector(cls, rank: int, local_rank: int,
def create_connector(cls, rank: int, local_rank: int,
...
@@ -1697,7 +1871,7 @@ index fe480533..c82fda80 100644
...
@@ -1697,7 +1871,7 @@ index fe480533..c82fda80 100644
# Register various connectors here.
# Register various connectors here.
@@ -48,3 +
48
,8 @@
KVConnectorFactory.register_connector(
@@ -48,3 +
61
,8 @@
KVConnectorFactory.register_connector(
"MooncakeConnector",
"MooncakeConnector",
"vllm.distributed.kv_transfer.kv_connector.simple_connector",
"vllm.distributed.kv_transfer.kv_connector.simple_connector",
"SimpleConnector")
"SimpleConnector")
...
@@ -1707,10 +1881,28 @@ index fe480533..c82fda80 100644
...
@@ -1707,10 +1881,28 @@ index fe480533..c82fda80 100644
+ "vllm.distributed.kv_transfer.kv_connector.dynamo_connector",
+ "vllm.distributed.kv_transfer.kv_connector.dynamo_connector",
+ "DynamoConnector")
+ "DynamoConnector")
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 2033e976..
ddebb68e
100644
index 2033e976
2
..
983bc69a3
100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -8,13 +8,15 @@
MooncakePipe.
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""
Simple KV Cache Connector for Distributed Machine Learning Inference
@@ -8,13 +21,15 @@
MooncakePipe.
But the logic can be extended to support other pipe and lookup buffer.
But the logic can be extended to support other pipe and lookup buffer.
"""
"""
...
@@ -1727,7 +1919,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1727,7 +1919,7 @@ index 2033e976..ddebb68e 100644
from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import (
SimpleBuffer)
SimpleBuffer)
from vllm.logger import init_logger
from vllm.logger import init_logger
@@ -33,6 +
35
,7 @@
class SimpleConnector(KVConnectorBase):
@@ -33,6 +
48
,7 @@
class SimpleConnector(KVConnectorBase):
rank: int,
rank: int,
local_rank: int,
local_rank: int,
config: VllmConfig,
config: VllmConfig,
...
@@ -1735,7 +1927,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1735,7 +1927,7 @@ index 2033e976..ddebb68e 100644
):
):
self.config = config.kv_transfer_config
self.config = config.kv_transfer_config
@@ -71,20 +7
4
,31 @@
class SimpleConnector(KVConnectorBase):
@@ -71,20 +
8
7,31 @@
class SimpleConnector(KVConnectorBase):
self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
self.producer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
self.consumer_signal_pipe: Union[PyNcclPipe, MooncakePipe]
...
@@ -1768,7 +1960,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1768,7 +1960,7 @@ index 2033e976..ddebb68e 100644
local_rank=local_rank,
local_rank=local_rank,
config=self.config,
config=self.config,
port_offset=port_offset_base + 1,
port_offset=port_offset_base + 1,
@@ -108,11 +1
22
,13 @@
class SimpleConnector(KVConnectorBase):
@@ -108,11 +1
35
,13 @@
class SimpleConnector(KVConnectorBase):
# its recv pipe to the send pipe of KV producder
# its recv pipe to the send pipe of KV producder
if self.config.kv_connector == "PyNcclConnector":
if self.config.kv_connector == "PyNcclConnector":
self.consumer_data_pipe = PyNcclPipe(
self.consumer_data_pipe = PyNcclPipe(
...
@@ -1782,7 +1974,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1782,7 +1974,7 @@ index 2033e976..ddebb68e 100644
local_rank=local_rank,
local_rank=local_rank,
config=self.config,
config=self.config,
port_offset=port_offset_base + 1,
port_offset=port_offset_base + 1,
@@ -131,21 +1
47
,25 @@
class SimpleConnector(KVConnectorBase):
@@ -131,21 +1
60
,25 @@
class SimpleConnector(KVConnectorBase):
self.config.kv_buffer_size,
self.config.kv_buffer_size,
)
)
...
@@ -1812,7 +2004,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1812,7 +2004,7 @@ index 2033e976..ddebb68e 100644
def send_kv_caches_and_hidden_states(
def send_kv_caches_and_hidden_states(
self,
self,
@@ -161,12 +1
81
,20 @@
class SimpleConnector(KVConnectorBase):
@@ -161,12 +1
94
,20 @@
class SimpleConnector(KVConnectorBase):
slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
start_layer = model_executable.model.start_layer
start_layer = model_executable.model.start_layer
end_layer = model_executable.model.end_layer
end_layer = model_executable.model.end_layer
...
@@ -1837,7 +2029,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1837,7 +2029,7 @@ index 2033e976..ddebb68e 100644
# query_lens contains new KV caches that are added to vLLM.
# query_lens contains new KV caches that are added to vLLM.
# so we will send them to decode instance
# so we will send them to decode instance
@@ -175,27 +2
03
,40 @@
class SimpleConnector(KVConnectorBase):
@@ -175,27 +2
16
,40 @@
class SimpleConnector(KVConnectorBase):
start_pos = sum(seq_lens[:idx])
start_pos = sum(seq_lens[:idx])
end_pos = start_pos + slen
end_pos = start_pos + slen
current_tokens = input_tokens_tensor[start_pos:end_pos]
current_tokens = input_tokens_tensor[start_pos:end_pos]
...
@@ -1892,7 +2084,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1892,7 +2084,7 @@ index 2033e976..ddebb68e 100644
logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
@@ -215,6 +2
5
6,7 @@
class SimpleConnector(KVConnectorBase):
@@ -215,6 +26
9
,7 @@
class SimpleConnector(KVConnectorBase):
input_tokens_tensor = model_input.input_tokens
input_tokens_tensor = model_input.input_tokens
seq_lens = model_input.attn_metadata.seq_lens
seq_lens = model_input.attn_metadata.seq_lens
slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
...
@@ -1900,7 +2092,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1900,7 +2092,7 @@ index 2033e976..ddebb68e 100644
hidden_or_intermediate_states_for_one_req = []
hidden_or_intermediate_states_for_one_req = []
@@ -222,6 +2
64
,9 @@
class SimpleConnector(KVConnectorBase):
@@ -222,6 +2
77
,9 @@
class SimpleConnector(KVConnectorBase):
num_computed_tokens_list = []
num_computed_tokens_list = []
start_pos_list = []
start_pos_list = []
...
@@ -1910,7 +2102,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1910,7 +2102,7 @@ index 2033e976..ddebb68e 100644
# enumerate different requests
# enumerate different requests
# FIXME(Kuntai): This impl assumes that all requests are prefill.
# FIXME(Kuntai): This impl assumes that all requests are prefill.
for idx, slen in enumerate(seq_lens):
for idx, slen in enumerate(seq_lens):
@@ -229,13 +27
4
,15 @@
class SimpleConnector(KVConnectorBase):
@@ -229,13 +2
8
7,15 @@
class SimpleConnector(KVConnectorBase):
start_pos = sum(seq_lens[:idx])
start_pos = sum(seq_lens[:idx])
end_pos = start_pos + slen
end_pos = start_pos + slen
current_tokens = input_tokens_tensor[start_pos:end_pos]
current_tokens = input_tokens_tensor[start_pos:end_pos]
...
@@ -1927,7 +2119,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1927,7 +2119,7 @@ index 2033e976..ddebb68e 100644
torch.ones_like(current_tokens, dtype=bool))
torch.ones_like(current_tokens, dtype=bool))
if ret[0] is None:
if ret[0] is None:
# didn't find any match.
# didn't find any match.
@@ -267,19 +3
14
,25 @@
class SimpleConnector(KVConnectorBase):
@@ -267,19 +3
27
,25 @@
class SimpleConnector(KVConnectorBase):
kv_cache = kv_caches[i - model_executable.model.start_layer]
kv_cache = kv_caches[i - model_executable.model.start_layer]
layer = model_executable.model.layers[i]
layer = model_executable.model.layers[i]
...
@@ -1966,7 +2158,7 @@ index 2033e976..ddebb68e 100644
...
@@ -1966,7 +2158,7 @@ index 2033e976..ddebb68e 100644
hidden_or_intermediate_states_for_one_req.append(hidden)
hidden_or_intermediate_states_for_one_req.append(hidden)
@@ -312,3 +3
65
,77 @@
class SimpleConnector(KVConnectorBase):
@@ -312,3 +3
78
,77 @@
class SimpleConnector(KVConnectorBase):
# MooncakePipe reuses data_pipe for signal_pipe, so we only have to
# MooncakePipe reuses data_pipe for signal_pipe, so we only have to
# close the data_pipe.
# close the data_pipe.
pass
pass
...
@@ -2045,10 +2237,28 @@ index 2033e976..ddebb68e 100644
...
@@ -2045,10 +2237,28 @@ index 2033e976..ddebb68e 100644
+ self.config.kv_consumers_pipeline_parallel_size = kv_config_enhanced["kv_consumers_pipeline_parallel_size"]
+ self.config.kv_consumers_pipeline_parallel_size = kv_config_enhanced["kv_consumers_pipeline_parallel_size"]
+ self.config.kv_producers_parallel_size = kv_config_enhanced["kv_producers_parallel_size"]
+ self.config.kv_producers_parallel_size = kv_config_enhanced["kv_producers_parallel_size"]
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 5e1b6235..b4
506877
100644
index 5e1b6235
2
..
7
b4
cb406e
100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -12,7 +12,8 @@
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""
Implements a distributed key-value (KV) cache transfer mechanism.
@@ -12,7 +25,8 @@
import threading
import threading
import time
import time
from collections import deque
from collections import deque
...
@@ -2058,7 +2268,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2058,7 +2268,7 @@ index 5e1b6235..b4506877 100644
import torch
import torch
@@ -46,7 +
47
,7 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -46,7 +
60
,7 @@
class SimpleBuffer(KVLookupBufferBase):
self.buffer_lock = threading.Lock()
self.buffer_lock = threading.Lock()
self.signal_pipe = signal_pipe
self.signal_pipe = signal_pipe
self.data_pipe = data_pipe
self.data_pipe = data_pipe
...
@@ -2067,7 +2277,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2067,7 +2277,7 @@ index 5e1b6235..b4506877 100644
self.normal_signal = torch.tensor([0], device="cpu")
self.normal_signal = torch.tensor([0], device="cpu")
self.end_signal = None
self.end_signal = None
@@ -57,10 +
58
,16 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -57,10 +
71
,16 @@
class SimpleBuffer(KVLookupBufferBase):
# tokens_roi_sender: tokens and roi of the producer (in the buffer)
# tokens_roi_sender: tokens and roi of the producer (in the buffer)
# tokens_roi_recver: tokens and roi of the consumer (query)
# tokens_roi_recver: tokens and roi of the consumer (query)
...
@@ -2088,7 +2298,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2088,7 +2298,7 @@ index 5e1b6235..b4506877 100644
if tokens_recver is None:
if tokens_recver is None:
# consumer sends an empty request
# consumer sends an empty request
@@ -80,14 +
87
,14 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -80,14 +
100
,14 @@
class SimpleBuffer(KVLookupBufferBase):
return 0
return 0
...
@@ -2106,7 +2316,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2106,7 +2316,7 @@ index 5e1b6235..b4506877 100644
def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
@@ -100,7 +10
7
,7 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -100,7 +1
2
0,7 @@
class SimpleBuffer(KVLookupBufferBase):
raise AssertionError(f"Unknown data type {type(data)}")
raise AssertionError(f"Unknown data type {type(data)}")
...
@@ -2115,7 +2325,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2115,7 +2325,7 @@ index 5e1b6235..b4506877 100644
key: torch.Tensor, value: torch.Tensor,
key: torch.Tensor, value: torch.Tensor,
hidden: torch.Tensor):
hidden: torch.Tensor):
@@ -115,7 +1
22
,7 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -115,7 +1
35
,7 @@
class SimpleBuffer(KVLookupBufferBase):
if isinstance(hidden, torch.Tensor):
if isinstance(hidden, torch.Tensor):
hidden = hidden.clone()
hidden = hidden.clone()
...
@@ -2124,7 +2334,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2124,7 +2334,7 @@ index 5e1b6235..b4506877 100644
with self.buffer_lock:
with self.buffer_lock:
for data in buffer_item:
for data in buffer_item:
@@ -125,53 +1
32
,54 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -125,53 +1
45
,54 @@
class SimpleBuffer(KVLookupBufferBase):
def _is_end_signal(self, signal):
def _is_end_signal(self, signal):
return signal is None
return signal is None
...
@@ -2223,7 +2433,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2223,7 +2433,7 @@ index 5e1b6235..b4506877 100644
except RuntimeError as e:
except RuntimeError as e:
if 'Connection closed by peer' not in str(e):
if 'Connection closed by peer' not in str(e):
@@ -180,10 +
188
,10 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -180,10 +
201
,10 @@
class SimpleBuffer(KVLookupBufferBase):
logger.debug("Closing drop_select_handler")
logger.debug("Closing drop_select_handler")
def drop_select(
def drop_select(
...
@@ -2236,7 +2446,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2236,7 +2446,7 @@ index 5e1b6235..b4506877 100644
"drop_select should be called by the KV cache consumer "\
"drop_select should be called by the KV cache consumer "\
"(e.g. the decode vLLM instance)"
"(e.g. the decode vLLM instance)"
@@ -192,26 +2
00
,28 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -192,26 +2
13
,28 @@
class SimpleBuffer(KVLookupBufferBase):
if isinstance(roi, torch.Tensor):
if isinstance(roi, torch.Tensor):
roi = roi.clone().float()
roi = roi.clone().float()
...
@@ -2274,7 +2484,7 @@ index 5e1b6235..b4506877 100644
...
@@ -2274,7 +2484,7 @@ index 5e1b6235..b4506877 100644
key: torch.Tensor, value: torch.Tensor,
key: torch.Tensor, value: torch.Tensor,
hidden: torch.Tensor) -> None:
hidden: torch.Tensor) -> None:
@@ -222,20 +2
32
,19 @@
class SimpleBuffer(KVLookupBufferBase):
@@ -222,20 +2
45
,19 @@
class SimpleBuffer(KVLookupBufferBase):
while self.buffer_size > self.buffer_size_threshold:
while self.buffer_size > self.buffer_size_threshold:
self.full_handler()
self.full_handler()
...
@@ -2302,10 +2512,28 @@ index 5e1b6235..b4506877 100644
...
@@ -2302,10 +2512,28 @@ index 5e1b6235..b4506877 100644
else:
else:
# TODO: have a explicit close signal and have a explicit way to
# TODO: have a explicit close signal and have a explicit way to
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 40589fb3..
da2829cf
100644
index 40589fb3
e
..
a3991c39d
100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -23,7 +23,7 @@
class KVPipeBase(ABC):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""
This file defines an interface `KVPipeBase`
that provides an abstraction for sending and receiving tensors, or None, via
@@ -23,7 +36,7 @@
class KVPipeBase(ABC):
"""
"""
@abstractmethod
@abstractmethod
...
@@ -2314,7 +2542,7 @@ index 40589fb3..da2829cf 100644
...
@@ -2314,7 +2542,7 @@ index 40589fb3..da2829cf 100644
"""Send a tensor, or None, via the pipe.
"""Send a tensor, or None, via the pipe.
Need to support sending None -- important for error handling.
Need to support sending None -- important for error handling.
@@ -41,7 +4
1
,7 @@
class KVPipeBase(ABC):
@@ -41,7 +
5
4,7 @@
class KVPipeBase(ABC):
raise NotImplementedError
raise NotImplementedError
@abstractmethod
@abstractmethod
...
@@ -2325,10 +2553,25 @@ index 40589fb3..da2829cf 100644
...
@@ -2325,10 +2553,25 @@ index 40589fb3..da2829cf 100644
Returns:
Returns:
diff --git a/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py
diff --git a/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py
new file mode 100644
new file mode 100644
index 00000000..
3ee0fa78
index 00000000
0
..
ca5345359
--- /dev/null
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/dynamo_nccl_pipe.py
@@ -0,0 +1,124 @@
@@ -0,0 +1,139 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import logging
+import threading
+import threading
+import typing
+import typing
...
@@ -2454,10 +2697,28 @@ index 00000000..3ee0fa78
...
@@ -2454,10 +2697,28 @@ index 00000000..3ee0fa78
+ # dtype = getattr(torch, dtype)
+ # dtype = getattr(torch, dtype)
+ self._receive_tensor(tensor_id, rank)
+ self._receive_tensor(tensor_id, rank)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 7aa53d07..
f5dd50b7
100644
index 7aa53d07
a
..
8fb256aff
100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -45,33 +45,33 @@
class PyNcclPipe(KVPipeBase):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""
This module implements a PyNccl pipe for sending and receiving
Optional[torch.Tensor] between distributed ranks with advanced
@@ -45,33 +58,33 @@
class PyNcclPipe(KVPipeBase):
METADATA_DTYPE = torch.int64
METADATA_DTYPE = torch.int64
def __init__(self,
def __init__(self,
...
@@ -2497,7 +2758,7 @@ index 7aa53d07..f5dd50b7 100644
...
@@ -2497,7 +2758,7 @@ index 7aa53d07..f5dd50b7 100644
# transportation-related variables
# transportation-related variables
self.transport_thread: Optional[ThreadPoolExecutor] = None
self.transport_thread: Optional[ThreadPoolExecutor] = None
@@ -145,16 +1
4
5,16 @@
class PyNcclPipe(KVPipeBase):
@@ -145,16 +15
8
,16 @@
class PyNcclPipe(KVPipeBase):
dtype=metadata["dtype"],
dtype=metadata["dtype"],
device=self.device)
device=self.device)
...
@@ -2517,7 +2778,7 @@ index 7aa53d07..f5dd50b7 100644
...
@@ -2517,7 +2778,7 @@ index 7aa53d07..f5dd50b7 100644
"""
"""
Receive the metadata dictionary from the target rank.
Receive the metadata dictionary from the target rank.
@@ -162,9 +1
62
,9 @@
class PyNcclPipe(KVPipeBase):
@@ -162,9 +1
75
,9 @@
class PyNcclPipe(KVPipeBase):
- metadata: A dictionary with keys "dtype" and "shape" describing
- metadata: A dictionary with keys "dtype" and "shape" describing
the tensor.
the tensor.
"""
"""
...
@@ -2529,7 +2790,7 @@ index 7aa53d07..f5dd50b7 100644
...
@@ -2529,7 +2790,7 @@ index 7aa53d07..f5dd50b7 100644
"""
"""
The actual implementation of sending the tensor and its metadata to the
The actual implementation of sending the tensor and its metadata to the
target rank.
target rank.
@@ -174,12 +17
4
,12 @@
class PyNcclPipe(KVPipeBase):
@@ -174,12 +1
8
7,12 @@
class PyNcclPipe(KVPipeBase):
being sent.
being sent.
"""
"""
metadata = self._make_metadata(tensor)
metadata = self._make_metadata(tensor)
...
@@ -2545,7 +2806,7 @@ index 7aa53d07..f5dd50b7 100644
...
@@ -2545,7 +2806,7 @@ index 7aa53d07..f5dd50b7 100644
"""
"""
The actual implementation of receiving a tensor and its metadata from
The actual implementation of receiving a tensor and its metadata from
the target rank.
the target rank.
@@ -187,21 +
187
,22 @@
class PyNcclPipe(KVPipeBase):
@@ -187,21 +
200
,22 @@
class PyNcclPipe(KVPipeBase):
Returns:
Returns:
- buffer: The received tensor, or None if no tensor is received.
- buffer: The received tensor, or None if no tensor is received.
"""
"""
...
@@ -2572,7 +2833,7 @@ index 7aa53d07..f5dd50b7 100644
...
@@ -2572,7 +2833,7 @@ index 7aa53d07..f5dd50b7 100644
with self.buffer_size_lock:
with self.buffer_size_lock:
self.buffer_size -= tensor_size
self.buffer_size -= tensor_size
@@ -220,7 +2
21
,7 @@
class PyNcclPipe(KVPipeBase):
@@ -220,7 +2
34
,7 @@
class PyNcclPipe(KVPipeBase):
logger.debug("KV cache transfer pipe is full. Waiting...")
logger.debug("KV cache transfer pipe is full. Waiting...")
time.sleep(0.05)
time.sleep(0.05)
...
@@ -2581,7 +2842,7 @@ index 7aa53d07..f5dd50b7 100644
...
@@ -2581,7 +2842,7 @@ index 7aa53d07..f5dd50b7 100644
"""
"""
Sends a tensor and its metadata to the destination rank in a
Sends a tensor and its metadata to the destination rank in a
non-blocking way.
non-blocking way.
@@ -228,6 +22
9
,7 @@
class PyNcclPipe(KVPipeBase):
@@ -228,6 +2
4
2,7 @@
class PyNcclPipe(KVPipeBase):
Parameters:
Parameters:
- tensor: The tensor to send, or None if no tensor is being sent.
- tensor: The tensor to send, or None if no tensor is being sent.
"""
"""
...
@@ -2589,7 +2850,7 @@ index 7aa53d07..f5dd50b7 100644
...
@@ -2589,7 +2850,7 @@ index 7aa53d07..f5dd50b7 100644
if self.transport_thread is None:
if self.transport_thread is None:
self.transport_thread = ThreadPoolExecutor(max_workers=1)
self.transport_thread = ThreadPoolExecutor(max_workers=1)
@@ -241,32 +2
43
,39 @@
class PyNcclPipe(KVPipeBase):
@@ -241,32 +2
56
,39 @@
class PyNcclPipe(KVPipeBase):
with self.buffer_size_lock:
with self.buffer_size_lock:
self.buffer_size += tensor_size
self.buffer_size += tensor_size
...
@@ -2644,10 +2905,28 @@ index 7aa53d07..f5dd50b7 100644
...
@@ -2644,10 +2905,28 @@ index 7aa53d07..f5dd50b7 100644
def close(self):
def close(self):
"""
"""
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py
diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py
index 1e80e0bd..
cd90206f
100644
index 1e80e0bd
7
..
f06c7a5f6
100644
--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
--- a/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py
@@ -35,6 +35,7 @@
class KVTransferAgent:
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""A centralized entrypoint to perform distributed KV cache transfer.
This implementation is a shim wrapper on two APIs exposed by `kv_connector`:
@@ -35,6 +48,7 @@
class KVTransferAgent:
rank: int,
rank: int,
local_rank: int,
local_rank: int,
config: "VllmConfig",
config: "VllmConfig",
...
@@ -2655,7 +2934,7 @@ index 1e80e0bd..cd90206f 100644
...
@@ -2655,7 +2934,7 @@ index 1e80e0bd..cd90206f 100644
):
):
self.config = config
self.config = config
@@ -47,7 +
48
,7 @@
class KVTransferAgent:
@@ -47,7 +
61
,7 @@
class KVTransferAgent:
"TransferAgent should only be used when kv_connector is set."
"TransferAgent should only be used when kv_connector is set."
self.connector = KVConnectorFactory.create_connector(
self.connector = KVConnectorFactory.create_connector(
...
@@ -2665,10 +2944,28 @@ index 1e80e0bd..cd90206f 100644
...
@@ -2665,10 +2944,28 @@ index 1e80e0bd..cd90206f 100644
def send_kv_caches_and_hidden_states(
def send_kv_caches_and_hidden_states(
self,
self,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 321902d1..
b8937ef8
100644
index 321902d1
1
..
03409899e
100644
--- a/vllm/distributed/parallel_state.py
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1085,7 +1085,8 @@
def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# Copyright 2023 The vLLM team.
# Adapted from
@@ -1085,7 +1098,8 @@
def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
_KV_TRANSFER = kv_transfer.KVTransferAgent(
_KV_TRANSFER = kv_transfer.KVTransferAgent(
rank=get_world_group().rank,
rank=get_world_group().rank,
local_rank=get_world_group().local_rank,
local_rank=get_world_group().local_rank,
...
@@ -2679,10 +2976,24 @@ index 321902d1..b8937ef8 100644
...
@@ -2679,10 +2976,24 @@ index 321902d1..b8937ef8 100644
def ensure_model_parallel_initialized(
def ensure_model_parallel_initialized(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d82d9ad9..
03896aa6
100644
index d82d9ad9
d
..
61c1e429d
100644
--- a/vllm/engine/llm_engine.py
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2,13 +2,17 @@
@@ -1,14 +1,31 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import copy
import copy
import time
import time
...
@@ -2701,7 +3012,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2701,7 +3012,7 @@ index d82d9ad9..03896aa6 100644
from typing import Sequence as GenericSequence
from typing import Sequence as GenericSequence
from typing import Set, Type, Union, cast, overload
from typing import Set, Type, Union, cast, overload
@@ -60,6 +
64
,9 @@
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
@@ -60,6 +
77
,9 @@
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
usage_message)
usage_message)
from vllm.utils import Counter, Device, deprecate_kwargs, weak_bind
from vllm.utils import Counter, Device, deprecate_kwargs, weak_bind
from vllm.version import __version__ as VLLM_VERSION
from vllm.version import __version__ as VLLM_VERSION
...
@@ -2711,7 +3022,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2711,7 +3022,7 @@ index d82d9ad9..03896aa6 100644
logger = init_logger(__name__)
logger = init_logger(__name__)
_LOCAL_LOGGING_INTERVAL_SEC = 5
_LOCAL_LOGGING_INTERVAL_SEC = 5
@@ -90,7 +
97
,7 @@
class OutputData(NamedTuple):
@@ -90,7 +
110
,7 @@
class OutputData(NamedTuple):
# outputs from multiple steps.
# outputs from multiple steps.
is_first_step_output: Optional[bool]
is_first_step_output: Optional[bool]
skip: List[int]
skip: List[int]
...
@@ -2720,7 +3031,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2720,7 +3031,7 @@ index d82d9ad9..03896aa6 100644
class SchedulerContext:
class SchedulerContext:
@@ -104,11 +1
11
,14 @@
class SchedulerContext:
@@ -104,11 +1
24
,14 @@
class SchedulerContext:
self.multi_step_stream_outputs: bool = multi_step_stream_outputs
self.multi_step_stream_outputs: bool = multi_step_stream_outputs
...
@@ -2736,7 +3047,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2736,7 +3047,7 @@ index d82d9ad9..03896aa6 100644
self.output_queue.append(
self.output_queue.append(
OutputData(outputs=outputs,
OutputData(outputs=outputs,
seq_group_metadata_list=seq_group_metadata_list,
seq_group_metadata_list=seq_group_metadata_list,
@@ -116,7 +1
26
,9 @@
class SchedulerContext:
@@ -116,7 +1
39
,9 @@
class SchedulerContext:
is_async=is_async,
is_async=is_async,
is_last_step=is_last_step,
is_last_step=is_last_step,
is_first_step_output=is_first_step_output,
is_first_step_output=is_first_step_output,
...
@@ -2747,7 +3058,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2747,7 +3058,7 @@ index d82d9ad9..03896aa6 100644
class LLMEngine:
class LLMEngine:
@@ -348,7 +3
60
,7 @@
class LLMEngine:
@@ -348,7 +3
73
,7 @@
class LLMEngine:
# GPU and CPU blocks, which are profiled in the distributed executor.
# GPU and CPU blocks, which are profiled in the distributed executor.
self.scheduler = [
self.scheduler = [
Scheduler(
Scheduler(
...
@@ -2756,7 +3067,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2756,7 +3067,7 @@ index d82d9ad9..03896aa6 100644
self.parallel_config.pipeline_parallel_size,
self.parallel_config.pipeline_parallel_size,
self.async_callbacks[v_id]
self.async_callbacks[v_id]
if self.model_config.use_async_output_proc else None)
if self.model_config.use_async_output_proc else None)
@@ -405,6 +4
17
,40 @@
class LLMEngine:
@@ -405,6 +4
30
,40 @@
class LLMEngine:
self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
...
@@ -2797,7 +3108,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2797,7 +3108,7 @@ index d82d9ad9..03896aa6 100644
def _initialize_kv_caches(self) -> None:
def _initialize_kv_caches(self) -> None:
"""Initialize the KV cache in the worker(s).
"""Initialize the KV cache in the worker(s).
@@ -500,6 +5
46
,8 @@
class LLMEngine:
@@ -500,6 +5
59
,8 @@
class LLMEngine:
# Shutdown model executor when engine is garbage collected
# Shutdown model executor when engine is garbage collected
# Use getattr since __init__ can fail before the field is set
# Use getattr since __init__ can fail before the field is set
if model_executor := getattr(self, "model_executor", None):
if model_executor := getattr(self, "model_executor", None):
...
@@ -2806,7 +3117,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2806,7 +3117,7 @@ index d82d9ad9..03896aa6 100644
model_executor.shutdown()
model_executor.shutdown()
def get_tokenizer_group(
def get_tokenizer_group(
@@ -552,11 +6
00
,14 @@
class LLMEngine:
@@ -552,11 +6
13
,14 @@
class LLMEngine:
prompt_adapter_request: Optional[PromptAdapterRequest],
prompt_adapter_request: Optional[PromptAdapterRequest],
trace_headers: Optional[Mapping[str, str]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
priority: int = 0,
...
@@ -2821,7 +3132,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2821,7 +3132,7 @@ index d82d9ad9..03896aa6 100644
ParallelSampleSequenceGroup.add_request(
ParallelSampleSequenceGroup.add_request(
request_id,
request_id,
self,
self,
@@ -574,6 +6
25
,8 @@
class LLMEngine:
@@ -574,6 +6
38
,8 @@
class LLMEngine:
# Create the sequences.
# Create the sequences.
block_size = self.cache_config.block_size
block_size = self.cache_config.block_size
seq_id = next(self.seq_counter)
seq_id = next(self.seq_counter)
...
@@ -2830,7 +3141,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2830,7 +3141,7 @@ index d82d9ad9..03896aa6 100644
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
if is_encoder_decoder_inputs(processed_inputs):
if is_encoder_decoder_inputs(processed_inputs):
@@ -584,7 +6
37
,7 @@
class LLMEngine:
@@ -584,7 +6
50
,7 @@
class LLMEngine:
encoder_inputs = None
encoder_inputs = None
seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
...
@@ -2839,7 +3150,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2839,7 +3150,7 @@ index d82d9ad9..03896aa6 100644
encoder_seq = (None if encoder_inputs is None else Sequence(
encoder_seq = (None if encoder_inputs is None else Sequence(
seq_id, encoder_inputs, block_size, eos_token_id, lora_request,
seq_id, encoder_inputs, block_size, eos_token_id, lora_request,
@@ -601,8 +6
54
,12 @@
class LLMEngine:
@@ -601,8 +6
67
,12 @@
class LLMEngine:
trace_headers=trace_headers,
trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
prompt_adapter_request=prompt_adapter_request,
encoder_seq=encoder_seq,
encoder_seq=encoder_seq,
...
@@ -2853,7 +3164,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2853,7 +3164,7 @@ index d82d9ad9..03896aa6 100644
seq_group = self._create_sequence_group_with_pooling(
seq_group = self._create_sequence_group_with_pooling(
request_id,
request_id,
seq,
seq,
@@ -673,6 +73
0
,7 @@
class LLMEngine:
@@ -673,6 +7
4
3,7 @@
class LLMEngine:
trace_headers: Optional[Mapping[str, str]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
priority: int = 0,
...
@@ -2861,7 +3172,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2861,7 +3172,7 @@ index d82d9ad9..03896aa6 100644
*,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
) -> None:
@@ -765,6 +8
2
3,7 @@
class LLMEngine:
@@ -765,6 +83
6
,7 @@
class LLMEngine:
prompt_adapter_request=prompt_adapter_request,
prompt_adapter_request=prompt_adapter_request,
trace_headers=trace_headers,
trace_headers=trace_headers,
priority=priority,
priority=priority,
...
@@ -2869,7 +3180,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2869,7 +3180,7 @@ index d82d9ad9..03896aa6 100644
)
)
def _validate_token_prompt(self, prompt: PromptType,
def _validate_token_prompt(self, prompt: PromptType,
@@ -799,6 +8
58
,7 @@
class LLMEngine:
@@ -799,6 +8
71
,7 @@
class LLMEngine:
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
encoder_seq: Optional[Sequence] = None,
encoder_seq: Optional[Sequence] = None,
priority: int = 0,
priority: int = 0,
...
@@ -2877,7 +3188,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2877,7 +3188,7 @@ index d82d9ad9..03896aa6 100644
) -> SequenceGroup:
) -> SequenceGroup:
"""Creates a SequenceGroup with SamplingParams."""
"""Creates a SequenceGroup with SamplingParams."""
max_logprobs = self.get_model_config().max_logprobs
max_logprobs = self.get_model_config().max_logprobs
@@ -829,7 +
889
,9 @@
class LLMEngine:
@@ -829,7 +
902
,9 @@
class LLMEngine:
trace_headers=trace_headers,
trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
prompt_adapter_request=prompt_adapter_request,
encoder_seq=encoder_seq,
encoder_seq=encoder_seq,
...
@@ -2888,7 +3199,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2888,7 +3199,7 @@ index d82d9ad9..03896aa6 100644
return seq_group
return seq_group
@@ -995,11 +10
5
7,11 @@
class LLMEngine:
@@ -995,11 +107
0
,11 @@
class LLMEngine:
# When we process only one request, no pop is required
# When we process only one request, no pop is required
# (since later we will process all of the rest)
# (since later we will process all of the rest)
(outputs, seq_group_metadata_list, scheduler_outputs, is_async,
(outputs, seq_group_metadata_list, scheduler_outputs, is_async,
...
@@ -2902,7 +3213,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2902,7 +3213,7 @@ index d82d9ad9..03896aa6 100644
# Sanity check
# Sanity check
assert len(seq_group_metadata_list) == len(
assert len(seq_group_metadata_list) == len(
@@ -1325,15 +1
387
,55 @@
class LLMEngine:
@@ -1325,15 +1
400
,55 @@
class LLMEngine:
# Clear outputs for each new scheduler iteration
# Clear outputs for each new scheduler iteration
ctx.request_outputs.clear()
ctx.request_outputs.clear()
...
@@ -2960,7 +3271,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -2960,7 +3271,7 @@ index d82d9ad9..03896aa6 100644
ctx.seq_group_metadata_list = seq_group_metadata_list
ctx.seq_group_metadata_list = seq_group_metadata_list
ctx.scheduler_outputs = scheduler_outputs
ctx.scheduler_outputs = scheduler_outputs
@@ -1383,9 +148
5
,46 @@
class LLMEngine:
@@ -1383,9 +14
9
8,46 @@
class LLMEngine:
execute_model_req.async_callback = self.async_callbacks[
execute_model_req.async_callback = self.async_callbacks[
virtual_engine]
virtual_engine]
...
@@ -3009,7 +3320,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -3009,7 +3320,7 @@ index d82d9ad9..03896aa6 100644
# We need to do this here so that last step's sampled_token_ids can
# We need to do this here so that last step's sampled_token_ids can
# be passed to the next iteration for PP.
# be passed to the next iteration for PP.
if self.scheduler_config.is_multi_step:
if self.scheduler_config.is_multi_step:
@@ -1396,7 +15
35
,26 @@
class LLMEngine:
@@ -1396,7 +15
48
,26 @@
class LLMEngine:
if len(ctx.output_queue) > 0:
if len(ctx.output_queue) > 0:
self._process_model_outputs(ctx=ctx)
self._process_model_outputs(ctx=ctx)
# No outputs in this case
# No outputs in this case
...
@@ -3037,7 +3348,7 @@ index d82d9ad9..03896aa6 100644
...
@@ -3037,7 +3348,7 @@ index d82d9ad9..03896aa6 100644
# Finish the current step for all the sequence groups.
# Finish the current step for all the sequence groups.
if self.scheduler_config.is_multi_step:
if self.scheduler_config.is_multi_step:
@@ -1456,7 +16
14
,7 @@
class LLMEngine:
@@ -1456,7 +16
27
,7 @@
class LLMEngine:
# queued control plane messages, such as add/remove lora adapters.
# queued control plane messages, such as add/remove lora adapters.
logger.debug("Stopping remote worker execution loop.")
logger.debug("Stopping remote worker execution loop.")
self.model_executor.stop_remote_worker_execution_loop()
self.model_executor.stop_remote_worker_execution_loop()
...
@@ -3047,10 +3358,28 @@ index d82d9ad9..03896aa6 100644
...
@@ -3047,10 +3358,28 @@ index d82d9ad9..03896aa6 100644
def _has_remaining_steps(
def _has_remaining_steps(
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 3cf1850e..
ae006579
100644
index 3cf1850e
e
..
d20a5f20b
100644
--- a/vllm/engine/multiprocessing/__init__.py
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -14,13 +14,17 @@
from vllm.outputs import RequestOutput
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import uuid
from dataclasses import dataclass, field
@@ -14,13 +27,17 @@
from vllm.outputs import RequestOutput
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.sampling_params import SamplingParams
from vllm.utils import deprecate_kwargs
from vllm.utils import deprecate_kwargs
...
@@ -3069,7 +3398,7 @@ index 3cf1850e..ae006579 100644
...
@@ -3069,7 +3398,7 @@ index 3cf1850e..ae006579 100644
class MQEngineDeadError(RuntimeError):
class MQEngineDeadError(RuntimeError):
@@ -36,6 +
40
,7 @@
class RPCProcessRequest:
@@ -36,6 +
53
,7 @@
class RPCProcessRequest:
trace_headers: Optional[Mapping[str, str]] = None
trace_headers: Optional[Mapping[str, str]] = None
prompt_adapter_request: Optional[PromptAdapterRequest] = None
prompt_adapter_request: Optional[PromptAdapterRequest] = None
priority: int = 0
priority: int = 0
...
@@ -3077,7 +3406,7 @@ index 3cf1850e..ae006579 100644
...
@@ -3077,7 +3406,7 @@ index 3cf1850e..ae006579 100644
@overload
@overload
def __init__(
def __init__(
@@ -78,6 +
83
,7 @@
class RPCProcessRequest:
@@ -78,6 +
96
,7 @@
class RPCProcessRequest:
trace_headers: Optional[Mapping[str, str]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
priority: int = 0,
...
@@ -3085,7 +3414,7 @@ index 3cf1850e..ae006579 100644
...
@@ -3085,7 +3414,7 @@ index 3cf1850e..ae006579 100644
*,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
) -> None:
@@ -95,7 +1
0
1,7 @@
class RPCProcessRequest:
@@ -95,7 +11
4
,7 @@
class RPCProcessRequest:
self.trace_headers = trace_headers
self.trace_headers = trace_headers
self.prompt_adapter_request = prompt_adapter_request
self.prompt_adapter_request = prompt_adapter_request
self.priority = priority
self.priority = priority
...
@@ -3094,7 +3423,7 @@ index 3cf1850e..ae006579 100644
...
@@ -3094,7 +3423,7 @@ index 3cf1850e..ae006579 100644
@dataclass
@dataclass
class RPCError:
class RPCError:
@@ -116,7 +1
22
,7 @@
class RPCStartupRequest(Enum):
@@ -116,7 +1
35
,7 @@
class RPCStartupRequest(Enum):
@dataclass
@dataclass
class RPCStartupResponse:
class RPCStartupResponse:
tracing_enabled: bool
tracing_enabled: bool
...
@@ -3103,7 +3432,7 @@ index 3cf1850e..ae006579 100644
...
@@ -3103,7 +3432,7 @@ index 3cf1850e..ae006579 100644
class RPCUProfileRequest(Enum):
class RPCUProfileRequest(Enum):
START_PROFILE = 1
START_PROFILE = 1
@@ -157,3 +16
3
,13 @@
def ENGINE_DEAD_ERROR(
@@ -157,3 +1
7
6,13 @@
def ENGINE_DEAD_ERROR(
return MQEngineDeadError(
return MQEngineDeadError(
"Engine loop is not running. Inspect the stacktrace to "
"Engine loop is not running. Inspect the stacktrace to "
f"find the original error: {repr(error)}.")
f"find the original error: {repr(error)}.")
...
@@ -3118,10 +3447,28 @@ index 3cf1850e..ae006579 100644
...
@@ -3118,10 +3447,28 @@ index 3cf1850e..ae006579 100644
+ gpu_cache_usage_perc: float
+ gpu_cache_usage_perc: float
+ gpu_prefix_cache_hit_rate: float
+ gpu_prefix_cache_hit_rate: float
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 85b5f31e..
05030292
100644
index 85b5f31e
3
..
c53b9eced
100644
--- a/vllm/engine/multiprocessing/client.py
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -8,6 +8,7 @@
from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import asyncio
import copy
@@ -8,6 +21,7 @@
from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
Optional, Union, cast, overload)
Optional, Union, cast, overload)
import cloudpickle
import cloudpickle
...
@@ -3129,7 +3476,7 @@ index 85b5f31e..05030292 100644
...
@@ -3129,7 +3476,7 @@ index 85b5f31e..05030292 100644
import psutil
import psutil
import zmq
import zmq
import zmq.asyncio
import zmq.asyncio
@@ -19,20 +
20
,23 @@
from vllm import PoolingParams
@@ -19,20 +
33
,23 @@
from vllm import PoolingParams
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
from vllm.core.scheduler import SchedulerOutputs
from vllm.core.scheduler import SchedulerOutputs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.arg_utils import AsyncEngineArgs
...
@@ -3156,7 +3503,7 @@ index 85b5f31e..05030292 100644
...
@@ -3156,7 +3503,7 @@ index 85b5f31e..05030292 100644
from vllm.engine.protocol import EngineClient
from vllm.engine.protocol import EngineClient
# yapf: enable
# yapf: enable
from vllm.envs import VLLM_RPC_TIMEOUT
from vllm.envs import VLLM_RPC_TIMEOUT
@@ -46,6 +
50
,8 @@
from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -46,6 +
63
,8 @@
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.utils import deprecate_kwargs
from vllm.utils import deprecate_kwargs
...
@@ -3165,7 +3512,7 @@ index 85b5f31e..05030292 100644
...
@@ -3165,7 +3512,7 @@ index 85b5f31e..05030292 100644
logger = init_logger(__name__)
logger = init_logger(__name__)
@@ -91,6 +
97
,7 @@
class MQLLMEngineClient(EngineClient):
@@ -91,6 +
110
,7 @@
class MQLLMEngineClient(EngineClient):
self._errored_with: Optional[BaseException] = None
self._errored_with: Optional[BaseException] = None
# Get the configs.
# Get the configs.
...
@@ -3173,7 +3520,7 @@ index 85b5f31e..05030292 100644
...
@@ -3173,7 +3520,7 @@ index 85b5f31e..05030292 100644
self.model_config = engine_config.model_config
self.model_config = engine_config.model_config
self.decoding_config = engine_config.decoding_config
self.decoding_config = engine_config.decoding_config
@@ -115,6 +1
22
,10 @@
class MQLLMEngineClient(EngineClient):
@@ -115,6 +1
35
,10 @@
class MQLLMEngineClient(EngineClient):
self.heartbeat_socket: Socket = self.context.socket(zmq.constants.PULL)
self.heartbeat_socket: Socket = self.context.socket(zmq.constants.PULL)
self.heartbeat_socket.connect(f"{ipc_path}{IPC_HEALTH_EXT}")
self.heartbeat_socket.connect(f"{ipc_path}{IPC_HEALTH_EXT}")
...
@@ -3184,7 +3531,7 @@ index 85b5f31e..05030292 100644
...
@@ -3184,7 +3531,7 @@ index 85b5f31e..05030292 100644
# IPC path for the data socket.
# IPC path for the data socket.
self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
@@ -129,8 +1
40
,27 @@
class MQLLMEngineClient(EngineClient):
@@ -129,8 +1
53
,27 @@
class MQLLMEngineClient(EngineClient):
# Loop to check health of the LLMEngine periodically.
# Loop to check health of the LLMEngine periodically.
# Started after the MQLLMEngine is ready.
# Started after the MQLLMEngine is ready.
self.health_loop: Optional[asyncio.Task] = None
self.health_loop: Optional[asyncio.Task] = None
...
@@ -3212,7 +3559,7 @@ index 85b5f31e..05030292 100644
...
@@ -3212,7 +3559,7 @@ index 85b5f31e..05030292 100644
@staticmethod
@staticmethod
def is_unsupported_config(engine_args: AsyncEngineArgs):
def is_unsupported_config(engine_args: AsyncEngineArgs):
# Pipeline parallel not yet supported
# Pipeline parallel not yet supported
@@ -180,6 +2
10
,61 @@
class MQLLMEngineClient(EngineClient):
@@ -180,6 +2
23
,61 @@
class MQLLMEngineClient(EngineClient):
except Exception as e:
except Exception as e:
self._set_errored(e)
self._set_errored(e)
...
@@ -3274,7 +3621,7 @@ index 85b5f31e..05030292 100644
...
@@ -3274,7 +3621,7 @@ index 85b5f31e..05030292 100644
async def run_output_handler_loop(self):
async def run_output_handler_loop(self):
"""Get RequestOutputs from Engine and stream to Request Queues"""
"""Get RequestOutputs from Engine and stream to Request Queues"""
@@ -278,12 +36
3
,26 @@
class MQLLMEngineClient(EngineClient):
@@ -278,12 +3
7
6,26 @@
class MQLLMEngineClient(EngineClient):
# Wait until server is ready.
# Wait until server is ready.
response = await self._wait_for_server_rpc(socket)
response = await self._wait_for_server_rpc(socket)
...
@@ -3301,7 +3648,7 @@ index 85b5f31e..05030292 100644
...
@@ -3301,7 +3648,7 @@ index 85b5f31e..05030292 100644
def close(self):
def close(self):
"""Destroy the ZeroMQ Context."""
"""Destroy the ZeroMQ Context."""
@@ -293,6 +
392
,8 @@
class MQLLMEngineClient(EngineClient):
@@ -293,6 +
405
,8 @@
class MQLLMEngineClient(EngineClient):
# Cancel background tasks.
# Cancel background tasks.
if self.health_loop is not None:
if self.health_loop is not None:
self.health_loop.cancel()
self.health_loop.cancel()
...
@@ -3310,7 +3657,7 @@ index 85b5f31e..05030292 100644
...
@@ -3310,7 +3657,7 @@ index 85b5f31e..05030292 100644
if self.output_loop is not None:
if self.output_loop is not None:
self.output_loop.cancel()
self.output_loop.cancel()
@@ -415,6 +5
16
,9 @@
class MQLLMEngineClient(EngineClient):
@@ -415,6 +5
29
,9 @@
class MQLLMEngineClient(EngineClient):
"""
"""
if self._errored_with is not None:
if self._errored_with is not None:
raise self._errored_with
raise self._errored_with
...
@@ -3320,7 +3667,7 @@ index 85b5f31e..05030292 100644
...
@@ -3320,7 +3667,7 @@ index 85b5f31e..05030292 100644
@property
@property
def is_running(self) -> bool:
def is_running(self) -> bool:
@@ -473,6 +5
77
,7 @@
class MQLLMEngineClient(EngineClient):
@@ -473,6 +5
90
,7 @@
class MQLLMEngineClient(EngineClient):
trace_headers: Optional[Mapping[str, str]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
priority: int = 0,
...
@@ -3328,7 +3675,7 @@ index 85b5f31e..05030292 100644
...
@@ -3328,7 +3675,7 @@ index 85b5f31e..05030292 100644
*,
*,
inputs: Optional[PromptType] = None # DEPRECATED
inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[RequestOutput, None]:
) -> AsyncGenerator[RequestOutput, None]:
@@ -502,7 +60
7
,8 @@
class MQLLMEngineClient(EngineClient):
@@ -502,7 +6
2
0,8 @@
class MQLLMEngineClient(EngineClient):
return self._process_request(prompt, sampling_params, request_id,
return self._process_request(prompt, sampling_params, request_id,
lora_request, trace_headers,
lora_request, trace_headers,
...
@@ -3338,7 +3685,7 @@ index 85b5f31e..05030292 100644
...
@@ -3338,7 +3685,7 @@ index 85b5f31e..05030292 100644
@overload
@overload
def encode(
def encode(
@@ -586,6 +
692
,7 @@
class MQLLMEngineClient(EngineClient):
@@ -586,6 +
705
,7 @@
class MQLLMEngineClient(EngineClient):
trace_headers: Optional[Mapping[str, str]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
priority: int = 0,
...
@@ -3346,7 +3693,7 @@ index 85b5f31e..05030292 100644
...
@@ -3346,7 +3693,7 @@ index 85b5f31e..05030292 100644
) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
PoolingRequestOutput, None]]:
PoolingRequestOutput, None]]:
"""Send an RPCGenerateRequest to the RPCServer and stream responses."""
"""Send an RPCGenerateRequest to the RPCServer and stream responses."""
@@ -630,6 +7
37
,12 @@
class MQLLMEngineClient(EngineClient):
@@ -630,6 +7
50
,12 @@
class MQLLMEngineClient(EngineClient):
else:
else:
lp_bytes = None
lp_bytes = None
...
@@ -3359,7 +3706,7 @@ index 85b5f31e..05030292 100644
...
@@ -3359,7 +3706,7 @@ index 85b5f31e..05030292 100644
request_bytes = pickle.dumps(
request_bytes = pickle.dumps(
RPCProcessRequest(
RPCProcessRequest(
prompt=prompt,
prompt=prompt,
@@ -639,11 +75
2
,11 @@
class MQLLMEngineClient(EngineClient):
@@ -639,11 +7
6
5,11 @@
class MQLLMEngineClient(EngineClient):
trace_headers=trace_headers,
trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
prompt_adapter_request=prompt_adapter_request,
priority=priority,
priority=priority,
...
@@ -3373,7 +3720,7 @@ index 85b5f31e..05030292 100644
...
@@ -3373,7 +3720,7 @@ index 85b5f31e..05030292 100644
await self.input_socket.send_multipart(parts, copy=False)
await self.input_socket.send_multipart(parts, copy=False)
# 4) Stream the RequestOutputs from the output queue. Note
# 4) Stream the RequestOutputs from the output queue. Note
@@ -705,3 +81
8
,6 @@
class MQLLMEngineClient(EngineClient):
@@ -705,3 +8
3
1,6 @@
class MQLLMEngineClient(EngineClient):
# Raise on error, otherwise happily return None
# Raise on error, otherwise happily return None
if isinstance(request_output, BaseException):
if isinstance(request_output, BaseException):
raise request_output
raise request_output
...
@@ -3381,10 +3728,25 @@ index 85b5f31e..05030292 100644
...
@@ -3381,10 +3728,25 @@ index 85b5f31e..05030292 100644
+ def set_metrics_publisher(self, metrics_publisher):
+ def set_metrics_publisher(self, metrics_publisher):
+ self.metrics_publisher = metrics_publisher
+ self.metrics_publisher = metrics_publisher
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index a0dd7958..
c82bc15b
100644
index a0dd7958
6
..
ea0d2cd68
100644
--- a/vllm/engine/multiprocessing/engine.py
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -3,35 +3,115 @@
@@ -1,37 +1,130 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import pickle
import pickle
import signal
import signal
from contextlib import contextmanager
from contextlib import contextmanager
...
@@ -3505,7 +3867,7 @@ index a0dd7958..c82bc15b 100644
...
@@ -3505,7 +3867,7 @@ index a0dd7958..c82bc15b 100644
class MQLLMEngine:
class MQLLMEngine:
"""A multiprocessing wrapper for :class:`LLMEngine`.
"""A multiprocessing wrapper for :class:`LLMEngine`.
@@ -94,12 +17
4
,37 @@
class MQLLMEngine:
@@ -94,12 +1
8
7,37 @@
class MQLLMEngine:
self.heartbeat_socket = self.ctx.socket(zmq.constants.PUSH)
self.heartbeat_socket = self.ctx.socket(zmq.constants.PUSH)
self.heartbeat_socket.bind(f"{ipc_path}{IPC_HEALTH_EXT}")
self.heartbeat_socket.bind(f"{ipc_path}{IPC_HEALTH_EXT}")
...
@@ -3543,7 +3905,7 @@ index a0dd7958..c82bc15b 100644
...
@@ -3543,7 +3905,7 @@ index a0dd7958..c82bc15b 100644
@property
@property
def dead_error(self) -> BaseException:
def dead_error(self) -> BaseException:
if self._errored_with is not None:
if self._errored_with is not None:
@@ -171,8 +2
76
,17 @@
class MQLLMEngine:
@@ -171,8 +2
89
,17 @@
class MQLLMEngine:
# Handle the query from the Client.
# Handle the query from the Client.
if request == RPCStartupRequest.IS_SERVER_READY:
if request == RPCStartupRequest.IS_SERVER_READY:
tracing_enabled = self.engine.is_tracing_enabled()
tracing_enabled = self.engine.is_tracing_enabled()
...
@@ -3563,7 +3925,7 @@ index a0dd7958..c82bc15b 100644
...
@@ -3563,7 +3925,7 @@ index a0dd7958..c82bc15b 100644
except Exception as e:
except Exception as e:
response = e
response = e
@@ -185,6 +
299
,7 @@
class MQLLMEngine:
@@ -185,6 +
312
,7 @@
class MQLLMEngine:
while True:
while True:
if not self.engine.has_unfinished_requests():
if not self.engine.has_unfinished_requests():
...
@@ -3571,7 +3933,7 @@ index a0dd7958..c82bc15b 100644
...
@@ -3571,7 +3933,7 @@ index a0dd7958..c82bc15b 100644
# Poll until there is work to do.
# Poll until there is work to do.
while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
# When there's no work, check on engine health and send
# When there's no work, check on engine health and send
@@ -220,6 +3
35
,13 @@
class MQLLMEngine:
@@ -220,6 +3
48
,13 @@
class MQLLMEngine:
def handle_new_input(self):
def handle_new_input(self):
"""Handle new input from the socket"""
"""Handle new input from the socket"""
try:
try:
...
@@ -3585,7 +3947,7 @@ index a0dd7958..c82bc15b 100644
...
@@ -3585,7 +3947,7 @@ index a0dd7958..c82bc15b 100644
while self.input_socket.poll(timeout=0) != 0:
while self.input_socket.poll(timeout=0) != 0:
frames = self.input_socket.recv_multipart(copy=False)
frames = self.input_socket.recv_multipart(copy=False)
request = pickle.loads(frames[0].buffer)
request = pickle.loads(frames[0].buffer)
@@ -262,6 +3
84
,11 @@
class MQLLMEngine:
@@ -262,6 +3
97
,11 @@
class MQLLMEngine:
self._send_outputs(rpc_err)
self._send_outputs(rpc_err)
try:
try:
...
@@ -3597,7 +3959,7 @@ index a0dd7958..c82bc15b 100644
...
@@ -3597,7 +3959,7 @@ index a0dd7958..c82bc15b 100644
self.engine.add_request(
self.engine.add_request(
request_id=request_id,
request_id=request_id,
prompt=request.prompt,
prompt=request.prompt,
@@ -269,7 +
396
,9 @@
class MQLLMEngine:
@@ -269,7 +
409
,9 @@
class MQLLMEngine:
lora_request=request.lora_request,
lora_request=request.lora_request,
trace_headers=request.trace_headers,
trace_headers=request.trace_headers,
prompt_adapter_request=request.prompt_adapter_request,
prompt_adapter_request=request.prompt_adapter_request,
...
@@ -3609,10 +3971,28 @@ index a0dd7958..c82bc15b 100644
...
@@ -3609,10 +3971,28 @@ index a0dd7958..c82bc15b 100644
if self.log_requests:
if self.log_requests:
logger.info("Added request %s.", request.request_id)
logger.info("Added request %s.", request.request_id)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 107220d5..
c716f75f
100644
index 107220d5
4
..
e0e0590b6
100644
--- a/vllm/entrypoints/openai/serving_chat.py
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -34,6 +34,7 @@
from vllm.sampling_params import BeamSearchParams, SamplingParams
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import asyncio
import json
@@ -34,6 +47,7 @@
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.sequence import Logprob
from vllm.sequence import Logprob
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
...
@@ -3620,7 +4000,7 @@ index 107220d5..c716f75f 100644
...
@@ -3620,7 +4000,7 @@ index 107220d5..c716f75f 100644
logger = init_logger(__name__)
logger = init_logger(__name__)
@@ -112,6 +1
13
,7 @@
class OpenAIServingChat(OpenAIServing):
@@ -112,6 +1
26
,7 @@
class OpenAIServingChat(OpenAIServing):
self,
self,
request: ChatCompletionRequest,
request: ChatCompletionRequest,
raw_request: Optional[Request] = None,
raw_request: Optional[Request] = None,
...
@@ -3628,7 +4008,7 @@ index 107220d5..c716f75f 100644
...
@@ -3628,7 +4008,7 @@ index 107220d5..c716f75f 100644
) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
) -> Union[AsyncGenerator[str, None], ChatCompletionResponse,
ErrorResponse]:
ErrorResponse]:
"""
"""
@@ -243,6 +2
4
5,7 @@
class OpenAIServingChat(OpenAIServing):
@@ -243,6 +25
8
,7 @@
class OpenAIServingChat(OpenAIServing):
trace_headers=trace_headers,
trace_headers=trace_headers,
prompt_adapter_request=prompt_adapter_request,
prompt_adapter_request=prompt_adapter_request,
priority=request.priority,
priority=request.priority,
...
@@ -3637,10 +4017,28 @@ index 107220d5..c716f75f 100644
...
@@ -3637,10 +4017,28 @@ index 107220d5..c716f75f 100644
generators.append(generator)
generators.append(generator)
diff --git a/vllm/envs.py b/vllm/envs.py
diff --git a/vllm/envs.py b/vllm/envs.py
index 745b068b..0
ae63d9
b 100644
index 745b068b
7
..0
f1a022f
b 100644
--- a/vllm/envs.py
--- a/vllm/envs.py
+++ b/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,6 +87,10 @@
if TYPE_CHECKING:
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import os
import tempfile
@@ -87,6 +100,10 @@
if TYPE_CHECKING:
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = ""
VLLM_RAY_BUNDLE_INDICES: str = ""
...
@@ -3651,7 +4049,7 @@ index 745b068b..0ae63d9b 100644
...
@@ -3651,7 +4049,7 @@ index 745b068b..0ae63d9b 100644
def get_default_cache_root():
def get_default_cache_root():
@@ -572,6 +5
76
,21 @@
environment_variables: Dict[str, Callable[[], Any]] = {
@@ -572,6 +5
89
,21 @@
environment_variables: Dict[str, Callable[[], Any]] = {
# models the alignment is already naturally aligned to 256 bytes.
# models the alignment is already naturally aligned to 256 bytes.
"VLLM_CUDA_MEM_ALIGN_KV_CACHE":
"VLLM_CUDA_MEM_ALIGN_KV_CACHE":
lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
...
@@ -3674,10 +4072,28 @@ index 745b068b..0ae63d9b 100644
...
@@ -3674,10 +4072,28 @@ index 745b068b..0ae63d9b 100644
# end-env-vars-definition
# end-env-vars-definition
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 773f5abe..3
eefd266
100644
index 773f5abe
7
..3
65685e13
100644
--- a/vllm/model_executor/models/deepseek_v2.py
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -585,6 +585,8 @@
class DeepseekV2Model(nn.Module):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
@@ -585,6 +598,8 @@
class DeepseekV2Model(nn.Module):
cache_config = vllm_config.cache_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
quant_config = vllm_config.quant_config
...
@@ -3687,10 +4103,28 @@ index 773f5abe..3eefd266 100644
...
@@ -3687,10 +4103,28 @@ index 773f5abe..3eefd266 100644
self.vocab_size = config.vocab_size
self.vocab_size = config.vocab_size
diff --git a/vllm/outputs.py b/vllm/outputs.py
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 786380c3..
56a7cf89
100644
index 786380c3
7
..
e9c3a5e16
100644
--- a/vllm/outputs.py
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -6,16 +6,16 @@
from typing import Dict, Generic, List, MutableSequence, Optional
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import time
from dataclasses import dataclass
@@ -6,16 +19,16 @@
from typing import Dict, Generic, List, MutableSequence, Optional
from typing import Sequence as GenericSequence
from typing import Sequence as GenericSequence
from typing import Union
from typing import Union
...
@@ -3711,10 +4145,25 @@ index 786380c3..56a7cf89 100644
...
@@ -3711,10 +4145,25 @@ index 786380c3..56a7cf89 100644
"""The output data of one completion output of a request.
"""The output data of one completion output of a request.
diff --git a/vllm/remote_prefill.py b/vllm/remote_prefill.py
diff --git a/vllm/remote_prefill.py b/vllm/remote_prefill.py
new file mode 100644
new file mode 100644
index 00000000..3f
9711ef
index 00000000
0
..
8
3f
6cd575
--- /dev/null
--- /dev/null
+++ b/vllm/remote_prefill.py
+++ b/vllm/remote_prefill.py
@@ -0,0 +1,67 @@
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from dataclasses import dataclass
+from typing import Callable, Optional, List
+from typing import Callable, Optional, List
+from enum import Enum
+from enum import Enum
...
@@ -3784,10 +4233,29 @@ index 00000000..3f9711ef
...
@@ -3784,10 +4233,29 @@ index 00000000..3f9711ef
+ remote_prefill_request_callback: Optional[RemotePrefillRequestCallback] = None
+ remote_prefill_request_callback: Optional[RemotePrefillRequestCallback] = None
\
No newline at end of file
\
No newline at end of file
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 97f9e212..
1bb97b00
100644
index 97f9e212
9
..
5849befba
100644
--- a/vllm/sampling_params.py
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -83,7 +83,7 @@
class RequestOutputKind(Enum):
@@ -1,4 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
"""Sampling parameters for text generation."""
import copy
from dataclasses import dataclass
@@ -83,7 +97,7 @@
class RequestOutputKind(Enum):
DELTA = 1
DELTA = 1
# Do not return intermediate RequestOuputs
# Do not return intermediate RequestOuputs
FINAL_ONLY = 2
FINAL_ONLY = 2
...
@@ -3797,10 +4265,29 @@ index 97f9e212..1bb97b00 100644
...
@@ -3797,10 +4265,29 @@ index 97f9e212..1bb97b00 100644
class SamplingParams(
class SamplingParams(
msgspec.Struct,
msgspec.Struct,
diff --git a/vllm/sequence.py b/vllm/sequence.py
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 534b9e60..
18675d2f
100644
index 534b9e60
6
..
c33bbde1c
100644
--- a/vllm/sequence.py
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -20,6 +20,7 @@
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
@@ -1,4 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
"""Sequence and its related classes."""
import copy
import enum
@@ -20,6 +34,7 @@
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
from vllm.pooling_params import PoolingParams
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.sampling_params import RequestOutputKind, SamplingParams
...
@@ -3808,7 +4295,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3808,7 +4295,7 @@ index 534b9e60..18675d2f 100644
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
@@ -59,13 +
60
,14 @@
class SequenceStatus(enum.IntEnum):
@@ -59,13 +
74
,14 @@
class SequenceStatus(enum.IntEnum):
"""Status of a sequence."""
"""Status of a sequence."""
WAITING = 0
WAITING = 0
RUNNING = 1
RUNNING = 1
...
@@ -3829,7 +4316,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3829,7 +4316,7 @@ index 534b9e60..18675d2f 100644
@staticmethod
@staticmethod
def is_finished(status: "SequenceStatus") -> bool:
def is_finished(status: "SequenceStatus") -> bool:
@@ -409,6 +4
11
,7 @@
class Sequence:
@@ -409,6 +4
25
,7 @@
class Sequence:
eos_token_id: Optional[int] = None,
eos_token_id: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
...
@@ -3837,7 +4324,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3837,7 +4324,7 @@ index 534b9e60..18675d2f 100644
) -> None:
) -> None:
self.seq_id = seq_id
self.seq_id = seq_id
self.inputs = SingletonInputsAdapter(inputs)
self.inputs = SingletonInputsAdapter(inputs)
@@ -416,7 +4
19
,7 @@
class Sequence:
@@ -416,7 +4
33
,7 @@
class Sequence:
self.eos_token_id = eos_token_id
self.eos_token_id = eos_token_id
self.lora_request = lora_request
self.lora_request = lora_request
self.prompt_adapter_request = prompt_adapter_request
self.prompt_adapter_request = prompt_adapter_request
...
@@ -3846,7 +4333,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3846,7 +4333,7 @@ index 534b9e60..18675d2f 100644
self.data = SequenceData.from_seqs(self.prompt_token_ids)
self.data = SequenceData.from_seqs(self.prompt_token_ids)
self.output_logprobs: SampleLogprobs = []
self.output_logprobs: SampleLogprobs = []
self.output_text = ""
self.output_text = ""
@@ -639,6 +6
42
,7 @@
class SequenceGroup:
@@ -639,6 +6
56
,7 @@
class SequenceGroup:
trace_headers: OpenTelemetry trace headers.
trace_headers: OpenTelemetry trace headers.
prompt_adapter_request: Prompt Adapter request.
prompt_adapter_request: Prompt Adapter request.
priority: User-defined priority of the request.
priority: User-defined priority of the request.
...
@@ -3854,7 +4341,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3854,7 +4341,7 @@ index 534b9e60..18675d2f 100644
"""
"""
def __init__(
def __init__(
@@ -654,6 +6
58
,7 @@
class SequenceGroup:
@@ -654,6 +6
72
,7 @@
class SequenceGroup:
trace_headers: Optional[Mapping[str, str]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
priority: int = 0,
...
@@ -3862,7 +4349,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3862,7 +4349,7 @@ index 534b9e60..18675d2f 100644
) -> None:
) -> None:
self.request_id = request_id
self.request_id = request_id
self.seqs = seqs
self.seqs = seqs
@@ -678,7 +6
83
,7 @@
class SequenceGroup:
@@ -678,7 +6
97
,7 @@
class SequenceGroup:
self.encoder_seq = encoder_seq
self.encoder_seq = encoder_seq
self.trace_headers = trace_headers
self.trace_headers = trace_headers
self.priority = priority
self.priority = priority
...
@@ -3871,7 +4358,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3871,7 +4358,7 @@ index 534b9e60..18675d2f 100644
self.cached_request_output = None
self.cached_request_output = None
@property
@property
@@ -927,6 +9
32
,9 @@
class SequenceGroupMetadata(
@@ -927,6 +9
46
,9 @@
class SequenceGroupMetadata(
query tokens for prefill, we don't need sampling.
query tokens for prefill, we don't need sampling.
token_chunk_size: The number of tokens to be processed (per sequence).
token_chunk_size: The number of tokens to be processed (per sequence).
None if chunking is not required.
None if chunking is not required.
...
@@ -3881,7 +4368,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3881,7 +4368,7 @@ index 534b9e60..18675d2f 100644
lora_request: LoRA request.
lora_request: LoRA request.
computed_block_nums: The block numbers that are already computed,
computed_block_nums: The block numbers that are already computed,
used in prefix caching.
used in prefix caching.
@@ -966,6 +9
74
,9 @@
class SequenceGroupMetadata(
@@ -966,6 +9
88
,9 @@
class SequenceGroupMetadata(
cross_block_table: Optional[List[int]] = None
cross_block_table: Optional[List[int]] = None
prompt_adapter_request: Optional[PromptAdapterRequest] = None
prompt_adapter_request: Optional[PromptAdapterRequest] = None
token_chunk_size: Optional[int] = None
token_chunk_size: Optional[int] = None
...
@@ -3891,7 +4378,7 @@ index 534b9e60..18675d2f 100644
...
@@ -3891,7 +4378,7 @@ index 534b9e60..18675d2f 100644
### Stateful fields that are lazily defined. ###
### Stateful fields that are lazily defined. ###
# The number of speculative tokens adopted in this request.
# The number of speculative tokens adopted in this request.
@@ -1310,6 +13
21
,8 @@
class ExecuteModelRequest(
@@ -1310,6 +13
35
,8 @@
class ExecuteModelRequest(
last_sampled_token_ids: Optional[torch.Tensor] = None
last_sampled_token_ids: Optional[torch.Tensor] = None
# Async callback
# Async callback
async_callback: Optional[Callable] = None
async_callback: Optional[Callable] = None
...
@@ -3901,10 +4388,28 @@ index 534b9e60..18675d2f 100644
...
@@ -3901,10 +4388,28 @@ index 534b9e60..18675d2f 100644
@property
@property
def is_first_multi_step(self) -> bool:
def is_first_multi_step(self) -> bool:
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 12baecde..
a3f2c464
100644
index 12baecde
6
..
11034b391
100644
--- a/vllm/worker/model_runner.py
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1824,6 +1824,9 @@
class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
@@ -1,4 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import dataclasses
import gc
@@ -1824,6 +1837,9 @@
class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
if self.vllm_config.kv_transfer_config is None:
if self.vllm_config.kv_transfer_config is None:
return False
return False
...
@@ -3914,7 +4419,7 @@ index 12baecde..a3f2c464 100644
...
@@ -3914,7 +4419,7 @@ index 12baecde..a3f2c464 100644
prefill_meta = model_input.attn_metadata.prefill_metadata
prefill_meta = model_input.attn_metadata.prefill_metadata
@@ -1849,6 +185
2
,9 @@
class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
@@ -1849,6 +18
6
5,9 @@
class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
if self.vllm_config.kv_transfer_config is None:
if self.vllm_config.kv_transfer_config is None:
return False
return False
...
@@ -3925,10 +4430,25 @@ index 12baecde..a3f2c464 100644
...
@@ -3925,10 +4430,25 @@ index 12baecde..a3f2c464 100644
prefill_meta = model_input.attn_metadata.prefill_metadata
prefill_meta = model_input.attn_metadata.prefill_metadata
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 582aa460..
876329d6
100644
index 582aa460
e
..
0be784a40
100644
--- a/vllm/worker/worker.py
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -2,7 +2,7 @@
@@ -1,8 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
"""A GPU worker class."""
"""A GPU worker class."""
import gc
import gc
import os
import os
...
@@ -3937,7 +4457,7 @@ index 582aa460..876329d6 100644
...
@@ -3937,7 +4457,7 @@ index 582aa460..876329d6 100644
import torch
import torch
import torch.distributed
import torch.distributed
@@ -31,6 +
31
,9 @@
from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -31,6 +
45
,9 @@
from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
from vllm.worker.pooling_model_runner import PoolingModelRunner
from vllm.worker.pooling_model_runner import PoolingModelRunner
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
WorkerInput)
WorkerInput)
...
@@ -3947,7 +4467,7 @@ index 582aa460..876329d6 100644
...
@@ -3947,7 +4467,7 @@ index 582aa460..876329d6 100644
logger = init_logger(__name__)
logger = init_logger(__name__)
@@ -306,6 +3
09
,46 @@
class Worker(LocalOrDistributedWorkerBase):
@@ -306,6 +3
23
,46 @@
class Worker(LocalOrDistributedWorkerBase):
self._init_cache_engine()
self._init_cache_engine()
self._warm_up_model()
self._warm_up_model()
...
@@ -3994,7 +4514,7 @@ index 582aa460..876329d6 100644
...
@@ -3994,7 +4514,7 @@ index 582aa460..876329d6 100644
def _init_cache_engine(self):
def _init_cache_engine(self):
assert self.cache_config.num_gpu_blocks is not None
assert self.cache_config.num_gpu_blocks is not None
self.cache_engine = [
self.cache_engine = [
@@ -367,6 +4
10
,8 @@
class Worker(LocalOrDistributedWorkerBase):
@@ -367,6 +4
24
,8 @@
class Worker(LocalOrDistributedWorkerBase):
blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
device=self.device,
device=self.device,
dtype=torch.int64).view(-1, 2)
dtype=torch.int64).view(-1, 2)
...
@@ -4003,7 +4523,7 @@ index 582aa460..876329d6 100644
...
@@ -4003,7 +4523,7 @@ index 582aa460..876329d6 100644
return WorkerInput(
return WorkerInput(
num_seq_groups=num_seq_groups,
num_seq_groups=num_seq_groups,
@@ -375,6 +4
20
,12 @@
class Worker(LocalOrDistributedWorkerBase):
@@ -375,6 +4
34
,12 @@
class Worker(LocalOrDistributedWorkerBase):
blocks_to_copy=blocks_to_copy,
blocks_to_copy=blocks_to_copy,
virtual_engine=virtual_engine,
virtual_engine=virtual_engine,
num_steps=num_steps,
num_steps=num_steps,
...
@@ -4017,10 +4537,29 @@ index 582aa460..876329d6 100644
...
@@ -4017,10 +4537,29 @@ index 582aa460..876329d6 100644
@torch.inference_mode()
@torch.inference_mode()
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 819b81fb..
2891854b
100644
index 819b81fb
f
..
7d1b1836d
100644
--- a/vllm/worker/worker_base.py
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -9,6 +9,7 @@
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
@@ -1,4 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
import dataclasses
import os
@@ -9,6 +23,7 @@
from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
import cloudpickle
import cloudpickle
import torch
import torch
import torch.nn as nn
import torch.nn as nn
...
@@ -4028,7 +4567,7 @@ index 819b81fb..2891854b 100644
...
@@ -4028,7 +4567,7 @@ index 819b81fb..2891854b 100644
from vllm.config import (ObservabilityConfig, VllmConfig,
from vllm.config import (ObservabilityConfig, VllmConfig,
set_current_vllm_config)
set_current_vllm_config)
@@ -23,6 +
24
,9 @@
from vllm.utils import (enable_trace_function_call_for_thread,
@@ -23,6 +
38
,9 @@
from vllm.utils import (enable_trace_function_call_for_thread,
from vllm.worker.model_runner_base import (BroadcastableModelInput,
from vllm.worker.model_runner_base import (BroadcastableModelInput,
ModelRunnerBase,
ModelRunnerBase,
ModelRunnerInputBase)
ModelRunnerInputBase)
...
@@ -4038,7 +4577,7 @@ index 819b81fb..2891854b 100644
...
@@ -4038,7 +4577,7 @@ index 819b81fb..2891854b 100644
logger = init_logger(__name__)
logger = init_logger(__name__)
@@ -53,6 +
5
7,8 @@
class WorkerBase(ABC):
@@ -53,6 +7
1
,8 @@
class WorkerBase(ABC):
from vllm.platforms import current_platform
from vllm.platforms import current_platform
self.current_platform = current_platform
self.current_platform = current_platform
...
@@ -4047,7 +4586,7 @@ index 819b81fb..2891854b 100644
...
@@ -4047,7 +4586,7 @@ index 819b81fb..2891854b 100644
@abstractmethod
@abstractmethod
def init_device(self) -> None:
def init_device(self) -> None:
"""Initialize device state, such as loading the model or other on-device
"""Initialize device state, such as loading the model or other on-device
@@ -216,6 +2
22
,13 @@
class WorkerInput:
@@ -216,6 +2
36
,13 @@
class WorkerInput:
virtual_engine: int = 0
virtual_engine: int = 0
num_steps: int = 1
num_steps: int = 1
...
@@ -4061,7 +4600,7 @@ index 819b81fb..2891854b 100644
...
@@ -4061,7 +4600,7 @@ index 819b81fb..2891854b 100644
@classmethod
@classmethod
def from_broadcasted_tensor_dict(
def from_broadcasted_tensor_dict(
cls: Type["WorkerInput"],
cls: Type["WorkerInput"],
@@ -232,6 +2
4
5,12 @@
class WorkerInput:
@@ -232,6 +25
9
,12 @@
class WorkerInput:
blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
virtual_engine=tensor_dict["virtual_engine"],
virtual_engine=tensor_dict["virtual_engine"],
num_steps=tensor_dict.pop("num_steps"),
num_steps=tensor_dict.pop("num_steps"),
...
@@ -4074,7 +4613,7 @@ index 819b81fb..2891854b 100644
...
@@ -4074,7 +4613,7 @@ index 819b81fb..2891854b 100644
)
)
def as_broadcastable_tensor_dict(
def as_broadcastable_tensor_dict(
@@ -246,6 +2
65
,12 @@
class WorkerInput:
@@ -246,6 +2
79
,12 @@
class WorkerInput:
"blocks_to_copy": self.blocks_to_copy,
"blocks_to_copy": self.blocks_to_copy,
"virtual_engine": self.virtual_engine,
"virtual_engine": self.virtual_engine,
"num_steps": self.num_steps,
"num_steps": self.num_steps,
...
@@ -4087,7 +4626,7 @@ index 819b81fb..2891854b 100644
...
@@ -4087,7 +4626,7 @@ index 819b81fb..2891854b 100644
}
}
return tensor_dict
return tensor_dict
@@ -316,13 +3
41
,16 @@
class LocalOrDistributedWorkerBase(WorkerBase):
@@ -316,13 +3
55
,16 @@
class LocalOrDistributedWorkerBase(WorkerBase):
return None
return None
worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data)
worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data)
...
@@ -4109,7 +4648,7 @@ index 819b81fb..2891854b 100644
...
@@ -4109,7 +4648,7 @@ index 819b81fb..2891854b 100644
def _get_driver_input_and_broadcast(
def _get_driver_input_and_broadcast(
self, execute_model_req: ExecuteModelRequest
self, execute_model_req: ExecuteModelRequest
@@ -396,49 +4
24
,88 @@
class LocalOrDistributedWorkerBase(WorkerBase):
@@ -396,49 +4
38
,88 @@
class LocalOrDistributedWorkerBase(WorkerBase):
self.execute_worker(worker_input)
self.execute_worker(worker_input)
# If there is no input, we don't need to execute the model.
# If there is no input, we don't need to execute the model.
...
@@ -4161,7 +4700,12 @@ index 819b81fb..2891854b 100644
...
@@ -4161,7 +4700,12 @@ index 819b81fb..2891854b 100644
- and self.observability_config.collect_model_execute_time):
- and self.observability_config.collect_model_execute_time):
- orig_model_execute_time = intermediate_tensors.tensors.get(
- orig_model_execute_time = intermediate_tensors.tensors.get(
- "model_execute_time", torch.tensor(0)).item()
- "model_execute_time", torch.tensor(0)).item()
-
+ and self.observability_config.collect_model_execute_time
+ and output is not None):
+ for o in output:
+ o.model_execute_time = (orig_model_execute_time +
+ model_execute_time)
- output = self.model_runner.execute_model(
- output = self.model_runner.execute_model(
- model_input=model_input,
- model_input=model_input,
- kv_caches=self.kv_cache[worker_input.virtual_engine]
- kv_caches=self.kv_cache[worker_input.virtual_engine]
...
@@ -4170,12 +4714,7 @@ index 819b81fb..2891854b 100644
...
@@ -4170,12 +4714,7 @@ index 819b81fb..2891854b 100644
- num_steps=num_steps,
- num_steps=num_steps,
- **kwargs,
- **kwargs,
- )
- )
+ and self.observability_config.collect_model_execute_time
-
+ and output is not None):
+ for o in output:
+ o.model_execute_time = (orig_model_execute_time +
+ model_execute_time)
- model_execute_time = time.perf_counter() - start_time
- model_execute_time = time.perf_counter() - start_time
- if not get_pp_group().is_last_rank:
- if not get_pp_group().is_last_rank:
- # output is IntermediateTensors
- # output is IntermediateTensors
...
...
container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch
0 → 100644
View file @
00730fc6
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment