Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
dcecc47d
Commit
dcecc47d
authored
Mar 07, 2025
by
GuanLuo
Committed by
GitHub
Mar 07, 2025
Browse files
test: add tests for kv bindings (#35)
parent
6705d483
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
217 additions
and
121 deletions
+217
-121
container/Dockerfile
container/Dockerfile
+1
-1
examples/python_rs/llm/vllm/kv_router/metrics_router.py
examples/python_rs/llm/vllm/kv_router/metrics_router.py
+0
-120
lib/bindings/python/tests/test_kv_bindings.py
lib/bindings/python/tests/test_kv_bindings.py
+216
-0
No files found.
container/Dockerfile
View file @
dcecc47d
...
@@ -106,7 +106,7 @@ ENV VLLM_GENERATE_WORKERS=${VLLM_FRAMEWORK:+1}
...
@@ -106,7 +106,7 @@ ENV VLLM_GENERATE_WORKERS=${VLLM_FRAMEWORK:+1}
ENV
VLLM_BASELINE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV
VLLM_BASELINE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV
VLLM_CONTEXT_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV
VLLM_CONTEXT_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV
VLLM_GENERATE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV
VLLM_GENERATE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV
VLLM_KV_CAPI_PATH="/opt/dynemo/
llm_
binding/lib/libdynemo_llm_capi.so"
ENV
VLLM_KV_CAPI_PATH="/opt/dynemo/binding
s
/lib/libdynemo_llm_capi.so"
ENV
PYTHONUNBUFFERED=1
ENV
PYTHONUNBUFFERED=1
# Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability
# Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability
...
...
examples/python_rs/llm/vllm/kv_router/metrics_router.py
deleted
100644 → 0
View file @
6705d483
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
uvloop
from
common.protocol
import
Request
,
Response
from
vllm.logger
import
logger
as
vllm_logger
from
dynemo.llm
import
KvRouter
from
dynemo.runtime
import
DistributedRuntime
,
dynemo_endpoint
,
dynemo_worker
class
Router
:
"""
Request handler for the generate endpoint
"""
def
__init__
(
self
,
router
,
workers_client
,
):
self
.
router
=
router
self
.
workers_client
=
workers_client
@
dynemo_endpoint
(
Request
,
Response
)
async
def
generate
(
self
,
request
):
lora_id
=
0
worker_id
=
None
tokens
=
[
3
]
*
64
try
:
worker_id
=
await
self
.
router
.
schedule
(
tokens
,
lora_id
)
# [NOTE][TODO] Now that the scheduler may return more error messages,
# now we are catching all exceptions and logging them. Should have
# catch specific router exceptions once we have dedicated types.
except
Exception
as
e
:
vllm_logger
.
info
(
f
"got exception of type
{
type
(
e
)
}
:
{
e
}
"
)
worker_id
=
None
vllm_logger
.
exception
(
f
"Error during worker selection:
{
e
}
"
)
vllm_logger
.
info
(
f
"Scheduling to worker_id:
{
worker_id
}
"
)
if
worker_id
is
None
:
vllm_logger
.
info
(
"randomly select worker"
)
engine_generator
=
await
self
.
workers_client
.
random
(
request
.
model_dump_json
()
)
else
:
vllm_logger
.
info
(
f
"directly select worker:
{
worker_id
}
"
)
engine_generator
=
await
self
.
workers_client
.
direct
(
request
.
model_dump_json
(),
worker_id
)
async
for
resp
in
engine_generator
:
resp
=
resp
.
data
()
if
hasattr
(
resp
,
"data"
)
else
resp
yield
resp
@
dynemo_endpoint
(
Request
,
Response
)
async
def
mock_generate
(
self
,
request
):
print
(
f
"Received request:
{
request
}
"
)
yield
"Hello, World!"
ROUTE_SELF
=
True
@
dynemo_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
workers_client
=
(
await
runtime
.
namespace
(
"dynemo"
)
.
component
(
"vllm"
)
.
endpoint
(
"generate"
)
.
client
()
)
vllm_logger
.
info
(
f
"Have number of workers (
{
len
(
workers_client
.
endpoint_ids
())
}
) are ready:
\n
"
+
"
\n
"
.
join
(
f
"id:
{
id
}
"
for
id
in
workers_client
.
endpoint_ids
())
)
# [TODO] Collect endpoint implementation expects services to provide
# ForwardPassMetrics as part of stats handling and it will panic if
# otherwise. This needs to be fixed so that non-providing endpoints will
# simply be ignored, but before that, we will make sure that the services
# of the same namespace::component are created via KvMetricsPublisher,
# if it is also used to create endpoints.
kv_listener
=
runtime
.
namespace
(
"dynemo"
).
component
(
"vllm"
)
await
kv_listener
.
create_service
()
router
=
KvRouter
(
runtime
,
kv_listener
)
# i.e. below will cause panic
# endpoint = kv_listener.endpoint("generate")
# await endpoint.serve_endpoint(
# Router(router, workers_client).mock_generate
# )
router_component
=
runtime
.
namespace
(
"dynemo"
).
component
(
"frontend"
)
await
router_component
.
create_service
()
endpoint
=
router_component
.
endpoint
(
"generate"
)
await
endpoint
.
serve_endpoint
(
Router
(
router
,
workers_client
).
generate
)
if
__name__
==
"__main__"
:
uvloop
.
install
()
asyncio
.
run
(
worker
())
examples/python_rs/llm/vllm/kv_router/mock_worker
.py
→
lib/bindings/python/tests/test_kv_bindings
.py
View file @
dcecc47d
...
@@ -16,39 +16,97 @@
...
@@ -16,39 +16,97 @@
import
asyncio
import
asyncio
import
ctypes
import
ctypes
import
os
import
subprocess
from
ctypes
import
c_char_p
,
c_int64
,
c_uint32
from
ctypes
import
c_char_p
,
c_int64
,
c_uint32
from
time
import
sleep
from
typing
import
List
import
uvloop
import
pytest
from
common.protocol
import
Request
,
Response
from
vllm.logger
import
logger
as
vllm_logger
from
dynemo.llm
import
KvMetricsPublisher
from
dynemo.llm
import
KvIndexer
,
KvMetricsAggregator
,
KvMetricsPublisher
from
dynemo.runtime
import
DistributedRuntime
,
dynemo_endpoint
,
dynemo_worker
from
dynemo.runtime
import
DistributedRuntime
pytestmark
=
pytest
.
mark
.
pre_merge
runtime
=
None
@
pytest
.
fixture
(
scope
=
"module"
,
autouse
=
True
)
def
setup_and_teardown
():
# Setup code
nats_server
=
subprocess
.
Popen
([
"nats-server"
,
"-js"
])
etcd
=
subprocess
.
Popen
([
"etcd"
])
print
(
"Setting up resources"
)
sleep
(
5
)
# wait for nats-server and etcd to start
yield
# Teardown code
print
(
"Tearing down resources"
)
nats_server
.
terminate
()
nats_server
.
wait
()
etcd
.
terminate
()
etcd
.
wait
()
async
def
test_event_handler
():
global
runtime
if
runtime
is
None
:
loop
=
asyncio
.
get_running_loop
()
runtime
=
DistributedRuntime
(
loop
)
namespace
=
"kv_test"
component
=
"event"
# publisher
worker_id
=
233
event_publisher
=
EventPublisher
(
namespace
,
component
,
worker_id
)
# indexer
kv_listener
=
runtime
.
namespace
(
namespace
).
component
(
component
)
await
kv_listener
.
create_service
()
indexer
=
KvIndexer
(
kv_listener
)
test_token
=
[
3
]
*
64
lora_id
=
0
# lora_id is not used in the indexer
scores
=
await
indexer
.
find_matches_for_request
(
test_token
,
lora_id
)
assert
not
scores
.
scores
event_publisher
.
store_event
(
test_token
,
lora_id
)
# wait for the event to be processed as it is sent asynchronously
await
asyncio
.
sleep
(
1
)
scores
=
await
indexer
.
find_matches_for_request
(
test_token
,
lora_id
)
assert
scores
.
scores
assert
worker_id
in
scores
.
scores
assert
scores
.
scores
[
worker_id
]
==
1
# remove event
event_publisher
.
remove_event
()
await
asyncio
.
sleep
(
1
)
scores
=
await
indexer
.
find_matches_for_request
(
test_token
,
lora_id
)
assert
not
scores
.
scores
# KV events
class
DynemoResult
:
class
DynemoResult
:
OK
=
0
OK
=
0
ERR
=
1
ERR
=
1
class
MockEngine
:
class
EventPublisher
:
"""
def
__init__
(
self
,
namespace
:
str
,
component
:
str
,
worker_id
:
int
):
Request handler for the generate endpoint
self
.
event_id_counter
=
0
"""
self
.
block_ids
:
List
[
int
]
=
[]
def
__init__
(
self
,
metrics_publisher
,
worker_id
):
# load event publisher library
self
.
worker_id
=
worker_id
self
.
lib
=
ctypes
.
CDLL
(
os
.
environ
[
"VLLM_KV_CAPI_PATH"
])
# KV events
self
.
lib
=
ctypes
.
CDLL
(
"/opt/dynemo/llm_binding/lib/libdynemo_llm_capi.so"
)
self
.
lib
.
dynemo_llm_init
.
argtypes
=
[
c_char_p
,
c_char_p
,
c_int64
]
self
.
lib
.
dynemo_llm_init
.
argtypes
=
[
c_char_p
,
c_char_p
,
c_int64
]
self
.
lib
.
dynemo_llm_init
.
restype
=
c_uint32
self
.
lib
.
dynemo_llm_init
.
restype
=
c_uint32
result
=
self
.
lib
.
dynemo_llm_init
(
"dynemo"
.
encode
(),
"vllm"
.
encode
(),
worker_id
)
result
=
self
.
lib
.
dynemo_llm_init
(
if
result
==
DynemoResult
.
OK
:
namespace
.
encode
(),
component
.
encode
(),
worker_id
vllm_logger
.
info
(
)
"KVCacheEventManager initialized successfully. Ready to publish KV Cache Events"
assert
result
==
DynemoResult
.
OK
)
else
:
vllm_logger
.
info
(
"KVCacheEventManager initialization failed!"
)
self
.
lib
.
dynemo_kv_event_publish_stored
.
argtypes
=
[
self
.
lib
.
dynemo_kv_event_publish_stored
.
argtypes
=
[
ctypes
.
c_uint64
,
# event_id
ctypes
.
c_uint64
,
# event_id
ctypes
.
POINTER
(
ctypes
.
c_uint32
),
# token_ids
ctypes
.
POINTER
(
ctypes
.
c_uint32
),
# token_ids
...
@@ -71,41 +129,7 @@ class MockEngine:
...
@@ -71,41 +129,7 @@ class MockEngine:
ctypes
.
c_uint32
ctypes
.
c_uint32
)
# dynemo_llm_result_t
)
# dynemo_llm_result_t
# KV metrics
def
store_event
(
self
,
tokens
,
lora_id
):
self
.
metrics_publisher
=
metrics_publisher
self
.
request_active_slots
=
0
self
.
request_total_slots
=
4
self
.
kv_active_block
=
0
self
.
kv_total_blocks
=
4
# [NOTE] Now that the component must has proper metrics reported
# to be properly selected by the router
self
.
metrics_publisher
.
publish
(
self
.
request_active_slots
,
self
.
request_total_slots
,
self
.
kv_active_block
,
self
.
kv_total_blocks
,
)
self
.
event_id_counter
=
0
self
.
tokens
=
[
3
]
*
64
@
dynemo_endpoint
(
Request
,
Response
)
async
def
generate
(
self
,
request
):
print
(
f
"Received request:
{
request
}
"
)
self
.
request_active_slots
=
min
(
self
.
request_active_slots
+
1
,
self
.
request_total_slots
)
self
.
kv_active_block
=
min
(
self
.
kv_active_block
+
1
,
self
.
kv_total_blocks
)
self
.
metrics_publisher
.
publish
(
self
.
request_active_slots
,
self
.
request_total_slots
,
self
.
kv_active_block
,
self
.
kv_total_blocks
,
)
self
.
store_event
()
yield
"Hello, World!"
def
store_event
(
self
):
parent_hash
=
(
parent_hash
=
(
(
ctypes
.
c_uint64
*
1
)(
self
.
event_id_counter
)
(
ctypes
.
c_uint64
*
1
)(
self
.
event_id_counter
)
if
self
.
event_id_counter
>
0
if
self
.
event_id_counter
>
0
...
@@ -113,57 +137,80 @@ class MockEngine:
...
@@ -113,57 +137,80 @@ class MockEngine:
)
)
result
=
self
.
lib
.
dynemo_kv_event_publish_stored
(
result
=
self
.
lib
.
dynemo_kv_event_publish_stored
(
self
.
event_id_counter
,
# uint64_t event_id
self
.
event_id_counter
,
# uint64_t event_id
(
ctypes
.
c_uint32
*
len
(
self
.
tokens
))(
(
ctypes
.
c_uint32
*
len
(
tokens
))(
*
tokens
),
# const uint32_t *token_ids
*
self
.
tokens
(
ctypes
.
c_size_t
*
1
)(
len
(
tokens
)),
# const uintptr_t *num_block_tokens
),
# const uint32_t *token_ids
(
ctypes
.
c_size_t
*
1
)(
len
(
self
.
tokens
)
),
# const uintptr_t *num_block_tokens
(
ctypes
.
c_uint64
*
1
)(
self
.
event_id_counter
),
# const uint64_t *block_ids
(
ctypes
.
c_uint64
*
1
)(
self
.
event_id_counter
),
# const uint64_t *block_ids
1
,
# uintptr_t num_blocks
1
,
# uintptr_t num_blocks
parent_hash
,
# const uint64_t *parent_hash
parent_hash
,
# const uint64_t *parent_hash
0
,
# uint64_t lora_id
lora_id
,
# uint64_t lora_id
)
)
self
.
block_ids
.
append
(
self
.
event_id_counter
)
self
.
event_id_counter
+=
1
self
.
event_id_counter
+=
1
if
result
==
DynemoResult
.
OK
:
assert
result
==
DynemoResult
.
OK
vllm_logger
.
debug
(
f
"Store - Published KV Event:
{
self
.
event_id_counter
}
"
)
else
:
vllm_logger
.
debug
(
f
"Store - Failed to Publish KV Event:
{
self
.
event_id_counter
}
"
)
async
def
cooldown
(
self
):
while
True
:
await
asyncio
.
sleep
(
5
)
self
.
request_active_slots
=
max
(
0
,
self
.
request_active_slots
-
1
)
self
.
kv_active_block
=
max
(
0
,
self
.
kv_active_block
-
1
)
self
.
metrics_publisher
.
publish
(
self
.
request_active_slots
,
self
.
request_total_slots
,
self
.
kv_active_block
,
self
.
kv_total_blocks
,
)
@
dynemo_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component
=
runtime
.
namespace
(
"dynemo"
).
component
(
"vllm"
)
metrics_publisher
=
KvMetricsPublisher
()
await
metrics_publisher
.
create_service
(
component
)
endpoint
=
component
.
endpoint
(
"generate"
)
engine
=
MockEngine
(
metrics_publisher
,
endpoint
.
lease_id
())
await
asyncio
.
gather
(
engine
.
cooldown
(),
endpoint
.
serve_endpoint
(
engine
.
generate
),
)
def
remove_event
(
self
):
result
=
self
.
lib
.
dynemo_kv_event_publish_removed
(
self
.
event_id_counter
,
# uint64_t event_id
(
ctypes
.
c_uint64
*
1
)(
self
.
block_ids
[
-
1
]),
# const uint64_t *block_ids
1
,
# uintptr_t num_blocks
)
self
.
event_id_counter
+=
1
if
__name__
==
"__main__"
:
assert
result
==
DynemoResult
.
OK
uvloop
.
install
()
asyncio
.
run
(
worker
())
async
def
test_metrics_aggregator
():
global
runtime
if
runtime
is
None
:
loop
=
asyncio
.
get_running_loop
()
runtime
=
DistributedRuntime
(
loop
)
namespace
=
"kv_test"
component
=
"metrics"
kv_listener
=
runtime
.
namespace
(
namespace
).
component
(
component
)
await
kv_listener
.
create_service
()
# aggregator
metrics_aggregator
=
KvMetricsAggregator
(
kv_listener
)
# has nothing to aggregate as worker has not started
metrics
=
await
metrics_aggregator
.
get_metrics
()
assert
not
metrics
.
endpoints
expected_metrics
=
{
"request_active_slots"
:
0
,
"request_total_slots"
:
1024
,
"kv_active_blocks"
:
523
,
"kv_total_blocks"
:
777
,
}
# need 'create_taskk' to put publisher task in the background
asyncio
.
create_task
(
metrics_publisher
(
kv_listener
,
expected_metrics
))
# needs time for publisher to spawn up
for
i
in
range
(
10
):
await
asyncio
.
sleep
(
1
)
metrics
=
await
metrics_aggregator
.
get_metrics
()
if
metrics
.
endpoints
:
break
assert
metrics
.
endpoints
for
endpoint
in
metrics
.
endpoints
:
# [TODO] not really checking id for now, can't get it as create_endpoint()
# create and serve the endpoint internally
assert
endpoint
.
worker_id
!=
0
assert
endpoint
.
request_active_slots
==
expected_metrics
[
"request_active_slots"
]
assert
endpoint
.
request_total_slots
==
expected_metrics
[
"request_total_slots"
]
assert
endpoint
.
kv_active_blocks
==
expected_metrics
[
"kv_active_blocks"
]
assert
endpoint
.
kv_total_blocks
==
expected_metrics
[
"kv_total_blocks"
]
async
def
metrics_publisher
(
kv_listener
,
expected_metrics
):
metrics_publisher
=
KvMetricsPublisher
()
metrics_publisher
.
publish
(
expected_metrics
[
"request_active_slots"
],
expected_metrics
[
"request_total_slots"
],
expected_metrics
[
"kv_active_blocks"
],
expected_metrics
[
"kv_total_blocks"
],
)
await
metrics_publisher
.
create_endpoint
(
kv_listener
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment