Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2d39ded6
Unverified
Commit
2d39ded6
authored
Apr 30, 2025
by
Yan Ru Pei
Committed by
GitHub
Apr 30, 2025
Browse files
chore: unified logging, added informative warnings for KV router example (#912)
parent
942a0fb9
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
6 deletions
+9
-6
examples/llm/components/kv_router.py
examples/llm/components/kv_router.py
+9
-6
No files found.
examples/llm/components/kv_router.py
View file @
2d39ded6
...
...
@@ -23,7 +23,6 @@ from typing import AsyncIterator
from
components.worker
import
VllmWorker
from
utils.logging
import
check_required_workers
from
utils.protocol
import
Tokens
from
vllm.logger
import
logger
as
vllm_logger
from
dynamo.llm
import
AggregatedMetrics
,
KvIndexer
,
KvMetricsAggregator
,
OverlapScores
from
dynamo.sdk
import
async_on_start
,
depends
,
dynamo_context
,
dynamo_endpoint
,
service
...
...
@@ -83,7 +82,7 @@ class Router:
worker
=
depends
(
VllmWorker
)
def
__init__
(
self
):
vllm_
logger
.
info
(
"Initializing Custom Router"
)
logger
.
info
(
"Initializing Custom Router"
)
self
.
args
=
parse_args
(
self
.
__class__
.
__name__
,
""
)
self
.
default_metrics
=
{
...
...
@@ -141,6 +140,8 @@ class Router:
worker_scores
[
worker_id
]
=
(
score
*
self
.
indexer
.
block_size
()
/
token_length
)
else
:
logger
.
warning
(
"Cannot get KV scores"
)
worker_metrics
=
{}
max_waiting
=
0.0
...
...
@@ -154,6 +155,8 @@ class Router:
max_waiting
=
max
(
max_waiting
,
worker_metrics
[
worker_id
][
"num_requests_waiting"
]
)
else
:
logger
.
warning
(
"Cannot get metrics"
)
# Get all worker IDs from the client. This is needed because scores / metrics may not have values for all workers
# and we want all workers to be considered in the logit calculation
...
...
@@ -175,7 +178,7 @@ class Router:
# Have 1 metric that weights towards cache hit
# 2 metrics that penalize overloaded worker and queuing
worker_logits
[
worker_id
]
=
2
*
score
-
gpu_cache_usage
-
normalized_waiting
vllm_
logger
.
info
(
logger
.
info
(
f
"Formula for
{
worker_id
}
:
{
worker_logits
[
worker_id
]:.
3
f
}
= 2.0 *
{
score
:.
3
f
}
-
{
gpu_cache_usage
:.
3
f
}
-
{
normalized_waiting
:.
3
f
}
"
)
...
...
@@ -204,7 +207,7 @@ class Router:
# Log to vllm_logger
for
message
in
log_messages
:
vllm_
logger
.
info
(
message
)
logger
.
info
(
message
)
return
best_worker_id
,
worker_scores
.
get
(
best_worker_id
,
0.0
)
...
...
@@ -217,14 +220,14 @@ class Router:
)
except
Exception
as
e
:
scores
=
{}
vllm_
logger
.
exception
(
f
"Error finding matches:
{
e
}
"
)
logger
.
exception
(
f
"Error finding matches:
{
e
}
"
)
metrics
=
await
self
.
metrics_aggregator
.
get_metrics
()
worker_id
,
prefix_hit_rate
=
self
.
_cost_function
(
scores
,
metrics
,
len
(
request
.
tokens
)
)
vllm_
logger
.
info
(
logger
.
info
(
f
"Scheduling to worker_id:
{
worker_id
}
with estimated prefix hit rate:
{
prefix_hit_rate
}
"
)
yield
f
"
{
worker_id
}
_
{
prefix_hit_rate
}
"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment