Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
80cac7c1
Unverified
Commit
80cac7c1
authored
Feb 23, 2026
by
Tzu-Ling Kan
Committed by
GitHub
Feb 23, 2026
Browse files
feat: Remove Component from public (#6403)
Signed-off-by:
tzulingk@nvidia.com
<
tzulingk@nvidia.com
>
parent
cb55766c
Changes
42
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
104 additions
and
182 deletions
+104
-182
components/src/dynamo/trtllm/workers/llm_worker.py
components/src/dynamo/trtllm/workers/llm_worker.py
+3
-6
components/src/dynamo/trtllm/workers/video_diffusion_worker.py
...nents/src/dynamo/trtllm/workers/video_diffusion_worker.py
+2
-2
components/src/dynamo/vllm/handlers.py
components/src/dynamo/vllm/handlers.py
+0
-6
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+23
-25
components/src/dynamo/vllm/multimodal_handlers/multimodal_pd_worker_handler.py
.../vllm/multimodal_handlers/multimodal_pd_worker_handler.py
+1
-3
components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
...nts/src/dynamo/vllm/multimodal_handlers/worker_handler.py
+0
-2
components/src/dynamo/vllm/omni/omni_handler.py
components/src/dynamo/vllm/omni/omni_handler.py
+0
-2
components/src/dynamo/vllm/publisher.py
components/src/dynamo/vllm/publisher.py
+13
-16
components/src/dynamo/vllm/tests/multimodal_handlers/test_vllm_multimodal_pd_worker_handler.py
...imodal_handlers/test_vllm_multimodal_pd_worker_handler.py
+0
-1
components/src/dynamo/vllm/tests/test_vllm_worker_factory.py
components/src/dynamo/vllm/tests/test_vllm_worker_factory.py
+1
-0
components/src/dynamo/vllm/worker_factory.py
components/src/dynamo/vllm/worker_factory.py
+14
-8
examples/multimodal/components/publisher.py
examples/multimodal/components/publisher.py
+7
-7
examples/multimodal/components/worker.py
examples/multimodal/components/worker.py
+12
-15
lib/bindings/python/examples/error_handling/server.py
lib/bindings/python/examples/error_handling/server.py
+2
-2
lib/bindings/python/examples/hello_world/server.py
lib/bindings/python/examples/hello_world/server.py
+2
-2
lib/bindings/python/examples/typed/server.py
lib/bindings/python/examples/typed/server.py
+2
-2
lib/bindings/python/rust/lib.rs
lib/bindings/python/rust/lib.rs
+0
-30
lib/bindings/python/rust/llm/kv.rs
lib/bindings/python/rust/llm/kv.rs
+12
-9
lib/bindings/python/src/dynamo/_core.pyi
lib/bindings/python/src/dynamo/_core.pyi
+10
-42
lib/bindings/python/src/dynamo/runtime/__init__.py
lib/bindings/python/src/dynamo/runtime/__init__.py
+0
-2
No files found.
components/src/dynamo/trtllm/workers/llm_worker.py
View file @
80cac7c1
...
@@ -336,7 +336,7 @@ async def init_llm_worker(
...
@@ -336,7 +336,7 @@ async def init_llm_worker(
endpoint
=
runtime
.
endpoint
(
endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
)
)
component
=
endpoint
.
component
()
if
shutdown_endpoints
is
not
None
:
if
shutdown_endpoints
is
not
None
:
shutdown_endpoints
[:]
=
[
endpoint
]
shutdown_endpoints
[:]
=
[
endpoint
]
...
@@ -419,7 +419,6 @@ async def init_llm_worker(
...
@@ -419,7 +419,6 @@ async def init_llm_worker(
# publisher will be set later if publishing is enabled.
# publisher will be set later if publishing is enabled.
handler_config
=
RequestHandlerConfig
(
handler_config
=
RequestHandlerConfig
(
component
=
component
,
engine
=
engine
,
engine
=
engine
,
default_sampling_params
=
default_sampling_params
,
default_sampling_params
=
default_sampling_params
,
publisher
=
None
,
publisher
=
None
,
...
@@ -456,7 +455,6 @@ async def init_llm_worker(
...
@@ -456,7 +455,6 @@ async def init_llm_worker(
if
config
.
publish_events_and_metrics
:
if
config
.
publish_events_and_metrics
:
# Initialize and pass in the publisher to the request handler to
# Initialize and pass in the publisher to the request handler to
# publish events and metrics.
# publish events and metrics.
kv_listener
=
endpoint
.
component
()
# Use model as fallback if served_model_name is not provided
# Use model as fallback if served_model_name is not provided
model_name_for_metrics
=
config
.
served_model_name
or
config
.
model
model_name_for_metrics
=
config
.
served_model_name
or
config
.
model
metrics_labels
=
[
metrics_labels
=
[
...
@@ -476,7 +474,7 @@ async def init_llm_worker(
...
@@ -476,7 +474,7 @@ async def init_llm_worker(
if
consolidator_output_endpoint
:
if
consolidator_output_endpoint
:
# Use the connect endpoint directly (already provided by get_consolidator_endpoints)
# Use the connect endpoint directly (already provided by get_consolidator_endpoints)
consolidator_publisher
=
KvEventPublisher
(
consolidator_publisher
=
KvEventPublisher
(
compone
nt
,
endpoint
=
endpoi
nt
,
kv_block_size
=
config
.
kv_block_size
,
kv_block_size
=
config
.
kv_block_size
,
zmq_endpoint
=
consolidator_output_connect_endpoint
,
zmq_endpoint
=
consolidator_output_connect_endpoint
,
zmq_topic
=
""
,
zmq_topic
=
""
,
...
@@ -487,9 +485,8 @@ async def init_llm_worker(
...
@@ -487,9 +485,8 @@ async def init_llm_worker(
)
)
async
with
get_publisher
(
async
with
get_publisher
(
compone
nt
,
endpoi
nt
,
engine
,
engine
,
kv_listener
,
int
(
endpoint
.
connection_id
()),
int
(
endpoint
.
connection_id
()),
config
.
kv_block_size
,
config
.
kv_block_size
,
metrics_labels
,
metrics_labels
,
...
...
components/src/dynamo/trtllm/workers/video_diffusion_worker.py
View file @
80cac7c1
...
@@ -91,7 +91,7 @@ async def init_video_diffusion_worker(
...
@@ -91,7 +91,7 @@ async def init_video_diffusion_worker(
endpoint
=
runtime
.
endpoint
(
endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
)
)
component
=
endpoint
.
component
()
if
shutdown_endpoints
is
not
None
:
if
shutdown_endpoints
is
not
None
:
shutdown_endpoints
[:]
=
[
endpoint
]
shutdown_endpoints
[:]
=
[
endpoint
]
...
@@ -100,7 +100,7 @@ async def init_video_diffusion_worker(
...
@@ -100,7 +100,7 @@ async def init_video_diffusion_worker(
await
engine
.
initialize
()
await
engine
.
initialize
()
# Create the request handler
# Create the request handler
handler
=
VideoGenerationHandler
(
component
,
engine
,
diffusion_config
)
handler
=
VideoGenerationHandler
(
engine
,
diffusion_config
)
# Register the model with Dynamo's discovery system
# Register the model with Dynamo's discovery system
model_name
=
config
.
served_model_name
or
config
.
model
model_name
=
config
.
served_model_name
or
config
.
model
...
...
components/src/dynamo/vllm/handlers.py
View file @
80cac7c1
...
@@ -268,7 +268,6 @@ class BaseWorkerHandler(ABC):
...
@@ -268,7 +268,6 @@ class BaseWorkerHandler(ABC):
def
__init__
(
def
__init__
(
self
,
self
,
runtime
,
runtime
,
component
,
engine
,
engine
,
default_sampling_params
,
default_sampling_params
,
model_max_len
:
int
|
None
=
None
,
model_max_len
:
int
|
None
=
None
,
...
@@ -280,7 +279,6 @@ class BaseWorkerHandler(ABC):
...
@@ -280,7 +279,6 @@ class BaseWorkerHandler(ABC):
enable_frontend_decoding
:
bool
=
False
,
enable_frontend_decoding
:
bool
=
False
,
):
):
self
.
runtime
=
runtime
self
.
runtime
=
runtime
self
.
component
=
component
self
.
engine_client
=
engine
self
.
engine_client
=
engine
self
.
default_sampling_params
=
default_sampling_params
self
.
default_sampling_params
=
default_sampling_params
self
.
kv_publishers
:
list
[
KvEventPublisher
]
|
None
=
None
self
.
kv_publishers
:
list
[
KvEventPublisher
]
|
None
=
None
...
@@ -1233,7 +1231,6 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -1233,7 +1231,6 @@ class DecodeWorkerHandler(BaseWorkerHandler):
def
__init__
(
def
__init__
(
self
,
self
,
runtime
,
runtime
,
component
,
engine
,
engine
,
default_sampling_params
,
default_sampling_params
,
model_max_len
:
int
|
None
=
None
,
model_max_len
:
int
|
None
=
None
,
...
@@ -1246,7 +1243,6 @@ class DecodeWorkerHandler(BaseWorkerHandler):
...
@@ -1246,7 +1243,6 @@ class DecodeWorkerHandler(BaseWorkerHandler):
):
):
super
().
__init__
(
super
().
__init__
(
runtime
,
runtime
,
component
,
engine
,
engine
,
default_sampling_params
,
default_sampling_params
,
model_max_len
,
model_max_len
,
...
@@ -1443,7 +1439,6 @@ class PrefillWorkerHandler(BaseWorkerHandler):
...
@@ -1443,7 +1439,6 @@ class PrefillWorkerHandler(BaseWorkerHandler):
def
__init__
(
def
__init__
(
self
,
self
,
runtime
,
runtime
,
component
,
engine
,
engine
,
default_sampling_params
,
default_sampling_params
,
model_max_len
:
int
|
None
=
None
,
model_max_len
:
int
|
None
=
None
,
...
@@ -1456,7 +1451,6 @@ class PrefillWorkerHandler(BaseWorkerHandler):
...
@@ -1456,7 +1451,6 @@ class PrefillWorkerHandler(BaseWorkerHandler):
):
):
super
().
__init__
(
super
().
__init__
(
runtime
,
runtime
,
component
,
engine
,
engine
,
default_sampling_params
,
default_sampling_params
,
model_max_len
,
model_max_len
,
...
...
components/src/dynamo/vllm/main.py
View file @
80cac7c1
...
@@ -11,6 +11,7 @@ from typing import Optional
...
@@ -11,6 +11,7 @@ from typing import Optional
import
uvloop
import
uvloop
from
prometheus_client
import
REGISTRY
,
CollectorRegistry
,
multiprocess
from
prometheus_client
import
REGISTRY
,
CollectorRegistry
,
multiprocess
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_events
import
ZmqEventPublisher
from
vllm.distributed.kv_events
import
ZmqEventPublisher
from
vllm.entrypoints.cli.serve
import
run_headless
from
vllm.entrypoints.cli.serve
import
run_headless
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
...
@@ -47,7 +48,7 @@ except ImportError:
...
@@ -47,7 +48,7 @@ except ImportError:
MediaFetcher
=
None
MediaFetcher
=
None
MEDIA_DECODER_AVAILABLE
=
False
MEDIA_DECODER_AVAILABLE
=
False
from
dynamo.runtime
import
DistributedRuntime
from
dynamo.runtime
import
DistributedRuntime
,
Endpoint
from
dynamo.runtime.logging
import
configure_dynamo_logging
from
dynamo.runtime.logging
import
configure_dynamo_logging
from
dynamo.vllm.worker_factory
import
WorkerFactory
from
dynamo.vllm.worker_factory
import
WorkerFactory
...
@@ -295,9 +296,8 @@ def setup_metrics_collection(config: Config, generate_endpoint, logger):
...
@@ -295,9 +296,8 @@ def setup_metrics_collection(config: Config, generate_endpoint, logger):
def
setup_kv_event_publisher
(
def
setup_kv_event_publisher
(
config
:
Config
,
config
:
Config
,
component
,
generate_endpoint
:
Endpoint
,
generate_endpoint
,
vllm_config
:
VllmConfig
,
vllm_config
,
consolidator_enabled
:
bool
=
False
,
consolidator_enabled
:
bool
=
False
,
consolidator_port
:
Optional
[
int
]
=
5558
,
consolidator_port
:
Optional
[
int
]
=
5558
,
)
->
Optional
[
KvEventPublisher
]:
)
->
Optional
[
KvEventPublisher
]:
...
@@ -306,7 +306,6 @@ def setup_kv_event_publisher(
...
@@ -306,7 +306,6 @@ def setup_kv_event_publisher(
Creates one publisher per dp_rank since each dp_rank publishes to a different port.
Creates one publisher per dp_rank since each dp_rank publishes to a different port.
Args:
Args:
config: Worker configuration
config: Worker configuration
component: Component for runtime integration
generate_endpoint: Endpoint for worker ID
generate_endpoint: Endpoint for worker ID
vllm_config: vLLM configuration
vllm_config: vLLM configuration
consolidator_enabled: If True, subscribe to kv eventconsolidator's ZMQ endpoint
consolidator_enabled: If True, subscribe to kv eventconsolidator's ZMQ endpoint
...
@@ -355,7 +354,7 @@ def setup_kv_event_publisher(
...
@@ -355,7 +354,7 @@ def setup_kv_event_publisher(
)
)
kv_publisher
=
KvEventPublisher
(
kv_publisher
=
KvEventPublisher
(
component
=
compone
nt
,
endpoint
=
generate_endpoi
nt
,
kv_block_size
=
vllm_config
.
cache_config
.
block_size
,
kv_block_size
=
vllm_config
.
cache_config
.
block_size
,
zmq_endpoint
=
zmq_endpoint
,
zmq_endpoint
=
zmq_endpoint
,
zmq_topic
=
""
,
zmq_topic
=
""
,
...
@@ -573,8 +572,9 @@ async def init_prefill(
...
@@ -573,8 +572,9 @@ async def init_prefill(
generate_endpoint
=
runtime
.
endpoint
(
generate_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
)
)
component
=
generate_endpoint
.
component
()
clear_endpoint
=
runtime
.
endpoint
(
clear_endpoint
=
component
.
endpoint
(
"clear_kv_blocks"
)
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.clear_kv_blocks"
)
# Use pre-created engine if provided (checkpoint mode), otherwise create new
# Use pre-created engine if provided (checkpoint mode), otherwise create new
if
pre_created_engine
is
not
None
:
if
pre_created_engine
is
not
None
:
...
@@ -596,7 +596,6 @@ async def init_prefill(
...
@@ -596,7 +596,6 @@ async def init_prefill(
handler
=
PrefillWorkerHandler
(
handler
=
PrefillWorkerHandler
(
runtime
,
runtime
,
component
,
engine_client
,
engine_client
,
default_sampling_params
,
default_sampling_params
,
getattr
(
getattr
(
vllm_config
,
"model_config"
,
None
),
"max_model_len"
,
None
),
getattr
(
getattr
(
vllm_config
,
"model_config"
,
None
),
"max_model_len"
,
None
),
...
@@ -627,7 +626,6 @@ async def init_prefill(
...
@@ -627,7 +626,6 @@ async def init_prefill(
# If kv event consolidator is enabled, publisher will subscribe to kv event consolidator's output
# If kv event consolidator is enabled, publisher will subscribe to kv event consolidator's output
kv_publishers
=
setup_kv_event_publisher
(
kv_publishers
=
setup_kv_event_publisher
(
config
,
config
,
component
,
generate_endpoint
,
generate_endpoint
,
vllm_config
,
vllm_config
,
consolidator_enabled
=
consolidator_enabled
,
consolidator_enabled
=
consolidator_enabled
,
...
@@ -717,8 +715,9 @@ async def init(
...
@@ -717,8 +715,9 @@ async def init(
generate_endpoint
=
runtime
.
endpoint
(
generate_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
)
)
component
=
generate_endpoint
.
component
()
clear_endpoint
=
runtime
.
endpoint
(
clear_endpoint
=
component
.
endpoint
(
"clear_kv_blocks"
)
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.clear_kv_blocks"
)
shutdown_endpoints
[:]
=
[
shutdown_endpoints
[:]
=
[
generate_endpoint
,
generate_endpoint
,
...
@@ -727,9 +726,15 @@ async def init(
...
@@ -727,9 +726,15 @@ async def init(
lora_enabled
=
config
.
engine_args
.
enable_lora
lora_enabled
=
config
.
engine_args
.
enable_lora
if
lora_enabled
:
if
lora_enabled
:
load_lora_endpoint
=
component
.
endpoint
(
"load_lora"
)
load_lora_endpoint
=
runtime
.
endpoint
(
unload_lora_endpoint
=
component
.
endpoint
(
"unload_lora"
)
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.load_lora"
list_loras_endpoint
=
component
.
endpoint
(
"list_loras"
)
)
unload_lora_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.unload_lora"
)
list_loras_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.list_loras"
)
shutdown_endpoints
.
extend
(
shutdown_endpoints
.
extend
(
[
[
...
@@ -739,8 +744,6 @@ async def init(
...
@@ -739,8 +744,6 @@ async def init(
]
]
)
)
model_name
=
config
.
served_model_name
or
config
.
model
# Use pre-created engine if provided (checkpoint mode), otherwise create new
# Use pre-created engine if provided (checkpoint mode), otherwise create new
if
pre_created_engine
is
not
None
:
if
pre_created_engine
is
not
None
:
(
(
...
@@ -752,19 +755,17 @@ async def init(
...
@@ -752,19 +755,17 @@ async def init(
)
=
pre_created_engine
)
=
pre_created_engine
# Factory is created after unpack so component_gauges is available
# Factory is created after unpack so component_gauges is available
factory
=
StatLoggerFactory
(
factory
=
StatLoggerFactory
(
compone
nt
,
endpoint
=
generate_endpoi
nt
,
component_gauges
=
component_gauges
,
component_gauges
=
component_gauges
,
dp_rank
=
config
.
engine_args
.
data_parallel_rank
or
0
,
dp_rank
=
config
.
engine_args
.
data_parallel_rank
or
0
,
metrics_labels
=
[(
"model"
,
model_name
)],
)
)
else
:
else
:
# Factory is created without component_gauges; setup_vllm_engine() will
# Factory is created without component_gauges; setup_vllm_engine() will
# create the gauges after setup_multiprocess_prometheus() and set them
# create the gauges after setup_multiprocess_prometheus() and set them
# on the factory before vLLM calls create_stat_logger().
# on the factory before vLLM calls create_stat_logger().
factory
=
StatLoggerFactory
(
factory
=
StatLoggerFactory
(
compone
nt
,
endpoint
=
generate_endpoi
nt
,
dp_rank
=
config
.
engine_args
.
data_parallel_rank
or
0
,
dp_rank
=
config
.
engine_args
.
data_parallel_rank
or
0
,
metrics_labels
=
[(
"model"
,
model_name
)],
)
)
(
(
engine_client
,
engine_client
,
...
@@ -780,7 +781,6 @@ async def init(
...
@@ -780,7 +781,6 @@ async def init(
handler
=
DecodeWorkerHandler
(
handler
=
DecodeWorkerHandler
(
runtime
,
runtime
,
component
,
engine_client
,
engine_client
,
default_sampling_params
,
default_sampling_params
,
getattr
(
getattr
(
vllm_config
,
"model_config"
,
None
),
"max_model_len"
,
None
),
getattr
(
getattr
(
vllm_config
,
"model_config"
,
None
),
"max_model_len"
,
None
),
...
@@ -811,7 +811,6 @@ async def init(
...
@@ -811,7 +811,6 @@ async def init(
# If kv event consolidator is enabled, publisher will subscribe to kv event consolidator's output
# If kv event consolidator is enabled, publisher will subscribe to kv event consolidator's output
kv_publishers
=
setup_kv_event_publisher
(
kv_publishers
=
setup_kv_event_publisher
(
config
,
config
,
component
,
generate_endpoint
,
generate_endpoint
,
vllm_config
,
vllm_config
,
consolidator_enabled
=
consolidator_enabled
,
consolidator_enabled
=
consolidator_enabled
,
...
@@ -957,7 +956,7 @@ async def init_omni(
...
@@ -957,7 +956,7 @@ async def init_omni(
generate_endpoint
=
runtime
.
endpoint
(
generate_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
)
)
component
=
generate_endpoint
.
component
()
shutdown_endpoints
[:]
=
[
generate_endpoint
]
shutdown_endpoints
[:]
=
[
generate_endpoint
]
# Initialize media filesystem for storing generated images/videos
# Initialize media filesystem for storing generated images/videos
...
@@ -968,7 +967,6 @@ async def init_omni(
...
@@ -968,7 +967,6 @@ async def init_omni(
# Initialize unified OmniHandler
# Initialize unified OmniHandler
handler
=
OmniHandler
(
handler
=
OmniHandler
(
runtime
=
runtime
,
runtime
=
runtime
,
component
=
component
,
config
=
config
,
config
=
config
,
default_sampling_params
=
{},
default_sampling_params
=
{},
shutdown_event
=
shutdown_event
,
shutdown_event
=
shutdown_event
,
...
...
components/src/dynamo/vllm/multimodal_handlers/multimodal_pd_worker_handler.py
View file @
80cac7c1
...
@@ -20,7 +20,7 @@ from dynamo.common.multimodal.embedding_transfer import (
...
@@ -20,7 +20,7 @@ from dynamo.common.multimodal.embedding_transfer import (
LocalEmbeddingReceiver
,
LocalEmbeddingReceiver
,
NixlPersistentEmbeddingReceiver
,
NixlPersistentEmbeddingReceiver
,
)
)
from
dynamo.runtime
import
Client
,
Component
,
DistributedRuntime
from
dynamo.runtime
import
Client
,
DistributedRuntime
from
..args
import
Config
from
..args
import
Config
from
..handlers
import
BaseWorkerHandler
,
build_sampling_params
from
..handlers
import
BaseWorkerHandler
,
build_sampling_params
...
@@ -44,7 +44,6 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
...
@@ -44,7 +44,6 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
def
__init__
(
def
__init__
(
self
,
self
,
runtime
,
runtime
,
component
:
Component
,
engine_client
:
AsyncLLM
,
engine_client
:
AsyncLLM
,
config
:
Config
,
config
:
Config
,
encode_worker_client
:
Client
|
None
=
None
,
encode_worker_client
:
Client
|
None
=
None
,
...
@@ -60,7 +59,6 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
...
@@ -60,7 +59,6 @@ class MultimodalPDWorkerHandler(BaseWorkerHandler):
# Call BaseWorkerHandler.__init__ with proper parameters
# Call BaseWorkerHandler.__init__ with proper parameters
super
().
__init__
(
super
().
__init__
(
runtime
,
runtime
,
component
,
engine_client
,
engine_client
,
default_sampling_params
,
default_sampling_params
,
enable_multimodal
=
config
.
enable_multimodal
,
enable_multimodal
=
config
.
enable_multimodal
,
...
...
components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
View file @
80cac7c1
...
@@ -22,7 +22,6 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
...
@@ -22,7 +22,6 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
def
__init__
(
def
__init__
(
self
,
self
,
runtime
,
runtime
,
component
,
engine_client
,
engine_client
,
config
:
Config
,
config
:
Config
,
shutdown_event
=
None
,
shutdown_event
=
None
,
...
@@ -36,7 +35,6 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
...
@@ -36,7 +35,6 @@ class MultimodalDecodeWorkerHandler(BaseWorkerHandler):
# Call BaseWorkerHandler.__init__ with proper parameters
# Call BaseWorkerHandler.__init__ with proper parameters
super
().
__init__
(
super
().
__init__
(
runtime
,
runtime
,
component
,
engine_client
,
engine_client
,
default_sampling_params
,
default_sampling_params
,
enable_multimodal
=
config
.
enable_multimodal
,
enable_multimodal
=
config
.
enable_multimodal
,
...
...
components/src/dynamo/vllm/omni/omni_handler.py
View file @
80cac7c1
...
@@ -68,7 +68,6 @@ class OmniHandler(BaseOmniHandler):
...
@@ -68,7 +68,6 @@ class OmniHandler(BaseOmniHandler):
def
__init__
(
def
__init__
(
self
,
self
,
runtime
,
runtime
,
component
,
config
,
config
,
default_sampling_params
:
Dict
[
str
,
Any
],
default_sampling_params
:
Dict
[
str
,
Any
],
shutdown_event
:
asyncio
.
Event
|
None
=
None
,
shutdown_event
:
asyncio
.
Event
|
None
=
None
,
...
@@ -88,7 +87,6 @@ class OmniHandler(BaseOmniHandler):
...
@@ -88,7 +87,6 @@ class OmniHandler(BaseOmniHandler):
"""
"""
super
().
__init__
(
super
().
__init__
(
runtime
=
runtime
,
runtime
=
runtime
,
component
=
component
,
config
=
config
,
config
=
config
,
default_sampling_params
=
default_sampling_params
,
default_sampling_params
=
default_sampling_params
,
shutdown_event
=
shutdown_event
,
shutdown_event
=
shutdown_event
,
...
...
components/src/dynamo/vllm/publisher.py
View file @
80cac7c1
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
asyncio
import
asyncio
import
logging
import
logging
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
from
prometheus_client
import
CollectorRegistry
from
prometheus_client
import
CollectorRegistry
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
...
@@ -12,7 +12,7 @@ from vllm.v1.metrics.stats import IterationStats, SchedulerStats
...
@@ -12,7 +12,7 @@ from vllm.v1.metrics.stats import IterationStats, SchedulerStats
from
dynamo.common.utils.prometheus
import
LLMBackendMetrics
from
dynamo.common.utils.prometheus
import
LLMBackendMetrics
from
dynamo.llm
import
WorkerMetricsPublisher
from
dynamo.llm
import
WorkerMetricsPublisher
from
dynamo.runtime
import
Compone
nt
from
dynamo.runtime
import
Endpoi
nt
# Create a dedicated registry for dynamo_component metrics
# Create a dedicated registry for dynamo_component metrics
# This ensures these metrics are isolated and can be exposed via their own callback
# This ensures these metrics are isolated and can be exposed via their own callback
...
@@ -42,15 +42,14 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
...
@@ -42,15 +42,14 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
def
__init__
(
def
__init__
(
self
,
self
,
component
:
Component
,
endpoint
:
Endpoint
,
dp_rank
:
int
,
dp_rank
:
int
=
0
,
component_gauges
:
LLMBackendMetrics
,
component_gauges
:
Optional
[
LLMBackendMetrics
]
=
None
,
metrics_labels
:
Optional
[
List
[
Tuple
[
str
,
str
]]]
=
None
,
)
->
None
:
)
->
None
:
self
.
inner
=
WorkerMetricsPublisher
()
self
.
inner
=
WorkerMetricsPublisher
()
self
.
_
component
=
compone
nt
self
.
_
endpoint
=
endpoi
nt
self
.
dp_rank
=
dp_rank
self
.
dp_rank
=
dp_rank
self
.
component_gauges
=
component_gauges
self
.
component_gauges
=
component_gauges
or
LLMBackendMetrics
()
self
.
num_gpu_block
=
1
self
.
num_gpu_block
=
1
# Schedule async endpoint creation
# Schedule async endpoint creation
self
.
_endpoint_task
=
asyncio
.
create_task
(
self
.
_create_endpoint
())
self
.
_endpoint_task
=
asyncio
.
create_task
(
self
.
_create_endpoint
())
...
@@ -58,7 +57,7 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
...
@@ -58,7 +57,7 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
async
def
_create_endpoint
(
self
)
->
None
:
async
def
_create_endpoint
(
self
)
->
None
:
"""Create the NATS endpoint asynchronously."""
"""Create the NATS endpoint asynchronously."""
try
:
try
:
await
self
.
inner
.
create_endpoint
(
self
.
_
compone
nt
)
await
self
.
inner
.
create_endpoint
(
self
.
_
endpoi
nt
)
logging
.
debug
(
"vLLM metrics publisher endpoint created"
)
logging
.
debug
(
"vLLM metrics publisher endpoint created"
)
except
Exception
:
except
Exception
:
logging
.
exception
(
"Failed to create vLLM metrics publisher endpoint"
)
logging
.
exception
(
"Failed to create vLLM metrics publisher endpoint"
)
...
@@ -105,16 +104,14 @@ class StatLoggerFactory:
...
@@ -105,16 +104,14 @@ class StatLoggerFactory:
def
__init__
(
def
__init__
(
self
,
self
,
component
:
Compone
nt
,
endpoint
:
Endpoi
nt
,
component_gauges
:
Optional
[
LLMBackendMetrics
]
=
None
,
component_gauges
:
Optional
[
LLMBackendMetrics
]
=
None
,
dp_rank
:
int
=
0
,
dp_rank
:
int
=
0
,
metrics_labels
:
Optional
[
List
[
Tuple
[
str
,
str
]]]
=
None
,
)
->
None
:
)
->
None
:
self
.
component
=
compone
nt
self
.
endpoint
=
endpoi
nt
self
.
component_gauges
=
component_gauges
self
.
component_gauges
=
component_gauges
self
.
created_logger
:
Optional
[
DynamoStatLoggerPublisher
]
=
None
self
.
created_logger
:
Optional
[
DynamoStatLoggerPublisher
]
=
None
self
.
dp_rank
=
dp_rank
self
.
dp_rank
=
dp_rank
self
.
metrics_labels
=
metrics_labels
or
[]
def
create_stat_logger
(
self
,
dp_rank
:
int
)
->
StatLoggerBase
:
def
create_stat_logger
(
self
,
dp_rank
:
int
)
->
StatLoggerBase
:
if
self
.
dp_rank
!=
dp_rank
:
if
self
.
dp_rank
!=
dp_rank
:
...
@@ -124,11 +121,11 @@ class StatLoggerFactory:
...
@@ -124,11 +121,11 @@ class StatLoggerFactory:
assert
(
assert
(
self
.
component_gauges
is
not
None
self
.
component_gauges
is
not
None
),
"component_gauges must be set before creating stat loggers"
),
"component_gauges must be set before creating stat loggers"
logger
=
DynamoStatLoggerPublisher
(
logger
=
DynamoStatLoggerPublisher
(
self
.
compone
nt
,
endpoint
=
self
.
endpoi
nt
,
dp_rank
,
dp_rank
=
dp_rank
,
component_gauges
=
self
.
component_gauges
,
component_gauges
=
self
.
component_gauges
,
metrics_labels
=
self
.
metrics_labels
,
)
)
self
.
created_logger
=
logger
self
.
created_logger
=
logger
...
...
components/src/dynamo/vllm/tests/multimodal_handlers/test_vllm_multimodal_pd_worker_handler.py
View file @
80cac7c1
...
@@ -62,7 +62,6 @@ def _make_handler(
...
@@ -62,7 +62,6 @@ def _make_handler(
with
patch
.
object
(
mod
.
BaseWorkerHandler
,
"__init__"
,
return_value
=
None
):
with
patch
.
object
(
mod
.
BaseWorkerHandler
,
"__init__"
,
return_value
=
None
):
return
mod
.
MultimodalPDWorkerHandler
(
return
mod
.
MultimodalPDWorkerHandler
(
runtime
=
MagicMock
(),
runtime
=
MagicMock
(),
component
=
MagicMock
(),
engine_client
=
MagicMock
(),
engine_client
=
MagicMock
(),
config
=
config
,
config
=
config
,
encode_worker_client
=
encode_worker_client
,
encode_worker_client
=
encode_worker_client
,
...
...
components/src/dynamo/vllm/tests/test_vllm_worker_factory.py
View file @
80cac7c1
...
@@ -105,6 +105,7 @@ class TestCreate:
...
@@ -105,6 +105,7 @@ class TestCreate:
Mock
(),
Mock
(),
Mock
(),
Mock
(),
"/tmp/prometheus"
,
"/tmp/prometheus"
,
Mock
(),
)
)
await
factory
.
create
(
await
factory
.
create
(
...
...
components/src/dynamo/vllm/worker_factory.py
View file @
80cac7c1
...
@@ -100,14 +100,22 @@ class WorkerFactory:
...
@@ -100,14 +100,22 @@ class WorkerFactory:
generate_endpoint
=
runtime
.
endpoint
(
generate_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
)
)
component
=
generate_endpoint
.
component
()
clear_endpoint
=
runtime
.
endpoint
(
clear_endpoint
=
component
.
endpoint
(
"clear_kv_blocks"
)
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.clear_kv_blocks"
)
shutdown_endpoints
[:]
=
[
generate_endpoint
,
clear_endpoint
]
shutdown_endpoints
[:]
=
[
generate_endpoint
,
clear_endpoint
]
lora_enabled
=
config
.
engine_args
.
enable_lora
lora_enabled
=
config
.
engine_args
.
enable_lora
if
lora_enabled
:
if
lora_enabled
:
load_lora_endpoint
=
component
.
endpoint
(
"load_lora"
)
load_lora_endpoint
=
runtime
.
endpoint
(
unload_lora_endpoint
=
component
.
endpoint
(
"unload_lora"
)
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.load_lora"
list_loras_endpoint
=
component
.
endpoint
(
"list_loras"
)
)
unload_lora_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.unload_lora"
)
list_loras_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.list_loras"
)
shutdown_endpoints
.
extend
(
shutdown_endpoints
.
extend
(
[
load_lora_endpoint
,
unload_lora_endpoint
,
list_loras_endpoint
]
[
load_lora_endpoint
,
unload_lora_endpoint
,
list_loras_endpoint
]
)
)
...
@@ -152,7 +160,6 @@ class WorkerFactory:
...
@@ -152,7 +160,6 @@ class WorkerFactory:
if
config
.
multimodal_decode_worker
:
if
config
.
multimodal_decode_worker
:
handler
=
MultimodalDecodeWorkerHandler
(
handler
=
MultimodalDecodeWorkerHandler
(
runtime
,
runtime
,
component
,
engine_client
,
engine_client
,
config
,
config
,
shutdown_event
,
shutdown_event
,
...
@@ -161,7 +168,6 @@ class WorkerFactory:
...
@@ -161,7 +168,6 @@ class WorkerFactory:
else
:
else
:
handler
=
MultimodalPDWorkerHandler
(
handler
=
MultimodalPDWorkerHandler
(
runtime
,
runtime
,
component
,
engine_client
,
engine_client
,
config
,
config
,
encode_worker_client
,
encode_worker_client
,
...
@@ -175,7 +181,7 @@ class WorkerFactory:
...
@@ -175,7 +181,7 @@ class WorkerFactory:
# Set up KV event publisher for prefix caching if enabled
# Set up KV event publisher for prefix caching if enabled
kv_publisher
=
self
.
setup_kv_event_publisher
(
kv_publisher
=
self
.
setup_kv_event_publisher
(
config
,
component
,
generate_endpoint
,
vllm_config
config
,
generate_endpoint
,
vllm_config
)
)
if
kv_publisher
:
if
kv_publisher
:
handler
.
kv_publisher
=
kv_publisher
handler
.
kv_publisher
=
kv_publisher
...
...
examples/multimodal/components/publisher.py
View file @
80cac7c1
...
@@ -22,7 +22,7 @@ from vllm.v1.metrics.loggers import StatLoggerBase
...
@@ -22,7 +22,7 @@ from vllm.v1.metrics.loggers import StatLoggerBase
from
vllm.v1.metrics.stats
import
IterationStats
,
SchedulerStats
from
vllm.v1.metrics.stats
import
IterationStats
,
SchedulerStats
from
dynamo.llm
import
WorkerMetricsPublisher
from
dynamo.llm
import
WorkerMetricsPublisher
from
dynamo.runtime
import
Compone
nt
from
dynamo.runtime
import
Endpoi
nt
class
NullStatLogger
(
StatLoggerBase
):
class
NullStatLogger
(
StatLoggerBase
):
...
@@ -48,11 +48,11 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
...
@@ -48,11 +48,11 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
def
__init__
(
def
__init__
(
self
,
self
,
component
:
Compone
nt
,
endpoint
:
Endpoi
nt
,
dp_rank
:
int
,
dp_rank
:
int
,
)
->
None
:
)
->
None
:
self
.
inner
=
WorkerMetricsPublisher
()
self
.
inner
=
WorkerMetricsPublisher
()
self
.
_
component
=
compone
nt
self
.
_
endpoint
=
endpoi
nt
self
.
dp_rank
=
dp_rank
self
.
dp_rank
=
dp_rank
self
.
num_gpu_block
=
1
self
.
num_gpu_block
=
1
# Schedule async endpoint creation
# Schedule async endpoint creation
...
@@ -61,7 +61,7 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
...
@@ -61,7 +61,7 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
async
def
_create_endpoint
(
self
)
->
None
:
async
def
_create_endpoint
(
self
)
->
None
:
"""Create the NATS endpoint asynchronously."""
"""Create the NATS endpoint asynchronously."""
try
:
try
:
await
self
.
inner
.
create_endpoint
(
self
.
_
compone
nt
)
await
self
.
inner
.
create_endpoint
(
self
.
_
endpoi
nt
)
logging
.
debug
(
"Multimodal metrics publisher endpoint created"
)
logging
.
debug
(
"Multimodal metrics publisher endpoint created"
)
except
Exception
:
except
Exception
:
logging
.
exception
(
"Failed to create multimodal metrics publisher endpoint"
)
logging
.
exception
(
"Failed to create multimodal metrics publisher endpoint"
)
...
@@ -94,11 +94,11 @@ class StatLoggerFactory:
...
@@ -94,11 +94,11 @@ class StatLoggerFactory:
def
__init__
(
def
__init__
(
self
,
self
,
component
:
Compone
nt
,
endpoint
:
Endpoi
nt
,
dp_rank
:
int
=
0
,
dp_rank
:
int
=
0
,
metrics_labels
:
Optional
[
List
[
Tuple
[
str
,
str
]]]
=
None
,
metrics_labels
:
Optional
[
List
[
Tuple
[
str
,
str
]]]
=
None
,
)
->
None
:
)
->
None
:
self
.
component
=
compone
nt
self
.
endpoint
=
endpoi
nt
self
.
created_logger
:
Optional
[
DynamoStatLoggerPublisher
]
=
None
self
.
created_logger
:
Optional
[
DynamoStatLoggerPublisher
]
=
None
self
.
dp_rank
=
dp_rank
self
.
dp_rank
=
dp_rank
self
.
metrics_labels
=
metrics_labels
or
[]
self
.
metrics_labels
=
metrics_labels
or
[]
...
@@ -106,7 +106,7 @@ class StatLoggerFactory:
...
@@ -106,7 +106,7 @@ class StatLoggerFactory:
def
create_stat_logger
(
self
,
dp_rank
:
int
)
->
StatLoggerBase
:
def
create_stat_logger
(
self
,
dp_rank
:
int
)
->
StatLoggerBase
:
if
self
.
dp_rank
!=
dp_rank
:
if
self
.
dp_rank
!=
dp_rank
:
return
NullStatLogger
()
return
NullStatLogger
()
logger
=
DynamoStatLoggerPublisher
(
self
.
compone
nt
,
dp_rank
)
logger
=
DynamoStatLoggerPublisher
(
self
.
endpoi
nt
,
dp_rank
)
self
.
created_logger
=
logger
self
.
created_logger
=
logger
return
logger
return
logger
...
...
examples/multimodal/components/worker.py
View file @
80cac7c1
...
@@ -24,7 +24,7 @@ from vllm.v1.engine.async_llm import AsyncLLM
...
@@ -24,7 +24,7 @@ from vllm.v1.engine.async_llm import AsyncLLM
import
dynamo.nixl_connect
as
connect
import
dynamo.nixl_connect
as
connect
from
dynamo.llm
import
KvEventPublisher
from
dynamo.llm
import
KvEventPublisher
from
dynamo.runtime
import
Component
,
DistributedRuntime
,
Endpoint
,
dynamo_worker
from
dynamo.runtime
import
DistributedRuntime
,
Endpoint
,
dynamo_worker
from
dynamo.runtime.logging
import
configure_dynamo_logging
from
dynamo.runtime.logging
import
configure_dynamo_logging
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
".."
))
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
".."
))
...
@@ -104,7 +104,6 @@ class VllmBaseWorker:
...
@@ -104,7 +104,6 @@ class VllmBaseWorker:
def
__init__
(
def
__init__
(
self
,
self
,
args
:
argparse
.
Namespace
,
args
:
argparse
.
Namespace
,
component
:
Component
,
endpoint
:
Endpoint
,
endpoint
:
Endpoint
,
config
:
Config
,
config
:
Config
,
):
):
...
@@ -113,15 +112,15 @@ class VllmBaseWorker:
...
@@ -113,15 +112,15 @@ class VllmBaseWorker:
self
.
downstream_endpoint
=
args
.
downstream_endpoint
self
.
downstream_endpoint
=
args
.
downstream_endpoint
self
.
engine_args
=
config
.
engine_args
self
.
engine_args
=
config
.
engine_args
self
.
config
=
config
self
.
config
=
config
self
.
setup_vllm_engine
(
component
,
endpoint
)
self
.
setup_vllm_engine
(
endpoint
)
async
def
async_init
(
self
,
runtime
:
DistributedRuntime
):
async
def
async_init
(
self
,
runtime
:
DistributedRuntime
):
pass
pass
def
setup_vllm_engine
(
self
,
component
:
Component
,
endpoint
:
Endpoint
):
def
setup_vllm_engine
(
self
,
endpoint
:
Endpoint
):
"""Initialize the vLLM engine.
"""Initialize the vLLM engine.
This method sets up the vLLM engine client, and configures the dynamo-aware KV
This method sets up the vLLM engine client, and configures the dynamo-aware KV
event publisher and metrics stats logger based on
component and
endpoint.
event publisher and metrics stats logger based on endpoint.
"""
"""
os
.
environ
[
"VLLM_NO_USAGE_STATS"
]
=
"1"
# Avoid internal HTTP requests
os
.
environ
[
"VLLM_NO_USAGE_STATS"
]
=
"1"
# Avoid internal HTTP requests
...
@@ -138,9 +137,8 @@ class VllmBaseWorker:
...
@@ -138,9 +137,8 @@ class VllmBaseWorker:
# Create vLLM engine with metrics logger and KV event publisher attached
# Create vLLM engine with metrics logger and KV event publisher attached
self
.
stats_logger
=
StatLoggerFactory
(
self
.
stats_logger
=
StatLoggerFactory
(
component
,
endpoint
=
endpoint
,
self
.
engine_args
.
data_parallel_rank
or
0
,
dp_rank
=
self
.
engine_args
.
data_parallel_rank
or
0
,
metrics_labels
=
[(
"model"
,
self
.
config
.
model
)],
)
)
self
.
engine_client
=
AsyncLLM
.
from_vllm_config
(
self
.
engine_client
=
AsyncLLM
.
from_vllm_config
(
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
...
@@ -164,7 +162,7 @@ class VllmBaseWorker:
...
@@ -164,7 +162,7 @@ class VllmBaseWorker:
).
replace
(
"*"
,
"127.0.0.1"
)
).
replace
(
"*"
,
"127.0.0.1"
)
self
.
kv_publisher
=
KvEventPublisher
(
self
.
kv_publisher
=
KvEventPublisher
(
component
=
compone
nt
,
endpoint
=
endpoi
nt
,
kv_block_size
=
vllm_config
.
cache_config
.
block_size
,
kv_block_size
=
vllm_config
.
cache_config
.
block_size
,
zmq_endpoint
=
zmq_endpoint
,
zmq_endpoint
=
zmq_endpoint
,
)
)
...
@@ -435,15 +433,14 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
...
@@ -435,15 +433,14 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
generate_endpoint
=
runtime
.
endpoint
(
generate_endpoint
=
runtime
.
endpoint
(
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.
{
config
.
endpoint
}
"
)
)
component
=
generate_endpoint
.
component
()
clear_endpoint
=
runtime
.
endpoint
(
clear_endpoint
=
component
.
endpoint
(
"clear_kv_blocks"
)
f
"
{
config
.
namespace
}
.
{
config
.
component
}
.clear_kv_blocks"
)
if
args
.
worker_type
in
[
"prefill"
,
"encode_prefill"
]:
if
args
.
worker_type
in
[
"prefill"
,
"encode_prefill"
]:
handler
:
VllmBaseWorker
=
VllmPDWorker
(
handler
:
VllmBaseWorker
=
VllmPDWorker
(
args
,
generate_endpoint
,
config
)
args
,
component
,
generate_endpoint
,
config
)
elif
args
.
worker_type
==
"decode"
:
elif
args
.
worker_type
==
"decode"
:
handler
=
VllmDecodeWorker
(
args
,
component
,
generate_endpoint
,
config
)
handler
=
VllmDecodeWorker
(
args
,
generate_endpoint
,
config
)
await
handler
.
async_init
(
runtime
)
await
handler
.
async_init
(
runtime
)
logger
.
info
(
f
"Starting to serve the
{
args
.
endpoint
}
endpoint..."
)
logger
.
info
(
f
"Starting to serve the
{
args
.
endpoint
}
endpoint..."
)
...
...
lib/bindings/python/examples/error_handling/server.py
View file @
80cac7c1
...
@@ -40,8 +40,8 @@ async def worker(runtime: DistributedRuntime):
...
@@ -40,8 +40,8 @@ async def worker(runtime: DistributedRuntime):
async
def
init
(
runtime
:
DistributedRuntime
,
ns
:
str
):
async
def
init
(
runtime
:
DistributedRuntime
,
ns
:
str
):
"""
"""
Instantiate a `backend` component
and serve the `generate` endpoint
Create
and serve the `generate` endpoint
using the distributed runtime.
A `Compone
nt
`
can serve
multiple endpoints
Multiple endpoi
nt
s
can
be
serve
d from a single worker.
"""
"""
endpoint
=
runtime
.
endpoint
(
f
"
{
ns
}
.backend.generate"
)
endpoint
=
runtime
.
endpoint
(
f
"
{
ns
}
.backend.generate"
)
print
(
"Started server instance"
)
print
(
"Started server instance"
)
...
...
lib/bindings/python/examples/hello_world/server.py
View file @
80cac7c1
...
@@ -45,8 +45,8 @@ async def graceful_shutdown(runtime: DistributedRuntime):
...
@@ -45,8 +45,8 @@ async def graceful_shutdown(runtime: DistributedRuntime):
async
def
init
(
runtime
:
DistributedRuntime
,
ns
:
str
):
async
def
init
(
runtime
:
DistributedRuntime
,
ns
:
str
):
"""
"""
Instantiate a `backend` component
and serve the `generate` endpoint
Create
and serve the `generate` endpoint
using the distributed runtime.
A `Compone
nt
`
can serve
multiple endpoints
Multiple endpoi
nt
s
can
be
serve
d from a single worker.
"""
"""
endpoint
=
runtime
.
endpoint
(
f
"
{
ns
}
.backend.generate"
)
endpoint
=
runtime
.
endpoint
(
f
"
{
ns
}
.backend.generate"
)
print
(
"Started server instance"
)
print
(
"Started server instance"
)
...
...
lib/bindings/python/examples/typed/server.py
View file @
80cac7c1
...
@@ -38,8 +38,8 @@ class RequestHandler:
...
@@ -38,8 +38,8 @@ class RequestHandler:
@
dynamo_worker
()
@
dynamo_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
):
async
def
worker
(
runtime
:
DistributedRuntime
):
"""
"""
Instantiate a `backend` component
and serve the `generate` endpoint
Create
and serve the `generate` endpoint
using the distributed runtime.
A `Compone
nt
`
can serve
multiple endpoints
Multiple endpoi
nt
s
can
be
serve
d from a single worker.
"""
"""
endpoint
=
runtime
.
endpoint
(
"dynamo.backend.generate"
)
endpoint
=
runtime
.
endpoint
(
"dynamo.backend.generate"
)
await
endpoint
.
serve_endpoint
(
RequestHandler
().
generate
)
await
endpoint
.
serve_endpoint
(
RequestHandler
().
generate
)
...
...
lib/bindings/python/rust/lib.rs
View file @
80cac7c1
...
@@ -150,7 +150,6 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
...
@@ -150,7 +150,6 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m
.add_function
(
wrap_pyfunction!
(
llm
::
entrypoint
::
run_input
,
m
)
?
)
?
;
m
.add_function
(
wrap_pyfunction!
(
llm
::
entrypoint
::
run_input
,
m
)
?
)
?
;
m
.add_class
::
<
DistributedRuntime
>
()
?
;
m
.add_class
::
<
DistributedRuntime
>
()
?
;
m
.add_class
::
<
Component
>
()
?
;
m
.add_class
::
<
Endpoint
>
()
?
;
m
.add_class
::
<
Endpoint
>
()
?
;
m
.add_class
::
<
ModelCardInstanceId
>
()
?
;
m
.add_class
::
<
ModelCardInstanceId
>
()
?
;
m
.add_class
::
<
Client
>
()
?
;
m
.add_class
::
<
Client
>
()
?
;
...
@@ -460,13 +459,6 @@ struct CancellationToken {
...
@@ -460,13 +459,6 @@ struct CancellationToken {
inner
:
rs
::
CancellationToken
,
inner
:
rs
::
CancellationToken
,
}
}
#[pyclass]
#[derive(Clone)]
struct
Component
{
inner
:
rs
::
component
::
Component
,
event_loop
:
PyObject
,
}
#[pyclass]
#[pyclass]
#[derive(Clone)]
#[derive(Clone)]
struct
Endpoint
{
struct
Endpoint
{
...
@@ -774,17 +766,6 @@ impl DistributedRuntime {
...
@@ -774,17 +766,6 @@ impl DistributedRuntime {
}
}
}
}
#[pymethods]
impl
Component
{
fn
endpoint
(
&
self
,
name
:
String
)
->
PyResult
<
Endpoint
>
{
let
inner
=
self
.inner
.endpoint
(
name
);
Ok
(
Endpoint
{
inner
,
event_loop
:
self
.event_loop
.clone
(),
})
}
}
#[pymethods]
#[pymethods]
impl
Endpoint
{
impl
Endpoint
{
#[pyo3(signature
=
(generator,
graceful_shutdown
=
true
,
metrics_labels
=
None,
health_check_payload
=
None))]
#[pyo3(signature
=
(generator,
graceful_shutdown
=
true
,
metrics_labels
=
None,
health_check_payload
=
None))]
...
@@ -907,17 +888,6 @@ impl Endpoint {
...
@@ -907,17 +888,6 @@ impl Endpoint {
Ok
(())
Ok
(())
})
})
}
}
/// Get the parent Component.
///
/// Note: To avoid duplicate metrics registries, reuse the returned Component for
/// multiple endpoints: `component.endpoint("ep1")`, `component.endpoint("ep2")`.
fn
component
(
&
self
)
->
Component
{
Component
{
inner
:
self
.inner
.component
()
.clone
(),
event_loop
:
self
.event_loop
.clone
(),
}
}
}
}
#[pymethods]
#[pymethods]
...
...
lib/bindings/python/rust/llm/kv.rs
View file @
80cac7c1
...
@@ -9,7 +9,7 @@ use std::sync::mpsc;
...
@@ -9,7 +9,7 @@ use std::sync::mpsc;
use
tokio_stream
::
StreamExt
;
use
tokio_stream
::
StreamExt
;
use
super
::
*
;
use
super
::
*
;
use
crate
::
Compone
nt
;
use
crate
::
Endpoi
nt
;
use
llm_rs
::
kv_router
::
protocols
::
compute_block_hash_for_seq
;
use
llm_rs
::
kv_router
::
protocols
::
compute_block_hash_for_seq
;
use
rs
::
pipeline
::{
AsyncEngine
,
SingleIn
};
use
rs
::
pipeline
::{
AsyncEngine
,
SingleIn
};
use
rs
::
protocols
::
annotated
::
Annotated
as
RsAnnotated
;
use
rs
::
protocols
::
annotated
::
Annotated
as
RsAnnotated
;
...
@@ -86,14 +86,14 @@ impl WorkerMetricsPublisher {
...
@@ -86,14 +86,14 @@ impl WorkerMetricsPublisher {
})
})
}
}
#[pyo3(signature
=
(
compone
nt))]
#[pyo3(signature
=
(
endpoi
nt))]
fn
create_endpoint
<
'p
>
(
fn
create_endpoint
<
'p
>
(
&
self
,
&
self
,
py
:
Python
<
'p
>
,
py
:
Python
<
'p
>
,
component
:
Compone
nt
,
endpoint
:
Endpoi
nt
,
)
->
PyResult
<
Bound
<
'p
,
PyAny
>>
{
)
->
PyResult
<
Bound
<
'p
,
PyAny
>>
{
let
rs_publisher
=
self
.inner
.clone
();
let
rs_publisher
=
self
.inner
.clone
();
let
rs_component
=
compone
nt
.inner
.clone
();
let
rs_component
=
endpoi
nt
.inner
.
component
()
.
clone
();
pyo3_async_runtimes
::
tokio
::
future_into_py
(
py
,
async
move
{
pyo3_async_runtimes
::
tokio
::
future_into_py
(
py
,
async
move
{
rs_publisher
rs_publisher
.create_endpoint
(
rs_component
)
.create_endpoint
(
rs_component
)
...
@@ -127,9 +127,9 @@ pub(crate) struct KvEventPublisher {
...
@@ -127,9 +127,9 @@ pub(crate) struct KvEventPublisher {
#[pymethods]
#[pymethods]
impl
KvEventPublisher
{
impl
KvEventPublisher
{
#[new]
#[new]
#[pyo3(signature
=
(
compone
nt,
worker_id=
0
,
kv_block_size=
0
,
dp_rank=
0
,
enable_local_indexer=
false
,
zmq_endpoint=None,
zmq_topic=None))]
#[pyo3(signature
=
(
endpoi
nt,
worker_id=
0
,
kv_block_size=
0
,
dp_rank=
0
,
enable_local_indexer=
false
,
zmq_endpoint=None,
zmq_topic=None))]
fn
new
(
fn
new
(
component
:
Compone
nt
,
endpoint
:
Endpoi
nt
,
worker_id
:
WorkerId
,
worker_id
:
WorkerId
,
kv_block_size
:
usize
,
kv_block_size
:
usize
,
dp_rank
:
DpRank
,
dp_rank
:
DpRank
,
...
@@ -139,8 +139,8 @@ impl KvEventPublisher {
...
@@ -139,8 +139,8 @@ impl KvEventPublisher {
)
->
PyResult
<
Self
>
{
)
->
PyResult
<
Self
>
{
let
_
=
worker_id
;
let
_
=
worker_id
;
let
source_config
=
zmq_endpoint
.map
(|
e
ndpoint
|
KvEventSourceConfig
::
Zmq
{
let
source_config
=
zmq_endpoint
.map
(|
e
p
|
KvEventSourceConfig
::
Zmq
{
endpoint
,
endpoint
:
ep
,
topic
:
zmq_topic
.unwrap_or_default
(),
topic
:
zmq_topic
.unwrap_or_default
(),
});
});
...
@@ -148,8 +148,11 @@ impl KvEventPublisher {
...
@@ -148,8 +148,11 @@ impl KvEventPublisher {
return
Err
(
to_pyerr
(
anyhow
::
anyhow!
(
"kv_block_size cannot be 0"
)));
return
Err
(
to_pyerr
(
anyhow
::
anyhow!
(
"kv_block_size cannot be 0"
)));
}
}
// Extract component from endpoint
let
component
=
endpoint
.inner
.component
()
.clone
();
let
inner
=
llm_rs
::
kv_router
::
publisher
::
KvEventPublisher
::
new_with_local_indexer
(
let
inner
=
llm_rs
::
kv_router
::
publisher
::
KvEventPublisher
::
new_with_local_indexer
(
component
.inner
,
component
,
kv_block_size
as
u32
,
kv_block_size
as
u32
,
source_config
,
source_config
,
enable_local_indexer
,
enable_local_indexer
,
...
...
lib/bindings/python/src/dynamo/_core.pyi
View file @
80cac7c1
...
@@ -111,19 +111,6 @@ class DistributedRuntime:
...
@@ -111,19 +111,6 @@ class DistributedRuntime:
"""
"""
...
...
class Component:
"""
A component is a collection of endpoints
"""
...
def endpoint(self, name: str) -> Endpoint:
"""
Create an endpoint
"""
...
class Endpoint:
class Endpoint:
"""
"""
...
@@ -190,25 +177,6 @@ class Endpoint:
...
@@ -190,25 +177,6 @@ class Endpoint:
"""
"""
...
...
def component(self) -> Component:
"""
Get the parent Component that this endpoint belongs to.
Returns:
Component: The parent component
Note:
To avoid duplicate metrics registries, reuse the returned Component for
multiple endpoints: component.endpoint("ep1"), component.endpoint("ep2")
Example:
endpoint = runtime.endpoint("demo.backend.generate")
component = endpoint.component()
health_endpoint = component.endpoint("health") # Reuse component
"""
...
class Client:
class Client:
"""
"""
A client capable of calling served instances of an endpoint
A client capable of calling served instances of an endpoint
...
@@ -404,14 +372,15 @@ class WorkerMetricsPublisher:
...
@@ -404,14 +372,15 @@ class WorkerMetricsPublisher:
Create a `WorkerMetricsPublisher` object
Create a `WorkerMetricsPublisher` object
"""
"""
async def create_endpoint(self,
component: Compone
nt) -> None:
async def create_endpoint(self,
endpoint: Endpoi
nt) -> None:
"""
"""
Creat
e the NATS endpoint for
metrics
publishing. Must be awaited.
Initializ
e the NATS endpoint for publishing
worker metrics
. Must be awaited.
Only service created through this method will interact with KV router of the same component.
Extracts component information from the endpoint to set up metrics publishing
on the correct NATS subject for routing decisions.
Args:
Args:
compone
nt: The
component to create the endpoint for
endpoi
nt: The
endpoint to extract component information from for metrics publishing
"""
"""
def publish(
def publish(
...
@@ -575,7 +544,7 @@ class KvIndexer:
...
@@ -575,7 +544,7 @@ class KvIndexer:
...
...
def __init__(self,
component: Compone
nt, block_size: int) -> None:
def __init__(self,
endpoint: Endpoi
nt, block_size: int) -> None:
"""
"""
Create a `KvIndexer` object
Create a `KvIndexer` object
"""
"""
...
@@ -622,7 +591,7 @@ class ApproxKvIndexer:
...
@@ -622,7 +591,7 @@ class ApproxKvIndexer:
def __init__(
def __init__(
self,
self,
component: Compone
nt,
endpoint: Endpoi
nt,
kv_block_size: int,
kv_block_size: int,
router_ttl_secs: float = 120.0,
router_ttl_secs: float = 120.0,
router_max_tree_size: int = 1048576,
router_max_tree_size: int = 1048576,
...
@@ -689,7 +658,7 @@ class KvEventPublisher:
...
@@ -689,7 +658,7 @@ class KvEventPublisher:
def __init__(
def __init__(
self,
self,
component: Compone
nt,
endpoint: Endpoi
nt,
worker_id: int = 0,
worker_id: int = 0,
kv_block_size: int = 0,
kv_block_size: int = 0,
dp_rank: int = 0,
dp_rank: int = 0,
...
@@ -706,8 +675,8 @@ class KvEventPublisher:
...
@@ -706,8 +675,8 @@ class KvEventPublisher:
When zmq_endpoint is None, events are pushed manually via publish_stored/publish_removed.
When zmq_endpoint is None, events are pushed manually via publish_stored/publish_removed.
Args:
Args:
compone
nt: The
component to publish events for
endpoi
nt: The
endpoint to extract component information from for event publishing
worker_id: The worker ID (unused, inferred from
compone
nt)
worker_id: The worker ID (unused, inferred from
endpoi
nt)
kv_block_size: The KV block size (must be > 0)
kv_block_size: The KV block size (must be > 0)
dp_rank: The data parallel rank (defaults to 0)
dp_rank: The data parallel rank (defaults to 0)
enable_local_indexer: Enable worker-local KV indexer
enable_local_indexer: Enable worker-local KV indexer
...
@@ -1612,7 +1581,6 @@ class VirtualConnectorClient:
...
@@ -1612,7 +1581,6 @@ class VirtualConnectorClient:
__all__ = [
__all__ = [
"Client",
"Client",
"Component",
"Context",
"Context",
"KserveGrpcService",
"KserveGrpcService",
"ModelDeploymentCard",
"ModelDeploymentCard",
...
...
lib/bindings/python/src/dynamo/runtime/__init__.py
View file @
80cac7c1
...
@@ -11,11 +11,9 @@ from pydantic import BaseModel, ValidationError
...
@@ -11,11 +11,9 @@ from pydantic import BaseModel, ValidationError
# List all the classes in the _core module for re-export
# List all the classes in the _core module for re-export
# import * causes "unable to detect undefined names"
# import * causes "unable to detect undefined names"
from
dynamo._core
import
Client
as
Client
from
dynamo._core
import
Client
as
Client
from
dynamo._core
import
Component
as
Component
from
dynamo._core
import
Context
as
Context
from
dynamo._core
import
Context
as
Context
from
dynamo._core
import
DistributedRuntime
as
DistributedRuntime
from
dynamo._core
import
DistributedRuntime
as
DistributedRuntime
from
dynamo._core
import
Endpoint
as
Endpoint
from
dynamo._core
import
Endpoint
as
Endpoint
from
dynamo._core
import
ModelDeploymentCard
as
ModelDeploymentCard
def
dynamo_worker
(
enable_nats
:
bool
=
True
):
def
dynamo_worker
(
enable_nats
:
bool
=
True
):
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment