feat: decouple bento dependency (#1266)

afb8495e · Biswa Panda · GitHub · e31f8d95 · afb8495e · afb8495e
Unverified Commit afb8495e authored Jun 04, 2025 by Biswa Panda Committed by GitHub Jun 04, 2025
6 changed files
--- a/docs/API/sdk.md
+++ b/docs/API/sdk.md
@@ -83,11 +83,11 @@ class ServiceA:
        self.engine = await initialize_model_engine(self.model_name)
        print(f"ServiceA initialized with model: {self.model_name}")

-    @async_on_shutdown
-    async def async_shutdown(self):
+    @on_shutdown
+    def shutdown(self):
        # Clean up resources
        if self.engine:
-            await self.engine.shutdown()
+            self.engine.shutdown()
        print("ServiceA engine shut down")

    @endpoint()
@@ -104,7 +104,7 @@ class ServiceA:
 Dynamo follows a class-based architecture similar to BentoML making it intuitive for users familiar with those frameworks. Each service is defined as a Python class, with the following components:
 1. Class attributes for dependencies using `depends()`
 2. An `__init__` method for standard initialization
-3. Optional lifecycle hooks like `@async_on_start` and `@async_on_shutdown`
+3. Optional lifecycle hooks like `@async_on_start` and `@on_shutdown`
 4. Endpoints defined with `@endpoint()`. Optionally, an endpoint can be given a name
   via `@endpoint("my_endpoint_name")`, but otherwise defaults to the name of the
   function being decorated if omitted.
@@ -170,15 +170,14 @@ This is especially useful for:
 - Initializing external connections
 - Setting up runtime resources that require async operations

-#### `@async_on_shutdown`
-The `@async_on_shutdown` hook is called when the service is shutdown handles cleanup.
+#### `@on_shutdown`
+The `@on_shutdown` hook is called when the service is shutdown handles cleanup.

 ```python
-@async_on_shutdown
-async def async_shutdown(self):
-    if self._engine_context is not None:
-        await self._engine_context.__aexit__(None, None, None)
-    print("VllmWorkerRouterLess shutting down")
+@on_shutdown
+def shutdown(self):
+    # gracefully Handle shutdown / cleanup
+    logger.info("worker shutting down")
 ```

 This ensures resources are properly released, preventing memory leaks and making sure external connections are properly closed. This is helpful to clean up vLLM engines that have been started outside of the main process.

--- a/examples/hello_world/hello_world.py
+++ b/examples/hello_world/hello_world.py
@@ -25,6 +25,7 @@ from dynamo.sdk import (
    depends,
    endpoint,
    liveness,
+    on_shutdown,
    readiness,
    service,
 )
@@ -87,6 +88,10 @@ class Backend:
        for token in text.split():
            yield f"Backend: {token}"

+    @on_shutdown
+    def shutdown(self):
+        logger.info("Shutting down backend")
+

 @service(
    dynamo={"namespace": "inference"},
@@ -112,6 +117,10 @@ class Middle:
            logger.info(f"Middle received response: {response}")
            yield f"Middle: {response}"

+    @on_shutdown
+    def shutdown(self):
+        logger.info("Shutting down middle")
+

 @service(
    dynamo={"namespace": "inference"},
@@ -152,3 +161,7 @@ class Frontend:
    @readiness
    def is_ready(self):
        return True
+
+    @on_shutdown
+    def shutdown(self):
+        logger.info("Shutting down frontend")
--- a/examples/llm/components/frontend.py
+++ b/examples/llm/components/frontend.py
@@ -24,7 +24,7 @@ from components.worker import VllmWorker
 from pydantic import BaseModel

 from dynamo import sdk
-from dynamo.sdk import api, async_on_shutdown, depends, service
+from dynamo.sdk import api, depends, on_shutdown, service
 from dynamo.sdk.lib.config import ServiceConfig
 from dynamo.sdk.lib.image import DYNAMO_IMAGE

@@ -120,7 +120,7 @@ class Frontend:
        The resulting api_endpoints in dynamo.yaml will be incorrect.
        """

-    @async_on_shutdown
+    @on_shutdown
    def cleanup(self):
        """Clean up resources before shutdown."""


--- a/examples/llm/configs/agg_router.yaml
+++ b/examples/llm/configs/agg_router.yaml
@@ -40,8 +40,6 @@ VllmWorker:
    workers: 1
    resources:
      gpu: '1'
-      cpu: '10'
-      memory: '20Gi'
  common-configs: [model, block-size, max-model-len, router, kv-transfer-config]

 Planner:

--- a/examples/tensorrt_llm/common/utils.py
+++ b/examples/tensorrt_llm/common/utils.py
@@ -25,6 +25,8 @@ from typing import Any, Callable, Coroutine, Optional, TypedDict, Union

 logger = logging.getLogger(__name__)

+AsyncTask = Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
+

 class RoutingStrategy(Enum):
    ROUND_ROBIN = "round_robin"
@@ -54,9 +56,7 @@ class ConversationMessage(TypedDict):
 class ManagedThread(threading.Thread):
    def __init__(
        self,
-        task: Optional[
-            Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
-        ],
+        task: Optional[AsyncTask],
        error_queue: Optional[Queue] = None,
        name: Optional[str] = None,
        loop: Optional[asyncio.AbstractEventLoop] = None,
@@ -76,9 +76,7 @@ class ManagedThread(threading.Thread):

    def run(self):
        while not self.stop_event.is_set():
-            task: Optional[
-                Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
-            ] = self.task
+            task: Optional[AsyncTask] = self.task
            if isinstance(task, weakref.WeakMethod):
                task = task()
                if task is None:

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,6 @@ license-files = ["LICENSE"]
 requires-python = ">=3.10"
 dependencies = [
    "pytest>=8.3.4",
-    "bentoml==1.4.8",
    "types-psutil==7.0.0.20250218",
    "kubernetes==32.0.1",
    "ai-dynamo-runtime==0.3.0",
@@ -191,7 +190,7 @@ check_untyped_defs = true

 [[tool.mypy.overrides]]
 # Skip mypy analysis on internal dependencies of vllm
-module = ["vllm.*", "bentoml.*", "fs.*", "_bentoml_sdk.*"]
+module = ["vllm.*"]
 follow_imports = "skip"
 ignore_missing_imports = true