Unverified Commit afb8495e authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: decouple bento dependency (#1266)

parent e31f8d95
......@@ -83,12 +83,12 @@ class ServiceA:
self.engine = await initialize_model_engine(self.model_name)
print(f"ServiceA initialized with model: {self.model_name}")
@async_on_shutdown
async def async_shutdown(self):
@on_shutdown
def shutdown(self):
# Clean up resources
if self.engine:
await self.engine.shutdown()
print("ServiceA engine shut down")
self.engine.shutdown()
print("ServiceA engine shut down")
@endpoint()
async def generate(self, request: ChatCompletionRequest):
......@@ -104,7 +104,7 @@ class ServiceA:
Dynamo follows a class-based architecture similar to BentoML making it intuitive for users familiar with those frameworks. Each service is defined as a Python class, with the following components:
1. Class attributes for dependencies using `depends()`
2. An `__init__` method for standard initialization
3. Optional lifecycle hooks like `@async_on_start` and `@async_on_shutdown`
3. Optional lifecycle hooks like `@async_on_start` and `@on_shutdown`
4. Endpoints defined with `@endpoint()`. Optionally, an endpoint can be given a name
via `@endpoint("my_endpoint_name")`, but otherwise defaults to the name of the
function being decorated if omitted.
......@@ -170,15 +170,14 @@ This is especially useful for:
- Initializing external connections
- Setting up runtime resources that require async operations
#### `@async_on_shutdown`
The `@async_on_shutdown` hook is called when the service is shutdown handles cleanup.
#### `@on_shutdown`
The `@on_shutdown` hook is called when the service is shutdown handles cleanup.
```python
@async_on_shutdown
async def async_shutdown(self):
if self._engine_context is not None:
await self._engine_context.__aexit__(None, None, None)
print("VllmWorkerRouterLess shutting down")
@on_shutdown
def shutdown(self):
# gracefully Handle shutdown / cleanup
logger.info("worker shutting down")
```
This ensures resources are properly released, preventing memory leaks and making sure external connections are properly closed. This is helpful to clean up vLLM engines that have been started outside of the main process.
......@@ -471,4 +470,4 @@ Think of all the depends statements as the maximal set of edges for the processo
Processor.link(Router)
```
This removes the `worker` dependency from the Processor and only spin up the Router.
\ No newline at end of file
This removes the `worker` dependency from the Processor and only spin up the Router.
......@@ -25,6 +25,7 @@ from dynamo.sdk import (
depends,
endpoint,
liveness,
on_shutdown,
readiness,
service,
)
......@@ -87,6 +88,10 @@ class Backend:
for token in text.split():
yield f"Backend: {token}"
@on_shutdown
def shutdown(self):
logger.info("Shutting down backend")
@service(
dynamo={"namespace": "inference"},
......@@ -112,6 +117,10 @@ class Middle:
logger.info(f"Middle received response: {response}")
yield f"Middle: {response}"
@on_shutdown
def shutdown(self):
logger.info("Shutting down middle")
@service(
dynamo={"namespace": "inference"},
......@@ -152,3 +161,7 @@ class Frontend:
@readiness
def is_ready(self):
return True
@on_shutdown
def shutdown(self):
logger.info("Shutting down frontend")
......@@ -24,7 +24,7 @@ from components.worker import VllmWorker
from pydantic import BaseModel
from dynamo import sdk
from dynamo.sdk import api, async_on_shutdown, depends, service
from dynamo.sdk import api, depends, on_shutdown, service
from dynamo.sdk.lib.config import ServiceConfig
from dynamo.sdk.lib.image import DYNAMO_IMAGE
......@@ -120,7 +120,7 @@ class Frontend:
The resulting api_endpoints in dynamo.yaml will be incorrect.
"""
@async_on_shutdown
@on_shutdown
def cleanup(self):
"""Clean up resources before shutdown."""
......
......@@ -40,8 +40,6 @@ VllmWorker:
workers: 1
resources:
gpu: '1'
cpu: '10'
memory: '20Gi'
common-configs: [model, block-size, max-model-len, router, kv-transfer-config]
Planner:
......
......@@ -25,6 +25,8 @@ from typing import Any, Callable, Coroutine, Optional, TypedDict, Union
logger = logging.getLogger(__name__)
AsyncTask = Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
class RoutingStrategy(Enum):
ROUND_ROBIN = "round_robin"
......@@ -54,9 +56,7 @@ class ConversationMessage(TypedDict):
class ManagedThread(threading.Thread):
def __init__(
self,
task: Optional[
Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
],
task: Optional[AsyncTask],
error_queue: Optional[Queue] = None,
name: Optional[str] = None,
loop: Optional[asyncio.AbstractEventLoop] = None,
......@@ -76,9 +76,7 @@ class ManagedThread(threading.Thread):
def run(self):
while not self.stop_event.is_set():
task: Optional[
Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
] = self.task
task: Optional[AsyncTask] = self.task
if isinstance(task, weakref.WeakMethod):
task = task()
if task is None:
......
......@@ -26,7 +26,6 @@ license-files = ["LICENSE"]
requires-python = ">=3.10"
dependencies = [
"pytest>=8.3.4",
"bentoml==1.4.8",
"types-psutil==7.0.0.20250218",
"kubernetes==32.0.1",
"ai-dynamo-runtime==0.3.0",
......@@ -191,7 +190,7 @@ check_untyped_defs = true
[[tool.mypy.overrides]]
# Skip mypy analysis on internal dependencies of vllm
module = ["vllm.*", "bentoml.*", "fs.*", "_bentoml_sdk.*"]
module = ["vllm.*"]
follow_imports = "skip"
ignore_missing_imports = true
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment