Unverified Commit afb8495e authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: decouple bento dependency (#1266)

parent e31f8d95
...@@ -83,12 +83,12 @@ class ServiceA: ...@@ -83,12 +83,12 @@ class ServiceA:
self.engine = await initialize_model_engine(self.model_name) self.engine = await initialize_model_engine(self.model_name)
print(f"ServiceA initialized with model: {self.model_name}") print(f"ServiceA initialized with model: {self.model_name}")
@async_on_shutdown @on_shutdown
async def async_shutdown(self): def shutdown(self):
# Clean up resources # Clean up resources
if self.engine: if self.engine:
await self.engine.shutdown() self.engine.shutdown()
print("ServiceA engine shut down") print("ServiceA engine shut down")
@endpoint() @endpoint()
async def generate(self, request: ChatCompletionRequest): async def generate(self, request: ChatCompletionRequest):
...@@ -104,7 +104,7 @@ class ServiceA: ...@@ -104,7 +104,7 @@ class ServiceA:
Dynamo follows a class-based architecture similar to BentoML making it intuitive for users familiar with those frameworks. Each service is defined as a Python class, with the following components: Dynamo follows a class-based architecture similar to BentoML making it intuitive for users familiar with those frameworks. Each service is defined as a Python class, with the following components:
1. Class attributes for dependencies using `depends()` 1. Class attributes for dependencies using `depends()`
2. An `__init__` method for standard initialization 2. An `__init__` method for standard initialization
3. Optional lifecycle hooks like `@async_on_start` and `@async_on_shutdown` 3. Optional lifecycle hooks like `@async_on_start` and `@on_shutdown`
4. Endpoints defined with `@endpoint()`. Optionally, an endpoint can be given a name 4. Endpoints defined with `@endpoint()`. Optionally, an endpoint can be given a name
via `@endpoint("my_endpoint_name")`, but otherwise defaults to the name of the via `@endpoint("my_endpoint_name")`, but otherwise defaults to the name of the
function being decorated if omitted. function being decorated if omitted.
...@@ -170,15 +170,14 @@ This is especially useful for: ...@@ -170,15 +170,14 @@ This is especially useful for:
- Initializing external connections - Initializing external connections
- Setting up runtime resources that require async operations - Setting up runtime resources that require async operations
#### `@async_on_shutdown` #### `@on_shutdown`
The `@async_on_shutdown` hook is called when the service is shutdown handles cleanup. The `@on_shutdown` hook is called when the service is shutdown handles cleanup.
```python ```python
@async_on_shutdown @on_shutdown
async def async_shutdown(self): def shutdown(self):
if self._engine_context is not None: # gracefully Handle shutdown / cleanup
await self._engine_context.__aexit__(None, None, None) logger.info("worker shutting down")
print("VllmWorkerRouterLess shutting down")
``` ```
This ensures resources are properly released, preventing memory leaks and making sure external connections are properly closed. This is helpful to clean up vLLM engines that have been started outside of the main process. This ensures resources are properly released, preventing memory leaks and making sure external connections are properly closed. This is helpful to clean up vLLM engines that have been started outside of the main process.
...@@ -471,4 +470,4 @@ Think of all the depends statements as the maximal set of edges for the processo ...@@ -471,4 +470,4 @@ Think of all the depends statements as the maximal set of edges for the processo
Processor.link(Router) Processor.link(Router)
``` ```
This removes the `worker` dependency from the Processor and only spin up the Router. This removes the `worker` dependency from the Processor and only spin up the Router.
\ No newline at end of file
...@@ -25,6 +25,7 @@ from dynamo.sdk import ( ...@@ -25,6 +25,7 @@ from dynamo.sdk import (
depends, depends,
endpoint, endpoint,
liveness, liveness,
on_shutdown,
readiness, readiness,
service, service,
) )
...@@ -87,6 +88,10 @@ class Backend: ...@@ -87,6 +88,10 @@ class Backend:
for token in text.split(): for token in text.split():
yield f"Backend: {token}" yield f"Backend: {token}"
@on_shutdown
def shutdown(self):
logger.info("Shutting down backend")
@service( @service(
dynamo={"namespace": "inference"}, dynamo={"namespace": "inference"},
...@@ -112,6 +117,10 @@ class Middle: ...@@ -112,6 +117,10 @@ class Middle:
logger.info(f"Middle received response: {response}") logger.info(f"Middle received response: {response}")
yield f"Middle: {response}" yield f"Middle: {response}"
@on_shutdown
def shutdown(self):
logger.info("Shutting down middle")
@service( @service(
dynamo={"namespace": "inference"}, dynamo={"namespace": "inference"},
...@@ -152,3 +161,7 @@ class Frontend: ...@@ -152,3 +161,7 @@ class Frontend:
@readiness @readiness
def is_ready(self): def is_ready(self):
return True return True
@on_shutdown
def shutdown(self):
logger.info("Shutting down frontend")
...@@ -24,7 +24,7 @@ from components.worker import VllmWorker ...@@ -24,7 +24,7 @@ from components.worker import VllmWorker
from pydantic import BaseModel from pydantic import BaseModel
from dynamo import sdk from dynamo import sdk
from dynamo.sdk import api, async_on_shutdown, depends, service from dynamo.sdk import api, depends, on_shutdown, service
from dynamo.sdk.lib.config import ServiceConfig from dynamo.sdk.lib.config import ServiceConfig
from dynamo.sdk.lib.image import DYNAMO_IMAGE from dynamo.sdk.lib.image import DYNAMO_IMAGE
...@@ -120,7 +120,7 @@ class Frontend: ...@@ -120,7 +120,7 @@ class Frontend:
The resulting api_endpoints in dynamo.yaml will be incorrect. The resulting api_endpoints in dynamo.yaml will be incorrect.
""" """
@async_on_shutdown @on_shutdown
def cleanup(self): def cleanup(self):
"""Clean up resources before shutdown.""" """Clean up resources before shutdown."""
......
...@@ -40,8 +40,6 @@ VllmWorker: ...@@ -40,8 +40,6 @@ VllmWorker:
workers: 1 workers: 1
resources: resources:
gpu: '1' gpu: '1'
cpu: '10'
memory: '20Gi'
common-configs: [model, block-size, max-model-len, router, kv-transfer-config] common-configs: [model, block-size, max-model-len, router, kv-transfer-config]
Planner: Planner:
......
...@@ -25,6 +25,8 @@ from typing import Any, Callable, Coroutine, Optional, TypedDict, Union ...@@ -25,6 +25,8 @@ from typing import Any, Callable, Coroutine, Optional, TypedDict, Union
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
AsyncTask = Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
class RoutingStrategy(Enum): class RoutingStrategy(Enum):
ROUND_ROBIN = "round_robin" ROUND_ROBIN = "round_robin"
...@@ -54,9 +56,7 @@ class ConversationMessage(TypedDict): ...@@ -54,9 +56,7 @@ class ConversationMessage(TypedDict):
class ManagedThread(threading.Thread): class ManagedThread(threading.Thread):
def __init__( def __init__(
self, self,
task: Optional[ task: Optional[AsyncTask],
Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
],
error_queue: Optional[Queue] = None, error_queue: Optional[Queue] = None,
name: Optional[str] = None, name: Optional[str] = None,
loop: Optional[asyncio.AbstractEventLoop] = None, loop: Optional[asyncio.AbstractEventLoop] = None,
...@@ -76,9 +76,7 @@ class ManagedThread(threading.Thread): ...@@ -76,9 +76,7 @@ class ManagedThread(threading.Thread):
def run(self): def run(self):
while not self.stop_event.is_set(): while not self.stop_event.is_set():
task: Optional[ task: Optional[AsyncTask] = self.task
Union[Callable[..., Coroutine[Any, Any, bool]], weakref.WeakMethod]
] = self.task
if isinstance(task, weakref.WeakMethod): if isinstance(task, weakref.WeakMethod):
task = task() task = task()
if task is None: if task is None:
......
...@@ -26,7 +26,6 @@ license-files = ["LICENSE"] ...@@ -26,7 +26,6 @@ license-files = ["LICENSE"]
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = [ dependencies = [
"pytest>=8.3.4", "pytest>=8.3.4",
"bentoml==1.4.8",
"types-psutil==7.0.0.20250218", "types-psutil==7.0.0.20250218",
"kubernetes==32.0.1", "kubernetes==32.0.1",
"ai-dynamo-runtime==0.3.0", "ai-dynamo-runtime==0.3.0",
...@@ -191,7 +190,7 @@ check_untyped_defs = true ...@@ -191,7 +190,7 @@ check_untyped_defs = true
[[tool.mypy.overrides]] [[tool.mypy.overrides]]
# Skip mypy analysis on internal dependencies of vllm # Skip mypy analysis on internal dependencies of vllm
module = ["vllm.*", "bentoml.*", "fs.*", "_bentoml_sdk.*"] module = ["vllm.*"]
follow_imports = "skip" follow_imports = "skip"
ignore_missing_imports = true ignore_missing_imports = true
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment