Unverified Commit 16310b26 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

refactor: refactor dynamo serve part-1/N (#788)


Co-authored-by: default avatarishandhanani <ishandhanani@gmail.com>
parent dbdbd5e5
......@@ -81,7 +81,7 @@ Now you should see both workers are ready in Node 1's terminal.
- `Response: {"worker_output":"Which team won 2020 World Series_GeneratedBy_NODE2HOSTNAME","request_id":"id_number"}`
```
curl -X 'POST' \
'http://localhost:3000/generate' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
......@@ -89,7 +89,7 @@ curl -X 'POST' \
"request_id":"id_number"
}'
curl -X 'POST' \
'http://localhost:3000/generate' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
......@@ -100,7 +100,7 @@ curl -X 'POST' \
6. Then modify the prompt and you will notice prompts with similar prefix will be routed to the same worker due to the simply routing algorithm used in this demo. For example, following query will be routed to the worker proceesed "Tell me a joke" prompt.
```
curl -X 'POST' \
'http://localhost:3000/generate' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
......@@ -134,7 +134,7 @@ dynamo serve components.prefill_worker:PrefillWorker
3. Query the Frontend. This time decode workers push requests to the prefill queue, and prefill worker pulles task from the queue to simulate the prefill task. The actual prefill is skipped in this demo.
```
curl -X 'POST' \
'http://localhost:3000/generate' \
'http://localhost:8000/generate' \
-H 'accept: text/event-stream' \
-H 'Content-Type: application/json' \
-d '{
......
......@@ -19,16 +19,21 @@ import sys
from components.processor import Processor
from components.utils import GeneralRequest
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from dynamo.sdk import api, depends, service
from dynamo.sdk import depends, dynamo_endpoint, service
from dynamo.sdk.lib.image import DYNAMO_IMAGE
logger = logging.getLogger(__name__)
app = FastAPI(title="Hello World LLM")
@service(
workers=1,
dynamo={"enabled": True, "namespace": "dynamo-demo"},
image=DYNAMO_IMAGE,
app=app,
)
class Frontend:
processor = depends(Processor)
......@@ -41,12 +46,16 @@ class Frontend:
logger.debug(f"Received signal {signum}, shutting down...")
sys.exit(0)
@api
@dynamo_endpoint(is_api=True)
async def generate(self, prompt, request_id): # from request body keys
"""Stream results from the pipeline."""
logger.info(f"Received: {prompt=},{request_id=}")
frontend_request = GeneralRequest(
prompt=prompt, request_id=request_id
).model_dump_json()
async for response in self.processor.processor_generate(frontend_request):
yield f"Response: {response}\n"
async def content_generator():
frontend_request = GeneralRequest(
prompt=prompt, request_id=request_id
).model_dump_json()
async for response in self.processor.processor_generate(frontend_request):
yield f"Response: {response}\n"
return StreamingResponse(content_generator())
......@@ -15,9 +15,11 @@
import logging
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from dynamo.sdk import DYNAMO_IMAGE, api, depends, dynamo_endpoint, service
from dynamo.sdk import DYNAMO_IMAGE, depends, dynamo_endpoint, service
from dynamo.sdk.lib.config import ServiceConfig
from dynamo.sdk.lib.logging import configure_server_logging
......@@ -102,10 +104,17 @@ class Middle:
yield f"Middle: {response}"
app = FastAPI(title="Hello World!")
@service(
dynamo={"enabled": True, "namespace": "inference"},
image=DYNAMO_IMAGE,
) # Regular HTTP API
app=app,
)
class Frontend:
"""A simple frontend HTTP API that forwards requests to the dynamo graph."""
middle = depends(Middle)
def __init__(self) -> None:
......@@ -119,12 +128,13 @@ class Frontend:
logger.info(f"Frontend config message: {self.message}")
logger.info(f"Frontend config port: {self.port}")
@api
async def generate(self, text):
@dynamo_endpoint(is_api=True)
async def generate(self, request: RequestType):
"""Stream results from the pipeline."""
logger.info(f"Frontend received: {text}")
logger.info(f"Frontend received type: {type(text)}")
txt = RequestType(text=text)
logger.info(f"Frontend sending: {type(txt)}")
async for response in self.middle.generate(txt.model_dump_json()):
yield f"Frontend: {response}"
logger.info(f"Frontend received: {request.text}")
async def content_generator():
async for response in self.middle.generate(request.model_dump_json()):
yield f"Frontend: {response}"
return StreamingResponse(content_generator())
......@@ -19,6 +19,7 @@ from pathlib import Path
from components.processor import Processor
from components.worker import VllmWorker
from fastapi import FastAPI
from pydantic import BaseModel
from dynamo import sdk
......@@ -49,9 +50,14 @@ class FrontendConfig(BaseModel):
# todo this should be called ApiServer
@service(
dynamo={
"enabled": True,
"namespace": "dynamo",
},
resources={"cpu": "10", "memory": "20Gi"},
workers=1,
image=DYNAMO_IMAGE,
app=FastAPI(title="LLM Example"),
)
class Frontend:
worker = depends(VllmWorker)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment