"vscode:/vscode.git/clone" did not exist on "344c21dc0a0fd99a6653be51268bee5ff5db0d0d"
Unverified Commit 6d46288c authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: rename dynamo decorator (#1133)

parent b520bf44
......@@ -24,7 +24,7 @@ from transformers import AutoImageProcessor, LlavaForConditionalGeneration
from utils.protocol import EncodeRequest, EncodeResponse
from utils.vllm import parse_vllm_args
from dynamo.sdk import dynamo_endpoint, service
from dynamo.sdk import endpoint, service
logger = logging.getLogger(__name__)
......@@ -50,7 +50,7 @@ class EncodeWorker:
self.MODEL_ID, device_map="auto", torch_dtype=torch.float16
).eval()
@dynamo_endpoint()
@endpoint()
async def encode(self, request: EncodeRequest) -> AsyncIterator[EncodeResponse]:
image = self.open_image(request.image_url)
image_embeds = self.image_processor(images=image, return_tensors="pt")
......
......@@ -20,7 +20,7 @@ from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from utils.protocol import MultiModalRequest
from dynamo.sdk import DYNAMO_IMAGE, depends, dynamo_api, service
from dynamo.sdk import DYNAMO_IMAGE, api, depends, service
logger = logging.getLogger(__name__)
......@@ -37,7 +37,7 @@ logger = logging.getLogger(__name__)
class Frontend:
processor = depends(Processor)
@dynamo_api()
@api()
async def generate(self, request: MultiModalRequest):
async def content_generator():
async for response in self.processor.generate(request.model_dump_json()):
......
......@@ -34,7 +34,7 @@ from vllm.entrypoints.openai.api_server import (
from vllm.inputs.data import TokensPrompt
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from dynamo.sdk import async_on_start, depends, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
logger = logging.getLogger(__name__)
......@@ -223,6 +223,6 @@ class PrefillWorker:
):
yield
@dynamo_endpoint()
@endpoint()
async def mock(self, req: RequestType):
yield f"mock_response: {req}"
......@@ -31,7 +31,7 @@ from vllm.outputs import RequestOutput
from vllm.transformers_utils.tokenizer import AnyTokenizer
from dynamo.runtime import EtcdKvCache
from dynamo.sdk import async_on_start, depends, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
logger = logging.getLogger(__name__)
......@@ -195,7 +195,7 @@ class Processor(ProcessMixIn):
)
# The generate endpoint will be used by the frontend to handle incoming requests.
@dynamo_endpoint()
@endpoint()
async def generate(self, request: MultiModalRequest):
# TODO: After having the multimodal support in OpenAI compatible frontend, we can use that directly and remove the custom endpoint.
msg = {
......
......@@ -41,7 +41,7 @@ from vllm.inputs.data import TokensPrompt
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from vllm.sampling_params import RequestOutputKind
from dynamo.sdk import async_on_start, depends, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
logger = logging.getLogger(__name__)
......@@ -175,7 +175,7 @@ class VllmWorker:
return callback
@dynamo_endpoint()
@endpoint()
async def generate(self, request: vLLMMultimodalRequest):
image_features = None
if self.do_remote_prefill:
......
......@@ -21,7 +21,7 @@ import sglang as sgl
from utils.protocol import DisaggPreprocessedRequest
from utils.sglang import parse_sglang_args
from dynamo.sdk import dynamo_endpoint, service
from dynamo.sdk import endpoint, service
logger = logging.getLogger(__name__)
......@@ -42,7 +42,7 @@ class SGLangDecodeWorker:
logger.warning("Decode worker initialized")
@dynamo_endpoint()
@endpoint()
async def generate(self, req: DisaggPreprocessedRequest):
g = await self.engine.async_generate(
input_ids=req.request.token_ids,
......
......@@ -36,7 +36,7 @@ from utils.protocol import DisaggPreprocessedRequest, PreprocessedRequest
from utils.sglang import parse_sglang_args
from dynamo.llm import ModelType, register_llm
from dynamo.sdk import async_on_start, depends, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
logger = logging.getLogger(__name__)
......@@ -112,7 +112,7 @@ class SGLangWorker:
sampling_params["ignore_eos"] = request.stop_conditions.ignore_eos
return sampling_params
@dynamo_endpoint()
@endpoint()
async def generate(self, request: PreprocessedRequest):
# TODO: maintain a mapping from SGLang's Ouput struct to LLMEngineOuput
sampling_params = self._build_sampling_params(request)
......
......@@ -25,7 +25,7 @@ from common.protocol import Tokens
from components.worker import TensorRTLLMWorker
from dynamo.llm import AggregatedMetrics, KvIndexer, KvMetricsAggregator, OverlapScores
from dynamo.sdk import async_on_start, depends, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
from dynamo.sdk.lib.config import ServiceConfig
logger = logging.getLogger(__name__)
......@@ -209,7 +209,7 @@ class Router:
return best_worker_id, worker_scores.get(best_worker_id, 0.0)
@dynamo_endpoint()
@endpoint()
async def generate(self, request: Tokens) -> AsyncIterator[WorkerId]:
if self.indexer is None or self.metrics_aggregator is None:
yield "_0.0"
......
......@@ -20,7 +20,7 @@ from common.parser import parse_tensorrt_llm_args
from common.protocol import TRTLLMWorkerRequest
from common.utils import ServerType
from dynamo.sdk import async_on_start, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
from dynamo.sdk.lib.config import ServiceConfig
logger = logging.getLogger(__name__)
......@@ -68,7 +68,7 @@ class TensorRTLLMPrefillWorker(BaseTensorrtLLMEngine):
component = dynamo_context["component"]
await self.kv_metrics_publisher.create_endpoint(component)
@dynamo_endpoint()
@endpoint()
async def generate(self, request: TRTLLMWorkerRequest):
async for response in super().generate(request):
yield response
......@@ -27,7 +27,7 @@ from common.utils import RequestType
from components.kv_router import Router
from components.worker import TensorRTLLMWorker
from dynamo.sdk import async_on_start, depends, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
from dynamo.sdk.lib.config import ServiceConfig
logger = logging.getLogger(__name__)
......@@ -143,7 +143,7 @@ class Processor(ChatProcessorMixin):
logger.debug(f"[preprocessor] Response: {response}")
yield json.loads(response)
@dynamo_endpoint(name="chat/completions")
@endpoint(name="chat/completions")
async def generate_chat(self, raw_request: DynamoTRTLLMChatCompletionRequest):
# max_tokens is deprecated, however if the max_tokens is provided instead
# of max_completion_tokens, we will use the value as max_completion_tokens.
......@@ -172,7 +172,7 @@ class Processor(ChatProcessorMixin):
async for response in self._generate(raw_request, RequestType.CHAT):
yield response
@dynamo_endpoint(name="completions")
@endpoint(name="completions")
async def completions(self, raw_request: DynamoTRTLLMCompletionRequest):
# min_tokens isn't currently propagated through the Rust OpenAI HTTP frontend,
# and ignore_eos is passed through the 'nvext' field, so set both when found.
......
......@@ -21,7 +21,7 @@ from common.protocol import TRTLLMWorkerRequest
from common.utils import ServerType
from components.prefill_worker import TensorRTLLMPrefillWorker
from dynamo.sdk import async_on_start, depends, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
from dynamo.sdk.lib.config import ServiceConfig
logger = logging.getLogger(__name__)
......@@ -91,7 +91,7 @@ class TensorRTLLMWorker(BaseTensorrtLLMEngine):
component = dynamo_context["component"]
await self._kv_metrics_publisher.create_endpoint(component)
@dynamo_endpoint()
@endpoint()
async def generate(self, request: TRTLLMWorkerRequest):
async for response in super().generate(request):
yield response
......@@ -25,7 +25,7 @@ from vllm.inputs import TokensPrompt
from vllm.sampling_params import SamplingParams
from dynamo.llm import ModelType, register_llm
from dynamo.sdk import async_on_start, depends, dynamo_context, dynamo_endpoint, service
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
logger = logging.getLogger(__name__)
......@@ -129,7 +129,7 @@ class SimpleLoadBalancer:
):
yield MyRequestOutput.model_validate_json(decode_response.data())
@dynamo_endpoint()
@endpoint()
async def generate(self, request: PreprocessedRequest):
logger.debug(
"Processor received completion request: %s", request.model_dump_json()
......
......@@ -27,7 +27,7 @@ from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from dynamo.sdk import async_on_start, dynamo_endpoint, service
from dynamo.sdk import async_on_start, endpoint, service
logger = logging.getLogger(__name__)
......@@ -65,7 +65,7 @@ class VllmBaseWorker:
finally:
loop.stop()
@dynamo_endpoint()
@endpoint()
async def generate(self, request: vLLMGenerateRequest):
gen = self.engine_client.generate(
prompt=request.prompt,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment