# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import asyncio import os from typing import Optional import bentoml with bentoml.importing(): from vllm.engine.arg_utils import AsyncEngineArgs from vllm.logger import logger as vllm_logger from vllm.sampling_params import RequestOutputKind from common.base_engine import BaseVllmEngine from common.protocol import MyRequestOutput, vLLMGenerateRequest from vllm.engine.multiprocessing.client import MQLLMEngineClient from dynamo.llm import KvMetricsPublisher from dynamo.sdk import ( async_onstart, dynamo_context, dynamo_endpoint, server_context, service, ) lease_id = None ## TODO: metrics_publisher.create_endpoint(worker_component), @service( dynamo={ "enabled": True, "namespace": "dynamo", }, resources={"gpu": 1, "cpu": "10", "memory": "20Gi"}, workers=1, ) class VllmEngine(BaseVllmEngine): """ vLLM Inference Engine """ def __init__(self): model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" self.engine_args = AsyncEngineArgs( model=model, gpu_memory_utilization=0.8, enable_prefix_caching=True, block_size=64, max_model_len=16384, ) VLLM_WORKER_ID = dynamo_context["endpoints"][0].lease_id() os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID) os.environ["VLLM_KV_NAMESPACE"] = "dynamo" os.environ["VLLM_KV_COMPONENT"] = "vllm" vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}") os.environ["CUDA_VISIBLE_DEVICES"] = f"{server_context.worker_index - 1}" self.metrics_publisher = KvMetricsPublisher() self.engine_client: Optional[MQLLMEngineClient] = None super().__init__(self.engine_args) async def create_metrics_publisher_endpoint(self): component = dynamo_context["component"] await self.metrics_publisher.create_service(component) @async_onstart async def init_engine(self): if self.engine_client is None: await super().initialize() print("vLLM worker initialized") assert self.engine_client is not None, "engine_client was not initialized" self.engine_client.set_metrics_publisher(self.metrics_publisher) self.metrics_publisher.publish( 0, 1024, 0, 1024, 0, 0, 0, ) task = asyncio.create_task(self.create_metrics_publisher_endpoint()) task.add_done_callback(lambda _: print("metrics publisher endpoint created")) @dynamo_endpoint() async def generate(self, request: vLLMGenerateRequest): sampling_params = request.sampling_params # rust HTTP requires Delta streaming sampling_params.output_kind = RequestOutputKind.DELTA async for response in self.engine_client.generate( # type: ignore request.engine_prompt, sampling_params, request.request_id ): # MyRequestOutput takes care of serializing the response as # vLLM's RequestOutput is not serializable by default resp = MyRequestOutput( request_id=response.request_id, prompt=response.prompt, prompt_token_ids=response.prompt_token_ids, prompt_logprobs=response.prompt_logprobs, outputs=response.outputs, finished=response.finished, ).model_dump_json() yield resp