worker.py 18.2 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# SPDX-License-Identifier: Apache-2.0

import argparse
import asyncio
import copy
import logging
import os
import signal
import sys
from typing import Tuple

import torch
import uvloop
from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.inputs.data import TokensPrompt
from vllm.usage.usage_lib import UsageContext
18
from vllm.utils.argparse_utils import FlexibleArgumentParser
19
20
from vllm.v1.engine.async_llm import AsyncLLM

21
import dynamo.nixl_connect as connect
22
23
24
25
26
27
28
29
30
from dynamo.llm import ZmqKvEventPublisher, ZmqKvEventPublisherConfig
from dynamo.runtime import Component, DistributedRuntime, Endpoint, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging

sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
from publisher import StatLoggerFactory
from utils.args import (
    Config,
    base_parse_args,
31
    configure_ports,
32
33
34
35
    overwrite_args,
    parse_endpoint,
)
from utils.image_loader import ImageLoader
36
from utils.model import construct_mm_data
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from utils.protocol import MyRequestOutput, vLLMMultimodalRequest

configure_dynamo_logging()
logger = logging.getLogger(__name__)


class VllmBaseWorker:
    @classmethod
    def parse_args(cls) -> Tuple[argparse.Namespace, Config]:
        parser = FlexibleArgumentParser(
            description="vLLM based encoder for Dynamo LLM."
        )
        parser.add_argument(
            "--endpoint",
            type=str,
            help="Dynamo endpoint string in 'dyn://namespace.component.endpoint' format.  Default value will vary based on the worker type, see --worker-type for details.",
        )
        parser.add_argument(
            "--downstream-endpoint",
            type=str,
            help="The endpoint string of the downstream LLM in 'dyn://namespace.component.endpoint' format. Default value will vary based on the worker type, see --worker-type for details.",
        )
        parser.add_argument(
            "--worker-type",
            type=str,
            choices=["prefill", "decode", "encode_prefill"],
            required=True,
            help="Specify the type of worker. Must be one of: 'prefill', 'decode', 'encode_prefill'",
        )
        parser.add_argument(
            "--enable-disagg",
            action="store_true",
            help="Enable disaggregated mode, where prefill and decode are handled by separate workers."
            " If not set, the '*prefill' worker type will handle both prefill and decode.",
        )

        # use endpoint_overwrite to set the default endpoint based on worker type
        def endpoint_overwrite(args):
75
            DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
76
77
            # default endpoint for this worker
            if args.worker_type == "prefill":
78
                args.endpoint = args.endpoint or f"dyn://{DYN_NAMESPACE}.llm.generate"
79
            elif args.worker_type == "decode":
80
81
82
                args.endpoint = (
                    args.endpoint or f"dyn://{DYN_NAMESPACE}.decoder.generate"
                )
83
            elif args.worker_type == "encode_prefill":
84
85
86
                args.endpoint = (
                    args.endpoint or f"dyn://{DYN_NAMESPACE}.encoder.generate"
                )
87
88
89
            # set downstream endpoint for disaggregated workers
            if args.enable_disagg:
                args.downstream_endpoint = (
90
91
                    args.downstream_endpoint
                    or f"dyn://{DYN_NAMESPACE}.decoder.generate"
92
93
94
95
96
97
98
99
100
101
102
103
104
                )

            return args

        args, config = base_parse_args(parser, endpoint_overwrite)

        return args, config

    def __init__(
        self,
        args: argparse.Namespace,
        component: Component,
        endpoint: Endpoint,
105
        config: Config,
106
107
108
109
    ):
        self.enable_disagg = args.enable_disagg
        self.endpoint = args.endpoint
        self.downstream_endpoint = args.downstream_endpoint
110
111
        self.engine_args = config.engine_args
        self.config = config
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
        self.setup_vllm_engine(component, endpoint)

    async def async_init(self, runtime: DistributedRuntime):
        pass

    def setup_vllm_engine(self, component: Component, endpoint: Endpoint):
        """Initialize the vLLM engine.
        This method sets up the vLLM engine client, and configures the dynamo-aware KV
        event publisher and metrics stats logger based on component and endpoint.
        """

        os.environ["VLLM_NO_USAGE_STATS"] = "1"  # Avoid internal HTTP requests
        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

        # Load default sampling params from `generation_config.json`
        self.default_sampling_params = (
            self.engine_args.create_model_config().get_diff_sampling_param()
        )

        # Taken from build_async_engine_client_from_engine_args()
        usage_context = UsageContext.OPENAI_API_SERVER
        vllm_config = self.engine_args.create_engine_config(usage_context=usage_context)

        # Create vLLM engine with metrics logger and KV event publisher attached
        self.stats_logger = StatLoggerFactory(
137
138
            component,
            self.engine_args.data_parallel_rank or 0,
139
            metrics_labels=[("model", self.config.model)],
140
141
142
143
144
        )
        self.engine_client = AsyncLLM.from_vllm_config(
            vllm_config=vllm_config,
            usage_context=usage_context,
            stat_loggers=[self.stats_logger],
145
            enable_log_requests=self.engine_args.enable_log_requests,
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
            disable_log_stats=self.engine_args.disable_log_stats,
        )

        # TODO Hack to get data, move this to registering in ETCD
        self.stats_logger.set_num_gpu_blocks_all(
            vllm_config.cache_config.num_gpu_blocks
        )
        self.stats_logger.set_request_total_slots_all(
            vllm_config.scheduler_config.max_num_seqs
        )
        self.stats_logger.init_publish()

        # TODO: We start off with a valid endpoint, then we increment it by dp_rank
        # May no longer be valid. Lets remove the increment behavior from vLLM and here
        zmq_endpoint = ZmqEventPublisher.offset_endpoint_port(
            self.engine_args.kv_events_config.endpoint,
            data_parallel_rank=self.engine_args.data_parallel_rank or 0,
        ).replace("*", "127.0.0.1")

        zmq_config = ZmqKvEventPublisherConfig(
166
            worker_id=endpoint.connection_id(),
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
            kv_block_size=vllm_config.cache_config.block_size,
            zmq_endpoint=zmq_endpoint,
        )
        self.kv_publisher = ZmqKvEventPublisher(component=component, config=zmq_config)

        logger.info(f"Reading Events from {zmq_endpoint}")

        logger.info(f"VllmWorker for {self.engine_args.model} has been initialized")

    async def generate(self, request: vLLMMultimodalRequest):
        raise NotImplementedError(
            "This method should be implemented in subclasses to handle the generation logic."
        )

    async def clear_kv_blocks(self, request=None):
        try:
            await self.engine_client.reset_prefix_cache()
            yield {"status": "success", "message": "KV cache cleared"}
        except Exception as e:
            yield {"status": "error", "message": str(e)}

    def cleanup(self):
        """Override in subclasses if cleanup is needed."""
        pass


class VllmDecodeWorker(VllmBaseWorker):
    async def generate(self, request: vLLMMultimodalRequest):
        logger.debug(f"Got raw request: {request}")
        if not isinstance(request, vLLMMultimodalRequest):
            if isinstance(request, str):
                request = vLLMMultimodalRequest.model_validate_json(request)
            else:
                request = vLLMMultimodalRequest.model_validate(request)
        logger.debug(f"Received decode request: {{ id: {request.request_id} }}.")

        # Decode worker doesn't process embeddings, so we pass None or empty tensor
        gen = self.engine_client.generate(
            prompt=TokensPrompt(
                prompt_token_ids=request.engine_prompt["prompt_token_ids"],
            ),
            sampling_params=request.sampling_params,
            request_id=request.request_id,
        )

        async for response in gen:
            logger.debug(f"Response kv_transfer_params: {response.kv_transfer_params}")
            yield MyRequestOutput(
                request_id=response.request_id,
                prompt=response.prompt,
                prompt_token_ids=response.prompt_token_ids,
                prompt_logprobs=response.prompt_logprobs,
                outputs=response.outputs,
                finished=response.finished,
                metrics=response.metrics,
                kv_transfer_params=response.kv_transfer_params,
            ).model_dump_json()


class VllmPDWorker(VllmBaseWorker):
    async def async_init(self, runtime: DistributedRuntime):
        logger.info("Startup started.")

        if self.enable_disagg:
            (
                parsed_namespace,
                parsed_component_name,
                parsed_endpoint_name,
            ) = parse_endpoint(self.downstream_endpoint)
            self.decode_worker_client = (
                await runtime.namespace(parsed_namespace)
                .component(parsed_component_name)
                .endpoint(parsed_endpoint_name)
                .client()
            )

243
244
245
246
247
        if "video" in self.engine_args.model.lower():
            self.EMBEDDINGS_DTYPE = torch.uint8
        else:
            self.EMBEDDINGS_DTYPE = torch.float16

248
        self.EMBEDDINGS_DEVICE = "cpu"
249

250
251
252
        # Create and initialize a dynamo connector for this worker.
        # We'll needs this to move data between this worker and remote workers efficiently.
        parsed_namespace, _, _ = parse_endpoint(self.endpoint)
253
        self._connector = connect.Connector()
254
255
256
257
258
259
260
261
262
263
264
265
266
267

        self.image_loader = ImageLoader()

        logger.info("VllmPDWorker has been initialized")

    async def generate(self, request: vLLMMultimodalRequest):
        logger.debug(f"Got raw request: {request}")
        if type(request) is not vLLMMultimodalRequest:
            if type(request) is str:
                request = vLLMMultimodalRequest.model_validate_json(request)
            else:
                request = vLLMMultimodalRequest.model_validate(request)
        logger.debug(f"Received PD request: {{ id: {request.request_id} }}.")

268
269
270
        if (
            request.multimodal_input.image_url is None
            and request.multimodal_input.video_url is None
271
            and request.multimodal_input.audio_url is None
272
        ):
273
274
275
276
277
278
279
280
281
            # Process embeddings using the connector
            # Create a descriptor based on the embedding shape.
            embeddings = torch.empty(
                request.embeddings_shape,
                dtype=self.EMBEDDINGS_DTYPE,
                device=self.EMBEDDINGS_DEVICE,
            )
            descriptor = connect.Descriptor(embeddings)

282
283
284
285
286
287
288
289
290
            if descriptor is None:
                raise RuntimeError(
                    "Descriptor is None in PD worker - cannot process embeddings"
                )

            read_op = await self._connector.begin_read(
                request.serialized_request, descriptor
            )
            await read_op.wait_for_completion()
291
292
293
294
295
296
297
            if "video" in self.engine_args.model.lower():
                video_numpy = embeddings.numpy()
                multi_modal_data = construct_mm_data(
                    self.engine_args.model,
                    self.EMBEDDINGS_DTYPE,
                    video_numpy=video_numpy,
                )
298
299
300
301
302
303
            elif "audio" in self.engine_args.model.lower():
                multi_modal_data = construct_mm_data(
                    self.engine_args.model,
                    self.EMBEDDINGS_DTYPE,
                    audio_embeds=embeddings,
                )
304
305
306
307
308
309
310
            else:
                multi_modal_data = construct_mm_data(
                    self.engine_args.model,
                    self.EMBEDDINGS_DTYPE,
                    image_embeds=embeddings,
                    image_grid_thw=request.image_grid_thw,
                )
311
312
        else:
            # Use PIL image instead of image embeddings
313
            multi_modal_data = {
314
315
316
                "image": await self.image_loader.load_image(
                    request.multimodal_input.image_url
                )
317
            }
318
319

        # Remove the image features from the request as they are not required
320
321
        request.multimodal_input.image_url = None
        request.multimodal_input.video_url = None
322
        request.multimodal_input.audio_url = None
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
        request.serialized_request = None

        pd_request = copy.deepcopy(request)
        # Do prefill and remote decode if enable_disagg is true
        if self.enable_disagg:
            extra_args = pd_request.sampling_params.extra_args or {}
            extra_args["kv_transfer_params"] = {
                "do_remote_decode": True,
            }
            pd_request.sampling_params.extra_args = extra_args
            pd_request.sampling_params.max_tokens = 1
            pd_request.sampling_params.min_tokens = 1

            logger.debug("Prefill request: %s", pd_request)

        gen = self.engine_client.generate(
            prompt=TokensPrompt(
                prompt_token_ids=pd_request.engine_prompt["prompt_token_ids"],
341
                multi_modal_data=multi_modal_data,
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
            ),
            sampling_params=pd_request.sampling_params,
            request_id=pd_request.request_id,
        )

        if self.enable_disagg:
            decode_request = copy.deepcopy(request)
            async for prefill_response in gen:
                # Update the prompt token id in the decode request to the one
                # in response, which has image templated filled in. So that
                # the decode worker will fetch correct amount of KV blocks.
                decode_request.engine_prompt[
                    "prompt_token_ids"
                ] = prefill_response.prompt_token_ids
                logger.debug(
                    f"Prefill response kv_transfer_params: {prefill_response.kv_transfer_params}"
                )
                extra_args = decode_request.sampling_params.extra_args or {}
                extra_args["kv_transfer_params"] = prefill_response.kv_transfer_params
                extra_args.pop("serialized_request", None)
                decode_request.sampling_params.extra_args = extra_args
                logger.debug("Decode request: %s", decode_request)
364
365
366
                async for (
                    decode_response
                ) in await self.decode_worker_client.round_robin(
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
                    decode_request.model_dump_json()
                ):
                    output = MyRequestOutput.model_validate_json(decode_response.data())
                    yield MyRequestOutput(
                        request_id=output.request_id,
                        prompt=output.prompt,
                        prompt_token_ids=output.prompt_token_ids,
                        prompt_logprobs=output.prompt_logprobs,
                        outputs=output.outputs,
                        finished=output.finished,
                        metrics=output.metrics,
                        kv_transfer_params=output.kv_transfer_params,
                    ).model_dump_json()

        else:
            async for response in gen:
                logger.debug(
                    f"Response kv_transfer_params: {response.kv_transfer_params}"
                )
                yield MyRequestOutput(
                    request_id=response.request_id,
                    prompt=response.prompt,
                    prompt_token_ids=response.prompt_token_ids,
                    prompt_logprobs=response.prompt_logprobs,
                    outputs=response.outputs,
                    finished=response.finished,
                    metrics=response.metrics,
                    kv_transfer_params=response.kv_transfer_params,
                ).model_dump_json()


async def graceful_shutdown(runtime):
    """
    By calling `runtime.shutdown()`, the endpoints will immediately be unavailable.
    However, in-flight requests will still be processed until they are finished.
    After all in-flight requests are finished, the `serve_endpoint` functions will return
    and the engine will be shutdown by Python's garbage collector.
    """
    logging.info("Received shutdown signal, shutting down DistributedRuntime")
    runtime.shutdown()
    logging.info("DistributedRuntime shutdown complete")


410
@dynamo_worker()
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
async def worker(runtime: DistributedRuntime):
    # Runtime setup
    # Set up signal handler for graceful shutdown
    loop = asyncio.get_running_loop()

    def signal_handler():
        asyncio.create_task(graceful_shutdown(runtime))

    for sig in (signal.SIGTERM, signal.SIGINT):
        loop.add_signal_handler(sig, signal_handler)

    logging.info("Signal handlers set up for graceful shutdown")

    # worker setup
    args, config = VllmBaseWorker.parse_args()

    # vLLM config overwrites
428
    configure_ports(config)
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
    overwrite_args(config)
    await init(runtime, args, config)


async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Config):
    """
    Instantiate and serve
    """

    component = runtime.namespace(config.namespace).component(config.component)

    generate_endpoint = component.endpoint(config.endpoint)
    clear_endpoint = component.endpoint("clear_kv_blocks")

    if args.worker_type in ["prefill", "encode_prefill"]:
        handler: VllmBaseWorker = VllmPDWorker(
445
            args, component, generate_endpoint, config
446
447
        )
    elif args.worker_type == "decode":
448
        handler = VllmDecodeWorker(args, component, generate_endpoint, config)
449
450
451
452
    await handler.async_init(runtime)

    logger.info(f"Starting to serve the {args.endpoint} endpoint...")

453
454
    metrics_labels = [("model", config.model)]

455
456
    try:
        await asyncio.gather(
457
458
459
460
461
462
            generate_endpoint.serve_endpoint(
                handler.generate, metrics_labels=metrics_labels
            ),
            clear_endpoint.serve_endpoint(
                handler.clear_kv_blocks, metrics_labels=metrics_labels
            ),
463
464
465
466
467
468
469
470
471
472
473
        )
    except Exception as e:
        logger.error(f"Failed to serve endpoints: {e}")
        raise
    finally:
        handler.cleanup()


if __name__ == "__main__":
    uvloop.install()
    asyncio.run(worker())