main.py

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import asyncio
import logging
import os
import tempfile
import time
from typing import Optional

import uvloop
from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus

from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.common.utils.prometheus import (
    LLMBackendMetrics,
    register_engine_metrics_callback,
)
from dynamo.common.utils.runtime import create_runtime
from dynamo.llm import (
    KvEventPublisher,
    ModelInput,
    ModelRuntimeConfig,
    ModelType,
    ZmqKvEventPublisherConfig,
    fetch_llm,
    register_llm,
)

# Optional imports for frontend decoding support
try:
    from dynamo.llm import MediaDecoder, MediaFetcher

    MEDIA_DECODER_AVAILABLE = True
except ImportError:
    MediaDecoder = None
    MediaFetcher = None
    MEDIA_DECODER_AVAILABLE = False

from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.vllm.multimodal_handlers import (
    ECProcessorHandler,
    EncodeWorkerHandler,
    MultimodalDecodeWorkerHandler,
    MultimodalPDWorkerHandler,
    PreprocessedHandler,
    VLLMEncodeWorkerHandler,
)
from dynamo.vllm.multimodal_utils.encode_utils import create_ec_transfer_config

from .args import Config, overwrite_args, parse_args
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
from .health_check import (
    VllmHealthCheckPayload,
    VllmOmniHealthCheckPayload,
    VllmPrefillHealthCheckPayload,
)
from .publisher import DYNAMO_COMPONENT_REGISTRY, StatLoggerFactory

configure_dynamo_logging()
logger = logging.getLogger(__name__)


async def _handle_non_leader_node(dp_rank: int) -> None:
    """
    Handle non-leader node (data_parallel_rank >= 1) in multi-node deployments.
    Non-leader nodes run vLLM workers but don't serve Dynamo endpoints.
    """
    logger.info(
        f"Non-leader node detected (data_parallel_rank={dp_rank}). "
        "Skipping endpoint serving."
    )
    # Wait indefinitely - process terminated via signal handlers
    await asyncio.Event().wait()


async def await_checkpoint_and_was_restored(signal_file: str) -> bool:
    """
    Wait for checkpoint signal file OR restore marker file.

    In checkpoint creation mode, poll until either:
    1. The signal file exists (checkpoint complete, should exit)
    2. The restore marker file exists (restored by CRIU, should proceed)

    The restore marker file is created by the restore-entrypoint before CRIU restore,
    so the restored process can detect it was restored even though os.environ is
    restored from the checkpoint and doesn't contain new container env vars.

    Args:
        signal_file: Path to the checkpoint signal file

    Returns:
        True if restored (should proceed with registration)
        False if signal file detected (should exit)
    """
    # Get restore marker file path (created by restore entrypoint before CRIU restore)
    restore_marker = os.environ.get("DYN_RESTORE_MARKER_FILE", "/tmp/dynamo-restored")

    logger.info(
        f"CHECKPOINT_READY: Model loaded, ready for container checkpoint. Waiting for signal file: {signal_file} or restore marker file: {restore_marker}"
    )

    while True:
        # Check if we've been restored (marker file created by restore entrypoint)
        if os.path.exists(restore_marker):
            logger.info(
                f"Detected restore from checkpoint (marker file exists: {restore_marker})"
            )
            return True  # Restored - proceed with registration

        # Check if checkpoint is complete (signal file exists)
        if os.path.exists(signal_file):
            logger.info(f"Checkpoint signal file detected: {signal_file}")
            return False  # Checkpoint done - exit

        await asyncio.sleep(1)


async def worker():
    config = parse_args()

    overwrite_args(config)
    dump_config(config.dump_config_to, config)

    # Name the model. Use either the full path (vllm and sglang do the same),
    # or the HF name (e.g. "Qwen/Qwen3-0.6B"), depending on cmd line params.
    if not config.served_model_name:
        config.served_model_name = config.engine_args.served_model_name = config.model

    # Check checkpoint-related environment variables EARLY
    signal_file = os.environ.get("DYN_CHECKPOINT_SIGNAL_FILE")
    ready_file = os.environ.get("DYN_CHECKPOINT_READY_FILE")

    is_checkpoint_mode = signal_file is not None

    # EARLY EXIT: Check if checkpoint already exists (before downloading model!)
    if is_checkpoint_mode:
        storage_type = os.environ.get("DYN_CHECKPOINT_STORAGE_TYPE")
        checkpoint_location = os.environ.get("DYN_CHECKPOINT_LOCATION")

        if storage_type == "pvc" and checkpoint_location:
            done_marker = f"{checkpoint_location}/checkpoint.done"

            if os.path.exists(done_marker):
                logger.info(
                    f"Found existing checkpoint at {checkpoint_location}. Storage type: {storage_type}"
                )
                return
            else:
                logger.info(
                    f"Checkpoint not found at: {checkpoint_location}. creating new checkpoint"
                )

    # Download the model if necessary using modelexpress.
    # We want it on disk before we start vllm to avoid downloading from HuggingFace.
    #
    # We don't set `config.engine_args.model` to the local path fetch_llm returns
    # because vllm will send that name to its Ray pipeline-parallel workers, which
    # may not have the local path.
    # vllm will attempt to download the model again, but find it in the HF cache.
    # For non-HF models use a path instead of an HF name, and ensure all workers have
    # that path (ideally via a shared folder).
    if not os.path.exists(config.model):
        await fetch_llm(config.model)

    # CHECKPOINT MODE: Load engine BEFORE runtime creation
    # This allows checkpointing GPU state before runtime connections are established
    pre_created_engine = None
    is_restored = False
    if is_checkpoint_mode:
        logger.info(
            f"Checkpoint mode enabled (DYN_CHECKPOINT_SIGNAL_FILE={signal_file})"
        )

        # CHECKPOINT MODE: Load model, sleep, wait for signal file or restore
        pre_created_engine = setup_vllm_engine(config)
        engine_client = pre_created_engine[0]

        # Put model to sleep before checkpoint (if sleep mode enabled)
        if config.engine_args.enable_sleep_mode:
            logger.info(f"Putting model to sleep (level={config.sleep_mode_level})")
            await engine_client.sleep(level=config.sleep_mode_level)

        # Write ready file to signal that we're ready for checkpointing
        if ready_file:
            with open(ready_file, "w") as f:
                f.write("ready")
            logger.info(f"Wrote checkpoint ready file: {ready_file}")

        # Wait for checkpoint signal file OR restore detection
        is_restored = await await_checkpoint_and_was_restored(signal_file)

        if is_restored:
            # Wake up model and proceed with registration
            if config.engine_args.enable_sleep_mode:
                logger.info("Waking up model after checkpoint restore")
                await engine_client.wake_up()
            logger.info("Proceeding with endpoint registration after restore")
        else:
            # Checkpoint complete, exit
            logger.info("Exiting after checkpoint completion")
            return

    shutdown_event = asyncio.Event()
    runtime, _ = create_runtime(
        store_kv=config.store_kv,
        request_plane=config.request_plane,
        event_plane=config.event_plane,
        use_kv_events=config.use_kv_events,
        shutdown_event=shutdown_event,
    )

    # Route to appropriate initialization based on config flags
    if config.vllm_native_encoder_worker:
        await init_vllm_native_encoder(runtime, config, shutdown_event)
        logger.debug("init_vllm_native_encoder completed")
    elif config.ec_processor:
        await init_ec_processor(runtime, config, shutdown_event)
        logger.debug("init_ec_processor completed")
    elif config.multimodal_processor:
        await init_multimodal_processor(runtime, config, shutdown_event)
        logger.debug("init_multimodal_processor completed")
    elif config.multimodal_encode_worker:
        await init_multimodal_encode_worker(runtime, config, shutdown_event)
        logger.debug("init_multimodal_encode_worker completed")
    elif (
        config.multimodal_worker
        or config.multimodal_decode_worker
        or config.multimodal_encode_prefill_worker
    ):
        await init_multimodal_worker(
            runtime, config, shutdown_event, pre_created_engine=pre_created_engine
        )
        logger.debug("init_multimodal_worker completed")
    elif config.omni:
        await init_omni(runtime, config, shutdown_event)
        logger.debug("init_omni completed")
    elif config.is_prefill_worker:
        await init_prefill(
            runtime, config, shutdown_event, pre_created_engine=pre_created_engine
        )
        logger.debug("init_prefill completed")
    else:
        await init(
            runtime, config, shutdown_event, pre_created_engine=pre_created_engine
        )
        logger.debug("init completed")

    logger.debug("Worker function completed, exiting...")


def setup_metrics_collection(config: Config, generate_endpoint, logger):
    """Set up metrics collection for vLLM and LMCache metrics.

    In multiprocess mode (PROMETHEUS_MULTIPROC_DIR set), metrics are stored:
      1. In-memory: Metric objects in global REGISTRY
      2. On-disk: Metric values in .db files (PROMETHEUS_MULTIPROC_DIR)

    MultiProcessCollector reads from .db files but adding it to REGISTRY can fail
    with "Duplicated timeseries" if PROMETHEUS_MULTIPROC_DIR was set before process
    started (K8s deployments) because metrics are already in REGISTRY.

    Solution: Try adding MultiProcessCollector to REGISTRY. If that fails, use
    separate registry for multiprocess collection and register callbacks to both
    registries to ensure all metrics (vllm, lmcache, dynamo_component) are collected.
    """
    if config.engine_args.disable_log_stats is False:
        # Register the dedicated dynamo_component registry callback
        # IMPORTANT: We do NOT use MultiProcessCollector for DYNAMO_COMPONENT_REGISTRY
        # because our gauges use in-memory values which work fine for single-process
        # and multi-process (each process has its own gauge with dp_rank label).
        # Using MultiProcessCollector would read from .db files which causes stale
        # values to accumulate across test runs.
        register_engine_metrics_callback(
            endpoint=generate_endpoint,
            registry=DYNAMO_COMPONENT_REGISTRY,
        )

        if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
            try:
                # MultiProcessCollector reads metrics from .db files in PROMETHEUS_MULTIPROC_DIR
                # Adding it to REGISTRY allows collecting both in-memory and .db file metrics
                multiprocess.MultiProcessCollector(REGISTRY)
                logger.debug("Added MultiProcessCollector to global REGISTRY")
                register_engine_metrics_callback(
                    endpoint=generate_endpoint,
                    registry=REGISTRY,
                    metric_prefix_filters=[
                        "vllm:",
                        "lmcache:",
                    ],
                )
            except ValueError as e:
                # Conflict: metrics already in REGISTRY, MultiProcessCollector tries to add same metrics from .db files
                # Solution: Use separate registry that ONLY reads from .db files (no in-memory conflicts)
                logger.debug(
                    f"Could not add MultiProcessCollector to REGISTRY ({e}), using separate registry"
                )
                multiproc_registry = CollectorRegistry()
                multiprocess.MultiProcessCollector(multiproc_registry)

                # Register both registries to collect all metrics
                # Global REGISTRY has in-memory metrics (vllm)
                register_engine_metrics_callback(
                    endpoint=generate_endpoint,
                    registry=REGISTRY,
                    metric_prefix_filters=["vllm:"],
                )
                # Multiproc registry has .db file metrics (lmcache, possibly vllm duplicates)
                register_engine_metrics_callback(
                    endpoint=generate_endpoint,
                    registry=multiproc_registry,
                    metric_prefix_filters=[
                        "vllm:",
                        "lmcache:",
                    ],
                )
        else:
            # No multiprocess mode
            register_engine_metrics_callback(
                endpoint=generate_endpoint,
                registry=REGISTRY,
                metric_prefix_filters=["vllm:", "lmcache:"],
            )


def setup_kv_event_publisher(
    config: Config,
    component,
    generate_endpoint,
    vllm_config,
    consolidator_enabled: bool = False,
    consolidator_port: Optional[int] = 5558,
) -> Optional[KvEventPublisher]:
    """
    Set up KV event publishers for prefix caching if enabled.
    Creates one publisher per dp_rank since each dp_rank publishes to a different port.
    Args:
        config: Worker configuration
        component: Component for runtime integration
        generate_endpoint: Endpoint for worker ID
        vllm_config: vLLM configuration
        consolidator_enabled: If True, subscribe to kv eventconsolidator's ZMQ endpoint
        consolidator_port: Port where kv event consolidator publishes (default: 5558)

    Returns:
        List of KvEventPublisher instances (one per dp_rank) if prefix caching is enabled, None otherwise.
    """
    if not config.engine_args.enable_prefix_caching:
        return None

    # Skip KV event publishing for decode workers
    if config.is_decode_worker:
        logger.info("Skipping KV event publisher setup for decode worker")
        return None

    if config.engine_args.kv_events_config is None:
        return None

    # Check if kv_cache_events are explicitly disabled
    if not config.engine_args.kv_events_config.enable_kv_cache_events:
        logger.info(
            "KV event publishing skipped: enable_kv_cache_events=False in kv_events_config"
        )
        return None

    # Get data_parallel_size to create publishers for all dp_ranks
    data_parallel_size = getattr(vllm_config.parallel_config, "data_parallel_size", 1)
    kv_publishers = []

    for dp_rank in range(data_parallel_size):
        if consolidator_enabled:
            # TODO: Use different port for each dp_rank once KVBM supports DP
            zmq_endpoint = f"tcp://127.0.0.1:{consolidator_port}"
            logger.info(
                f"KV event publisher for dp_rank={dp_rank} subscribing to consolidator at {zmq_endpoint}"
            )
        else:
            # Each dp_rank publishes to a different port
            zmq_endpoint = ZmqEventPublisher.offset_endpoint_port(
                config.engine_args.kv_events_config.endpoint,
                data_parallel_rank=dp_rank,
            ).replace("*", "127.0.0.1")
            logger.info(
                f"KV event publisher for dp_rank={dp_rank} subscribing to vLLM at {zmq_endpoint}"
            )

        zmq_config = ZmqKvEventPublisherConfig(
            worker_id=generate_endpoint.connection_id(),
            kv_block_size=vllm_config.cache_config.block_size,
            zmq_endpoint=zmq_endpoint,
            enable_local_indexer=config.enable_local_indexer,
            dp_rank=dp_rank,
        )
        kv_publisher = KvEventPublisher(component=component, zmq_config=zmq_config)
        kv_publishers.append(kv_publisher)

        logger.info(
            f"Worker reading KV events for dp_rank={dp_rank} from {zmq_endpoint}"
        )

    return kv_publishers if kv_publishers else None


def setup_vllm_engine(config, stat_logger=None):
    # vLLM v0.11.0 bug: vllm/v1.metrics/prometheus.py:79 passes TemporaryDirectory object
    # instead of .name string, causing false error on exit. Set PROMETHEUS_MULTIPROC_DIR
    # ourselves to avoid this and handle cleanup properly.
    prometheus_temp_dir = None
    if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
        prometheus_temp_dir = tempfile.TemporaryDirectory(prefix="vllm_prometheus_")
        os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_temp_dir.name
        logger.debug(
            f"Created PROMETHEUS_MULTIPROC_DIR at: {os.environ['PROMETHEUS_MULTIPROC_DIR']}"
        )

    setup_multiprocess_prometheus()  # call vLLM's library's function to setup multiprocess prometheus
    logger.debug(
        f"Prometheus multiproc dir set to: {os.environ.get('PROMETHEUS_MULTIPROC_DIR')}"
    )

    # Construct Prometheus gauges AFTER setup_multiprocess_prometheus() so Gauge objects
    # see the correct ValueClass (multiprocess vs in-memory).
    component_gauges = LLMBackendMetrics(
        registry=DYNAMO_COMPONENT_REGISTRY,
        model_name=config.served_model_name or "",
        component_name=config.component or "",
    )

    # If a StatLoggerFactory was provided, give it the gauges so the loggers
    # it creates can publish Prometheus metrics.
    if stat_logger is not None:
        stat_logger.component_gauges = component_gauges

    os.environ["VLLM_NO_USAGE_STATS"] = "1"  # Avoid internal HTTP requests
    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

    engine_args = config.engine_args

    if engine_args.enable_lora:
        if "VLLM_ALLOW_RUNTIME_LORA_UPDATING" not in os.environ:
            os.environ["VLLM_ALLOW_RUNTIME_LORA_UPDATING"] = "True"
        if "VLLM_LORA_MODULES_LOADING_TIMEOUT" not in os.environ:
            os.environ["VLLM_LORA_MODULES_LOADING_TIMEOUT"] = "600"

    if engine_args.load_format == "gms":
        engine_args.worker_cls = "gpu_memory_service.integrations.vllm.worker.GMSWorker"

    # Load default sampling params from `generation_config.json`
    default_sampling_params = (
        engine_args.create_model_config().get_diff_sampling_param()
    )

    # Taken from build_async_engine_client_from_engine_args()
    usage_context = UsageContext.OPENAI_API_SERVER
    vllm_config = engine_args.create_engine_config(usage_context=usage_context)

    # Set up consolidator endpoints if KVBM is enabled
    consolidator_endpoints = None
    if config.has_connector("kvbm"):
        try:
            from kvbm.vllm_integration.consolidator_config import (
                get_consolidator_endpoints,
            )

            consolidator_endpoints = get_consolidator_endpoints(vllm_config)
        except Exception as e:
            logger.warning(
                f"KVBM connector is enabled but failed to get consolidator endpoints: {e}. "
                "Continuing without KV event consolidation. "
                "Ensure 'kvbm' package is installed if this feature is needed."
            )
    vllm_config.consolidator_endpoints = consolidator_endpoints

    factory = []
    if stat_logger:
        factory.append(stat_logger)

    # Time engine initialization
    start_time = time.time()
    engine_client = AsyncLLM.from_vllm_config(
        vllm_config=vllm_config,
        usage_context=usage_context,
        stat_loggers=factory,
        enable_log_requests=engine_args.enable_log_requests,
        disable_log_stats=engine_args.disable_log_stats,
    )
    load_time = time.time() - start_time

    # Record model load time
    component_gauges.set_model_load_time(load_time)

    logger.info(f"VllmWorker for {config.served_model_name} has been initialized")

    return (
        engine_client,
        vllm_config,
        default_sampling_params,
        prometheus_temp_dir,
        component_gauges,
    )


async def register_vllm_model(
    model_input: ModelInput,
    model_type: ModelType,
    generate_endpoint,
    config: Config,
    engine_client: AsyncLLM,
    vllm_config,
):
    """
    Helper function to register a vLLM model with runtime configuration.

    Args:
        model_input: Input type for the model (e.g., ModelInput.Tokens)
        model_type: Type of model (e.g., ModelType.Chat, ModelType.Prefill)
        generate_endpoint: Endpoint to register
        config: Configuration object
        engine_client: vLLM engine client
        vllm_config: vLLM configuration
    """
    runtime_config = ModelRuntimeConfig()

    # Get runtime configuration from vLLM engine
    logging.info(
        f"Getting engine runtime configuration metadata from vLLM engine for {model_type}..."
    )
    runtime_values = get_engine_cache_info(engine_client)
    runtime_config.total_kv_blocks = runtime_values["num_gpu_blocks"]
    runtime_config.max_num_seqs = runtime_values["max_num_seqs"]
    runtime_config.max_num_batched_tokens = runtime_values["max_num_batched_tokens"]
    # Decode workers don't create the WorkerKvQuery endpoint, so don't advertise local indexer
    runtime_config.enable_local_indexer = (
        config.enable_local_indexer and not config.is_decode_worker
    )

    # Add tool/reasoning parsers for decode models
    if model_type != ModelType.Prefill:
        runtime_config.tool_call_parser = config.tool_call_parser
        runtime_config.reasoning_parser = config.reasoning_parser

    # Get data_parallel_size from vllm_config (defaults to 1)
    data_parallel_size = getattr(vllm_config.parallel_config, "data_parallel_size", 1)
    runtime_config.data_parallel_size = data_parallel_size

    # Configure media decoder for frontend image decoding when enabled
    # This enables frontend to decode images and transfer via NIXL RDMA
    media_decoder = None
    media_fetcher = None
    if config.frontend_decoding:
        if not MEDIA_DECODER_AVAILABLE:
            raise RuntimeError(
                "--frontend-decoding requires MediaDecoder support. "
                "Ensure dynamo.llm module includes MediaDecoder and MediaFetcher."
            )
        media_decoder = MediaDecoder()
        media_decoder.enable_image({"limits": {"max_alloc": 128 * 1024 * 1024}})
        # media_decoder.enable_video({})

        media_fetcher = MediaFetcher()
        media_fetcher.timeout_ms(30000)
        media_fetcher.allow_direct_port(True)

    await register_llm(
        model_input,
        model_type,
        generate_endpoint,
        config.model,
        config.served_model_name,
        kv_cache_block_size=runtime_values["block_size"],
        runtime_config=runtime_config,
        custom_template_path=config.custom_jinja_template,
        media_decoder=media_decoder,
        media_fetcher=media_fetcher,
    )


async def init_prefill(
    runtime: DistributedRuntime,
    config: Config,
    shutdown_event: asyncio.Event,
    pre_created_engine=None,
):
    """
    Instantiate and serve
    """
    component = runtime.namespace(config.namespace).component(config.component)

    generate_endpoint = component.endpoint(config.endpoint)
    clear_endpoint = component.endpoint("clear_kv_blocks")

    # Use pre-created engine if provided (checkpoint mode), otherwise create new
    if pre_created_engine is not None:
        (
            engine_client,
            vllm_config,
            default_sampling_params,
            prometheus_temp_dir,
            _component_gauges,
        ) = pre_created_engine
    else:
        (
            engine_client,
            vllm_config,
            default_sampling_params,
            prometheus_temp_dir,
            _component_gauges,
        ) = setup_vllm_engine(config)

    handler = PrefillWorkerHandler(
        runtime,
        component,
        engine_client,
        default_sampling_params,
        getattr(getattr(vllm_config, "model_config", None), "max_model_len", None),
        enable_multimodal=config.enable_multimodal,
        generate_endpoint=generate_endpoint,
        config=config,
        use_vllm_tokenizer=config.use_vllm_tokenizer,
        shutdown_event=shutdown_event,
        enable_frontend_decoding=config.frontend_decoding,
    )
    handler.add_temp_dir(prometheus_temp_dir)

    # Check if kv event consolidator is enabled (port was allocated in setup_vllm_engine)
    consolidator_enabled = False
    consolidator_port = None

    if (
        hasattr(vllm_config, "consolidator_endpoints")
        and vllm_config.consolidator_endpoints
    ):
        # Extract connect endpoint (third element) for clients to subscribe
        # consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint)
        consolidator_output_endpoint = vllm_config.consolidator_endpoints[2]
        consolidator_port = int(consolidator_output_endpoint.split(":")[-1])
        consolidator_enabled = True

    # Set up KV event publishers for prefix caching if enabled (one per dp_rank)
    # If kv event consolidator is enabled, publisher will subscribe to kv event consolidator's output
    kv_publishers = setup_kv_event_publisher(
        config,
        component,
        generate_endpoint,
        vllm_config,
        consolidator_enabled=consolidator_enabled,
        consolidator_port=consolidator_port,
    )
    if kv_publishers:
        handler.kv_publishers = kv_publishers

    setup_metrics_collection(config, generate_endpoint, logger)

    # Register sleep/wake_up engine routes
    runtime.register_engine_route("sleep", handler.sleep)
    runtime.register_engine_route("wake_up", handler.wake_up)
    logger.info("Registered engine routes: /engine/sleep, /engine/wake_up")

    # Handle non-leader nodes - don't serve endpoints
    if config.engine_args.data_parallel_rank:
        await _handle_non_leader_node(config.engine_args.data_parallel_rank)
        return

    # Register prefill model with ModelType.Prefill
    model_input = ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens
    await register_vllm_model(
        model_input,
        ModelType.Prefill,
        generate_endpoint,
        config,
        engine_client,
        vllm_config,
    )

    health_check_payload = VllmPrefillHealthCheckPayload(
        engine_client, use_text_input=config.use_vllm_tokenizer
    ).to_dict()

    try:
        logger.debug("Starting serve_endpoint for prefill worker")
        await asyncio.gather(
            # for prefill, we want to shutdown the engine after all prefill requests are finished because
            #     (temp reason): we don't support re-routing prefill requests
            #     (long-term reason): prefill engine should pull from a global queue so there is
            #                         only a few in-flight requests that can be quickly finished
            generate_endpoint.serve_endpoint(
                handler.generate,
                graceful_shutdown=True,
                # In practice config.served_model_name is always set, but mypy needs the "or" here.
                metrics_labels=[("model", config.served_model_name or config.model)],
                health_check_payload=health_check_payload,
            ),
            clear_endpoint.serve_endpoint(
                handler.clear_kv_blocks,
                metrics_labels=[("model", config.served_model_name)],
            ),
        )
        logger.debug("serve_endpoint completed for prefill worker")
    except Exception as e:
        logger.error(f"Failed to serve endpoints: {e}")
        raise
    finally:
        logger.debug("Cleaning up prefill worker")
        handler.cleanup()


async def init(
    runtime: DistributedRuntime,
    config: Config,
    shutdown_event: asyncio.Event,
    pre_created_engine=None,
):
    """
    Instantiate and serve
    """

    component = runtime.namespace(config.namespace).component(config.component)

    generate_endpoint = component.endpoint(config.endpoint)
    clear_endpoint = component.endpoint("clear_kv_blocks")
    load_lora_endpoint = component.endpoint("load_lora")
    unload_lora_endpoint = component.endpoint("unload_lora")
    list_loras_endpoint = component.endpoint("list_loras")

    model_name = config.served_model_name or config.model

    # Use pre-created engine if provided (checkpoint mode), otherwise create new
    if pre_created_engine is not None:
        (
            engine_client,
            vllm_config,
            default_sampling_params,
            prometheus_temp_dir,
            component_gauges,
        ) = pre_created_engine
        # Factory is created after unpack so component_gauges is available
        factory = StatLoggerFactory(
            component,
            component_gauges=component_gauges,
            dp_rank=config.engine_args.data_parallel_rank or 0,
            metrics_labels=[("model", model_name)],
        )
    else:
        # Factory is created without component_gauges; setup_vllm_engine() will
        # create the gauges after setup_multiprocess_prometheus() and set them
        # on the factory before vLLM calls create_stat_logger().
        factory = StatLoggerFactory(
            component,
            dp_rank=config.engine_args.data_parallel_rank or 0,
            metrics_labels=[("model", model_name)],
        )
        (
            engine_client,
            vllm_config,
            default_sampling_params,
            prometheus_temp_dir,
            component_gauges,
        ) = setup_vllm_engine(config, factory)

    # TODO Hack to get data, move this to registering in TBD
    factory.set_num_gpu_blocks_all(vllm_config.cache_config.num_gpu_blocks)
    factory.init_publish()

    handler = DecodeWorkerHandler(
        runtime,
        component,
        engine_client,
        default_sampling_params,
        getattr(getattr(vllm_config, "model_config", None), "max_model_len", None),
        enable_multimodal=config.enable_multimodal,
        generate_endpoint=generate_endpoint,
        config=config,
        use_vllm_tokenizer=config.use_vllm_tokenizer,
        shutdown_event=shutdown_event,
        enable_frontend_decoding=config.frontend_decoding,
    )
    handler.add_temp_dir(prometheus_temp_dir)

    # Check if kv event consolidator is enabled (port was allocated in setup_vllm_engine)
    consolidator_enabled = False
    consolidator_port = None

    if (
        hasattr(vllm_config, "consolidator_endpoints")
        and vllm_config.consolidator_endpoints
    ):
        # Extract connect endpoint (third element) for clients to subscribe
        # consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint)
        consolidator_output_endpoint = vllm_config.consolidator_endpoints[2]
        consolidator_port = int(consolidator_output_endpoint.split(":")[-1])
        consolidator_enabled = True

    # Set up KV event publisher for prefix caching if enabled
    # If kv event consolidator is enabled, publisher will subscribe to kv event consolidator's output
    kv_publishers = setup_kv_event_publisher(
        config,
        component,
        generate_endpoint,
        vllm_config,
        consolidator_enabled=consolidator_enabled,
        consolidator_port=consolidator_port,
    )
    if kv_publishers:
        handler.kv_publishers = kv_publishers

    setup_metrics_collection(config, generate_endpoint, logger)

    # Register sleep/wake_up engine routes
    runtime.register_engine_route("sleep", handler.sleep)
    runtime.register_engine_route("wake_up", handler.wake_up)
    logger.info("Registered engine routes: /engine/sleep, /engine/wake_up")

    # Handle non-leader nodes - don't serve endpoints
    if config.engine_args.data_parallel_rank:
        await _handle_non_leader_node(config.engine_args.data_parallel_rank)
        return

    # Parse endpoint types from --dyn-endpoint-types flag
    model_type = parse_endpoint_types(config.dyn_endpoint_types)
    logger.info(f"Registering model with endpoint types: {config.dyn_endpoint_types}")

    model_input = ModelInput.Text if config.use_vllm_tokenizer else ModelInput.Tokens

    # Warn if custom template provided but chat endpoint not enabled
    if config.custom_jinja_template and "chat" not in config.dyn_endpoint_types:
        logger.warning(
            "Custom Jinja template provided (--custom-jinja-template) but 'chat' not in --dyn-endpoint-types. "
            "The chat template will be loaded but the /v1/chat/completions endpoint will not be available."
        )

    await register_vllm_model(
        model_input,
        model_type,
        generate_endpoint,
        config,
        engine_client,
        vllm_config,
    )

    health_check_payload = VllmHealthCheckPayload(
        engine_client, use_text_input=config.use_vllm_tokenizer
    ).to_dict()

    try:
        logger.debug("Starting serve_endpoint for decode worker")
        await asyncio.gather(
            # for decode, we want to transfer the in-flight requests to other decode engines,
            # because waiting them to finish can take a long time for long OSLs
            generate_endpoint.serve_endpoint(
                handler.generate,
                graceful_shutdown=True,
                metrics_labels=[("model", config.served_model_name or config.model)],
                health_check_payload=health_check_payload,
            ),
            clear_endpoint.serve_endpoint(
                handler.clear_kv_blocks,
                metrics_labels=[("model", config.served_model_name or config.model)],
            ),
            load_lora_endpoint.serve_endpoint(
                handler.load_lora,
                metrics_labels=[("model", config.served_model_name or config.model)],
            ),
            unload_lora_endpoint.serve_endpoint(
                handler.unload_lora,
                metrics_labels=[("model", config.served_model_name or config.model)],
            ),
            list_loras_endpoint.serve_endpoint(
                handler.list_loras,
                metrics_labels=[("model", config.served_model_name or config.model)],
            ),
        )
        logger.debug("serve_endpoint completed for decode worker")
    except Exception as e:
        logger.error(f"Failed to serve endpoints: {e}")
        raise
    finally:
        logger.debug("Cleaning up decode worker")
        # Cleanup background tasks
        handler.cleanup()


def get_engine_cache_info(engine: AsyncLLM):
    """Retrieve cache configuration information from [`AsyncLLM`] engine."""

    try:
        # Get values directly from vllm_config instead of collective_rpc
        cache_values = {
            "num_gpu_blocks": engine.vllm_config.cache_config.num_gpu_blocks,
            "block_size": engine.vllm_config.cache_config.block_size,
        }

        scheduler_values = {
            "max_num_seqs": engine.vllm_config.scheduler_config.max_num_seqs,
            "max_num_batched_tokens": engine.vllm_config.scheduler_config.max_num_batched_tokens,
        }

        logging.info(f"Cache config values: {cache_values}")
        logging.info(f"Scheduler config values: {scheduler_values}")
        return {
            "num_gpu_blocks": cache_values["num_gpu_blocks"],
            "block_size": cache_values["block_size"],
            "max_num_seqs": scheduler_values["max_num_seqs"],
            "max_num_batched_tokens": scheduler_values["max_num_batched_tokens"],
        }
    except Exception as e:
        logging.error(f"Failed to get configuration values from vLLM config: {e}")
        raise


async def init_multimodal_processor(
    runtime: DistributedRuntime, config: Config, shutdown_event: asyncio.Event
):
    """Initialize multimodal processor component"""
    component = runtime.namespace(config.namespace).component(config.component)

    generate_endpoint = component.endpoint(config.endpoint)

    # Get encode worker client
    encode_worker_client = (
        await runtime.namespace(config.namespace)
        .component("encoder")
        .endpoint("generate")
        .client()
    )

    pd_worker_client = (
        await runtime.namespace(config.namespace)
        .component("backend")
        .endpoint("generate")
        .client()
    )

    handler = PreprocessedHandler(
        config.engine_args,
        encode_worker_client,
        pd_worker_client,
    )

    logger.info("Waiting for Encoder Worker Instances ...")
    await encode_worker_client.wait_for_instances()

    # Register the endpoint as entrypoint to a model
    await register_llm(
        ModelInput.Tokens,
        ModelType.Chat,
        generate_endpoint,
        config.model,
        config.served_model_name,
        kv_cache_block_size=config.engine_args.block_size,
    )

    logger.info("Starting to serve the processor endpoint...")

    try:
        await asyncio.gather(
            generate_endpoint.serve_endpoint(
                handler.generate, metrics_labels=[("model", config.model)]
            ),
        )
    except Exception as e:
        logger.error(f"Failed to serve endpoints: {e}")
        raise
    finally:
        handler.cleanup()


async def init_multimodal_encode_worker(
    runtime: DistributedRuntime, config: Config, shutdown_event: asyncio.Event
):
    """Initialize multimodal encode worker component"""
    component = runtime.namespace(config.namespace).component(config.component)

    generate_endpoint = component.endpoint(config.endpoint)

    # Get PD worker client
    # In multimodal mode, the PD worker always registers as "backend"
    # (even in disaggregated mode with prefill/decode split, we still connect to "backend")
    pd_worker_client = (
        await runtime.namespace(config.namespace)
        .component("backend")
        .endpoint("generate")
        .client()
    )

    handler = EncodeWorkerHandler(
        config.engine_args,
        pd_worker_client,
    )
    await handler.async_init(runtime)
    logger.info("Waiting for PD Worker Instances ...")
    await pd_worker_client.wait_for_instances()
    logger.info("Starting to serve the encode worker endpoint...")

    try:
        await asyncio.gather(
            generate_endpoint.serve_endpoint(
                handler.generate, metrics_labels=[("model", config.model)]
            ),
        )
    except Exception as e:
        logger.error(f"Failed to serve encode worker endpoint: {e}")
        raise
    finally:
        handler.cleanup()


async def init_vllm_native_encoder(
    runtime: DistributedRuntime, config: Config, shutdown_event: asyncio.Event
):
    """
    Initialize vLLM-native encoder worker component (ECConnector mode).
    In this mode, vLLM handles encoder execution, caching, and storage automatically.
    """
    # Create component and endpoint
    component = runtime.namespace(config.namespace).component(config.component)
    generate_endpoint = component.endpoint(config.endpoint)

    # Configure ECTransferConfig for producer role
    instance_id = 0
    engine_id = f"{config.namespace}.{config.component}.encoder.{instance_id}"

    # Configure encoder with producer role, it will be responsible for creating embeddings and storing them in the shared storage
    ec_transfer_config = create_ec_transfer_config(
        engine_id=engine_id,
        ec_role="ec_producer",
        ec_connector_backend=config.ec_connector_backend,
        ec_storage_path=config.ec_storage_path,
        ec_extra_config=config.ec_extra_config,
    )

    # Set ECTransferConfig on engine args
    config.engine_args.ec_transfer_config = ec_transfer_config

    # Setup vLLM engine
    (
        engine_client,
        vllm_config,
        default_sampling_params,
        prometheus_temp_dir,
        _component_gauges,
    ) = setup_vllm_engine(config)

    # Initialize vLLM Native Encoder Worker Handler
    handler = VLLMEncodeWorkerHandler(
        runtime,
        component,
        engine_client,
        config,
    )
    handler.add_temp_dir(prometheus_temp_dir)

    # 5. No async init needed - vLLM handles everything
    # await handler.async_init(runtime)  # Not needed for ECConnector mode

    logger.info("Starting to serve vLLM-native encoder endpoint...")

    # 6. Serve endpoint
    try:
        await asyncio.gather(
            generate_endpoint.serve_endpoint(
                handler.generate, metrics_labels=[("model", config.model)]
            ),
        )
    except Exception as e:
        logger.error(f"Failed to serve vLLM-native encoder endpoint: {e}")
        raise
    finally:
        handler.cleanup()


async def init_ec_processor(
    runtime: DistributedRuntime, config: Config, shutdown_event: asyncio.Event
):
    """
    Initialize ECConnector processor component.

    Simple processor that routes multimodal requests using ECConnector pattern:
    1. Preprocess request (same as regular processor)
    2. Send multimodal items to encoder workers (stores to shared storage)
    3. Forward preprocessed request to PD worker (loads from shared storage)
    4. Stream response back to client
    """
    # Create component and endpoint
    component = runtime.namespace(config.namespace).component(config.component)
    generate_endpoint = component.endpoint(config.endpoint)

    # Get encoder worker client
    encoder_client = (
        await runtime.namespace(config.namespace)
        .component("encoder")
        .endpoint("generate")
        .client()
    )

    # Get PD worker client
    pd_client = (
        await runtime.namespace(config.namespace)
        .component("backend")
        .endpoint("generate")
        .client()
    )

    # Get prompt template from args (must be passed via environment or command line)
    mm_prompt_template = config.mm_prompt_template

    # Create EC processor handler (with preprocessing like regular processor)
    handler = ECProcessorHandler(
        config.engine_args,
        encoder_worker_client=encoder_client,
        pd_worker_client=pd_client,
        prompt_template=mm_prompt_template,
    )

    logger.info("Waiting for encoder and PD worker instances...")
    await encoder_client.wait_for_instances()
    await pd_client.wait_for_instances()

    # Register the endpoint as entrypoint to a model (same as preprocessed_handler)
    await register_llm(
        ModelInput.Tokens,  # Use Rust tokenization for better performance and multi-image support
        ModelType.Chat,
        generate_endpoint,
        config.model,
        config.served_model_name,
        kv_cache_block_size=config.engine_args.block_size,
    )

    logger.info("Starting to serve EC processor endpoint...")

    try:
        await asyncio.gather(
            generate_endpoint.serve_endpoint(
                handler.generate, metrics_labels=[("model", config.model)]
            ),
        )
    except Exception as e:
        logger.error(f"Failed to serve EC processor endpoint: {e}")
        raise
    finally:
        handler.cleanup()


async def init_multimodal_worker(
    runtime: DistributedRuntime,
    config: Config,
    shutdown_event: asyncio.Event,
    pre_created_engine=None,
):
    """
    Initialize multimodal worker component.

    Supports two modes:
    1. --multimodal-worker: Receives embeddings from separate encoder
    2. --multimodal-encode-prefill-worker: Handles inline encoding (e.g., Llama 4)

    Both can operate in aggregated (P+D) or disaggregated (P→D) mode.

    When --ec-consumer-mode is enabled, configures as ECConnector consumer
    to load encoder embeddings from shared storage.
    """
    component = runtime.namespace(config.namespace).component(config.component)

    generate_endpoint = component.endpoint(config.endpoint)
    clear_endpoint = component.endpoint("clear_kv_blocks")

    # Configure ECConnector consumer mode if enabled
    if config.ec_consumer_mode:
        logger.info("Configuring as ECConnector consumer for encoder embeddings")
        instance_id = 0
        engine_id = f"{config.namespace}.{config.component}.backend.{instance_id}"

        # The PD Worker just load the embeddings from the shared storage, so it is a consumer
        ec_transfer_config = create_ec_transfer_config(
            engine_id=engine_id,
            ec_role="ec_consumer",
            ec_connector_backend=config.ec_connector_backend,
            ec_storage_path=config.ec_storage_path,
            ec_extra_config=config.ec_extra_config,
        )

        # Set ECTransferConfig on engine args
        config.engine_args.ec_transfer_config = ec_transfer_config
        logger.info(f"Configured as ECConnector consumer with engine_id={engine_id}")

    # Use pre-created engine if provided (checkpoint mode), otherwise create new
    if pre_created_engine is not None:
        (
            engine_client,
            vllm_config,
            default_sampling_params,
            prometheus_temp_dir,
            _component_gauges,
        ) = pre_created_engine
    else:
        (
            engine_client,
            vllm_config,
            default_sampling_params,
            prometheus_temp_dir,
            _component_gauges,
        ) = setup_vllm_engine(config)

    # Set up decode worker client for disaggregated mode
    decode_worker_client = None
    if config.is_prefill_worker:
        # Prefill worker needs to connect to decode worker
        decode_worker_client = (
            await runtime.namespace(config.namespace)
            .component("decoder")
            .endpoint("generate")
            .client()
        )
        await decode_worker_client.wait_for_instances()
        logger.info("Connected to decode worker for disaggregated mode")

    # Choose handler based on worker type
    if config.multimodal_decode_worker:
        handler = MultimodalDecodeWorkerHandler(
            runtime, component, engine_client, config, shutdown_event
        )
    else:
        handler = MultimodalPDWorkerHandler(
            runtime,
            component,
            engine_client,
            config,
            decode_worker_client,
            shutdown_event,
        )
    handler.add_temp_dir(prometheus_temp_dir)

    await handler.async_init(runtime)

    # Set up KV event publisher for prefix caching if enabled
    kv_publisher = setup_kv_event_publisher(
        config, component, generate_endpoint, vllm_config
    )
    if kv_publisher:
        handler.kv_publisher = kv_publisher

    metrics_labels = [("model", config.model)]
    try:
        await asyncio.gather(
            generate_endpoint.serve_endpoint(
                handler.generate,
                metrics_labels=metrics_labels,
            ),
            clear_endpoint.serve_endpoint(
                handler.clear_kv_blocks,
                metrics_labels=metrics_labels,
            ),
        )
    except Exception as e:
        logger.error(f"Failed to serve endpoints: {e}")
        raise
    finally:
        handler.cleanup()


async def init_omni(
    runtime: DistributedRuntime, config: Config, shutdown_event: asyncio.Event
):
    """
    Initialize Omni worker for text-to-text generation using vLLM-Omni orchestrator.

    Uses vLLM-Omni's Omni class for single-stage text generation pipeline.
    For now, supports text-to-text only (no multimodal).
    """
    # Lazy import to avoid loading vllm-omni unless explicitly needed
    from dynamo.vllm.omni import OmniHandler

    component = runtime.namespace(config.namespace).component(config.component)
    generate_endpoint = component.endpoint(config.endpoint)

    # Initialize OmniHandler with Omni orchestrator
    handler = OmniHandler(
        runtime=runtime,
        component=component,
        config=config,
        default_sampling_params={},
        shutdown_event=shutdown_event,
    )

    logger.info(f"Omni worker initialized for model: {config.model}")

    # Set up metrics collection for vLLM and LMCache metrics
    setup_metrics_collection(config, generate_endpoint, logger)

    # Handle non-leader nodes - don't serve endpoints
    if config.engine_args.data_parallel_rank:
        await _handle_non_leader_node(config.engine_args.data_parallel_rank)
        return

    # TODO: extend for multi-stage pipelines
    await register_llm(
        ModelInput.Text,
        ModelType.Images,
        generate_endpoint,
        config.model,
        config.served_model_name,
        kv_cache_block_size=config.engine_args.block_size,
    )

    logger.info("Starting to serve Omni worker endpoint...")

    health_check_payload = (
        await VllmOmniHealthCheckPayload.create(handler.engine_client)
    ).to_dict()

    try:
        await generate_endpoint.serve_endpoint(
            handler.generate,
            graceful_shutdown=True,
            metrics_labels=[("model", config.served_model_name or config.model)],
            health_check_payload=health_check_payload,
        )
    except Exception as e:
        logger.error(f"Failed to serve Omni endpoint: {e}")
        raise
    finally:
        logger.debug("Cleaning up Omni worker")
        handler.cleanup()


def main():
    uvloop.run(worker())


if __name__ == "__main__":
    main()