feat: vllm moe k8 FT tests (#3672)

Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>

feat: vllm moe k8 FT tests (#3672)
Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>
d81a00ef · Tzu-Ling Kan · GitHub · e01c6e99 · d81a00ef · d81a00ef
Unverified Commit d81a00ef authored Oct 21, 2025 by Tzu-Ling Kan Committed by GitHub Oct 21, 2025
5 changed files
--- a/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
+++ b/tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
+# syntax=docker/dockerfile:1.10.0
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Dockerfile for using local/pre-built vLLM images with Dynamo
+# Based on container/Dockerfile.vllm but uses existing vLLM image instead of building from source
+
+# All ARGs used in FROM statements must be declared before any FROM
+ARG LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input"
+ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
+ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
+ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
+
+# Other build arguments
+ARG PYTHON_VERSION=3.12
+ARG ARCH=amd64
+ARG ARCH_ALT=x86_64
+
+# Use local vLLM image as source
+FROM ${LOCAL_VLLM_IMAGE} AS vllm_source
+
+# Use Dynamo base image
+FROM ${DYNAMO_BASE_IMAGE} AS dynamo_base
+
+##################################################
+########## Runtime Image ########################
+##################################################
+FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
+
+WORKDIR /workspace
+ENV DYNAMO_HOME=/opt/dynamo
+ENV VIRTUAL_ENV=/opt/dynamo/venv
+ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
+
+ARG ARCH_ALT
+ARG PYTHON_VERSION
+ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
+ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
+ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
+
+# Install Python, build-essential and runtime dependencies
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        # Python runtime - CRITICAL for virtual environment to work
+        python${PYTHON_VERSION}-dev \
+        build-essential \
+        # jq and curl for polling various endpoints and health checks
+        jq \
+        git \
+        curl \
+        # Libraries required by UCX to find RDMA devices
+        libibverbs1 rdma-core ibverbs-utils libibumad3 \
+        libnuma1 librdmacm1 ibverbs-providers \
+        # JIT Kernel Compilation, flashinfer
+        ninja-build \
+        g++ \
+        # prometheus dependencies
+        ca-certificates \
+        # DeepGemm uses 'cuobjdump' which does not come with CUDA image
+        cuda-command-line-tools-12-8 && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy CUDA development tools from vLLM image (for JIT compilation)
+COPY --from=vllm_source /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
+COPY --from=vllm_source /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
+COPY --from=vllm_source /usr/local/cuda/bin/ptxas /usr/local/cuda/bin/ptxas
+COPY --from=vllm_source /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary
+COPY --from=vllm_source /usr/local/cuda/include/ /usr/local/cuda/include/
+COPY --from=vllm_source /usr/local/cuda/nvvm /usr/local/cuda/nvvm
+COPY --from=vllm_source /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
+
+### COPY NATS & ETCD ###
+COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
+COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
+# Add ETCD and CUDA binaries to PATH
+ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
+
+# Copy UCX and NIXL from dynamo base
+COPY --from=dynamo_base /usr/local/ucx /usr/local/ucx
+COPY --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX
+ENV PATH=/usr/local/ucx/bin:$PATH
+
+### VIRTUAL ENVIRONMENT SETUP ###
+
+# Copy uv directly from official image (like Dockerfile.vllm does)
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+# Create fresh virtual environment (following Dockerfile.vllm pattern)
+RUN mkdir -p /opt/dynamo/venv && \
+    uv venv /opt/dynamo/venv --python ${PYTHON_VERSION}
+
+# Activate virtual environment
+ENV VIRTUAL_ENV=/opt/dynamo/venv
+ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
+
+# Copy vLLM installation from local image
+# vLLM workspace is at /vllm-workspace in the image
+COPY --from=vllm_source /vllm-workspace /opt/vllm
+
+# Copy ALL Python packages from vLLM image directly to venv
+# Since vLLM is already installed (not as wheels), we copy the site-packages
+COPY --from=vllm_source /usr/local/lib/python3.12/dist-packages ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
+
+# Fix the .pth files to point to the correct location for pplx_kernels and DeepEP
+RUN if [ -f ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.pplx_kernels-0.0.1.pth ]; then \
+        echo "/opt/vllm/ep_kernels_workspace/pplx-kernels/src" > ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.pplx_kernels-0.0.1.pth; \
+    fi && \
+    if [ -f ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.deep_ep-0.0.1.pth ]; then \
+        echo "/opt/vllm/ep_kernels_workspace/DeepEP" > ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.deep_ep-0.0.1.pth; \
+    fi && \
+    # Also check for any other .pth files that might reference /vllm-workspace
+    find ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages -name "*.pth" -exec sed -i 's|/vllm-workspace|/opt/vllm|g' {} \;
+
+# Set LD_LIBRARY_PATH for all components
+ENV LD_LIBRARY_PATH=\
+/opt/vllm/ep_kernels_workspace/nvshmem_install/lib:\
+$NIXL_LIB_DIR:\
+$NIXL_PLUGIN_DIR:\
+/usr/local/ucx/lib:\
+/usr/local/ucx/lib/ucx:\
+$LD_LIBRARY_PATH
+
+# DeepGemm JIT compilation support
+ENV CPATH=/usr/local/cuda/include
+
+# Install Dynamo and dependencies (following Dockerfile.vllm pattern)
+# First install basic Python packages
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install pip setuptools wheel
+
+# Copy and install Dynamo wheels from dynamo_base
+COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install \
+    /opt/dynamo/wheelhouse/ai_dynamo_runtime*cp312*.whl \
+    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
+    /opt/dynamo/wheelhouse/nixl/nixl*.whl \
+    && rm -rf /opt/dynamo/wheelhouse
+
+# Install common and test dependencies
+COPY container/deps/requirements.txt /tmp/requirements.txt
+COPY container/deps/requirements.test.txt /tmp/requirements.test.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt && \
+    rm /tmp/requirements*.txt
+
+# Copy workspace files
+COPY . /workspace/
+
+# Copy attribution files
+COPY ATTRIBUTION* LICENSE /workspace/
+
+# Setup entrypoint
+ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
+CMD []
+
+###########################################################
+########## Development Image ##############################
+###########################################################
+FROM runtime AS dev
+
+# Install development tools
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends  \
+    nvtop \
+    wget \
+    tmux \
+    vim \
+    openssh-client \
+    iproute2 \
+    rsync \
+    zip \
+    unzip \
+    htop \
+    autoconf \
+    automake \
+    cmake \
+    libtool \
+    meson \
+    net-tools \
+    pybind11-dev \
+    clang \
+    libclang-dev \
+    protobuf-compiler && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set workspace directory
+ENV WORKSPACE_DIR=/workspace \
+    DYNAMO_HOME=/workspace \
+    RUSTUP_HOME=/usr/local/rustup \
+    CARGO_HOME=/usr/local/cargo \
+    PATH=/usr/local/cargo/bin:$PATH
+
+# Copy Rust toolchain if needed
+COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
+COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
+
+# Install maturin for development
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install maturin[patchelf] && \
+    uv pip install --no-deps -e .
+
+ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
+CMD []
--- a/tests/fault_tolerance/deploy/container/build_from_local_vllm.sh
+++ b/tests/fault_tolerance/deploy/container/build_from_local_vllm.sh
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Build script for Dynamo with local/pre-built vLLM images
+
+set -e
+
+# Default values
+LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input"
+DYNAMO_BASE_TAG="dynamo:latest-none"
+OUTPUT_TAG="my-dynamo-vllm:local"
+TARGET="dev"
+NO_CACHE=""
+BUILD_DYNAMO_BASE=true
+DOCKERFILE_DIR=$(dirname "$(readlink -f "$0")")
+PROJECT_ROOT=$(dirname $(dirname $(dirname $(dirname "$DOCKERFILE_DIR"))))
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+print_usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Build Dynamo image using a local/pre-built vLLM image"
+    echo ""
+    echo "Options:"
+    echo "  --vllm-image IMAGE        Local vLLM image to use (default: $LOCAL_VLLM_IMAGE)"
+    echo "  --tag TAG                 Output image tag (default: $OUTPUT_TAG)"
+    echo "  --target TARGET           Build target: runtime or dev (default: $TARGET)"
+    echo "  --no-cache               Disable Docker build cache"
+    echo "  --skip-base              Skip building dynamo base (assumes it exists)"
+    echo "  --dynamo-base TAG        Dynamo base image tag (default: $DYNAMO_BASE_TAG)"
+    echo "  --help                   Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  # Use default vLLM image"
+    echo "  $0"
+    echo ""
+    echo "  # Use custom vLLM image"
+    echo "  $0 --vllm-image my-vllm:custom --tag my-dynamo:test"
+    echo ""
+    echo "  # Build runtime image only"
+    echo "  $0 --target runtime --tag my-dynamo:prod"
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --vllm-image)
+            LOCAL_VLLM_IMAGE="$2"
+            shift 2
+            ;;
+        --tag)
+            OUTPUT_TAG="$2"
+            shift 2
+            ;;
+        --target)
+            TARGET="$2"
+            if [[ "$TARGET" != "runtime" && "$TARGET" != "dev" ]]; then
+                echo -e "${RED}Error: --target must be 'runtime' or 'dev'${NC}"
+                exit 1
+            fi
+            shift 2
+            ;;
+        --no-cache)
+            NO_CACHE="--no-cache"
+            shift
+            ;;
+        --skip-base)
+            BUILD_DYNAMO_BASE=false
+            shift
+            ;;
+        --dynamo-base)
+            DYNAMO_BASE_TAG="$2"
+            shift 2
+            ;;
+        --help|-h)
+            print_usage
+            exit 0
+            ;;
+        *)
+            echo -e "${RED}Unknown option: $1${NC}"
+            print_usage
+            exit 1
+            ;;
+    esac
+done
+
+echo -e "${GREEN}========================================${NC}"
+echo -e "${GREEN}Building Dynamo with Local vLLM Image${NC}"
+echo -e "${GREEN}========================================${NC}"
+echo ""
+echo "Configuration:"
+echo "  vLLM Image: $LOCAL_VLLM_IMAGE"
+echo "  Output Tag: $OUTPUT_TAG"
+echo "  Target: $TARGET"
+echo "  Dynamo Base: $DYNAMO_BASE_TAG"
+echo "  Project Root: $PROJECT_ROOT"
+echo ""
+
+# Check if local vLLM image exists
+if ! docker image inspect "$LOCAL_VLLM_IMAGE" > /dev/null 2>&1; then
+    echo -e "${RED}Error: Local vLLM image '$LOCAL_VLLM_IMAGE' not found${NC}"
+    echo "Available vLLM images:"
+    docker images | grep -E "^REPOSITORY|vllm" || echo "No vLLM images found"
+    exit 1
+fi
+
+# Step 1: Build Dynamo base if requested
+if [ "$BUILD_DYNAMO_BASE" = true ]; then
+    echo -e "${YELLOW}Step 1: Building Dynamo base image...${NC}"
+    cd "$PROJECT_ROOT"
+
+    # Check if build.sh exists
+    if [ ! -f "container/build.sh" ]; then
+        echo -e "${RED}Error: container/build.sh not found in $PROJECT_ROOT${NC}"
+        exit 1
+    fi
+
+    ./container/build.sh \
+        --framework none \
+        --tag "$DYNAMO_BASE_TAG" \
+        $NO_CACHE
+
+    if [ $? -ne 0 ]; then
+        echo -e "${RED}Error: Failed to build Dynamo base image${NC}"
+        exit 1
+    fi
+    echo -e "${GREEN}✓ Dynamo base image built successfully${NC}"
+else
+    echo -e "${YELLOW}Step 1: Skipping Dynamo base build (using existing)${NC}"
+    # Check if base image exists
+    if ! docker image inspect "$DYNAMO_BASE_TAG" > /dev/null 2>&1; then
+        echo -e "${RED}Error: Dynamo base image '$DYNAMO_BASE_TAG' not found${NC}"
+        echo "Please build it first or remove --skip-base flag"
+        exit 1
+    fi
+fi
+
+# Step 2: Build combined image with local vLLM
+echo ""
+echo -e "${YELLOW}Step 2: Building combined Dynamo + vLLM image...${NC}"
+cd "$PROJECT_ROOT"
+
+# Build the combined image
+docker build \
+    -f "$DOCKERFILE_DIR/Dockerfile.local_vllm" \
+    --build-arg LOCAL_VLLM_IMAGE="$LOCAL_VLLM_IMAGE" \
+    --build-arg DYNAMO_BASE_IMAGE="$DYNAMO_BASE_TAG" \
+    --target "$TARGET" \
+    --tag "$OUTPUT_TAG" \
+    $NO_CACHE \
+    .
+
+if [ $? -ne 0 ]; then
+    echo -e "${RED}Error: Failed to build combined image${NC}"
+    exit 1
+fi
+
+echo ""
+echo -e "${GREEN}========================================${NC}"
+echo -e "${GREEN}✓ Build completed successfully!${NC}"
+echo -e "${GREEN}========================================${NC}"
+echo ""
+echo "Output image: $OUTPUT_TAG"
+echo ""
+echo "To test the image:"
+echo "  docker run --rm -it --gpus all $OUTPUT_TAG python -c 'import vllm; print(vllm.__version__)'"
+echo ""
+echo "To use in pytest:"
+echo "  pytest tests/fault_tolerance/deploy/test_deployment.py::test_fault_scenario[vllm-moe-agg-tp-1-dp-2-none] \\"
+echo "    --image $OUTPUT_TAG \\"
+echo "    --namespace dynamo-kubernetes \\"
+echo "    -v -s"
+echo ""
+echo "To push to registry:"
+echo "  docker tag $OUTPUT_TAG <your-registry>/$OUTPUT_TAG"
+echo "  docker push <your-registry>/$OUTPUT_TAG"
+
--- a/tests/fault_tolerance/deploy/scenarios.py
+++ b/tests/fault_tolerance/deploy/scenarios.py
@@ -223,12 +223,45 @@ def _create_deployments_for_backend(backend):
    return deployments


+def _create_moe_deployments_for_backend(backend="vllm"):
+    """Create MoE-specific deployment configurations for DeepSeek-V2-Lite."""
+    deployments = {}
+
+    # Only test tp=1, dp=2 for now
+    tp_size = 1
+    dp_replicas = (
+        2  # Note: this is handled internally by vLLM with --data-parallel-size
+    )
+
+    template_dir = "tests/fault_tolerance/deploy/templates"
+    yaml_files = {
+        "agg": f"{template_dir}/{backend}/moe_agg.yaml",
+        "disagg": f"{template_dir}/{backend}/moe_disagg.yaml",
+    }
+
+    for deploy_type in ["agg", "disagg"]:
+        scenario_name = f"{backend}-moe-{deploy_type}-tp-{tp_size}-dp-{dp_replicas}"
+        deployment = {
+            "spec": DeploymentSpec(yaml_files[deploy_type]),
+            "backend": backend,
+            "model": "deepseek-ai/DeepSeek-V2-Lite",
+            "is_moe": True,
+        }
+
+        deployments[scenario_name] = deployment
+
+    return deployments
+
+
 # Create all deployment specifications
 deployment_specs = {}
 deployment_specs.update(_create_deployments_for_backend("vllm"))
 deployment_specs.update(_create_deployments_for_backend("sglang"))
 deployment_specs.update(_create_deployments_for_backend("trtllm"))

+# Add MoE deployments for vLLM only
+deployment_specs.update(_create_moe_deployments_for_backend("vllm"))
+

 # Each failure scenaro contains a list of failure injections
 # Each failure injection has a time in seconds after the pervious injection and
@@ -378,6 +411,18 @@ def create_legacy_load(
 # Default load configuration (using AI-Perf)
 load = Load()

+# MoE-specific load configuration
+moe_load = Load(
+    clients=3,  # Fewer clients for MoE testing
+    requests_per_client=30,  # Reduced for MoE complexity
+    input_token_length=100,
+    output_token_length=100,
+    max_retries=3,
+    sla=None,
+    client_type="aiperf",
+    max_request_rate=0.5,  # Lower rate for MoE
+)
+
 # model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

 model = None
@@ -397,6 +442,9 @@ for backend in ["vllm", "sglang", "trtllm"]:
 for deployment_name, deployment_info in deployment_specs.items():
    backend = deployment_info["backend"]

+    # Check if this is an MoE deployment
+    is_moe = deployment_info.get("is_moe", False)
+
    # Determine deployment type from deployment name
    deploy_type = (
        "agg"
@@ -419,10 +467,17 @@ for deployment_name, deployment_info in deployment_specs.items():
            continue

        scenario_name = f"{deployment_name}-{failure_name}"
+
+        # Use MoE-specific load configuration if it's an MoE model
+        load_config = moe_load if is_moe else load
+
+        # Get model from deployment info or use the global model
+        scenario_model = deployment_info.get("model", model)
+
        scenarios[scenario_name] = Scenario(
            deployment=deployment_info["spec"],
-            load=load,
+            load=load_config,
            failures=failure,
-            model=model,
+            model=scenario_model,
            backend=backend,
        )
--- a/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
+++ b/tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-moe-agg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-moe-agg
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-moe-agg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          memory: "50Gi"
+          gpu: "2"
+        limits:
+          memory: "100Gi"
+          gpu: "2"
+      envs:
+        # MoE-specific environment variables
+        - name: VLLM_ALL2ALL_BACKEND
+          value: "pplx"
+        - name: VLLM_USE_ELASTIC_EP
+          value: "1"
+        - name: VLLM_USE_DEEP_GEMM
+          value: "1"
+        - name: VLLM_USE_V1
+          value: "1"
+        - name: VLLM_WORKER_MULTIPROC_METHOD
+          value: "spawn"
+        - name: CUDA_VISIBLE_DEVICES
+          value: "0,1"
+        - name: VLLM_DEBUG
+          value: "1"
+        - name: VLLM_LOGGING_LEVEL
+          value: "DEBUG"
+      extraPodSpec:
+        imagePullSecrets:
+        - name: nvcr-imagepullsecret
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/components/backends/vllm
+          command:
+          - python3
+          - -m
+          - dynamo.vllm
+          args:
+            - --model
+            - deepseek-ai/DeepSeek-V2-Lite
+            - --trust-remote-code
+            - --disable-log-requests
+            - --tensor-parallel-size
+            - "1"
+            - --data-parallel-size
+            - "2"
+            - --gpu-memory-utilization
+            - "0.5"
+            - --max-model-len
+            - "1024"
+            - --enable-expert-parallel
+            - --enable-elastic-ep
+            - --enable-eplb
+            - --eplb-config.num_redundant_experts
+            - "24"
+            - --eplb-config.window_size
+            - "100"
+            - --eplb-config.step_interval
+            - "10"
+            - --no-enable-prefix-caching
+            - --enforce-eager
--- a/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
+++ b/tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-moe-disagg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-moe-disagg
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        imagePullSecrets:
+        - name: nvcr-imagepullsecret
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+    VllmDecodeWorker:
+      dynamoNamespace: vllm-moe-disagg
+      envFromSecret: hf-token-secret
+      componentType: worker
+      subComponentType: decode
+      replicas: 1
+      resources:
+        requests:
+          memory: "50Gi"
+          gpu: "2"
+        limits:
+          memory: "100Gi"
+          gpu: "2"
+      envs:
+        # MoE-specific environment variables
+        - name: VLLM_ALL2ALL_BACKEND
+          value: "pplx"
+        - name: VLLM_USE_ELASTIC_EP
+          value: "1"
+        - name: VLLM_USE_DEEP_GEMM
+          value: "1"
+        - name: VLLM_USE_V1
+          value: "1"
+        - name: VLLM_WORKER_MULTIPROC_METHOD
+          value: "spawn"
+        - name: CUDA_VISIBLE_DEVICES
+          value: "0,1"
+        - name: VLLM_DEBUG
+          value: "1"
+        - name: VLLM_LOGGING_LEVEL
+          value: "DEBUG"
+      extraPodSpec:
+        imagePullSecrets:
+        - name: nvcr-imagepullsecret
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/components/backends/vllm
+          command:
+          - python3
+          - -m
+          - dynamo.vllm
+          args:
+            - --model
+            - deepseek-ai/DeepSeek-V2-Lite
+            - --trust-remote-code
+            - --disable-log-requests
+            - --tensor-parallel-size
+            - "1"
+            - --data-parallel-size
+            - "2"
+            - --gpu-memory-utilization
+            - "0.5"
+            - --max-model-len
+            - "1024"
+            - --enable-expert-parallel
+            - --enable-elastic-ep
+            - --enable-eplb
+            - --eplb-config.num_redundant_experts
+            - "24"
+            - --eplb-config.window_size
+            - "100"
+            - --eplb-config.step_interval
+            - "10"
+            - --no-enable-prefix-caching
+            - --enforce-eager
+    VllmPrefillWorker:
+      dynamoNamespace: vllm-moe-disagg
+      envFromSecret: hf-token-secret
+      componentType: worker
+      subComponentType: prefill
+      replicas: 1
+      resources:
+        requests:
+          memory: "50Gi"
+          gpu: "2"
+        limits:
+          memory: "100Gi"
+          gpu: "2"
+      envs:
+        # MoE-specific environment variables
+        - name: VLLM_ALL2ALL_BACKEND
+          value: "pplx"
+        - name: VLLM_USE_ELASTIC_EP
+          value: "1"
+        - name: VLLM_USE_DEEP_GEMM
+          value: "1"
+        - name: VLLM_USE_V1
+          value: "1"
+        - name: VLLM_WORKER_MULTIPROC_METHOD
+          value: "spawn"
+        - name: CUDA_VISIBLE_DEVICES
+          value: "0,1"
+        - name: VLLM_DEBUG
+          value: "1"
+        - name: VLLM_LOGGING_LEVEL
+          value: "DEBUG"
+      extraPodSpec:
+        imagePullSecrets:
+        - name: nvcr-imagepullsecret
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/components/backends/vllm
+          command:
+          - python3
+          - -m
+          - dynamo.vllm
+          args:
+            - --model
+            - deepseek-ai/DeepSeek-V2-Lite
+            - --trust-remote-code
+            - --disable-log-requests
+            - --is-prefill-worker
+            - --tensor-parallel-size
+            - "1"
+            - --data-parallel-size
+            - "2"
+            - --gpu-memory-utilization
+            - "0.5"
+            - --max-model-len
+            - "1024"
+            - --enable-expert-parallel
+            - --enable-elastic-ep
+            - --enable-eplb
+            - --eplb-config.num_redundant_experts
+            - "24"
+            - --eplb-config.window_size
+            - "100"
+            - --eplb-config.step_interval
+            - "10"
+            - --no-enable-prefix-caching
+            - --enforce-eager