Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d81a00ef
Unverified
Commit
d81a00ef
authored
Oct 21, 2025
by
Tzu-Ling Kan
Committed by
GitHub
Oct 21, 2025
Browse files
feat: vllm moe k8 FT tests (#3672)
Signed-off-by:
tzulingk@nvidia.com
<
tzulingk@nvidia.com
>
parent
e01c6e99
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
673 additions
and
2 deletions
+673
-2
tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
+204
-0
tests/fault_tolerance/deploy/container/build_from_local_vllm.sh
...fault_tolerance/deploy/container/build_from_local_vllm.sh
+183
-0
tests/fault_tolerance/deploy/scenarios.py
tests/fault_tolerance/deploy/scenarios.py
+57
-2
tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
+80
-0
tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
+149
-0
No files found.
tests/fault_tolerance/deploy/container/Dockerfile.local_vllm
0 → 100644
View file @
d81a00ef
# syntax=docker/dockerfile:1.10.0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Dockerfile for using local/pre-built vLLM images with Dynamo
# Based on container/Dockerfile.vllm but uses existing vLLM image instead of building from source
# All ARGs used in FROM statements must be declared before any FROM
ARG LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input"
ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
# Other build arguments
ARG PYTHON_VERSION=3.12
ARG ARCH=amd64
ARG ARCH_ALT=x86_64
# Use local vLLM image as source
FROM ${LOCAL_VLLM_IMAGE} AS vllm_source
# Use Dynamo base image
FROM ${DYNAMO_BASE_IMAGE} AS dynamo_base
##################################################
########## Runtime Image ########################
##################################################
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
WORKDIR /workspace
ENV DYNAMO_HOME=/opt/dynamo
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ARG ARCH_ALT
ARG PYTHON_VERSION
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Install Python, build-essential and runtime dependencies
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \
build-essential \
# jq and curl for polling various endpoints and health checks
jq \
git \
curl \
# Libraries required by UCX to find RDMA devices
libibverbs1 rdma-core ibverbs-utils libibumad3 \
libnuma1 librdmacm1 ibverbs-providers \
# JIT Kernel Compilation, flashinfer
ninja-build \
g++ \
# prometheus dependencies
ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-8 && \
rm -rf /var/lib/apt/lists/*
# Copy CUDA development tools from vLLM image (for JIT compilation)
COPY --from=vllm_source /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
COPY --from=vllm_source /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
COPY --from=vllm_source /usr/local/cuda/bin/ptxas /usr/local/cuda/bin/ptxas
COPY --from=vllm_source /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary
COPY --from=vllm_source /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=vllm_source /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=vllm_source /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
### COPY NATS & ETCD ###
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
# Add ETCD and CUDA binaries to PATH
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
# Copy UCX and NIXL from dynamo base
COPY --from=dynamo_base /usr/local/ucx /usr/local/ucx
COPY --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX
ENV PATH=/usr/local/ucx/bin:$PATH
### VIRTUAL ENVIRONMENT SETUP ###
# Copy uv directly from official image (like Dockerfile.vllm does)
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
# Create fresh virtual environment (following Dockerfile.vllm pattern)
RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python ${PYTHON_VERSION}
# Activate virtual environment
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Copy vLLM installation from local image
# vLLM workspace is at /vllm-workspace in the image
COPY --from=vllm_source /vllm-workspace /opt/vllm
# Copy ALL Python packages from vLLM image directly to venv
# Since vLLM is already installed (not as wheels), we copy the site-packages
COPY --from=vllm_source /usr/local/lib/python3.12/dist-packages ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
# Fix the .pth files to point to the correct location for pplx_kernels and DeepEP
RUN if [ -f ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.pplx_kernels-0.0.1.pth ]; then \
echo "/opt/vllm/ep_kernels_workspace/pplx-kernels/src" > ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.pplx_kernels-0.0.1.pth; \
fi && \
if [ -f ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.deep_ep-0.0.1.pth ]; then \
echo "/opt/vllm/ep_kernels_workspace/DeepEP" > ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.deep_ep-0.0.1.pth; \
fi && \
# Also check for any other .pth files that might reference /vllm-workspace
find ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages -name "*.pth" -exec sed -i 's|/vllm-workspace|/opt/vllm|g' {} \;
# Set LD_LIBRARY_PATH for all components
ENV LD_LIBRARY_PATH=\
/opt/vllm/ep_kernels_workspace/nvshmem_install/lib:\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH
# DeepGemm JIT compilation support
ENV CPATH=/usr/local/cuda/include
# Install Dynamo and dependencies (following Dockerfile.vllm pattern)
# First install basic Python packages
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install pip setuptools wheel
# Copy and install Dynamo wheels from dynamo_base
COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*cp312*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& rm -rf /opt/dynamo/wheelhouse
# Install common and test dependencies
COPY container/deps/requirements.txt /tmp/requirements.txt
COPY container/deps/requirements.test.txt /tmp/requirements.test.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt && \
rm /tmp/requirements*.txt
# Copy workspace files
COPY . /workspace/
# Copy attribution files
COPY ATTRIBUTION* LICENSE /workspace/
# Setup entrypoint
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
###########################################################
########## Development Image ##############################
###########################################################
FROM runtime AS dev
# Install development tools
RUN apt-get update -y && \
apt-get install -y --no-install-recommends \
nvtop \
wget \
tmux \
vim \
openssh-client \
iproute2 \
rsync \
zip \
unzip \
htop \
autoconf \
automake \
cmake \
libtool \
meson \
net-tools \
pybind11-dev \
clang \
libclang-dev \
protobuf-compiler && \
rm -rf /var/lib/apt/lists/*
# Set workspace directory
ENV WORKSPACE_DIR=/workspace \
DYNAMO_HOME=/workspace \
RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \
PATH=/usr/local/cargo/bin:$PATH
# Copy Rust toolchain if needed
COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
# Install maturin for development
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install maturin[patchelf] && \
uv pip install --no-deps -e .
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
tests/fault_tolerance/deploy/container/build_from_local_vllm.sh
0 → 100755
View file @
d81a00ef
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Build script for Dynamo with local/pre-built vLLM images
set
-e
# Default values
LOCAL_VLLM_IMAGE
=
"vllm-elastic-ep:latest_all2all_buffer_input"
DYNAMO_BASE_TAG
=
"dynamo:latest-none"
OUTPUT_TAG
=
"my-dynamo-vllm:local"
TARGET
=
"dev"
NO_CACHE
=
""
BUILD_DYNAMO_BASE
=
true
DOCKERFILE_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
PROJECT_ROOT
=
$(
dirname
$(
dirname
$(
dirname
$(
dirname
"
$DOCKERFILE_DIR
"
))))
# Colors for output
RED
=
'\033[0;31m'
GREEN
=
'\033[0;32m'
YELLOW
=
'\033[1;33m'
NC
=
'\033[0m'
# No Color
print_usage
()
{
echo
"Usage:
$0
[OPTIONS]"
echo
""
echo
"Build Dynamo image using a local/pre-built vLLM image"
echo
""
echo
"Options:"
echo
" --vllm-image IMAGE Local vLLM image to use (default:
$LOCAL_VLLM_IMAGE
)"
echo
" --tag TAG Output image tag (default:
$OUTPUT_TAG
)"
echo
" --target TARGET Build target: runtime or dev (default:
$TARGET
)"
echo
" --no-cache Disable Docker build cache"
echo
" --skip-base Skip building dynamo base (assumes it exists)"
echo
" --dynamo-base TAG Dynamo base image tag (default:
$DYNAMO_BASE_TAG
)"
echo
" --help Show this help message"
echo
""
echo
"Examples:"
echo
" # Use default vLLM image"
echo
"
$0
"
echo
""
echo
" # Use custom vLLM image"
echo
"
$0
--vllm-image my-vllm:custom --tag my-dynamo:test"
echo
""
echo
" # Build runtime image only"
echo
"
$0
--target runtime --tag my-dynamo:prod"
}
# Parse command line arguments
while
[[
$#
-gt
0
]]
;
do
case
$1
in
--vllm-image
)
LOCAL_VLLM_IMAGE
=
"
$2
"
shift
2
;;
--tag
)
OUTPUT_TAG
=
"
$2
"
shift
2
;;
--target
)
TARGET
=
"
$2
"
if
[[
"
$TARGET
"
!=
"runtime"
&&
"
$TARGET
"
!=
"dev"
]]
;
then
echo
-e
"
${
RED
}
Error: --target must be 'runtime' or 'dev'
${
NC
}
"
exit
1
fi
shift
2
;;
--no-cache
)
NO_CACHE
=
"--no-cache"
shift
;;
--skip-base
)
BUILD_DYNAMO_BASE
=
false
shift
;;
--dynamo-base
)
DYNAMO_BASE_TAG
=
"
$2
"
shift
2
;;
--help
|
-h
)
print_usage
exit
0
;;
*
)
echo
-e
"
${
RED
}
Unknown option:
$1
${
NC
}
"
print_usage
exit
1
;;
esac
done
echo
-e
"
${
GREEN
}
========================================
${
NC
}
"
echo
-e
"
${
GREEN
}
Building Dynamo with Local vLLM Image
${
NC
}
"
echo
-e
"
${
GREEN
}
========================================
${
NC
}
"
echo
""
echo
"Configuration:"
echo
" vLLM Image:
$LOCAL_VLLM_IMAGE
"
echo
" Output Tag:
$OUTPUT_TAG
"
echo
" Target:
$TARGET
"
echo
" Dynamo Base:
$DYNAMO_BASE_TAG
"
echo
" Project Root:
$PROJECT_ROOT
"
echo
""
# Check if local vLLM image exists
if
!
docker image inspect
"
$LOCAL_VLLM_IMAGE
"
>
/dev/null 2>&1
;
then
echo
-e
"
${
RED
}
Error: Local vLLM image '
$LOCAL_VLLM_IMAGE
' not found
${
NC
}
"
echo
"Available vLLM images:"
docker images |
grep
-E
"^REPOSITORY|vllm"
||
echo
"No vLLM images found"
exit
1
fi
# Step 1: Build Dynamo base if requested
if
[
"
$BUILD_DYNAMO_BASE
"
=
true
]
;
then
echo
-e
"
${
YELLOW
}
Step 1: Building Dynamo base image...
${
NC
}
"
cd
"
$PROJECT_ROOT
"
# Check if build.sh exists
if
[
!
-f
"container/build.sh"
]
;
then
echo
-e
"
${
RED
}
Error: container/build.sh not found in
$PROJECT_ROOT
${
NC
}
"
exit
1
fi
./container/build.sh
\
--framework
none
\
--tag
"
$DYNAMO_BASE_TAG
"
\
$NO_CACHE
if
[
$?
-ne
0
]
;
then
echo
-e
"
${
RED
}
Error: Failed to build Dynamo base image
${
NC
}
"
exit
1
fi
echo
-e
"
${
GREEN
}
✓ Dynamo base image built successfully
${
NC
}
"
else
echo
-e
"
${
YELLOW
}
Step 1: Skipping Dynamo base build (using existing)
${
NC
}
"
# Check if base image exists
if
!
docker image inspect
"
$DYNAMO_BASE_TAG
"
>
/dev/null 2>&1
;
then
echo
-e
"
${
RED
}
Error: Dynamo base image '
$DYNAMO_BASE_TAG
' not found
${
NC
}
"
echo
"Please build it first or remove --skip-base flag"
exit
1
fi
fi
# Step 2: Build combined image with local vLLM
echo
""
echo
-e
"
${
YELLOW
}
Step 2: Building combined Dynamo + vLLM image...
${
NC
}
"
cd
"
$PROJECT_ROOT
"
# Build the combined image
docker build
\
-f
"
$DOCKERFILE_DIR
/Dockerfile.local_vllm"
\
--build-arg
LOCAL_VLLM_IMAGE
=
"
$LOCAL_VLLM_IMAGE
"
\
--build-arg
DYNAMO_BASE_IMAGE
=
"
$DYNAMO_BASE_TAG
"
\
--target
"
$TARGET
"
\
--tag
"
$OUTPUT_TAG
"
\
$NO_CACHE
\
.
if
[
$?
-ne
0
]
;
then
echo
-e
"
${
RED
}
Error: Failed to build combined image
${
NC
}
"
exit
1
fi
echo
""
echo
-e
"
${
GREEN
}
========================================
${
NC
}
"
echo
-e
"
${
GREEN
}
✓ Build completed successfully!
${
NC
}
"
echo
-e
"
${
GREEN
}
========================================
${
NC
}
"
echo
""
echo
"Output image:
$OUTPUT_TAG
"
echo
""
echo
"To test the image:"
echo
" docker run --rm -it --gpus all
$OUTPUT_TAG
python -c 'import vllm; print(vllm.__version__)'"
echo
""
echo
"To use in pytest:"
echo
" pytest tests/fault_tolerance/deploy/test_deployment.py::test_fault_scenario[vllm-moe-agg-tp-1-dp-2-none]
\\
"
echo
" --image
$OUTPUT_TAG
\\
"
echo
" --namespace dynamo-kubernetes
\\
"
echo
" -v -s"
echo
""
echo
"To push to registry:"
echo
" docker tag
$OUTPUT_TAG
<your-registry>/
$OUTPUT_TAG
"
echo
" docker push <your-registry>/
$OUTPUT_TAG
"
tests/fault_tolerance/deploy/scenarios.py
View file @
d81a00ef
...
...
@@ -223,12 +223,45 @@ def _create_deployments_for_backend(backend):
return
deployments
def
_create_moe_deployments_for_backend
(
backend
=
"vllm"
):
"""Create MoE-specific deployment configurations for DeepSeek-V2-Lite."""
deployments
=
{}
# Only test tp=1, dp=2 for now
tp_size
=
1
dp_replicas
=
(
2
# Note: this is handled internally by vLLM with --data-parallel-size
)
template_dir
=
"tests/fault_tolerance/deploy/templates"
yaml_files
=
{
"agg"
:
f
"
{
template_dir
}
/
{
backend
}
/moe_agg.yaml"
,
"disagg"
:
f
"
{
template_dir
}
/
{
backend
}
/moe_disagg.yaml"
,
}
for
deploy_type
in
[
"agg"
,
"disagg"
]:
scenario_name
=
f
"
{
backend
}
-moe-
{
deploy_type
}
-tp-
{
tp_size
}
-dp-
{
dp_replicas
}
"
deployment
=
{
"spec"
:
DeploymentSpec
(
yaml_files
[
deploy_type
]),
"backend"
:
backend
,
"model"
:
"deepseek-ai/DeepSeek-V2-Lite"
,
"is_moe"
:
True
,
}
deployments
[
scenario_name
]
=
deployment
return
deployments
# Create all deployment specifications
deployment_specs
=
{}
deployment_specs
.
update
(
_create_deployments_for_backend
(
"vllm"
))
deployment_specs
.
update
(
_create_deployments_for_backend
(
"sglang"
))
deployment_specs
.
update
(
_create_deployments_for_backend
(
"trtllm"
))
# Add MoE deployments for vLLM only
deployment_specs
.
update
(
_create_moe_deployments_for_backend
(
"vllm"
))
# Each failure scenaro contains a list of failure injections
# Each failure injection has a time in seconds after the pervious injection and
...
...
@@ -378,6 +411,18 @@ def create_legacy_load(
# Default load configuration (using AI-Perf)
load
=
Load
()
# MoE-specific load configuration
moe_load
=
Load
(
clients
=
3
,
# Fewer clients for MoE testing
requests_per_client
=
30
,
# Reduced for MoE complexity
input_token_length
=
100
,
output_token_length
=
100
,
max_retries
=
3
,
sla
=
None
,
client_type
=
"aiperf"
,
max_request_rate
=
0.5
,
# Lower rate for MoE
)
# model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model
=
None
...
...
@@ -397,6 +442,9 @@ for backend in ["vllm", "sglang", "trtllm"]:
for
deployment_name
,
deployment_info
in
deployment_specs
.
items
():
backend
=
deployment_info
[
"backend"
]
# Check if this is an MoE deployment
is_moe
=
deployment_info
.
get
(
"is_moe"
,
False
)
# Determine deployment type from deployment name
deploy_type
=
(
"agg"
...
...
@@ -419,10 +467,17 @@ for deployment_name, deployment_info in deployment_specs.items():
continue
scenario_name
=
f
"
{
deployment_name
}
-
{
failure_name
}
"
# Use MoE-specific load configuration if it's an MoE model
load_config
=
moe_load
if
is_moe
else
load
# Get model from deployment info or use the global model
scenario_model
=
deployment_info
.
get
(
"model"
,
model
)
scenarios
[
scenario_name
]
=
Scenario
(
deployment
=
deployment_info
[
"spec"
],
load
=
load
,
load
=
load
_config
,
failures
=
failure
,
model
=
model
,
model
=
scenario_
model
,
backend
=
backend
,
)
tests/fault_tolerance/deploy/templates/vllm/moe_agg.yaml
0 → 100644
View file @
d81a00ef
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
vllm-moe-agg
spec
:
services
:
Frontend
:
dynamoNamespace
:
vllm-moe-agg
componentType
:
frontend
replicas
:
1
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
vllm-moe-agg
componentType
:
worker
replicas
:
1
resources
:
requests
:
memory
:
"
50Gi"
gpu
:
"
2"
limits
:
memory
:
"
100Gi"
gpu
:
"
2"
envs
:
# MoE-specific environment variables
-
name
:
VLLM_ALL2ALL_BACKEND
value
:
"
pplx"
-
name
:
VLLM_USE_ELASTIC_EP
value
:
"
1"
-
name
:
VLLM_USE_DEEP_GEMM
value
:
"
1"
-
name
:
VLLM_USE_V1
value
:
"
1"
-
name
:
VLLM_WORKER_MULTIPROC_METHOD
value
:
"
spawn"
-
name
:
CUDA_VISIBLE_DEVICES
value
:
"
0,1"
-
name
:
VLLM_DEBUG
value
:
"
1"
-
name
:
VLLM_LOGGING_LEVEL
value
:
"
DEBUG"
extraPodSpec
:
imagePullSecrets
:
-
name
:
nvcr-imagepullsecret
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
python3
-
-m
-
dynamo.vllm
args
:
-
--model
-
deepseek-ai/DeepSeek-V2-Lite
-
--trust-remote-code
-
--disable-log-requests
-
--tensor-parallel-size
-
"
1"
-
--data-parallel-size
-
"
2"
-
--gpu-memory-utilization
-
"
0.5"
-
--max-model-len
-
"
1024"
-
--enable-expert-parallel
-
--enable-elastic-ep
-
--enable-eplb
-
--eplb-config.num_redundant_experts
-
"
24"
-
--eplb-config.window_size
-
"
100"
-
--eplb-config.step_interval
-
"
10"
-
--no-enable-prefix-caching
-
--enforce-eager
tests/fault_tolerance/deploy/templates/vllm/moe_disagg.yaml
0 → 100644
View file @
d81a00ef
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
vllm-moe-disagg
spec
:
services
:
Frontend
:
dynamoNamespace
:
vllm-moe-disagg
componentType
:
frontend
replicas
:
1
extraPodSpec
:
imagePullSecrets
:
-
name
:
nvcr-imagepullsecret
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker
:
dynamoNamespace
:
vllm-moe-disagg
envFromSecret
:
hf-token-secret
componentType
:
worker
subComponentType
:
decode
replicas
:
1
resources
:
requests
:
memory
:
"
50Gi"
gpu
:
"
2"
limits
:
memory
:
"
100Gi"
gpu
:
"
2"
envs
:
# MoE-specific environment variables
-
name
:
VLLM_ALL2ALL_BACKEND
value
:
"
pplx"
-
name
:
VLLM_USE_ELASTIC_EP
value
:
"
1"
-
name
:
VLLM_USE_DEEP_GEMM
value
:
"
1"
-
name
:
VLLM_USE_V1
value
:
"
1"
-
name
:
VLLM_WORKER_MULTIPROC_METHOD
value
:
"
spawn"
-
name
:
CUDA_VISIBLE_DEVICES
value
:
"
0,1"
-
name
:
VLLM_DEBUG
value
:
"
1"
-
name
:
VLLM_LOGGING_LEVEL
value
:
"
DEBUG"
extraPodSpec
:
imagePullSecrets
:
-
name
:
nvcr-imagepullsecret
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
python3
-
-m
-
dynamo.vllm
args
:
-
--model
-
deepseek-ai/DeepSeek-V2-Lite
-
--trust-remote-code
-
--disable-log-requests
-
--tensor-parallel-size
-
"
1"
-
--data-parallel-size
-
"
2"
-
--gpu-memory-utilization
-
"
0.5"
-
--max-model-len
-
"
1024"
-
--enable-expert-parallel
-
--enable-elastic-ep
-
--enable-eplb
-
--eplb-config.num_redundant_experts
-
"
24"
-
--eplb-config.window_size
-
"
100"
-
--eplb-config.step_interval
-
"
10"
-
--no-enable-prefix-caching
-
--enforce-eager
VllmPrefillWorker
:
dynamoNamespace
:
vllm-moe-disagg
envFromSecret
:
hf-token-secret
componentType
:
worker
subComponentType
:
prefill
replicas
:
1
resources
:
requests
:
memory
:
"
50Gi"
gpu
:
"
2"
limits
:
memory
:
"
100Gi"
gpu
:
"
2"
envs
:
# MoE-specific environment variables
-
name
:
VLLM_ALL2ALL_BACKEND
value
:
"
pplx"
-
name
:
VLLM_USE_ELASTIC_EP
value
:
"
1"
-
name
:
VLLM_USE_DEEP_GEMM
value
:
"
1"
-
name
:
VLLM_USE_V1
value
:
"
1"
-
name
:
VLLM_WORKER_MULTIPROC_METHOD
value
:
"
spawn"
-
name
:
CUDA_VISIBLE_DEVICES
value
:
"
0,1"
-
name
:
VLLM_DEBUG
value
:
"
1"
-
name
:
VLLM_LOGGING_LEVEL
value
:
"
DEBUG"
extraPodSpec
:
imagePullSecrets
:
-
name
:
nvcr-imagepullsecret
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/components/backends/vllm
command
:
-
python3
-
-m
-
dynamo.vllm
args
:
-
--model
-
deepseek-ai/DeepSeek-V2-Lite
-
--trust-remote-code
-
--disable-log-requests
-
--is-prefill-worker
-
--tensor-parallel-size
-
"
1"
-
--data-parallel-size
-
"
2"
-
--gpu-memory-utilization
-
"
0.5"
-
--max-model-len
-
"
1024"
-
--enable-expert-parallel
-
--enable-elastic-ep
-
--enable-eplb
-
--eplb-config.num_redundant_experts
-
"
24"
-
--eplb-config.window_size
-
"
100"
-
--eplb-config.step_interval
-
"
10"
-
--no-enable-prefix-caching
-
--enforce-eager
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment