Unverified Commit d81a00ef authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

feat: vllm moe k8 FT tests (#3672)


Signed-off-by: default avatartzulingk@nvidia.com <tzulingk@nvidia.com>
parent e01c6e99
# syntax=docker/dockerfile:1.10.0
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Dockerfile for using local/pre-built vLLM images with Dynamo
# Based on container/Dockerfile.vllm but uses existing vLLM image instead of building from source
# All ARGs used in FROM statements must be declared before any FROM
ARG LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input"
ARG DYNAMO_BASE_IMAGE="dynamo:latest-none"
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
# Other build arguments
ARG PYTHON_VERSION=3.12
ARG ARCH=amd64
ARG ARCH_ALT=x86_64
# Use local vLLM image as source
FROM ${LOCAL_VLLM_IMAGE} AS vllm_source
# Use Dynamo base image
FROM ${DYNAMO_BASE_IMAGE} AS dynamo_base
##################################################
########## Runtime Image ########################
##################################################
FROM ${RUNTIME_IMAGE}:${RUNTIME_IMAGE_TAG} AS runtime
WORKDIR /workspace
ENV DYNAMO_HOME=/opt/dynamo
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ARG ARCH_ALT
ARG PYTHON_VERSION
ENV NIXL_PREFIX=/opt/nvidia/nvda_nixl
ENV NIXL_LIB_DIR=$NIXL_PREFIX/lib/${ARCH_ALT}-linux-gnu
ENV NIXL_PLUGIN_DIR=$NIXL_LIB_DIR/plugins
# Install Python, build-essential and runtime dependencies
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
# Python runtime - CRITICAL for virtual environment to work
python${PYTHON_VERSION}-dev \
build-essential \
# jq and curl for polling various endpoints and health checks
jq \
git \
curl \
# Libraries required by UCX to find RDMA devices
libibverbs1 rdma-core ibverbs-utils libibumad3 \
libnuma1 librdmacm1 ibverbs-providers \
# JIT Kernel Compilation, flashinfer
ninja-build \
g++ \
# prometheus dependencies
ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-8 && \
rm -rf /var/lib/apt/lists/*
# Copy CUDA development tools from vLLM image (for JIT compilation)
COPY --from=vllm_source /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc
COPY --from=vllm_source /usr/local/cuda/bin/cudafe++ /usr/local/cuda/bin/cudafe++
COPY --from=vllm_source /usr/local/cuda/bin/ptxas /usr/local/cuda/bin/ptxas
COPY --from=vllm_source /usr/local/cuda/bin/fatbinary /usr/local/cuda/bin/fatbinary
COPY --from=vllm_source /usr/local/cuda/include/ /usr/local/cuda/include/
COPY --from=vllm_source /usr/local/cuda/nvvm /usr/local/cuda/nvvm
COPY --from=vllm_source /usr/local/cuda/lib64/libcudart.so* /usr/local/cuda/lib64/
### COPY NATS & ETCD ###
COPY --from=dynamo_base /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_base /usr/local/bin/etcd/ /usr/local/bin/etcd/
# Add ETCD and CUDA binaries to PATH
ENV PATH=/usr/local/bin/etcd/:/usr/local/cuda/nvvm/bin:$PATH
# Copy UCX and NIXL from dynamo base
COPY --from=dynamo_base /usr/local/ucx /usr/local/ucx
COPY --from=dynamo_base $NIXL_PREFIX $NIXL_PREFIX
ENV PATH=/usr/local/ucx/bin:$PATH
### VIRTUAL ENVIRONMENT SETUP ###
# Copy uv directly from official image (like Dockerfile.vllm does)
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
# Create fresh virtual environment (following Dockerfile.vllm pattern)
RUN mkdir -p /opt/dynamo/venv && \
uv venv /opt/dynamo/venv --python ${PYTHON_VERSION}
# Activate virtual environment
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Copy vLLM installation from local image
# vLLM workspace is at /vllm-workspace in the image
COPY --from=vllm_source /vllm-workspace /opt/vllm
# Copy ALL Python packages from vLLM image directly to venv
# Since vLLM is already installed (not as wheels), we copy the site-packages
COPY --from=vllm_source /usr/local/lib/python3.12/dist-packages ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages
# Fix the .pth files to point to the correct location for pplx_kernels and DeepEP
RUN if [ -f ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.pplx_kernels-0.0.1.pth ]; then \
echo "/opt/vllm/ep_kernels_workspace/pplx-kernels/src" > ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.pplx_kernels-0.0.1.pth; \
fi && \
if [ -f ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.deep_ep-0.0.1.pth ]; then \
echo "/opt/vllm/ep_kernels_workspace/DeepEP" > ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/__editable__.deep_ep-0.0.1.pth; \
fi && \
# Also check for any other .pth files that might reference /vllm-workspace
find ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages -name "*.pth" -exec sed -i 's|/vllm-workspace|/opt/vllm|g' {} \;
# Set LD_LIBRARY_PATH for all components
ENV LD_LIBRARY_PATH=\
/opt/vllm/ep_kernels_workspace/nvshmem_install/lib:\
$NIXL_LIB_DIR:\
$NIXL_PLUGIN_DIR:\
/usr/local/ucx/lib:\
/usr/local/ucx/lib/ucx:\
$LD_LIBRARY_PATH
# DeepGemm JIT compilation support
ENV CPATH=/usr/local/cuda/include
# Install Dynamo and dependencies (following Dockerfile.vllm pattern)
# First install basic Python packages
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install pip setuptools wheel
# Copy and install Dynamo wheels from dynamo_base
COPY --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/wheelhouse/
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*cp312*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& rm -rf /opt/dynamo/wheelhouse
# Install common and test dependencies
COPY container/deps/requirements.txt /tmp/requirements.txt
COPY container/deps/requirements.test.txt /tmp/requirements.test.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --requirement /tmp/requirements.txt --requirement /tmp/requirements.test.txt && \
rm /tmp/requirements*.txt
# Copy workspace files
COPY . /workspace/
# Copy attribution files
COPY ATTRIBUTION* LICENSE /workspace/
# Setup entrypoint
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
###########################################################
########## Development Image ##############################
###########################################################
FROM runtime AS dev
# Install development tools
RUN apt-get update -y && \
apt-get install -y --no-install-recommends \
nvtop \
wget \
tmux \
vim \
openssh-client \
iproute2 \
rsync \
zip \
unzip \
htop \
autoconf \
automake \
cmake \
libtool \
meson \
net-tools \
pybind11-dev \
clang \
libclang-dev \
protobuf-compiler && \
rm -rf /var/lib/apt/lists/*
# Set workspace directory
ENV WORKSPACE_DIR=/workspace \
DYNAMO_HOME=/workspace \
RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \
PATH=/usr/local/cargo/bin:$PATH
# Copy Rust toolchain if needed
COPY --from=dynamo_base /usr/local/rustup /usr/local/rustup
COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
# Install maturin for development
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install maturin[patchelf] && \
uv pip install --no-deps -e .
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Build script for Dynamo with local/pre-built vLLM images
set -e
# Default values
LOCAL_VLLM_IMAGE="vllm-elastic-ep:latest_all2all_buffer_input"
DYNAMO_BASE_TAG="dynamo:latest-none"
OUTPUT_TAG="my-dynamo-vllm:local"
TARGET="dev"
NO_CACHE=""
BUILD_DYNAMO_BASE=true
DOCKERFILE_DIR=$(dirname "$(readlink -f "$0")")
PROJECT_ROOT=$(dirname $(dirname $(dirname $(dirname "$DOCKERFILE_DIR"))))
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
print_usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Build Dynamo image using a local/pre-built vLLM image"
echo ""
echo "Options:"
echo " --vllm-image IMAGE Local vLLM image to use (default: $LOCAL_VLLM_IMAGE)"
echo " --tag TAG Output image tag (default: $OUTPUT_TAG)"
echo " --target TARGET Build target: runtime or dev (default: $TARGET)"
echo " --no-cache Disable Docker build cache"
echo " --skip-base Skip building dynamo base (assumes it exists)"
echo " --dynamo-base TAG Dynamo base image tag (default: $DYNAMO_BASE_TAG)"
echo " --help Show this help message"
echo ""
echo "Examples:"
echo " # Use default vLLM image"
echo " $0"
echo ""
echo " # Use custom vLLM image"
echo " $0 --vllm-image my-vllm:custom --tag my-dynamo:test"
echo ""
echo " # Build runtime image only"
echo " $0 --target runtime --tag my-dynamo:prod"
}
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--vllm-image)
LOCAL_VLLM_IMAGE="$2"
shift 2
;;
--tag)
OUTPUT_TAG="$2"
shift 2
;;
--target)
TARGET="$2"
if [[ "$TARGET" != "runtime" && "$TARGET" != "dev" ]]; then
echo -e "${RED}Error: --target must be 'runtime' or 'dev'${NC}"
exit 1
fi
shift 2
;;
--no-cache)
NO_CACHE="--no-cache"
shift
;;
--skip-base)
BUILD_DYNAMO_BASE=false
shift
;;
--dynamo-base)
DYNAMO_BASE_TAG="$2"
shift 2
;;
--help|-h)
print_usage
exit 0
;;
*)
echo -e "${RED}Unknown option: $1${NC}"
print_usage
exit 1
;;
esac
done
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN}Building Dynamo with Local vLLM Image${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Configuration:"
echo " vLLM Image: $LOCAL_VLLM_IMAGE"
echo " Output Tag: $OUTPUT_TAG"
echo " Target: $TARGET"
echo " Dynamo Base: $DYNAMO_BASE_TAG"
echo " Project Root: $PROJECT_ROOT"
echo ""
# Check if local vLLM image exists
if ! docker image inspect "$LOCAL_VLLM_IMAGE" > /dev/null 2>&1; then
echo -e "${RED}Error: Local vLLM image '$LOCAL_VLLM_IMAGE' not found${NC}"
echo "Available vLLM images:"
docker images | grep -E "^REPOSITORY|vllm" || echo "No vLLM images found"
exit 1
fi
# Step 1: Build Dynamo base if requested
if [ "$BUILD_DYNAMO_BASE" = true ]; then
echo -e "${YELLOW}Step 1: Building Dynamo base image...${NC}"
cd "$PROJECT_ROOT"
# Check if build.sh exists
if [ ! -f "container/build.sh" ]; then
echo -e "${RED}Error: container/build.sh not found in $PROJECT_ROOT${NC}"
exit 1
fi
./container/build.sh \
--framework none \
--tag "$DYNAMO_BASE_TAG" \
$NO_CACHE
if [ $? -ne 0 ]; then
echo -e "${RED}Error: Failed to build Dynamo base image${NC}"
exit 1
fi
echo -e "${GREEN}✓ Dynamo base image built successfully${NC}"
else
echo -e "${YELLOW}Step 1: Skipping Dynamo base build (using existing)${NC}"
# Check if base image exists
if ! docker image inspect "$DYNAMO_BASE_TAG" > /dev/null 2>&1; then
echo -e "${RED}Error: Dynamo base image '$DYNAMO_BASE_TAG' not found${NC}"
echo "Please build it first or remove --skip-base flag"
exit 1
fi
fi
# Step 2: Build combined image with local vLLM
echo ""
echo -e "${YELLOW}Step 2: Building combined Dynamo + vLLM image...${NC}"
cd "$PROJECT_ROOT"
# Build the combined image
docker build \
-f "$DOCKERFILE_DIR/Dockerfile.local_vllm" \
--build-arg LOCAL_VLLM_IMAGE="$LOCAL_VLLM_IMAGE" \
--build-arg DYNAMO_BASE_IMAGE="$DYNAMO_BASE_TAG" \
--target "$TARGET" \
--tag "$OUTPUT_TAG" \
$NO_CACHE \
.
if [ $? -ne 0 ]; then
echo -e "${RED}Error: Failed to build combined image${NC}"
exit 1
fi
echo ""
echo -e "${GREEN}========================================${NC}"
echo -e "${GREEN}✓ Build completed successfully!${NC}"
echo -e "${GREEN}========================================${NC}"
echo ""
echo "Output image: $OUTPUT_TAG"
echo ""
echo "To test the image:"
echo " docker run --rm -it --gpus all $OUTPUT_TAG python -c 'import vllm; print(vllm.__version__)'"
echo ""
echo "To use in pytest:"
echo " pytest tests/fault_tolerance/deploy/test_deployment.py::test_fault_scenario[vllm-moe-agg-tp-1-dp-2-none] \\"
echo " --image $OUTPUT_TAG \\"
echo " --namespace dynamo-kubernetes \\"
echo " -v -s"
echo ""
echo "To push to registry:"
echo " docker tag $OUTPUT_TAG <your-registry>/$OUTPUT_TAG"
echo " docker push <your-registry>/$OUTPUT_TAG"
......@@ -223,12 +223,45 @@ def _create_deployments_for_backend(backend):
return deployments
def _create_moe_deployments_for_backend(backend="vllm"):
"""Create MoE-specific deployment configurations for DeepSeek-V2-Lite."""
deployments = {}
# Only test tp=1, dp=2 for now
tp_size = 1
dp_replicas = (
2 # Note: this is handled internally by vLLM with --data-parallel-size
)
template_dir = "tests/fault_tolerance/deploy/templates"
yaml_files = {
"agg": f"{template_dir}/{backend}/moe_agg.yaml",
"disagg": f"{template_dir}/{backend}/moe_disagg.yaml",
}
for deploy_type in ["agg", "disagg"]:
scenario_name = f"{backend}-moe-{deploy_type}-tp-{tp_size}-dp-{dp_replicas}"
deployment = {
"spec": DeploymentSpec(yaml_files[deploy_type]),
"backend": backend,
"model": "deepseek-ai/DeepSeek-V2-Lite",
"is_moe": True,
}
deployments[scenario_name] = deployment
return deployments
# Create all deployment specifications
deployment_specs = {}
deployment_specs.update(_create_deployments_for_backend("vllm"))
deployment_specs.update(_create_deployments_for_backend("sglang"))
deployment_specs.update(_create_deployments_for_backend("trtllm"))
# Add MoE deployments for vLLM only
deployment_specs.update(_create_moe_deployments_for_backend("vllm"))
# Each failure scenaro contains a list of failure injections
# Each failure injection has a time in seconds after the pervious injection and
......@@ -378,6 +411,18 @@ def create_legacy_load(
# Default load configuration (using AI-Perf)
load = Load()
# MoE-specific load configuration
moe_load = Load(
clients=3, # Fewer clients for MoE testing
requests_per_client=30, # Reduced for MoE complexity
input_token_length=100,
output_token_length=100,
max_retries=3,
sla=None,
client_type="aiperf",
max_request_rate=0.5, # Lower rate for MoE
)
# model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model = None
......@@ -397,6 +442,9 @@ for backend in ["vllm", "sglang", "trtllm"]:
for deployment_name, deployment_info in deployment_specs.items():
backend = deployment_info["backend"]
# Check if this is an MoE deployment
is_moe = deployment_info.get("is_moe", False)
# Determine deployment type from deployment name
deploy_type = (
"agg"
......@@ -419,10 +467,17 @@ for deployment_name, deployment_info in deployment_specs.items():
continue
scenario_name = f"{deployment_name}-{failure_name}"
# Use MoE-specific load configuration if it's an MoE model
load_config = moe_load if is_moe else load
# Get model from deployment info or use the global model
scenario_model = deployment_info.get("model", model)
scenarios[scenario_name] = Scenario(
deployment=deployment_info["spec"],
load=load,
load=load_config,
failures=failure,
model=model,
model=scenario_model,
backend=backend,
)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-moe-agg
spec:
services:
Frontend:
dynamoNamespace: vllm-moe-agg
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: vllm-moe-agg
componentType: worker
replicas: 1
resources:
requests:
memory: "50Gi"
gpu: "2"
limits:
memory: "100Gi"
gpu: "2"
envs:
# MoE-specific environment variables
- name: VLLM_ALL2ALL_BACKEND
value: "pplx"
- name: VLLM_USE_ELASTIC_EP
value: "1"
- name: VLLM_USE_DEEP_GEMM
value: "1"
- name: VLLM_USE_V1
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: "spawn"
- name: CUDA_VISIBLE_DEVICES
value: "0,1"
- name: VLLM_DEBUG
value: "1"
- name: VLLM_LOGGING_LEVEL
value: "DEBUG"
extraPodSpec:
imagePullSecrets:
- name: nvcr-imagepullsecret
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- deepseek-ai/DeepSeek-V2-Lite
- --trust-remote-code
- --disable-log-requests
- --tensor-parallel-size
- "1"
- --data-parallel-size
- "2"
- --gpu-memory-utilization
- "0.5"
- --max-model-len
- "1024"
- --enable-expert-parallel
- --enable-elastic-ep
- --enable-eplb
- --eplb-config.num_redundant_experts
- "24"
- --eplb-config.window_size
- "100"
- --eplb-config.step_interval
- "10"
- --no-enable-prefix-caching
- --enforce-eager
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-moe-disagg
spec:
services:
Frontend:
dynamoNamespace: vllm-moe-disagg
componentType: frontend
replicas: 1
extraPodSpec:
imagePullSecrets:
- name: nvcr-imagepullsecret
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmDecodeWorker:
dynamoNamespace: vllm-moe-disagg
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 1
resources:
requests:
memory: "50Gi"
gpu: "2"
limits:
memory: "100Gi"
gpu: "2"
envs:
# MoE-specific environment variables
- name: VLLM_ALL2ALL_BACKEND
value: "pplx"
- name: VLLM_USE_ELASTIC_EP
value: "1"
- name: VLLM_USE_DEEP_GEMM
value: "1"
- name: VLLM_USE_V1
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: "spawn"
- name: CUDA_VISIBLE_DEVICES
value: "0,1"
- name: VLLM_DEBUG
value: "1"
- name: VLLM_LOGGING_LEVEL
value: "DEBUG"
extraPodSpec:
imagePullSecrets:
- name: nvcr-imagepullsecret
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- deepseek-ai/DeepSeek-V2-Lite
- --trust-remote-code
- --disable-log-requests
- --tensor-parallel-size
- "1"
- --data-parallel-size
- "2"
- --gpu-memory-utilization
- "0.5"
- --max-model-len
- "1024"
- --enable-expert-parallel
- --enable-elastic-ep
- --enable-eplb
- --eplb-config.num_redundant_experts
- "24"
- --eplb-config.window_size
- "100"
- --eplb-config.step_interval
- "10"
- --no-enable-prefix-caching
- --enforce-eager
VllmPrefillWorker:
dynamoNamespace: vllm-moe-disagg
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 1
resources:
requests:
memory: "50Gi"
gpu: "2"
limits:
memory: "100Gi"
gpu: "2"
envs:
# MoE-specific environment variables
- name: VLLM_ALL2ALL_BACKEND
value: "pplx"
- name: VLLM_USE_ELASTIC_EP
value: "1"
- name: VLLM_USE_DEEP_GEMM
value: "1"
- name: VLLM_USE_V1
value: "1"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: "spawn"
- name: CUDA_VISIBLE_DEVICES
value: "0,1"
- name: VLLM_DEBUG
value: "1"
- name: VLLM_LOGGING_LEVEL
value: "DEBUG"
extraPodSpec:
imagePullSecrets:
- name: nvcr-imagepullsecret
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- deepseek-ai/DeepSeek-V2-Lite
- --trust-remote-code
- --disable-log-requests
- --is-prefill-worker
- --tensor-parallel-size
- "1"
- --data-parallel-size
- "2"
- --gpu-memory-utilization
- "0.5"
- --max-model-len
- "1024"
- --enable-expert-parallel
- --enable-elastic-ep
- --enable-eplb
- --eplb-config.num_redundant_experts
- "24"
- --eplb-config.window_size
- "100"
- --eplb-config.step_interval
- "10"
- --no-enable-prefix-caching
- --enforce-eager
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment