Unverified Commit 106a184a authored by Tushar Sharma's avatar Tushar Sharma Committed by GitHub
Browse files

feat: add SGLang recipe for DeepSeek-V4 (#8704) (#8712)


Co-authored-by: default avatarKrishnan Prashanth <140860868+KrishnanPrash@users.noreply.github.com>
parent 35fa7129
# DeepSeek-V4-Flash SGLang + Dynamo runtime image
#
# Two-step build:
# 1. Build dynamo:latest-sglang-runtime locally per container/README.md:
# python3 container/render.py --framework sglang --target runtime
# docker build -f container/sglang-runtime-*.Dockerfile -t dynamo:latest-sglang-runtime .
# 2. Build this overlay (adds V4 parsers + routed_experts fix):
# docker build -f recipes/deepseek-v4-flash/sglang/Dockerfile.dsv4-sglang \
# -t <your-registry>/sglang-dsv4:<tag> .
ARG DYNAMO_SRC_IMAGE=dynamo:latest-sglang-runtime
ARG DSV4_BASE_IMAGE=lmsysorg/sglang:deepseek-v4-blackwell
# ---------- Stage 1: Build dynamo wheels with V4 parsers ----------
FROM quay.io/pypa/manylinux_2_28_x86_64 AS wheel_builder
RUN yum install -y openssl-devel clang-devel && yum clean all
# Modern protoc
RUN curl -sL https://github.com/protocolbuffers/protobuf/releases/download/v28.3/protoc-28.3-linux-x86_64.zip \
-o /tmp/protoc.zip && unzip -o /tmp/protoc.zip -d /usr/local && rm /tmp/protoc.zip
# Rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH=/root/.cargo/bin:/opt/python/cp312-cp312/bin:${PATH}
RUN pip install maturin
# Clone dynamo release branch (includes V4 parsers + sglang compat fixes)
RUN git clone --depth 1 -b release/deepseekv4 \
https://github.com/ai-dynamo/dynamo.git /workspace
# Build dynamo runtime wheel
WORKDIR /workspace/lib/bindings/python
RUN maturin build --release --interpreter /opt/python/cp312-cp312/bin/python3 \
-o /opt/dynamo/wheelhouse
# Build ai-dynamo pure-python wheel
WORKDIR /workspace
RUN pip wheel --no-deps -w /opt/dynamo/wheelhouse .
# ---------- Stage 2: Dynamo sglang-runtime donor ----------
FROM ${DYNAMO_SRC_IMAGE} AS dynamo_src
# ---------- Stage 3: Final image ----------
FROM ${DSV4_BASE_IMAGE}
ENV DEBIAN_FRONTEND=noninteractive
# Infra from dynamo sglang-runtime (etcd, nats, UCX, NIXL)
COPY --from=dynamo_src /usr/bin/nats-server /usr/bin/nats-server
COPY --from=dynamo_src /usr/local/bin/etcd /usr/local/bin/etcd
ENV PATH=/usr/local/bin/etcd:${PATH}
# UCX libs
COPY --from=dynamo_src /usr/lib/x86_64-linux-gnu/ucx /usr/lib/x86_64-linux-gnu/ucx
COPY --from=dynamo_src /usr/lib/x86_64-linux-gnu/libuc*.so* /usr/lib/x86_64-linux-gnu/
# NIXL + deps (pip packages with native libs)
COPY --from=dynamo_src /usr/local/lib/python3.12/dist-packages/nixl* /usr/local/lib/python3.12/dist-packages/
COPY --from=dynamo_src /usr/local/lib/python3.12/dist-packages/nixl_cu12* /usr/local/lib/python3.12/dist-packages/
# Dynamo wheels with V4 parsers (overrides the base dynamo from donor)
COPY --from=wheel_builder /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl /opt/dynamo/wheelhouse/
COPY --from=wheel_builder /opt/dynamo/wheelhouse/ai_dynamo-*.whl /opt/dynamo/wheelhouse/
RUN pip install --no-cache-dir --force-reinstall --no-deps \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo-*.whl && \
python3 -c "from dynamo._core import get_tool_parser_names; assert 'deepseek_v4' in get_tool_parser_names(), 'V4 parser missing!'; print('V4 parser verified')"
# Dynamo Python components from V4 branch
COPY --from=wheel_builder /workspace/components/src/dynamo /workspace/components/src/dynamo
# Fix: sglang repo dir at /workspace/sglang shadows the Python package
ENV PYTHONPATH=/workspace/sglang/python:/workspace/components/src:${PYTHONPATH}
# DeepGEMM JIT env vars
ENV SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \
SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1
WORKDIR /workspace
ENTRYPOINT []
CMD ["bash"]
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# DeepSeek-V4-Flash SGLang DynamoGraphDeployment
# Aggregated mode (no P/D disagg), B200x4 TP4, MXFP4 MoE, EAGLE MTP 3/4
#
# Deploy:
# kubectl apply -f sglang-dgd.yaml -n <namespace>
#
# Test:
# kubectl port-forward -n <namespace> svc/sglang-dsv4-flash-frontend 8000:8000
# curl http://localhost:8000/v1/models
# curl http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' \
# -d '{"model":"deepseek-ai/DeepSeek-V4-Flash","messages":[{"role":"user","content":"Hello"}]}'
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sglang-dsv4-flash
spec:
services:
Frontend:
componentType: frontend
replicas: 1
volumeMounts:
- name: shared-model-cache
mountPoint: /models
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/dynamo-dev/kprashanth:sglang-dsv4-v2
imagePullPolicy: Always
env:
- name: HF_HOME
value: /models
- name: HF_HUB_OFFLINE
value: "1"
startupProbe:
httpGet:
path: /health
port: 8000
periodSeconds: 10
timeoutSeconds: 10
failureThreshold: 360
decode:
componentType: worker
subComponentType: decode
replicas: 1
resources:
limits:
gpu: "4"
volumeMounts:
- name: shared-model-cache
mountPoint: /models
sharedMemory:
size: 200Gi
extraPodSpec:
nodeSelector:
nvidia.com/gpu.product: NVIDIA-B200
tolerations:
- key: nvidia.com/gpu
operator: Equal
value: "true"
effect: NoSchedule
mainContainer:
image: nvcr.io/nvidian/dynamo-dev/kprashanth:sglang-dsv4-v2
imagePullPolicy: Always
workingDir: /workspace
command:
- python3
- -m
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-V4-Flash
- --served-model-name
- deepseek-ai/DeepSeek-V4-Flash
- --trust-remote-code
- --tp
- "4"
- --moe-runner-backend
- flashinfer_mxfp4
- --speculative-algo
- EAGLE
- --speculative-num-steps
- "3"
- --speculative-eagle-topk
- "1"
- --speculative-num-draft-tokens
- "4"
- --chunked-prefill-size
- "4096"
- --disable-flashinfer-autotune
- --dyn-tool-call-parser
- deepseek_v4
- --dyn-reasoning-parser
- deepseek_v4
env:
- name: HF_HOME
value: /models
- name: HF_HUB_OFFLINE
value: "1"
- name: CUDA_VISIBLE_DEVICES
value: "0,1,2,3,4,5,6,7"
- name: SGLANG_JIT_DEEPGEMM_PRECOMPILE
value: "0"
- name: SGLANG_JIT_DEEPGEMM_FAST_WARMUP
value: "1"
- name: NCCL_CUMEM_ENABLE
value: "1"
- name: GLOO_SOCKET_IFNAME
value: eth0
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 10
failureThreshold: 360
pvcs:
- name: shared-model-cache
create: false
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# DeepSeek-V4-Pro SGLang DynamoGraphDeployment
# Aggregated mode, B200x8 TP8, MXFP4 MoE, EAGLE MTP 3/4
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sglang-dsv4-pro
spec:
services:
Frontend:
componentType: frontend
replicas: 1
volumeMounts:
- name: shared-model-cache
mountPoint: /models
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/dynamo-dev/kprashanth:sglang-dsv4-v2
imagePullPolicy: Always
env:
- name: HF_HOME
value: /models
- name: HF_HUB_OFFLINE
value: "1"
startupProbe:
httpGet:
path: /health
port: 8000
periodSeconds: 10
timeoutSeconds: 10
failureThreshold: 360
decode:
componentType: worker
subComponentType: decode
replicas: 1
resources:
limits:
gpu: "8"
volumeMounts:
- name: shared-model-cache
mountPoint: /models
sharedMemory:
size: 200Gi
extraPodSpec:
nodeSelector:
nvidia.com/gpu.product: NVIDIA-B200
tolerations:
- key: nvidia.com/gpu
operator: Equal
value: "true"
effect: NoSchedule
mainContainer:
image: nvcr.io/nvidian/dynamo-dev/kprashanth:sglang-dsv4-v2
imagePullPolicy: Always
workingDir: /workspace
command:
- python3
- -m
- dynamo.sglang
args:
- --model-path
- deepseek-ai/DeepSeek-V4-Pro
- --served-model-name
- deepseek-ai/DeepSeek-V4-Pro
- --trust-remote-code
- --tp
- "8"
- --moe-runner-backend
- flashinfer_mxfp4
- --speculative-algo
- EAGLE
- --speculative-num-steps
- "3"
- --speculative-eagle-topk
- "1"
- --speculative-num-draft-tokens
- "4"
- --chunked-prefill-size
- "4096"
- --disable-flashinfer-autotune
- --dyn-tool-call-parser
- deepseek_v4
- --dyn-reasoning-parser
- deepseek_v4
env:
- name: HF_HOME
value: /models
- name: HF_HUB_OFFLINE
value: "1"
- name: CUDA_VISIBLE_DEVICES
value: "0,1,2,3,4,5,6,7"
- name: SGLANG_JIT_DEEPGEMM_PRECOMPILE
value: "0"
- name: SGLANG_JIT_DEEPGEMM_FAST_WARMUP
value: "1"
- name: NCCL_CUMEM_ENABLE
value: "1"
- name: GLOO_SOCKET_IFNAME
value: eth0
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 10
failureThreshold: 360
pvcs:
- name: shared-model-cache
create: false
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment