feat: add SGLang recipe for DeepSeek-V4 (#8704) (#8712)

Co-authored-by: Krishnan Prashanth <140860868+KrishnanPrash@users.noreply.github.com>

feat: add SGLang recipe for DeepSeek-V4 (#8704) (#8712)
Co-authored-by: Krishnan Prashanth <140860868+KrishnanPrash@users.noreply.github.com>
106a184a · Tushar Sharma · GitHub · 35fa7129 · 106a184a · 106a184a
Unverified Commit 106a184a authored Apr 24, 2026 by Tushar Sharma Committed by GitHub Apr 24, 2026
3 changed files
--- a/recipes/deepseek-v4-flash/sglang/Dockerfile.dsv4-sglang
+++ b/recipes/deepseek-v4-flash/sglang/Dockerfile.dsv4-sglang
+# DeepSeek-V4-Flash SGLang + Dynamo runtime image
+#
+# Two-step build:
+#   1. Build dynamo:latest-sglang-runtime locally per container/README.md:
+#        python3 container/render.py --framework sglang --target runtime
+#        docker build -f container/sglang-runtime-*.Dockerfile -t dynamo:latest-sglang-runtime .
+#   2. Build this overlay (adds V4 parsers + routed_experts fix):
+#        docker build -f recipes/deepseek-v4-flash/sglang/Dockerfile.dsv4-sglang \
+#          -t <your-registry>/sglang-dsv4:<tag> .
+ARG DYNAMO_SRC_IMAGE=dynamo:latest-sglang-runtime
+ARG DSV4_BASE_IMAGE=lmsysorg/sglang:deepseek-v4-blackwell
+# ---------- Stage 1: Build dynamo wheels with V4 parsers ----------
+FROM quay.io/pypa/manylinux_2_28_x86_64 AS wheel_builder
+RUN yum install -y openssl-devel clang-devel && yum clean all
+# Modern protoc
+RUN curl -sL https://github.com/protocolbuffers/protobuf/releases/download/v28.3/protoc-28.3-linux-x86_64.zip \
+    -o /tmp/protoc.zip && unzip -o /tmp/protoc.zip -d /usr/local && rm /tmp/protoc.zip
+# Rust
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+ENV PATH=/root/.cargo/bin:/opt/python/cp312-cp312/bin:${PATH}
+RUN pip install maturin
+# Clone dynamo release branch (includes V4 parsers + sglang compat fixes)
+RUN git clone --depth 1 -b release/deepseekv4 \
+    https://github.com/ai-dynamo/dynamo.git /workspace
+# Build dynamo runtime wheel
+WORKDIR /workspace/lib/bindings/python
+RUN maturin build --release --interpreter /opt/python/cp312-cp312/bin/python3 \
+    -o /opt/dynamo/wheelhouse
+# Build ai-dynamo pure-python wheel
+WORKDIR /workspace
+RUN pip wheel --no-deps -w /opt/dynamo/wheelhouse .
+# ---------- Stage 2: Dynamo sglang-runtime donor ----------
+FROM ${DYNAMO_SRC_IMAGE} AS dynamo_src
+# ---------- Stage 3: Final image ----------
+FROM ${DSV4_BASE_IMAGE}
+ENV DEBIAN_FRONTEND=noninteractive
+# Infra from dynamo sglang-runtime (etcd, nats, UCX, NIXL)
+COPY --from=dynamo_src /usr/bin/nats-server /usr/bin/nats-server
+COPY --from=dynamo_src /usr/local/bin/etcd /usr/local/bin/etcd
+ENV PATH=/usr/local/bin/etcd:${PATH}
+# UCX libs
+COPY --from=dynamo_src /usr/lib/x86_64-linux-gnu/ucx /usr/lib/x86_64-linux-gnu/ucx
+COPY --from=dynamo_src /usr/lib/x86_64-linux-gnu/libuc*.so* /usr/lib/x86_64-linux-gnu/
+# NIXL + deps (pip packages with native libs)
+COPY --from=dynamo_src /usr/local/lib/python3.12/dist-packages/nixl* /usr/local/lib/python3.12/dist-packages/
+COPY --from=dynamo_src /usr/local/lib/python3.12/dist-packages/nixl_cu12* /usr/local/lib/python3.12/dist-packages/
+# Dynamo wheels with V4 parsers (overrides the base dynamo from donor)
+COPY --from=wheel_builder /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl /opt/dynamo/wheelhouse/
+COPY --from=wheel_builder /opt/dynamo/wheelhouse/ai_dynamo-*.whl /opt/dynamo/wheelhouse/
+RUN pip install --no-cache-dir --force-reinstall --no-deps \
+    /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
+    /opt/dynamo/wheelhouse/ai_dynamo-*.whl && \
+    python3 -c "from dynamo._core import get_tool_parser_names; assert 'deepseek_v4' in get_tool_parser_names(), 'V4 parser missing!'; print('V4 parser verified')"
+# Dynamo Python components from V4 branch
+COPY --from=wheel_builder /workspace/components/src/dynamo /workspace/components/src/dynamo
+# Fix: sglang repo dir at /workspace/sglang shadows the Python package
+ENV PYTHONPATH=/workspace/sglang/python:/workspace/components/src:${PYTHONPATH}
+# DeepGEMM JIT env vars
+ENV SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1
+WORKDIR /workspace
+ENTRYPOINT []
+CMD ["bash"]
--- a/recipes/deepseek-v4-flash/sglang/sglang-dgd.yaml
+++ b/recipes/deepseek-v4-flash/sglang/sglang-dgd.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSeek-V4-Flash SGLang DynamoGraphDeployment
+# Aggregated mode (no P/D disagg), B200x4 TP4, MXFP4 MoE, EAGLE MTP 3/4
+#
+# Deploy:
+#   kubectl apply -f sglang-dgd.yaml -n <namespace>
+#
+# Test:
+#   kubectl port-forward -n <namespace> svc/sglang-dsv4-flash-frontend 8000:8000
+#   curl http://localhost:8000/v1/models
+#   curl http://localhost:8000/v1/chat/completions -H 'Content-Type: application/json' \
+#     -d '{"model":"deepseek-ai/DeepSeek-V4-Flash","messages":[{"role":"user","content":"Hello"}]}'
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-dsv4-flash
+spec:
+  services:
+    Frontend:
+      componentType: frontend
+      replicas: 1
+      volumeMounts:
+        - name: shared-model-cache
+          mountPoint: /models
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/dynamo-dev/kprashanth:sglang-dsv4-v2
+          imagePullPolicy: Always
+          env:
+            - name: HF_HOME
+              value: /models
+            - name: HF_HUB_OFFLINE
+              value: "1"
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            periodSeconds: 10
+            timeoutSeconds: 10
+            failureThreshold: 360
+    decode:
+      componentType: worker
+      subComponentType: decode
+      replicas: 1
+      resources:
+        limits:
+          gpu: "4"
+      volumeMounts:
+        - name: shared-model-cache
+          mountPoint: /models
+      sharedMemory:
+        size: 200Gi
+      extraPodSpec:
+        nodeSelector:
+          nvidia.com/gpu.product: NVIDIA-B200
+        tolerations:
+          - key: nvidia.com/gpu
+            operator: Equal
+            value: "true"
+            effect: NoSchedule
+        mainContainer:
+          image: nvcr.io/nvidian/dynamo-dev/kprashanth:sglang-dsv4-v2
+          imagePullPolicy: Always
+          workingDir: /workspace
+          command:
+            - python3
+            - -m
+            - dynamo.sglang
+          args:
+            - --model-path
+            - deepseek-ai/DeepSeek-V4-Flash
+            - --served-model-name
+            - deepseek-ai/DeepSeek-V4-Flash
+            - --trust-remote-code
+            - --tp
+            - "4"
+            - --moe-runner-backend
+            - flashinfer_mxfp4
+            - --speculative-algo
+            - EAGLE
+            - --speculative-num-steps
+            - "3"
+            - --speculative-eagle-topk
+            - "1"
+            - --speculative-num-draft-tokens
+            - "4"
+            - --chunked-prefill-size
+            - "4096"
+            - --disable-flashinfer-autotune
+            - --dyn-tool-call-parser
+            - deepseek_v4
+            - --dyn-reasoning-parser
+            - deepseek_v4
+          env:
+            - name: HF_HOME
+              value: /models
+            - name: HF_HUB_OFFLINE
+              value: "1"
+            - name: CUDA_VISIBLE_DEVICES
+              value: "0,1,2,3,4,5,6,7"
+            - name: SGLANG_JIT_DEEPGEMM_PRECOMPILE
+              value: "0"
+            - name: SGLANG_JIT_DEEPGEMM_FAST_WARMUP
+              value: "1"
+            - name: NCCL_CUMEM_ENABLE
+              value: "1"
+            - name: GLOO_SOCKET_IFNAME
+              value: eth0
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            timeoutSeconds: 10
+            failureThreshold: 360
+  pvcs:
+    - name: shared-model-cache
+      create: false
--- a/recipes/deepseek-v4-pro/sglang/sglang-dgd.yaml
+++ b/recipes/deepseek-v4-pro/sglang/sglang-dgd.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSeek-V4-Pro SGLang DynamoGraphDeployment
+# Aggregated mode, B200x8 TP8, MXFP4 MoE, EAGLE MTP 3/4
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-dsv4-pro
+spec:
+  services:
+    Frontend:
+      componentType: frontend
+      replicas: 1
+      volumeMounts:
+        - name: shared-model-cache
+          mountPoint: /models
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/dynamo-dev/kprashanth:sglang-dsv4-v2
+          imagePullPolicy: Always
+          env:
+            - name: HF_HOME
+              value: /models
+            - name: HF_HUB_OFFLINE
+              value: "1"
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 8000
+            periodSeconds: 10
+            timeoutSeconds: 10
+            failureThreshold: 360
+    decode:
+      componentType: worker
+      subComponentType: decode
+      replicas: 1
+      resources:
+        limits:
+          gpu: "8"
+      volumeMounts:
+        - name: shared-model-cache
+          mountPoint: /models
+      sharedMemory:
+        size: 200Gi
+      extraPodSpec:
+        nodeSelector:
+          nvidia.com/gpu.product: NVIDIA-B200
+        tolerations:
+          - key: nvidia.com/gpu
+            operator: Equal
+            value: "true"
+            effect: NoSchedule
+        mainContainer:
+          image: nvcr.io/nvidian/dynamo-dev/kprashanth:sglang-dsv4-v2
+          imagePullPolicy: Always
+          workingDir: /workspace
+          command:
+            - python3
+            - -m
+            - dynamo.sglang
+          args:
+            - --model-path
+            - deepseek-ai/DeepSeek-V4-Pro
+            - --served-model-name
+            - deepseek-ai/DeepSeek-V4-Pro
+            - --trust-remote-code
+            - --tp
+            - "8"
+            - --moe-runner-backend
+            - flashinfer_mxfp4
+            - --speculative-algo
+            - EAGLE
+            - --speculative-num-steps
+            - "3"
+            - --speculative-eagle-topk
+            - "1"
+            - --speculative-num-draft-tokens
+            - "4"
+            - --chunked-prefill-size
+            - "4096"
+            - --disable-flashinfer-autotune
+            - --dyn-tool-call-parser
+            - deepseek_v4
+            - --dyn-reasoning-parser
+            - deepseek_v4
+          env:
+            - name: HF_HOME
+              value: /models
+            - name: HF_HUB_OFFLINE
+              value: "1"
+            - name: CUDA_VISIBLE_DEVICES
+              value: "0,1,2,3,4,5,6,7"
+            - name: SGLANG_JIT_DEEPGEMM_PRECOMPILE
+              value: "0"
+            - name: SGLANG_JIT_DEEPGEMM_FAST_WARMUP
+              value: "1"
+            - name: NCCL_CUMEM_ENABLE
+              value: "1"
+            - name: GLOO_SOCKET_IFNAME
+              value: eth0
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            timeoutSeconds: 10
+            failureThreshold: 360
+  pvcs:
+    - name: shared-model-cache
+      create: false