docker: add CUDA13 support in dockerfile and update GDRCopy/NVSHMEM for blackwell support (#11517)

Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>

docker: add CUDA13 support in dockerfile and update GDRCopy/NVSHMEM for blackwell support (#11517)
Co-authored-by: Baizhou Zhang <sobereddiezhang@gmail.com>
285a8e69 · ishandhanani · GitHub · 813bd6f8 · 285a8e69 · 285a8e69
Unverified Commit 285a8e69 authored Oct 27, 2025 by ishandhanani Committed by GitHub Oct 27, 2025
7 changed files
--- a/.github/workflows/release-docker-cu13.yml
+++ b/.github/workflows/release-docker-cu13.yml
+name: Build and Push CUDA 13 Docker Images
+# release this manually via workflow_dispatch for now
+on:
+    workflow_dispatch:
+jobs:
+    build-dev:
+        if: ${{ github.repository == 'sgl-project/sglang' }}
+        runs-on: ${{ matrix.runner }}
+        strategy:
+            matrix:
+                include:
+                    - runner: x64-docker-build-node
+                      platform: linux/amd64
+                      build_type: all
+                      grace_blackwell: 0
+                      tag: dev-x86-cu13
+                      version: 13.0.1
+                    - runner: arm-docker-build-node
+                      platform: linux/arm64
+                      build_type: all
+                      grace_blackwell: 1
+                      tag: dev-arm64-cu13
+                      version: 13.0.1
+        steps:
+            - name: Delete huge unnecessary tools folder
+              run: rm -rf /opt/hostedtoolcache
+            - name: Checkout repository
+              uses: actions/checkout@v4
+            - name: Free disk space
+              uses: jlumbroso/free-disk-space@main
+              with:
+                  tool-cache: true
+                  docker-images: true
+                  android: true
+                  dotnet: true
+                  haskell: true
+                  large-packages: true
+                  swap-storage: true
+            - name: Set up Docker Buildx
+              uses: docker/setup-buildx-action@v3
+            - name: Login to Docker Hub
+              uses: docker/login-action@v2
+              with:
+                  username: ${{ secrets.DOCKERHUB_USERNAME }}
+                  password: ${{ secrets.DOCKERHUB_TOKEN }}
+            - name: Build and Push Dev Image
+              run: |
+                  docker buildx build \
+                    --platform ${{ matrix.platform }} \
+                    --push \
+                    -f docker/Dockerfile \
+                    --build-arg CUDA_VERSION=${{ matrix.version }} \
+                    --build-arg BUILD_TYPE=${{ matrix.build_type }} \
+                    --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) \
+                    --build-arg GRACE_BLACKWELL=${{ matrix.grace_blackwell }} \
+                    -t lmsysorg/sglang:${{ matrix.tag }} \
+                    --no-cache \
+                    .
+    create-manifests:
+        runs-on: ubuntu-22.04
+        needs: [build-dev]
+        if: ${{ github.repository == 'sgl-project/sglang' }}
+        strategy:
+            matrix:
+                variant:
+                    - tag: dev-cu13
+                      x86_tag: dev-x86-cu13
+                      arm64_tag: dev-arm64-cu13
+        steps:
+            - uses: docker/setup-buildx-action@v3
+            - uses: docker/login-action@v2
+              with:
+                  username: ${{ secrets.DOCKERHUB_USERNAME }}
+                  password: ${{ secrets.DOCKERHUB_TOKEN }}
+            - run: |
+                  docker buildx imagetools create \
+                    -t lmsysorg/sglang:${{ matrix.variant.tag }} \
+                    -t lmsysorg/sglang:nightly-${{ matrix.variant.tag }}-$(date +%Y%m%d)-${GITHUB_SHA:0:8} \
+                    lmsysorg/sglang:${{ matrix.variant.x86_tag }} \
+                    lmsysorg/sglang:${{ matrix.variant.arm64_tag }}
+            - name: Cleanup Old Nightly Builds
+              run: |
+                  # Get JWT token for Docker Hub API
+                  TOKEN=$(curl -s -H "Content-Type: application/json" -X POST -d '{"username": "${{ secrets.DOCKERHUB_USERNAME }}", "password": "${{ secrets.DOCKERHUB_TOKEN }}"}' https://hub.docker.com/v2/users/login/ | jq -r .token)
+                  # Get all tags for the repository
+                  TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" "https://hub.docker.com/v2/repositories/lmsysorg/sglang/tags/?page_size=100")
+                  # Extract tags that match our pattern and sort by last_updated timestamp (most recent first)
+                  TAGS=$(echo "$TAGS_RESPONSE" | jq -r '.results[] | select(.name | startswith("nightly-${{ matrix.variant.tag }}-")) | "\(.last_updated)|\(.name)"' | sort -r | cut -d'|' -f2)
+                  # Count total tags and keep only the 14 most recent
+                  TAG_COUNT=$(echo "$TAGS" | wc -l)
+                  if [ "$TAG_COUNT" -gt 14 ]; then
+                    echo "Found $TAG_COUNT nightly builds, keeping only the 14 most recent"
+                    TAGS_TO_DELETE=$(echo "$TAGS" | tail -n +15)
+                    echo "Tags to delete: $TAGS_TO_DELETE"
+                    # Delete old tags
+                    for tag in $TAGS_TO_DELETE; do
+                      echo "Deleting tag: $tag"
+                      curl -X DELETE \
+                        -H "Authorization: JWT $TOKEN" \
+                        "https://hub.docker.com/v2/repositories/lmsysorg/sglang/tags/$tag/"
+                    done
+                  else
+                    echo "Only $TAG_COUNT nightly builds found, no cleanup needed"
+                  fi
--- a/.github/workflows/release-docker-dev.yml
+++ b/.github/workflows/release-docker-dev.yml
@@ -53,7 +53,17 @@ jobs:
      - name: Build and Push Dev Image
        run: |
-          docker buildx build --platform ${{ matrix.platform }} --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GRACE_BLACKWELL=${{ matrix.grace_blackwell }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.tag }} --no-cache .
+          docker buildx build \
+            --platform ${{ matrix.platform }} \
+            --push \
+            -f docker/Dockerfile \
+            --build-arg CUDA_VERSION=${{ matrix.version }} \
+            --build-arg BUILD_TYPE=${{ matrix.build_type }} \
+            --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) \
+            --build-arg GRACE_BLACKWELL=${{ matrix.grace_blackwell }} \
+            -t lmsysorg/sglang:${{ matrix.tag }} \
+            --no-cache \
+            .
  create-manifests:
    runs-on: ubuntu-22.04

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
 ARG CUDA_VERSION=12.9.1
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base
-ARG TARGETARCH
-ARG GRACE_BLACKWELL=0
+ARG TARGETARCH
 ARG BUILD_TYPE=all
 ARG BRANCH_TYPE=remote
+ARG GRACE_BLACKWELL=0
+ARG GRACE_BLACKWELL_DEEPEP_BRANCH=gb200_blog_part_2
 ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
 ARG FLASHMLA_COMMIT=1408756a88e52a25196b759eaf8db89d2b51b5a1
-ARG FAST_HADAMARD_TRANSFORM_COMMIT=7fd811c2b47f63b0b08d2582619f939e14dad77c
-ARG CMAKE_BUILD_PARALLEL_LEVEL=2
+ARG TRITON_LANG_COMMIT=4caa0328bf8df64896dd5f6fb9df41b0eb2e750a
 ARG SGL_KERNEL_VERSION=0.3.16.post4
+ARG GDRCOPY_VERSION=2.5.1
+ARG NVSHMEM_VERSION=3.4.5
 ENV DEBIAN_FRONTEND=noninteractive \
    CUDA_HOME=/usr/local/cuda \
-    GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
+    GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/ \
    NVSHMEM_DIR=/sgl-workspace/nvshmem/install
 # Add GKE default lib and bin locations.
 ENV PATH="${PATH}:/usr/local/nvidia/bin" \
@@ -55,7 +61,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # GDRCopy installation
 RUN mkdir -p /tmp/gdrcopy && cd /tmp \
- && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
 && cd gdrcopy/packages \
 && CUDA=/usr/local/cuda ./build-deb-packages.sh \
 && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
@@ -69,6 +75,7 @@ COPY . /src
 FROM base AS build-image
 # Install SGLang
+# Until torch 2.9 and cu13 are stable we manually update torch if you are on CUDA 13
 WORKDIR /sgl-workspace
 ARG BRANCH_TYPE
 COPY --from=local_src /src /tmp/local_src
@@ -84,36 +91,64 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
      12.6.1) CUINDEX=126 ;; \
      12.8.1) CUINDEX=128 ;; \
      12.9.1) CUINDEX=129 ;; \
+      13.0.1) CUINDEX=130 ;; \
      *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
    esac \
 && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
-     python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
-   fi \
+   ; \
-&& if [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
+   elif [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
-     python3 -m pip install --no-cache-dir sgl-kernel==${SGL_KERNEL_VERSION} ; \
+      python3 -m pip install --no-cache-dir sgl-kernel==${SGL_KERNEL_VERSION} \
+   ; \
+   elif [ "$CUDA_VERSION" = "13.0.1" ]; then \
+      python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
+   ; \
+   else \
+      echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
+   ; \
   fi \
 && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
- && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
+ && if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
+      python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \
+    elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \
+      python3 -m pip install --no-cache-dir nvidia-nccl-cu13==2.28.3 --force-reinstall --no-deps ; \
+      python3 -m pip uninstall -y torch torchaudio torchvision ; \
+      python3 -m pip install --no-cache-dir torch==2.9.0 torchaudio==2.9.0 torchvision --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} ; \
+    else \
+      echo "No NCCL mapping for CUDA_VERSION=${CUDA_VERSION}" && exit 1 ; \
+    fi \
 && FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin
 # Download NVSHMEM source files
 # We use Tom's DeepEP fork for GB200 for now; the 1fd57b0276311d035d16176bb0076426166e52f3 commit is https://github.com/fzyzcjy/DeepEP/tree/gb200_blog_part_2
-RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
+RUN set -eux; \
+    if [ "${CUDA_VERSION%%.*}" = "13" ]; then \
+      wget "https://github.com/NVIDIA/nvshmem/releases/download/v${NVSHMEM_VERSION}-0/nvshmem_src_cuda-all-all-${NVSHMEM_VERSION}.tar.gz"; \
+      NVSHMEM_TARBALL="nvshmem_src_cuda-all-all-${NVSHMEM_VERSION}.tar.gz"; \
+    else \
+      wget "https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz"; \
+      NVSHMEM_TARBALL="nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz"; \
+    fi && \
    if [ "$GRACE_BLACKWELL" = "1" ]; then \
-      git clone https://github.com/fzyzcjy/DeepEP.git \
+      git clone https://github.com/fzyzcjy/DeepEP.git && \
-      && cd DeepEP && git checkout 1fd57b0276311d035d16176bb0076426166e52f3 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
+      cd DeepEP && \
+      git checkout ${GRACE_BLACKWELL_DEEPEP_BRANCH} && \
+      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
+      cd .. ; \
    else \
-      git clone https://github.com/deepseek-ai/DeepEP.git \
+      git clone https://github.com/deepseek-ai/DeepEP.git && \
-      && cd DeepEP && git checkout ${DEEPEP_COMMIT} && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
+      cd DeepEP && \
-    fi \
+      git checkout "${DEEPEP_COMMIT}" && \
-    && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
+      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && \
-    && mv nvshmem_src nvshmem \
+      cd .. ; \
-    && rm -f /sgl-workspace/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+    fi && \
+    tar -xf "${NVSHMEM_TARBALL}" && \
+    mv nvshmem_src nvshmem && \
+    rm -f "/sgl-workspace/${NVSHMEM_TARBALL}"
 # Build and install NVSHMEM
 RUN cd /sgl-workspace/nvshmem && \
-    if [ "$GRACE_BLACKWELL" = "1" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
+    if [ "$GRACE_BLACKWELL" = "1" ]; then CUDA_ARCH="90;100;103;120"; else CUDA_ARCH="90"; fi && \
    NVSHMEM_SHMEM_SUPPORT=0 \
    NVSHMEM_UCX_SUPPORT=0 \
    NVSHMEM_USE_NCCL=0 \
@@ -126,29 +161,50 @@ RUN cd /sgl-workspace/nvshmem && \
    cmake --build build --target install -j${CMAKE_BUILD_PARALLEL_LEVEL}
 # Install DeepEP
+# CTK13 requires the cccl include
 RUN cd /sgl-workspace/DeepEP && \
    case "$CUDA_VERSION" in \
      12.6.1) \
        CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' \
        ;; \
-      12.8.1|12.9.1) \
+      12.8.1|12.9.1|13.0.1) \
-        CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0' \
+        CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0;10.3' \
        ;; \
      *) \
        echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 \
        ;; \
    esac && \
+    if [ "${CUDA_VERSION%%.*}" = "13" ]; then \
+      sed -i "/^    include_dirs = \['csrc\/'\]/a\    include_dirs.append('${CUDA_HOME}/include/cccl')" setup.py; \
+    fi && \
    NVSHMEM_DIR=${NVSHMEM_DIR} TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" pip install --no-build-isolation .
 # Install flashmla
-RUN git clone https://github.com/deepseek-ai/FlashMLA.git flash-mla && \
+RUN if [ "$CUDA_VERSION" != "13.0.1" ]; then \
+    git clone https://github.com/deepseek-ai/FlashMLA.git flash-mla && \
      cd flash-mla && \
      git checkout ${FLASHMLA_COMMIT} && \
      git submodule update --init --recursive && \
      if [ "$CUDA_VERSION" = "12.6.1" ]; then \
        export FLASH_MLA_DISABLE_SM100=1; \
      fi && \
-      pip install --no-build-isolation -v . ;
+      pip install --no-build-isolation -v . ; \
+    fi
+# In order to use flashinfer_cutedsl without IMA for WideEP configs we must install
+# latest flashinfer_cutedsl. Once 0.4.3 is officially released, remove this
+RUN python3 -m pip install --no-cache-dir --upgrade --pre "nvidia-cutlass-dsl==4.3.0.dev0"
+# For cuda 13, we install triton from source to fix some sm103 issues
+# This can be reverted after >3.4.5 is released
+# See the conversation in: https://github.com/triton-lang/triton/pull/8536
+RUN if [ "$CUDA_VERSION" = "13.0.1" ]; then \
+    git clone https://github.com/triton-lang/triton.git && \
+    cd triton && \
+    git checkout ${TRITON_LANG_COMMIT} && \
+    pip install --break-system-packages -r python/requirements.txt && \
+    MAX_JOBS=20 pip install --break-system-packages -e .; \
+fi
 # Python tools
 RUN python3 -m pip install --no-cache-dir \

--- a/docs/get_started/install.md
+++ b/docs/get_started/install.md
@@ -12,10 +12,11 @@ It is recommended to use uv for faster installation:
 ```bash
 pip install --upgrade pip
 pip install uv
-uv pip install sglang --prerelease=allow
+uv pip install "sglang" --prerelease=allow
 ```
 **Quick fixes to common problems**
 - If you encounter `OSError: CUDA_HOME environment variable is not set`. Please set it to your CUDA install root with either of the following solutions:
  1. Use `export CUDA_HOME=/usr/local/cuda-<your-cuda-version>` to set the `CUDA_HOME` environment variable.
  2. Install FlashInfer first following [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html), then install SGLang as described above.
@@ -33,6 +34,7 @@ pip install -e "python"
 ```
 **Quick fixes to common problems**
 - If you want to develop SGLang, it is recommended to use docker. Please refer to [setup docker container](../developer_guide/development_guide_using_docker.md#setup-docker-container). The docker image is `lmsysorg/sglang:dev`.
 ## Method 3: Using docker

--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -60,11 +60,11 @@ dependencies = [
  "soundfile==0.13.1",
  "tiktoken",
  "timm==1.0.16",
-  "torch==2.8.0",
  "torch_memory_saver==0.0.9",
-  "torchao==0.9.0",
+  "torch==2.8.0",
  "torchaudio==2.8.0",
  "torchvision",
+  "torchao==0.9.0",
  "tqdm",
  "transformers==4.57.1",
  "uvicorn",
@@ -77,7 +77,7 @@ dependencies = [
 ]
 [project.optional-dependencies]
-modelopt = ["nvidia-modelopt"]
+checkpoint-engine = ["checkpoint-engine==0.1.2"]
 test = [
  "accelerate",
  "expecttest",
@@ -89,21 +89,6 @@ test = [
  "sentence_transformers",
  "tabulate",
 ]
-checkpoint-engine = ["checkpoint-engine==0.1.2"]
-all = []
-dev = ["sglang[test]"]
-# Temporary tags
-cu130 = [
-  "torch==2.9.0",
-  "torchaudio==2.9.0",
-  "torchvision==0.24.0",
-]
-cu130_all = [
-  "sglang[test]",
-  "sglang[decord]",
-  "sglang[cu130]"
-]
 tracing = [
  "opentelemetry-api",
  "opentelemetry-exporter-otlp",
@@ -111,10 +96,6 @@ tracing = [
  "opentelemetry-sdk",
 ]
-# To be deprecated in 2 weeks
-blackwell = ["sglang[dev]"]
-blackwell_aarch64 = ["sglang[dev]"]
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"
 "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"

--- a/scripts/ci/ci_install_deepep.sh
+++ b/scripts/ci/ci_install_deepep.sh
@@ -4,7 +4,7 @@ set -euxo pipefail
 bash scripts/ci/ci_install_dependency.sh
-export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
+export GDRCOPY_HOME=/usr/src/gdrdrv-2.5.1/
 export NVSHMEM_DIR=/opt/nvshmem/install
 export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
 export PATH="${NVSHMEM_DIR}/bin:$PATH"
@@ -27,9 +27,9 @@ rm -rf /opt/gdrcopy && mkdir -p /opt/gdrcopy
 rm -rf /opt/nvshmem && mkdir -p /opt/nvshmem
 cd /opt/gdrcopy
 git clone https://github.com/NVIDIA/gdrcopy.git .
-git checkout v2.4.4
+git checkout v2.5.1
 apt update
-apt install -y nvidia-dkms-535
+apt install -y nvidia-dkms-580
 apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
 apt install -y check libsubunit0 libsubunit-dev python3-venv
 cd packages
@@ -46,8 +46,8 @@ apt-get update && apt-get install -y libfabric-dev
 # Install NVSHMEM
 cd /opt/nvshmem
-wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.4.5/source/nvshmem_src_cuda12-all-all-3.4.5.tar.gz
-tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+tar -xf nvshmem_src_cuda12-all-all-3.4.5.tar.gz
 mv nvshmem_src nvshmem && cd nvshmem
 NVSHMEM_SHMEM_SUPPORT=0 \
 NVSHMEM_UCX_SUPPORT=0 \
@@ -57,7 +57,7 @@ NVSHMEM_IBGDA_SUPPORT=1 \
 NVSHMEM_PMIX_SUPPORT=0 \
 NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
 NVSHMEM_USE_GDRCOPY=1 \
-cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/opt/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90
+cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/opt/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES="90;100;103;121"
 cd build
 make -j$(nproc) install

--- a/scripts/ci/ci_install_dependency.sh
+++ b/scripts/ci/ci_install_dependency.sh
@@ -55,7 +55,7 @@ else
    $PIP_CMD install flashinfer-python==0.4.1 --prerelease=allow $PIP_INSTALL_SUFFIX
    # Install the main package
-    $PIP_CMD install -e "python[dev]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX --upgrade
+    $PIP_CMD install -e "python" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX --upgrade
 fi
 # Install router for pd-disagg test
@@ -68,7 +68,7 @@ echo "SGL_KERNEL_VERSION_FROM_KERNEL=${SGL_KERNEL_VERSION_FROM_KERNEL} SGL_KERNE
 if [ "${CUSTOM_BUILD_SGL_KERNEL:-}" = "true" ]; then
    ls -alh sgl-kernel/dist
-    $PIP_CMD install sgl-kernel/dist/sgl_kernel-${SGL_KERNEL_VERSION_FROM_KERNEL}-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX
+    $PIP_CMD install sgl-kernel/dist/sgl_kernel-${SGL_KERNEL_VERSION_FROM_KERNEL}+${CU_VERSION}-cp310-abi3-manylinux2014_x86_64.whl --force-reinstall $PIP_INSTALL_SUFFIX
 else
    $PIP_CMD install sgl-kernel==${SGL_KERNEL_VERSION_FROM_SRT} --force-reinstall $PIP_INSTALL_SUFFIX
 fi