Unverified Commit 47488cc3 authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

docker: x86 dev builds for hopper and blackwell (#11075)

parent 61305291
...@@ -3,7 +3,7 @@ name: Build and Push Development Docker Images ...@@ -3,7 +3,7 @@ name: Build and Push Development Docker Images
on: on:
workflow_dispatch: workflow_dispatch:
schedule: schedule:
- cron: '0 0 * * *' - cron: "0 0 * * *"
jobs: jobs:
build-dev-x86: build-dev-x86:
...@@ -14,7 +14,7 @@ jobs: ...@@ -14,7 +14,7 @@ jobs:
variant: variant:
- version: 12.9.1 - version: 12.9.1
type: all type: all
tag: dev tag: dev-x86
steps: steps:
- name: Delete huge unnecessary tools folder - name: Delete huge unnecessary tools folder
run: rm -rf /opt/hostedtoolcache run: rm -rf /opt/hostedtoolcache
...@@ -46,15 +46,15 @@ jobs: ...@@ -46,15 +46,15 @@ jobs:
run: | run: |
docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache . docker buildx build --platform linux/amd64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache .
build-blackwell-arm: build-dev-arm:
if: ${{ github.repository == 'sgl-project/sglang' }} if: ${{ github.repository == 'sgl-project/sglang' }}
runs-on: labubu runs-on: sgl-kernel-release-node-arm
strategy: strategy:
matrix: matrix:
variant: variant:
- version: 12.9.1 - version: 12.9.1
type: blackwell_aarch type: blackwell_aarch64
tag: blackwell-cu129 tag: dev-arm64
steps: steps:
- name: Delete huge unnecessary tools folder - name: Delete huge unnecessary tools folder
run: rm -rf /opt/hostedtoolcache run: rm -rf /opt/hostedtoolcache
...@@ -84,19 +84,18 @@ jobs: ...@@ -84,19 +84,18 @@ jobs:
- name: Build and Push Blackwell Image (ARM) - name: Build and Push Blackwell Image (ARM)
run: | run: |
docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }}-arm64 --no-cache . docker buildx build --platform linux/arm64 --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.variant.version }} --build-arg BUILD_TYPE=${{ matrix.variant.type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.variant.tag }} --no-cache .
create-manifests: create-manifests:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
needs: [build-dev-x86, build-blackwell-arm] needs: [build-dev-x86, build-dev-arm]
if: ${{ github.repository == 'sgl-project/sglang' }} if: ${{ github.repository == 'sgl-project/sglang' }}
strategy: strategy:
matrix: matrix:
variant: variant:
- tag: dev-manifest - tag: dev
x86_tag: dev x86_tag: dev-x86
arm64_tag: blackwell-cu129-arm64 arm64_tag: dev-arm64
steps: steps:
- uses: docker/setup-buildx-action@v3 - uses: docker/setup-buildx-action@v3
- uses: docker/login-action@v2 - uses: docker/login-action@v2
......
...@@ -10,16 +10,16 @@ on: ...@@ -10,16 +10,16 @@ on:
jobs: jobs:
publish-x86: publish-x86:
if: github.repository == 'sgl-project/sglang' if: github.repository == 'sgl-project/sglang'
environment: 'prod' environment: "prod"
strategy: strategy:
matrix: matrix:
variant: variant:
- cuda_version: '12.6.1' - cuda_version: "12.6.1"
build_type: 'all' build_type: "all"
- cuda_version: '12.8.1' - cuda_version: "12.8.1"
build_type: 'blackwell' build_type: "blackwell"
- cuda_version: '12.9.1' - cuda_version: "12.9.1"
build_type: 'blackwell' build_type: "blackwell"
runs-on: nvidia runs-on: nvidia
steps: steps:
- name: Delete huge unnecessary tools folder - name: Delete huge unnecessary tools folder
...@@ -82,13 +82,13 @@ jobs: ...@@ -82,13 +82,13 @@ jobs:
publish-arm64: publish-arm64:
if: github.repository == 'sgl-project/sglang' if: github.repository == 'sgl-project/sglang'
environment: 'prod' environment: "prod"
strategy: strategy:
matrix: matrix:
variant: variant:
- cuda_version: '12.9.1' - cuda_version: "12.9.1"
build_type: 'blackwell_aarch' build_type: "blackwell_aarch64"
runs-on: labubu runs-on: sgl-kernel-release-node-arm
steps: steps:
- name: Delete huge unnecessary tools folder - name: Delete huge unnecessary tools folder
run: rm -rf /opt/hostedtoolcache run: rm -rf /opt/hostedtoolcache
......
...@@ -206,7 +206,7 @@ jobs: ...@@ -206,7 +206,7 @@ jobs:
build-cu129-aarch64: build-cu129-aarch64:
if: github.repository == 'sgl-project/sglang' if: github.repository == 'sgl-project/sglang'
runs-on: labubu runs-on: sgl-kernel-release-node-arm
strategy: strategy:
matrix: matrix:
python-version: ["3.10"] python-version: ["3.10"]
......
...@@ -93,9 +93,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li ...@@ -93,9 +93,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
&& FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin && FLASHINFER_LOGGING_LEVEL=warning python3 -m flashinfer --download-cubin
# Download source files # Download NVSHMEM source files
# We use Tom's DeepEP fork for GB200 for now
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
if [ "$BUILD_TYPE" = "blackwell_aarch" ] && [ "$(uname -m)" = "aarch64" ]; then \ if [ "$BUILD_TYPE" = "blackwell_aarch64" ]; then \
git clone https://github.com/fzyzcjy/DeepEP.git \ git clone https://github.com/fzyzcjy/DeepEP.git \
&& cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \ && cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
else \ else \
...@@ -108,7 +109,7 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour ...@@ -108,7 +109,7 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour
# Build and install NVSHMEM # Build and install NVSHMEM
RUN cd /sgl-workspace/nvshmem && \ RUN cd /sgl-workspace/nvshmem && \
if [ "$BUILD_TYPE" = "blackwell" ] || [ "$BUILD_TYPE" = "blackwell_aarch" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \ export CUDA_ARCH="90;100;120" && \
NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \ NVSHMEM_USE_NCCL=0 \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment