Unverified Commit 8969240a authored by Dillon Cullinan's avatar Dillon Cullinan Committed by GitHub
Browse files

ci: OPS-980: Add ARM build (#3146)


Signed-off-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
parent 89cf9107
...@@ -9,6 +9,10 @@ inputs: ...@@ -9,6 +9,10 @@ inputs:
description: 'Target to build' description: 'Target to build'
required: false required: false
default: 'runtime' default: 'runtime'
platform:
description: 'Docker platform to build on, ie. linux/amd64'
required: false
default: 'linux/amd64'
image_tag: image_tag:
description: 'Custom image tag (optional, defaults to framework:latest)' description: 'Custom image tag (optional, defaults to framework:latest)'
required: false required: false
...@@ -63,6 +67,7 @@ runs: ...@@ -63,6 +67,7 @@ runs:
SCCACHE_S3_BUCKET: ${{ inputs.sccache_s3_bucket }} SCCACHE_S3_BUCKET: ${{ inputs.sccache_s3_bucket }}
AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }}
PLATFORM: ${{ inputs.platform }}
run: | run: |
# Determine image tag # Determine image tag
if [ -n "${{ inputs.image_tag }}" ]; then if [ -n "${{ inputs.image_tag }}" ]; then
...@@ -70,11 +75,13 @@ runs: ...@@ -70,11 +75,13 @@ runs:
else else
IMAGE_TAG="${{ inputs.framework }}:latest" IMAGE_TAG="${{ inputs.framework }}:latest"
fi fi
echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
./container/build.sh --tag "$IMAGE_TAG" \ ./container/build.sh --tag "$IMAGE_TAG" \
--target ${{ inputs.target }} \ --target ${{ inputs.target }} \
--vllm-max-jobs 10 \
--framework ${{ inputs.framework }} \ --framework ${{ inputs.framework }} \
--platform ${{ inputs.platform }} \
--use-sccache \ --use-sccache \
--sccache-bucket "$SCCACHE_S3_BUCKET" \ --sccache-bucket "$SCCACHE_S3_BUCKET" \
--sccache-region "$AWS_DEFAULT_REGION" --sccache-region "$AWS_DEFAULT_REGION"
description: 'Tag and Push Docker Images'
inputs:
local_image:
description: 'Local Image Name:Tag'
required: true
push_tag:
description: 'Target Name:Tag'
required: true
aws_push:
description: 'Push to AWS Boolean'
required: false
default: 'false'
azure_push:
description: 'Push to Azure Container Registry (ACR) Boolean'
required: false
default: 'false'
aws_account_id:
description: 'AWS Account ID'
required: false
aws_default_region:
description: 'AWS Default Region'
required: false
aws_access_key_id:
description: 'AWS Access Key ID'
required: false
aws_secret_access_key:
description: 'AWS Secret Access Key'
required: false
azure_acr_hostname:
description: 'Azure ACR hostname'
required: false
azure_acr_user:
description: 'Azure ACR user'
required: false
azure_acr_password:
description: 'Azure ACR password'
required: false
outputs:
image_tag:
description: 'Image Tag'
value: ${{ inputs.push_tag }}
runs:
using: "composite"
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Install awscli
if: ${{ inputs.aws_push == 'true' }}
shell: bash
run: |
curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip"
unzip awscliv2.zip
sudo ./aws/install
- name: ECR Login
if: ${{ inputs.aws_push == 'true' }}
shell: bash
env:
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: |
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
- name: ACR Login
shell: bash
if: ${{ inputs.azure_push == 'true' }}
run: |
echo "${{ inputs.azure_acr_password }}" | docker login ${{ inputs.azure_acr_hostname }} --username ${{ inputs.azure_acr_user }} --password-stdin
- name: ECR Tag and Push
shell: bash
if: ${{ inputs.aws_push == 'true' }}
env:
LOCAL_IMAGE: ${{ inputs.local_image }}
PUSH_TAG: ${{ inputs.push_tag }}
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: |
docker tag ${LOCAL_IMAGE} ${ECR_HOSTNAME}/${PUSH_TAG}
docker push ${ECR_HOSTNAME}/${PUSH_TAG}
- name: ACR Tag and Push
shell: bash
if: ${{ inputs.azure_push == 'true' }}
env:
LOCAL_IMAGE: ${{ inputs.local_image }}
PUSH_TAG: ${{ inputs.push_tag }}
AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
run: |
docker tag ${LOCAL_IMAGE} ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
docker push ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
...@@ -8,6 +8,7 @@ on: ...@@ -8,6 +8,7 @@ on:
branches: branches:
- main - main
- "pull-request/[0-9]+" - "pull-request/[0-9]+"
- release/*.*.*
concurrency: concurrency:
group: ${{ github.workflow }}-build-test-${{ github.ref_name || github.run_id }} group: ${{ github.workflow }}-build-test-${{ github.ref_name || github.run_id }}
...@@ -37,9 +38,16 @@ jobs: ...@@ -37,9 +38,16 @@ jobs:
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))' echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
vllm: vllm:
runs-on: gpu-l40-amd64
needs: changed-files needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true' if: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: vllm (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
...@@ -49,22 +57,47 @@ jobs: ...@@ -49,22 +57,47 @@ jobs:
with: with:
framework: vllm framework: vllm
target: runtime target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tag: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run tests - name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.build-image.outputs.image_tag }} image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "e2e and vllm and gpu_1 and not slow" pytest_marks: "e2e and vllm and gpu_1 and not slow"
sglang: sglang:
runs-on: gpu-l40-amd64
needs: changed-files needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true' if: needs.changed-files.outputs.has_code_changes == 'true'
# OPS-1140: Uncomment this for sglang arm switch to wideep
# strategy:
# fail-fast: false
# matrix:
# platform:
# - { arch: amd64, runner: gpu-l40-amd64 }
# - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
# name: sglang (${{ matrix.platform.arch }})
# runs-on: ${{ matrix.platform.runner }}
# OPS-1140: Remove this runs-on line, replaced with the above line
runs-on: gpu-l40-amd64
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
...@@ -74,22 +107,49 @@ jobs: ...@@ -74,22 +107,49 @@ jobs:
with: with:
framework: sglang framework: sglang
target: runtime target: runtime
platform: 'linux/amd64'
# OPS-1140: Replace the above line with the uncommented below line
# platform: 'linux/${{ matrix.platform.arch }}'
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-amd64
# OPS-1140: Replace the above line with the uncommented below line
# push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run tests - name: Run tests
# OPS-1140: Uncomment the below line
# if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.build-image.outputs.image_tag }} image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "e2e and sglang and gpu_1" pytest_marks: "e2e and sglang and gpu_1"
trtllm: trtllm:
runs-on: gpu-l40-amd64
needs: changed-files needs: changed-files
if: needs.changed-files.outputs.has_code_changes == 'true' if: needs.changed-files.outputs.has_code_changes == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: trtllm (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
...@@ -99,13 +159,28 @@ jobs: ...@@ -99,13 +159,28 @@ jobs:
with: with:
framework: trtllm framework: trtllm
target: runtime target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tag: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run tests - name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.build-image.outputs.image_tag }} image_tag: ${{ steps.build-image.outputs.image_tag }}
......
...@@ -332,6 +332,15 @@ get_options() { ...@@ -332,6 +332,15 @@ get_options() {
missing_requirement "$1" missing_requirement "$1"
fi fi
;; ;;
--vllm-max-jobs)
if [ "$2" ]; then
MAX_JOBS=$2
shift
else
missing_requirement "$1"
fi
;;
-?*) -?*)
error 'ERROR: Unknown option: ' "$1" error 'ERROR: Unknown option: ' "$1"
;; ;;
...@@ -708,6 +717,10 @@ if [ -n "${NIXL_UCX_REF}" ]; then ...@@ -708,6 +717,10 @@ if [ -n "${NIXL_UCX_REF}" ]; then
BUILD_ARGS+=" --build-arg NIXL_UCX_REF=${NIXL_UCX_REF} " BUILD_ARGS+=" --build-arg NIXL_UCX_REF=${NIXL_UCX_REF} "
fi fi
if [ -n "${MAX_JOBS}" ]; then
BUILD_ARGS+=" --build-arg MAX_JOBS=${MAX_JOBS} "
fi
# Add sccache build arguments # Add sccache build arguments
if [ "$USE_SCCACHE" = true ]; then if [ "$USE_SCCACHE" = true ]; then
BUILD_ARGS+=" --build-arg USE_SCCACHE=true" BUILD_ARGS+=" --build-arg USE_SCCACHE=true"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment