"lib/runtime/src/vscode:/vscode.git/clone" did not exist on "c376655f4b9652247a1a4cec9b477afc310dba8c"
Unverified Commit 3d8c497e authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

ci: Prod Runners (#5534)

Replacing infra for Github runners
parent 2a2ad756
......@@ -58,8 +58,8 @@ jobs:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
- { arch: amd64, runner: prod-builder-amd-v1 }
- { arch: arm64, runner: prod-builder-arm-v1 }
runs-on: ${{ matrix.platform.runner }}
steps:
- name: Checkout repository
......
......@@ -75,7 +75,7 @@ env:
jobs:
build-amd64:
name: Build ${{ matrix.framework }} (amd64)
runs-on: cpu-amd-m5-4xlarge
runs-on: prod-builder-amd-v1
timeout-minutes: 120
strategy:
fail-fast: false
......@@ -126,7 +126,7 @@ jobs:
build-arm64:
name: Build ${{ matrix.framework }} (arm64)
runs-on: cpu-arm-r8g-4xlarge
runs-on: prod-builder-arm-v1
timeout-minutes: 120
strategy:
fail-fast: false
......@@ -304,9 +304,9 @@ jobs:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: gpu-l40-amd64
runner: prod-builder-amd-gpu-v1
- arch: arm64
runner: cpu-arm-r8g-4xlarge
runner: prod-builder-arm-v1
steps:
- uses: actions/checkout@v4
- name: Check if build succeeded
......@@ -437,10 +437,10 @@ jobs:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: gpu-l40-amd64
runner: prod-builder-amd-gpu-v1
timeout: 90
- arch: arm64
runner: cpu-arm-r8g-4xlarge
runner: prod-builder-arm-v1
timeout: 90
steps:
- uses: actions/checkout@v4
......@@ -571,10 +571,10 @@ jobs:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: gpu-l40-amd64
runner: prod-builder-amd-gpu-v1
timeout: 120
- arch: arm64
runner: cpu-arm-r8g-4xlarge
runner: prod-builder-arm-v1
timeout: 120
steps:
- uses: actions/checkout@v4
......@@ -707,10 +707,10 @@ jobs:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: gpu-l40-amd64
runner: prod-builder-amd-gpu-v1
timeout: 150
- arch: arm64
runner: cpu-arm-r8g-4xlarge
runner: prod-builder-arm-v1
timeout: 150
steps:
- uses: actions/checkout@v4
......@@ -836,7 +836,7 @@ jobs:
name: ${{ matrix.framework.name }}-ft-k8s
needs: [build-amd64]
if: always()
runs-on: cpu-amd-m5-4xlarge
runs-on: prod-builder-amd-v1
timeout-minutes: 60
strategy:
fail-fast: false
......@@ -1074,7 +1074,7 @@ jobs:
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: cpu-amd-m5-4xlarge
runs-on: prod-builder-amd-v1
if: always() && inputs.enable_slack_notification && !github.event.repository.fork
needs: results-summary
permissions:
......
......@@ -63,8 +63,8 @@ jobs:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: cpu-amd-m5-2xlarge }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
- { arch: amd64, runner: prod-builder-amd-v1 }
- { arch: arm64, runner: prod-builder-arm-v1 }
name: operator-build (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps:
......@@ -81,7 +81,6 @@ jobs:
- name: Docker Login
uses: ./.github/actions/docker-login
with:
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
......@@ -141,8 +140,8 @@ jobs:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
- { arch: amd64, runner: prod-builder-amd-gpu-v1 }
- { arch: arm64, runner: prod-builder-arm-v1 }
cuda_version:
- { major_minor: '13.0', major: '13' }
- { major_minor: '12.9', major: '12' }
......@@ -219,8 +218,8 @@ jobs:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
- { arch: amd64, runner: prod-builder-amd-gpu-v1 }
- { arch: arm64, runner: prod-builder-arm-v1 }
cuda_version:
- { major_minor: '13.0', major: '13' }
- { major_minor: '12.9', major: '12' }
......@@ -238,8 +237,8 @@ jobs:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
- { arch: amd64, runner: prod-builder-amd-gpu-v1 }
- { arch: arm64, runner: prod-builder-arm-v1 }
cuda_version:
- { major_minor: '13.0', major: '13' }
name: trtllm-build-test (cuda${{ matrix.cuda_version.major_minor}}, ${{ matrix.platform.arch }})
......@@ -250,7 +249,7 @@ jobs:
steps: *runtime-container-build-push-test
deploy-operator:
runs-on: cpu-amd-m5-2xlarge
runs-on: prod-default-v1
if: needs.changed-files.outputs.core == 'true'
needs: [changed-files, operator, vllm, sglang, trtllm]
env:
......@@ -325,7 +324,7 @@ jobs:
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
deploy-test-vllm:
runs-on: cpu-amd-m5-2xlarge
runs-on: prod-default-v1
# Run if push to main, or manually triggered
if: ( github.ref_name == 'main' || github.event.inputs.run_deploy_operator )
needs: [changed-files, deploy-operator, vllm]
......@@ -515,7 +514,7 @@ jobs:
# - Run vllm disagg_router on all commits
# - Run rest of jobs only on push to main or manual trigger
deploy-test-vllm-disagg-router:
runs-on: cpu-amd-m5-2xlarge
runs-on: prod-default-v1
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true'
needs: [changed-files, deploy-operator, vllm]
permissions:
......@@ -535,7 +534,7 @@ jobs:
steps: *deploy-test-steps
deploy-test-sglang:
runs-on: cpu-amd-m5-2xlarge
runs-on: prod-default-v1
# Run if push to main, or manually triggered
if: ( github.ref_name == 'main' || github.event.inputs.run_deploy_operator )
needs: [changed-files, deploy-operator, sglang]
......@@ -557,7 +556,7 @@ jobs:
steps: *deploy-test-steps
deploy-test-trtllm:
runs-on: cpu-amd-m5-2xlarge
runs-on: prod-default-v1
# Run if push to main, or manually triggered
if: ( github.ref_name == 'main' || github.event.inputs.run_deploy_operator )
needs: [changed-files, deploy-operator, trtllm]
......@@ -581,7 +580,7 @@ jobs:
steps: *deploy-test-steps
cleanup:
runs-on: cpu-amd-m5-2xlarge
runs-on: prod-default-v1
if: always()
needs: [changed-files, deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm, deploy-test-vllm-disagg-router]
steps:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment