Unverified Commit 81c9b938 authored by Yuge Zhang's avatar Yuge Zhang Committed by GitHub
Browse files

Fix VM image builder on 1ES (#5060)

parent ac892fc7
......@@ -245,16 +245,18 @@ We appreciate all contributions from community to make NNI thrive.
| Type | Status |
| :---: | :---: |
| Fast test | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/fast%20test?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=54&branchName=master) |
| Full linux | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20linux?repoName=microsoft%2Fnni&branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=62&repoName=microsoft%2Fnni&branchName=master) |
| Full windows | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20windows?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=63&branchName=master) |
| Full test - HPO | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20HPO?repoName=microsoft%2Fnni&branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=90&repoName=microsoft%2Fnni&branchName=master) |
| Full test - NAS | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20NAS?repoName=microsoft%2Fnni&branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=89&repoName=microsoft%2Fnni&branchName=master) |
| Full test - compression | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/full%20test%20-%20compression?repoName=microsoft%2Fnni&branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=91&repoName=microsoft%2Fnni&branchName=master) |
### Training services
| Type | Status |
| :---: | :---: |
| Local - linux | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20local%20-%20linux?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=92&branchName=master) |
| Local - windows | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20local%20-%20windows?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=98&branchName=master) |
| Remote - linux to linux | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20remote%20-%20linux%20to%20linux?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=64&branchName=master) |
| Remote - linux to windows | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20remote%20-%20linux%20to%20windows?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=67&branchName=master) |
| Remote - windows to linux | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20remote%20-%20windows%20to%20linux?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=68&branchName=master) |
| Remote - windows to windows | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20remote%20-%20windows%20to%20windows?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=99&branchName=master) |
| OpenPAI | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20openpai%20-%20linux?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=65&branchName=master) |
| Frameworkcontroller | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20frameworkcontroller?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=70&branchName=master) |
| Kubeflow | [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/integration%20test%20-%20kubeflow?branchName=master)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=69&branchName=master) |
......
......@@ -4,8 +4,8 @@ trigger: none
pr: none
variables:
resource_group: nni
gallery_name: nniImageGallery
resource_group: nni-image-builder
gallery_name: nniImageGalleryV2
network_security_group: nni-image-builder-nsg
managed_image_name: nni-linux-image
image_definition_name: nniLinuxImage
......
......@@ -4,11 +4,11 @@ trigger: none
pr: none
variables:
resource_group: nni
gallery_name: nniImageGallery
resource_group: nni-image-builder
gallery_name: nniImageGalleryV2
network_security_group: nni-image-builder-nsg
managed_image_name: nni-windows-image
image_name: nniWindowsImage
image_definition_name: nniWindowsImage
packer_config: config_windows
jobs:
......
......@@ -46,3 +46,11 @@ jobs:
- template: templates/cache-dependencies-template.yml
parameters:
platform: ubuntu-latest-gpu
- job: windows_gpu
pool:
vmImage: windows-latest
steps:
- template: templates/cache-dependencies-template.yml
parameters:
platform: windows-gpu
......@@ -35,9 +35,7 @@ stages:
timeoutInMinutes: 60
steps:
- template: templates/fix-apt-1es.yml
parameters:
check_gpu: true
- template: templates/check-gpu-status.yml
- template: templates/install-dependencies.yml
parameters:
......
......@@ -35,9 +35,7 @@ stages:
timeoutInMinutes: 60
steps:
- template: templates/fix-apt-1es.yml
parameters:
check_gpu: true
- template: templates/check-gpu-status.yml
- template: templates/install-dependencies.yml
parameters:
......
......@@ -32,12 +32,10 @@ stages:
jobs:
- job: linux
pool: nni-it-1es-11
timeoutInMinutes: 60
timeoutInMinutes: 90
steps:
- template: templates/fix-apt-1es.yml
parameters:
check_gpu: true
- template: templates/check-gpu-status.yml
- template: templates/install-dependencies.yml
parameters:
......@@ -55,24 +53,25 @@ stages:
- job: windows
pool: nni-it-1es-windows
timeoutInMinutes: 60
timeoutInMinutes: 90
steps:
# FIXME: Windows should use GPU,
# but it's not used now since driver is not installed in the image.
- template: templates/check-gpu-status.yml
parameters:
platform: windows
- template: templates/install-dependencies.yml
parameters:
platform: windows
platform: windows-gpu
python_env: noop
- template: templates/install-nni.yml
parameters:
user: false
# NOTE: Data needs to be downloaded if Windows has GPU.
# Also, the download template needs to be updated with powershell syntax.
# - template: templates/download-test-data.yml
- template: templates/download-test-data.yml
parameters:
platform: windows
- powershell: |
python test/vso_tools/ssl_patch.py
......
......@@ -11,8 +11,7 @@ jobs:
timeoutInMinutes: 90
steps:
# FIXME: should use GPU here
- template: templates/fix-apt-1es.yml
# TODO: consider adding GPU tests here
- template: templates/install-dependencies.yml
parameters:
......
......@@ -11,9 +11,7 @@ jobs:
timeoutInMinutes: 60
steps:
- template: templates/fix-apt-1es.yml
parameters:
check_gpu: true
- template: templates/check-gpu-status.yml
- template: templates/install-dependencies.yml
parameters:
......
......@@ -11,6 +11,9 @@ jobs:
timeoutInMinutes: 120
steps:
- template: templates/check-gpu-status.yml
parameters:
platform: windows
- template: templates/install-dependencies.yml
parameters:
......@@ -38,8 +41,7 @@ jobs:
# We can't install it on-the-fly because we can't elevate the permission here.
- powershell: |
cd test
python training_service/nnitest/run_tests.py --config training_service/config/integration_tests.yml --ts local `
--exclude mnist-pytorch-local-gpu
python training_service/nnitest/run_tests.py --config training_service/config/integration_tests.yml --ts local
displayName: Integration test
- template: templates/save-crashed-info.yml
......
......@@ -16,9 +16,8 @@ jobs:
timeoutInMinutes: 120
steps:
- template: templates/fix-apt-1es.yml
# FIXME: GPU is not supported yet.
# Change to ubuntu-latest-gpu when it's done.
# TODO: We don't currently have a test for GPU.
# And nvidia-docker is not installed yet.
- template: templates/install-dependencies.yml
parameters:
......
# BEFORE READING:
#
# 1. We are now running agents on 1ES, all the notes about VMSS can be safely ignored.
# 2. Many actions can be done on both cloud shell and web portal. Choose whichever you prefer.
steps:
- script: |
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
displayName: Install azcli
# Please follow the tutorial of [image builder](https://docs.microsoft.com/en-us/azure/virtual-machines/image-builder-overview)
# to set up a managed identity, and,
# 1. Assign the role following the instruction.
# 2. Assign contributor role of the resource group to the identity.
# 3. Add the identity to VMSS.
#
# Update 2022/7 (running on Microsoft-hosted agents).
# Update 2022/7 (running on Microsoft-hosted agents / 1ES agents).
# Use a service principal. This service principal must be assigned contributor access to the resource group.
#
# Alternative option: managed identity.
# Follow tutorial of [image builder](https://docs.microsoft.com/en-us/azure/virtual-machines/image-builder-overview).
#
# Either way, the identity / service principal must be assigned contributor access to the resource group.
# We also added the following role (but I'm not sure whether it's necessary):
#
# {
# "properties": {
# "roleName": "ImageBuilderRole",
# "description": "Image Builder access to create resources for the image build, you should delete or split out as appropriate",
# "assignableScopes": [
# "/subscriptions/<subscription_id>/resourceGroups/<resource_group>"
# ],
# "permissions": [
# {
# "actions": [
# "Microsoft.Compute/galleries/read",
# "Microsoft.Compute/galleries/images/read",
# "Microsoft.Compute/galleries/images/versions/read",
# "Microsoft.Compute/galleries/images/versions/write",
# "Microsoft.Compute/images/write",
# "Microsoft.Compute/images/read",
# "Microsoft.Compute/images/delete",
# "Microsoft.VirtualMachineImages/imageTemplates/write",
# "Microsoft.VirtualMachineImages/imageTemplates/read",
# "Microsoft.VirtualMachineImages/imageTemplates/delete"
# ],
# "notActions": [],
# "dataActions": [],
# "notDataActions": []
# }
# ]
# }
# }
#
- script: |
az login --service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant_id)
displayName: Login to Azure
......@@ -28,10 +63,12 @@ steps:
az provider show -n Microsoft.Network -o json
displayName: Register features
# Need to create an image gallerybefore this.
# Only need to create once.
# Need to create an image gallery before this.
# Only need to create once (can be done on web portal).
# az sig create --resource-group <resource_group> --gallery-name <sig_name>
#
# NOTE: Remember to add READER access to the image gallery for "1ES Resource Management".
#
# Add a image definition (also only once).
# az sig image-definition create -g <resource_group> \
# --gallery-name <sig_name> \
......@@ -45,6 +82,8 @@ steps:
# --sku 20_04-nni \
# --os-type Linux \
# --hyper-v-generation V2
#
# This can be done on web portal, remember to choose V2 for Hyper-V generation.
- script: |
set -e
......@@ -96,7 +135,7 @@ steps:
# The workaround here is to use a monitor to detect the machine ready signal and change its WinRM port.
- script: |
cd test/vso_tools/build_vm
python3 packer_build_windows.py
python3 packer_build_windows.py $(packer_config).json $(resource_group)
displayName: (Windows) Packer build
condition: and(succeeded(), contains(variables['packer_config'], 'windows'))
......@@ -109,14 +148,14 @@ steps:
# TODO: Should delete the managed image after build is done.
# Image gallery alone is enough. Keeping it for now for debugging purposes.
# No further actions are needed here. VM images are already set to latest. They should be auto-updated.
# In case you want to do it on your own:
#
# To deploy the image on VMSS, run this in Cloud Shell:
# az vmss update --resource-group nni --name nni-windows-it \
# --set virtualMachineProfile.storageProfile.imageReference.id=/subscriptions/{subscriptionId}/resourceGroups/nni/providers/Microsoft.Compute/galleries/nniImageGallery/images/nniWindowsImage/versions/Latest
#
# To deploy the image on 1ES, similar actions need to be performed on the web portal of 1ES managed images.
#
# Probably need to enlarge the disk size, in case it's too small:
# az vmss update -n nni-it -g nni --set virtualMachineProfile.storageProfile.osDisk.diskSizeGb=50
#
# No need to update the image every time, because it's already set to latest.
#
# NOTE: After using 1ES pool, the pool image has to be updated manually to the latest version.
# However, no successful build has been performed yet, because of resource shortage in Southeast Asia.
parameters:
- name: platform
type: string
default: linux
steps:
# Install GPU driver on Windows.
# Installer has already been downloaded and saved in the image.
- powershell: |
Start-Process -Verb RunAs -FilePath "$env:ProgramData\driver_installer.exe" -ArgumentList "/s /n" -Wait
displayName: (Windows) Install GPU driver
condition: and(succeeded(), contains('${{ parameters.platform }}', 'windows'))
# Make sure GPU isn't broken.
- script: |
nvidia-smi
displayName: Check GPU status
......@@ -7,13 +7,26 @@
# because it's not easy to setup auto-download for some datasets.
# See cache-dependencies-template.yml on how to generate credentials to upload new test data.
parameters:
- name: platform
type: string
default: linux
steps:
- script: |
set -e
mkdir -p test/data
cd test
azcopy copy 'https://nni.blob.core.windows.net/testdata/*' data
python vso_tools/unpack_testdata.py
ls -al data
- ${{ if contains(parameters.platform, 'windows') }}:
powershell: |
New-Item -Path test/data -ItemType directory -Force
cd test
azcopy copy 'https://nni.blob.core.windows.net/testdata/*' data
python vso_tools/unpack_testdata.py
Get-ChildItem data
${{ else }}:
script: |
set -e
mkdir -p test/data
cd test
azcopy copy 'https://nni.blob.core.windows.net/testdata/*' data
python vso_tools/unpack_testdata.py
ls -al data
displayName: Download test data
# Fix apt-related issues on 1ES linux pipeline.
# 1ES has an auto-upgraded with apt-get running in the background, periodically.
# This leads to bad consequences:
# 1) apt is locked when install is actually needed
# 2) unattended upgrade could possibly break the GPU driver version, and crash nvidia-smi.
#
# The ultimate solution should be to upgrade the VM image correctly,
# but it's currently infeasible because of a resource group limitation.
# We introduce a workaround here by force disabling the auto-upgrade and,
# fix the broken dependencies if upgrade has already been accidentally run.
#
# This file can be removed after image is updated to latest.
parameters:
- name: check_gpu
type: boolean
default: false
steps:
# Don't set -e
# Always make sure the lock is released.
- script: |
set -x
sudo bash test/vso_tools/build_vm/disable_apt_daily.sh
sudo apt-get -o DPkg::Lock::Timeout=120 --fix-broken -y install
displayName: (1ES) Disable apt upgrade
# Make sure GPU isn't broken.
# Sometimes we can't save the GPU because upgrade runs too early.
# We have to rerun the pipeline if unlucky. But it doesn't matter if we don't intend to use GPU at all.
- script: |
echo "There can be unlucky cases when we can't save the GPU. If nvidia-smi fails, try to rerun the failed jobs."
nvidia-smi
displayName: (1ES) Check GPU status
condition: and(succeeded(), ${{ parameters.check_gpu }})
......@@ -261,6 +261,7 @@ _yarn_env['PATH'] = str(Path().resolve() / 'nni_node') + path_env_seperator + os
_yarn_path = Path().resolve() / 'toolchain/yarn/bin' / yarn_executable
def _yarn(path, *args):
_print('yarn ' + ' '.join(args) + f' (path: {path})')
if os.environ.get('GLOBAL_TOOLCHAIN'):
subprocess.run(['yarn', *args], cwd=path, check=True)
else:
......
import logging
import sys
import pytest
import numpy as np
......@@ -196,6 +197,18 @@ def test_hub_oneshot(space_type, strategy_type):
if strategy_type in ['darts', 'gumbel'] and space_type == 'mobilenetv3':
pytest.skip('Skip as it consumes too much memory.')
WINDOWS_SPACES = [
# Skip some spaces as Windows platform is slow.
'nasbench201',
'mobilenetv3',
'proxylessnas',
'shufflenet',
'autoformer',
'darts',
]
if sys.platform == 'win32' and space_type not in WINDOWS_SPACES:
pytest.skip('Skip as Windows is too slow.')
model_space = _hub_factory(space_type)
dataset_type = 'cifar10'
......
......@@ -22,11 +22,11 @@
"gallery_name": "<gallery_name>",
"image_name": "<image_name>",
"image_version": "<image_version>",
"replication_regions": ["southeastasia", "westus2", "eastus"],
"replication_regions": ["southeastasia", "westus3", "eastus"],
"storage_account_type": "Standard_LRS"
},
"build_resource_group_name": "nni",
"build_resource_group_name": "<resource_group>",
"vm_size": "Standard_DS2_v2"
}],
"provisioners": [{
......
......@@ -20,11 +20,11 @@
"gallery_name": "<gallery_name>",
"image_name": "<image_name>",
"image_version": "<image_version>",
"replication_regions": ["southeastasia", "westus2", "eastus"],
"replication_regions": ["southeastasia", "westus3", "eastus"],
"storage_account_type": "Standard_LRS"
},
"build_resource_group_name": "nni",
"build_resource_group_name": "<resource_group>",
"vm_size": "Standard_D2s_v4",
"allowed_inbound_ip_addresses": ["<ip_address>"],
......
......@@ -15,8 +15,8 @@ import subprocess
import sys
import time
BUILD_COMMAND = 'PACKER_LOG=1 packer build packer_windows.json'
RESOURCE_GROUP = 'nni'
BUILD_COMMAND = 'PACKER_LOG=1 packer build ' + sys.argv[1]
RESOURCE_GROUP = sys.argv[2]
def monitor_print(*args):
......@@ -24,6 +24,9 @@ def monitor_print(*args):
def main():
monitor_print('Build command:', BUILD_COMMAND)
monitor_print('Resource group:', RESOURCE_GROUP)
process = subprocess.Popen(BUILD_COMMAND, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
retcode = process.poll()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment