Unverified Commit 75ea2117 authored by Yuge Zhang's avatar Yuge Zhang Committed by GitHub
Browse files

Refactor integration test (step 5) - VM image builder (#4896)

parent a8f86a78
trigger: none
pr: none
variables:
resource_group: nni
gallery_name: nniImageGallery
network_security_group: nni-image-builder-nsg
managed_image_name: nni-linux-image
image_definition_name: nniLinuxImage
packer_config: config_linux
jobs:
- job: linux
pool: nni-it
steps:
- template: templates/build-vm-image-template.yml
trigger: none
pr: none
variables:
resource_group: nni
gallery_name: nniImageGallery
network_security_group: nni-image-builder-nsg
managed_image_name: nni-windows-image
image_name: nniWindowsImage
packer_config: config_windows
jobs:
- job: windows
pool: nni-it
timeoutInMinutes: 90
steps:
- template: templates/build-vm-image-template.yml
steps:
- script: |
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
displayName: Install azcli
# Please follow the tutorial of [image builder](https://docs.microsoft.com/en-us/azure/virtual-machines/image-builder-overview)
# to set up a managed identity, and,
# 1. Assign the role following the instruction.
# 2. Assign contributor role of the resource group to the identity.
# 3. Add the identity to VMSS.
- script: |
az login --identity --allow-no-subscriptions --username $(identity_id)
displayName: Login to Azure
# Make sure all these are registered.
# If not, might need az provider register -n xxx
# Need subscription-write access.
- script: |
set -e
az provider show -n Microsoft.VirtualMachineImages -o json
az provider show -n Microsoft.KeyVault -o json
az provider show -n Microsoft.Compute -o json
az provider show -n Microsoft.Storage -o json
az provider show -n Microsoft.Network -o json
displayName: Register features
# Need to create an image gallerybefore this.
# Only need to create once.
# az sig create --resource-group <resource_group> --gallery-name <sig_name>
# Add a image definition (also only once).
# az sig image-definition create -g <resource_group> \
# --gallery-name <sig_name> \
# --gallery-image-definition <image_def>
#
# For example,
# az sig image-definition create -g nni --gallery-name nniImageGallery \
# --gallery-image-definition nniLinuxImage \
# --publisher NNI \
# --offer ubuntu \
# --sku 20_04-nni \
# --os-type Linux \
# --hyper-v-generation V2
- script: |
set -e
set -x
az image list -g $(resource_group)
if az image list -g $(resource_group) --query [].'name' | grep -q $(managed_image_name); then
az image delete -n $(managed_image_name) -g $(resource_group)
fi
displayName: List existing images (and delete)
- script: |
set -e
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
sudo apt-get update && sudo apt-get install packer
displayName: Install packer
- script: |
set -e
cd test/vso_tools/build_vm
export IP_ADDRESS=$(curl -s ifconfig.me)
export VERSION=$(date "+%Y").$(date "+%m%d").$(date "+%H%M%S")
export CONFIG_PATH=$(packer_config).json
sed -i -e "s/<client_id>/$(identity_id)/g" $CONFIG_PATH
sed -i -e "s/<subscription_id>/$(subscription_id)/g" $CONFIG_PATH
sed -i -e "s/<managed_image_name>/$(managed_image_name)/g" $CONFIG_PATH
sed -i -e "s/<resource_group>/$(resource_group)/g" $CONFIG_PATH
sed -i -e "s/<network_security_group>/$(network_security_group)/g" $CONFIG_PATH
sed -i -e "s/<gallery_name>/$(gallery_name)/g" $CONFIG_PATH
sed -i -e "s/<image_name>/$(image_definition_name)/g" $CONFIG_PATH
sed -i -e "s/<image_version>/${VERSION}/g" $CONFIG_PATH
sed -i -e "s/<ip_address>/${IP_ADDRESS}/g" $CONFIG_PATH
cat $CONFIG_PATH
echo "##vso[task.logissue type=warning]During packer build, please avoid cancelling this task. Otherwise, created resources might need manual cleanup."
displayName: Prepare configuration
# Microsoft has a security group for VM created under their subscriptions, that,
# based on my observations (though I had no clearance to see it myself):
# 1. A low priority deny all that denies all unintended incoming traffic.
# 2. A medium-high priority denial for all traffic coming from small ports (lower than 8000 probably).
# 3. A high priority allowance for traffics from Microsoft-internal IPs.
#
# We can only insert new rules below medium. Therefore,
# 1. For Linux, we change the ssh port to 10022. This is done at provisioning by injecting user / custom data.
# 2. For Windows, they can't execute the user data script: https://stackoverflow.com/questions/62888359/custom-data-with-azure-windows-vm-run-powersell-script
# We can't use custom script extensions either because it's not supported in packer.
# We also can't use shell-local provisioner to invoke command, because when the VM is ready, packer always try to connect to WinRM.
# The workaround here is to use a monitor to detect the machine ready signal and change its WinRM port.
- script: |
cd test/vso_tools/build_vm
python3 packer_build_windows.py
displayName: (Windows) Packer build
condition: and(succeeded(), contains(variables['packer_config'], 'windows'))
- script: |
cd test/vso_tools/build_vm
PACKER_LOG=1 packer build $(packer_config).json
displayName: (Linux) Packer build
condition: and(succeeded(), contains(variables['packer_config'], 'linux'))
# TODO: Should delete the managed image after build is done.
# Image gallery alone is enough. Keeping it for now for debugging purposes.
# To deploy the image on VMSS, run this in Cloud Shell:
# az vmss update --resource-group nni --name nni-windows-it \
# --set virtualMachineProfile.storageProfile.imageReference.id=/subscriptions/{subscriptionId}/resourceGroups/nni/providers/Microsoft.Compute/galleries/nniImageGallery/images/nniWindowsImage/versions/Latest
#
# Probably need to enlarge the disk size, in case it's too small:
# az vmss update -n nni-it -g nni --set virtualMachineProfile.storageProfile.osDisk.diskSizeGb=50
#
# No need to update the image every time, because it's already set to latest.
#!/bin/bash
# Microsoft internal subscription has a firewall (network security group),
# to deny traffic from low ports (maybe less than 10000).
# We have to change the port at VM provision, so that the VM can be connected and build scripts can run.
CONFIG_PATH=/etc/ssh/sshd_config
sudo sed -i -e "s/#Port 22/Port 10022/g" $CONFIG_PATH
sudo service ssh restart
# See change_ssh_port.sh
# Note that this script does NOT run at provision, because Azure doesn't support provision-time powershell script.
# This script is invoked with azcli (az vm run-command). See packer_build_windows.py.
winrm set winrm/config/Listener?Address=*+Transport=HTTPS '@{Port="15986"}'
New-NetFirewallRule -Name 'Custom-WinRM' -DisplayName 'Custom WinRM Port Rule' -Enabled True -Direction Inbound -Protocol TCP -Action Allow -Program System -LocalPort 15986
Restart-Service -Name WinRM
{
"builders": [{
"type": "azure-arm",
"client_id": "<client_id>",
"managed_image_name": "<managed_image_name>",
"managed_image_resource_group_name": "<resource_group>",
"os_type": "Linux",
"image_publisher": "Canonical",
"image_offer": "0001-com-ubuntu-server-focal",
"image_sku": "20_04-lts-gen2",
"os_disk_size_gb": 50,
"shared_image_gallery_destination": {
"subscription": "<subscription_id>",
"resource_group": "<resource_group>",
"gallery_name": "<gallery_name>",
"image_name": "<image_name>",
"image_version": "<image_version>",
"replication_regions": ["southeastasia", "westus2"],
"storage_account_type": "Standard_LRS"
},
"build_resource_group_name": "nni",
"vm_size": "Standard_DS2_v2"
}],
"provisioners": [{
"script": "setup_linux.sh",
"type": "shell",
"timeout": "30m"
}]
}
{
"builders": [{
"type": "azure-arm",
"client_id": "<client_id>",
"managed_image_name": "<managed_image_name>",
"managed_image_resource_group_name": "<resource_group>",
"os_type": "Windows",
"image_publisher": "MicrosoftWindowsServer",
"image_offer": "WindowsServer",
"image_sku": "2019-datacenter-gensecond",
"shared_image_gallery_destination": {
"subscription": "<subscription_id>",
"resource_group": "<resource_group>",
"gallery_name": "<gallery_name>",
"image_name": "<image_name>",
"image_version": "<image_version>",
"replication_regions": ["southeastasia", "westus2"],
"storage_account_type": "Standard_LRS"
},
"build_resource_group_name": "nni",
"vm_size": "Standard_D2s_v4",
"allowed_inbound_ip_addresses": ["<ip_address>"],
"communicator": "winrm",
"winrm_use_ssl": true,
"winrm_insecure": true,
"winrm_timeout": "10m",
"winrm_username": "packer",
"winrm_port": 15986
}],
"provisioners": [
{
"script": "setup_windows.ps1",
"type": "powershell",
"elevated_user": "packer",
"elevated_password": "{{ build `Password` }}",
"timeout": "40m",
"valid_exit_codes": [0, 3010]
},
{
"type": "windows-restart",
"restart_timeout": "15m"
},
{
"script": "setup_windows_finalize.ps1",
"type": "powershell",
"elevated_user": "packer",
"elevated_password": "{{ build `Password` }}",
"timeout": "15m"
}
]
}
"""Start and monitor a packer building process, and change WinRM ports at an appropriate time.
It first catches the name of the VM. It's a randomly generated string by packer.
It then catches the signal of packer's waiting for WinRM ready.
Actually, it will be ready soon, but packer is waiting on the wrong port.
Invoking a command here to change the port of WinRM so that packer could conect to it.
The monitor is designed to be as robust as possible, so that packer won't easily crash.
It's painful to manually clean up the resources that packer has created,
as we are reusing an existing resource group.
"""
import re
import subprocess
import sys
import time
BUILD_COMMAND = 'PACKER_LOG=1 packer build packer_windows.json'
RESOURCE_GROUP = 'nni'
def monitor_print(*args):
print('packer build monitor:', *args, flush=True)
def main():
process = subprocess.Popen(BUILD_COMMAND, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
retcode = process.poll()
vm_name = None
winrm_start_waiting = None
winrm_finished = False
for line in process.stdout:
try:
line = line.decode()
sys.stdout.write(line)
sys.stdout.flush()
except UnicodeDecodeError:
monitor_print('Decode error:', str(line))
if winrm_finished:
continue
# Find VM name
vm_name_grp = re.search(r'pkrvm[a-z0-9]{10,}', line)
if vm_name is None and vm_name_grp is not None:
vm_name = vm_name_grp.group(0)
monitor_print('VM name found:', vm_name)
# Waiting for WinRM
if winrm_start_waiting is None and 'Waiting for WinRM' in line:
if vm_name is None:
monitor_print('VM name not found. This is not normal.')
else:
winrm_start_waiting = time.time()
monitor_print('Waiting for WinRM detected. You might see some errors. No worry.')
# After WinRM has a waiting signal, wait another minute to make sure it's ready.
if winrm_start_waiting is not None and time.time() - winrm_start_waiting > 60:
monitor_print('WinRM waits time has exceeded 60 seconds. Starting to invoke command to change its port.')
result = subprocess.run(
'az vm run-command invoke --command-id RunPowerShellScript '
f'--name {vm_name} -g {RESOURCE_GROUP} '
'--scripts @change_winrm_port.ps1',
shell=True
)
if result.returncode != 0:
monitor_print('Return code of command invoking is non-zero:', result.returncode)
else:
monitor_print('Command invocation successfully triggered.')
# To make the packer resource cleanup robust,
# WinRM is always finished regardless of subprocess status.
winrm_finished = True
if retcode is not None:
if retcode != 0:
monitor_print('packer build fails with return code:', retcode)
else:
monitor_print('packer build succeeds')
return retcode
time.sleep(1)
if __name__ == '__main__':
sys.exit(main())
#!/bin/bash
set -e
set -x
# Build essentials are required.
# But clean first...
sudo apt-get clean
sudo rm -rvf /var/lib/apt/lists/*
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y software-properties-common
sudo apt-get update
sudo apt-get install -y build-essential cmake uidmap
# Install azcli for Azure resources access and management.
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
# Install azcopy for cache download.
# https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10#use-azcopy-in-a-script
mkdir -p tmp
cd tmp
wget -O azcopy_v10.tar.gz https://aka.ms/downloadazcopy-v10-linux && tar -xf azcopy_v10.tar.gz --strip-components=1
sudo cp ./azcopy /usr/bin/
sudo chmod +x /usr/bin/azcopy
# Install docker
# This docker must run with sudo.
# We don't know which user will run on pipeline in advance.
curl -fsSL https://get.docker.com | sh
sudo systemctl --now enable docker
# TODO: nvidia-docker should be installed here.
# Install NFS server / client
# This should only be done when neceessary, but it doesn't harm to install it, nonetheless.
# The NFS server can be accessed through the path: host.docker.internal
# Added a host alias so that it can also be used outside the container
# Inside the container they should use exactly the same uid/gid to read/write files.
sudo apt-get install -y nfs-kernel-server nfs-common
sudo mkdir -p /var/nfs/general
sudo chmod 777 /var/nfs/general
echo "/var/nfs/general *(rw,sync,insecure,no_subtree_check,no_root_squash)" | sudo tee -a /etc/exports
echo "127.0.0.1 host.docker.internal" | sudo tee -a /etc/hosts
sudo systemctl restart nfs-kernel-server
# VM with GPU needs to install drivers. Reference:
# https://docs.microsoft.com/en-us/azure/virtual-machines/linux/n-series-driver-setup
# https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
# https://linuxhint.com/install-cuda-ubuntu/
sudo apt-get install linux-headers-$(uname -r) -y
sudo wget -O /etc/apt/preferences.d/cuda-repository-pin-600 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
sudo apt-get update
sudo apt-get install -y cuda-drivers
# UsePythonVersion task only works when the specific Python version is already installed.
# The following is for linux.
# Reference: https://dev.to/akaszynski/create-an-azure-self-hosted-agent-without-going-insane-173g
# We only need Python 3.7 and 3.9 for now.
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt-get install -y python3.7-dev python3.7-venv python3.9-dev python3.9-venv
# Deprovision
sudo /usr/sbin/waagent -force -deprovision
sudo HISTSIZE=0 sync
#Requires -RunAsAdministrator
$ErrorActionPreference = "Stop"
# Choco.
# https://docs.chocolatey.org/en-us/choco/setup
# Community version can't customize output directory.
Write-Host "Installing Choco..."
[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
Set-PSDebug -Trace 1
# Nuget.
# Doesn't have azcopy.
Write-Host "Installing Nuget..."
$NugetDir = "$env:ProgramData\nuget"
New-Item "$NugetDir" -ItemType Directory -Force | Out-Null
Invoke-WebRequest -Uri "https://dist.nuget.org/win-x86-commandline/latest/nuget.exe" -OutFile "${NugetDir}\nuget.exe"
$env:path = "$env:path;$NugetDir"
# Install SSH.
Write-Host "Installing SSH..."
# https://docs.microsoft.com/en-us/windows-server/administration/openssh/openssh_install_firstuse
Get-WindowsCapability -Online | Where-Object Name -like 'OpenSSH*'
# Install the OpenSSH Client
Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0
# Install the OpenSSH Server
Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0
# Start the sshd service
Set-PSDebug -Trace 0
Write-Host "Starting SSH service..."
Start-Service sshd
Set-Service -Name sshd -StartupType 'Automatic'
Write-Host "Configure firewall for SSH..."
# Confirm the Firewall rule is configured. It should be created automatically by setup. Run the following to verify
if (!(Get-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -ErrorAction SilentlyContinue | Select-Object Name, Enabled)) {
Write-Output "Firewall Rule 'OpenSSH-Server-In-TCP' does not exist, creating it..."
New-NetFirewallRule -Name 'OpenSSH-Server-In-TCP' -DisplayName 'OpenSSH Server (sshd)' -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22
} else {
Write-Output "Firewall rule 'OpenSSH-Server-In-TCP' has been created and exists."
}
Set-PSDebug -Trace 1
# Create a new user (for SSH login).
$Password = ConvertTo-SecureString "P@ssW0rD!" -AsPlainText -Force
New-LocalUser "NNIUser" -Password $Password -PasswordNeverExpires
# These installation seems not working.
# Visual Studio C++ Build tools (for Cython)
# Invoke-WebRequest "https://aka.ms/vs/17/release/vs_BuildTools.exe" -OutFile "vs_BuildTools.exe"
# Start-Process -FilePath "vs_BuildTools.exe" -ArgumentList "--quiet --add Microsoft.VisualStudio.Workload.VCTools --includeRecommended" -Wait
# Remove-Item "vs_BuildTools.exe"
# Microsoft Visual C++ Redistributable (for PyTorch)
# Invoke-WebRequest "https://aka.ms/vs/16/release/vc_redist.x64.exe" -OutFile "vc_redist.x64.exe"
# Start-Process -FilePath ".\vc_redist.x64.exe" -ArgumentList "/q /norestart" -Wait
# Remove-Item "vc_redist.x64.exe"
# Use choco instead.
choco install -y --no-progress visualstudio2019buildtools
choco install -y --no-progress visualstudio2019-workload-vctools
choco install -y --no-progress vcredist2012 vcredist2013 vcredist2015 vcredist2017
# Install CUDA.
Write-Host "Installing CUDA..."
$CudaUrl = "https://developer.download.nvidia.com/compute/cuda/11.7.0/network_installers/cuda_11.7.0_windows_network.exe"
Invoke-WebRequest $CudaUrl -OutFile "cuda_installer.exe"
Start-Process -FilePath "cuda_installer.exe" -ArgumentList "/s /n" -Wait
Remove-Item "cuda_installer.exe"
# Verify CUDA.
Write-Host "Verify CUDA installation..."
$CudaDir = "$env:ProgramFiles\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin"
# GPU driver can't be installed without a hardware
# Get-Command nvidia-smi
Get-ChildItem $CudaDir
$env:path = "$env:path;$CudaDir"
Write-Host "Installing utilities..."
# Install azcopy for cache download.
# Something wrong with the latest (10.15.0) checksum.
choco install -y --force azcopy10 --version=10.14.1 --no-progress
azcopy --version
# Install swig.
# Note that swig 4.0 is not compatible with ConfigSpace.
choco install -y --force swig --version=3.0.12 --no-progress
swig -version
# Install cmake.
choco install -y --no-progress cmake
$env:path = "$env:path;$env:ProgramFiles\CMake\bin"
cmake --version
# Install python.
# Originally I tried to install the python by downloading from official, and run the installation.
#
# Invoke-WebRequest -Uri "https://www.python.org/ftp/python/3.9.12/python-3.9.12-amd64.exe" -OutFile "python-installer.exe"
# Start-Process -FilePath .\python-installer.exe -NoNewWindow -Wait \
# -ArgumentList "/quiet InstallAllUsers=1 TargetDir=$(Agent.ToolsDirectory)\Python\3.9.12\x64 Include_launcher=0"
# New-Item -Path $(Agent.ToolsDirectory)\Python\3.9.12\x64.complete -ItemType file -Force
#
# But ``Start-Process`` fails with mysterious reasons (exit code is not zero and no error message).
# I tried with -PassThru, -NoNewWindow, -Wait, /quiet, /passive, InstallAllUsers and some other flags, but none works.
# (InstallAllUsers is the key to make it work on my local, but not on pipeline).
# I guess it's related to lack of adminstrative privileges.
# I kept this attempt here in case any one can make it work.
#
# Other two workarounds.
# 1) choco install python. The community verison can't customize output directory,
# and the output directory is only a guess (e.g., C:\Python310).
# 2) nuget install python. This seems working.
#
# Can't move to the installed python to $PythonDir\3.9.12\x64 because,
# 1. If we copy it, Python path will complain in the next few steps.
# 2. If we try to create a symlink, it will tell us that we don't have adminstrative rights.
#
# After all this struggle, the workaround here is simple:
# to install with nuget, then don't use `UsePythonVersion` in the next step.
# The workaround works because we actually never needs multiple python versions on windows.
Write-Host "Installing Python..."
$PythonDir = "$env:ProgramData\Python"
nuget install python -Version 3.9.12 -OutputDirectory "$PythonDir"
$env:path = "$env:path;$PythonDir\python.3.9.12\tools\;$PythonDir\python.3.9.12\tools\Scripts"
Write-Host "Verify Python installation..."
python --version
# Permanently update the PATHs
# https://codingbee.net/powershell/powershell-make-a-permanent-change-to-the-path-environment-variable
Write-Host "Prepare PATHs..."
Write-Host $env:path
Set-ItemProperty -Path "Registry::HKEY_LOCAL_MACHINE\System\CurrentControlSet\Control\Session Manager\Environment" -Name PATH -Value $env:path
#Requires -RunAsAdministrator
$ErrorActionPreference = "Stop"
# Generalize VM with sysprep
# https://docs.microsoft.com/en-us/azure/virtual-machines/windows/build-image-with-packer
# NOTE: the following *3* lines are only needed if the you have installed the Guest Agent.
while ((Get-Service RdAgent).Status -ne 'Running') { Start-Sleep -s 5 }
# Seems we don't have this.
# while ((Get-Service WindowsAzureTelemetryService).Status -ne 'Running') { Start-Sleep -s 5 }
while ((Get-Service WindowsAzureGuestAgent).Status -ne 'Running') { Start-Sleep -s 5 }
if ( Test-Path $Env:SystemRoot\windows\system32\Sysprep\unattend.xml ) {
rm $Env:SystemRoot\windows\system32\Sysprep\unattend.xml -Force
}
& $env:SystemRoot\System32\Sysprep\Sysprep.exe /oobe /generalize /quiet /quit /mode:vm
while ($true) {
$imageState = Get-ItemProperty HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Setup\State | Select ImageState;
if ($imageState.ImageState -ne 'IMAGE_STATE_GENERALIZE_RESEAL_TO_OOBE') {
Write-Output $imageState.ImageState; Start-Sleep -s 10
} else {
break
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment