fix-apt-1es.yml 1.39 KB
Newer Older
Yuge Zhang's avatar
Yuge Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# Fix apt-related issues on 1ES linux pipeline.

# 1ES has an auto-upgraded with apt-get running in the background, periodically.
# This leads to bad consequences:
# 1) apt is locked when install is actually needed
# 2) unattended upgrade could possibly break the GPU driver version, and crash nvidia-smi.
#
# The ultimate solution should be to upgrade the VM image correctly,
# but it's currently infeasible because of a resource group limitation.
# We introduce a workaround here by force disabling the auto-upgrade and,
# fix the broken dependencies if upgrade has already been accidentally run.
#
# This file can be removed after image is updated to latest.

parameters:
- name: check_gpu
  type: boolean
  default: false

steps:

# Don't set -e
# Always make sure the lock is released.
- script: |
    set -x
    sudo bash test/vso_tools/build_vm/disable_apt_daily.sh
    sudo apt-get -o DPkg::Lock::Timeout=120 --fix-broken -y install
  displayName: (1ES) Disable apt upgrade

# Make sure GPU isn't broken.
# Sometimes we can't save the GPU because upgrade runs too early.
# We have to rerun the pipeline if unlucky. But it doesn't matter if we don't intend to use GPU at all.
- script: |
    echo "There can be unlucky cases when we can't save the GPU. If nvidia-smi fails, try to rerun the failed jobs."
    nvidia-smi
  displayName: (1ES) Check GPU status
  condition: and(succeeded(), ${{ parameters.check_gpu }})