Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
4e71ed62
"src/tl_templates/vscode:/vscode.git/clone" did not exist on "7b74bb0128cfe7dac65e919578a2e25bd30d5be6"
Unverified
Commit
4e71ed62
authored
Jul 08, 2022
by
Yuge Zhang
Committed by
GitHub
Jul 08, 2022
Browse files
Migrate pipeline to 1ES (#4986)
parent
570448ea
Changes
23
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
233 additions
and
85 deletions
+233
-85
nni/experiment/config/utils/internal.py
nni/experiment/config/utils/internal.py
+2
-1
nni/retiarii/execution/base.py
nni/retiarii/execution/base.py
+11
-9
nni/retiarii/execution/cgo_engine.py
nni/retiarii/execution/cgo_engine.py
+13
-6
nni/retiarii/integration.py
nni/retiarii/integration.py
+64
-17
nni/retiarii/strategy/bruteforce.py
nni/retiarii/strategy/bruteforce.py
+2
-0
nni/tools/gpu_tool/gpu_metrics_collector.py
nni/tools/gpu_tool/gpu_metrics_collector.py
+12
-7
pipelines/build-vm-image-linux.yml
pipelines/build-vm-image-linux.yml
+4
-1
pipelines/build-vm-image-windows.yml
pipelines/build-vm-image-windows.yml
+3
-1
pipelines/full-test-compression.yml
pipelines/full-test-compression.yml
+12
-10
pipelines/full-test-hpo.yml
pipelines/full-test-hpo.yml
+8
-8
pipelines/full-test-nas.yml
pipelines/full-test-nas.yml
+12
-7
pipelines/integration-test-hybrid.yml
pipelines/integration-test-hybrid.yml
+2
-1
pipelines/integration-test-local-linux.yml
pipelines/integration-test-local-linux.yml
+5
-1
pipelines/integration-test-local-windows.yml
pipelines/integration-test-local-windows.yml
+3
-1
pipelines/integration-test-remote-l2l.yml
pipelines/integration-test-remote-l2l.yml
+3
-2
pipelines/integration-test-remote-w2w.yml
pipelines/integration-test-remote-w2w.yml
+2
-2
pipelines/templates/build-vm-image-template.yml
pipelines/templates/build-vm-image-template.yml
+9
-2
pipelines/templates/fix-apt-1es.yml
pipelines/templates/fix-apt-1es.yml
+37
-0
pipelines/templates/save-crashed-info.yml
pipelines/templates/save-crashed-info.yml
+25
-5
test/ut/tools/nnictl/test_kill_command.py
test/ut/tools/nnictl/test_kill_command.py
+4
-4
No files found.
nni/experiment/config/utils/internal.py
View file @
4e71ed62
...
...
@@ -171,7 +171,8 @@ def load_training_service_config(config) -> TrainingServiceConfig:
cls
=
_get_ts_config_class
(
config
[
'platform'
])
if
cls
is
not
None
:
return
cls
(
**
config
)
return
config
# not valid json, don't touch
# not valid json, don't touch
return
config
# type: ignore
def
_get_ts_config_class
(
platform
:
str
)
->
type
[
TrainingServiceConfig
]
|
None
:
from
..training_service
import
TrainingServiceConfig
# avoid circular import
...
...
nni/retiarii/execution/base.py
View file @
4e71ed62
...
...
@@ -10,6 +10,7 @@ import string
from
typing
import
Any
,
Dict
,
Iterable
,
List
from
nni.experiment
import
rest
from
nni.retiarii.integration
import
RetiariiAdvisor
from
.interface
import
AbstractExecutionEngine
,
AbstractGraphListener
from
.utils
import
get_mutation_summary
...
...
@@ -75,20 +76,21 @@ class BaseExecutionEngine(AbstractExecutionEngine):
self
.
url_prefix
=
rest_url_prefix
self
.
_listeners
:
List
[
AbstractGraphListener
]
=
[]
# register advisor callbacks
advisor
=
get_advisor
()
advisor
.
send_trial_callback
=
self
.
_send_trial_callback
advisor
.
request_trial_jobs_callback
=
self
.
_request_trial_jobs_callback
advisor
.
trial_end_callback
=
self
.
_trial_end_callback
advisor
.
intermediate_metric_callback
=
self
.
_intermediate_metric_callback
advisor
.
final_metric_callback
=
self
.
_final_metric_callback
self
.
_running_models
:
Dict
[
int
,
Model
]
=
dict
()
self
.
_history
:
List
[
Model
]
=
[]
self
.
resources
=
0
# register advisor callbacks
advisor
:
RetiariiAdvisor
=
get_advisor
()
advisor
.
register_callbacks
({
'send_trial'
:
self
.
_send_trial_callback
,
'request_trial_jobs'
:
self
.
_request_trial_jobs_callback
,
'trial_end'
:
self
.
_trial_end_callback
,
'intermediate_metric'
:
self
.
_intermediate_metric_callback
,
'final_metric'
:
self
.
_final_metric_callback
})
def
submit_models
(
self
,
*
models
:
Model
)
->
None
:
for
model
in
models
:
data
=
self
.
pack_model_data
(
model
)
...
...
nni/retiarii/execution/cgo_engine.py
View file @
4e71ed62
...
...
@@ -14,6 +14,7 @@ from dataclasses import dataclass
from
nni.common.device
import
GPUDevice
,
Device
from
nni.experiment.config.training_services
import
RemoteConfig
from
nni.retiarii.integration
import
RetiariiAdvisor
from
.interface
import
AbstractExecutionEngine
,
AbstractGraphListener
,
WorkerInfo
from
..
import
codegen
,
utils
from
..graph
import
Model
,
ModelStatus
,
MetricData
,
Node
...
...
@@ -28,6 +29,10 @@ from .base import BaseGraphData
_logger
=
logging
.
getLogger
(
__name__
)
def
_noop
(
*
args
,
**
kwargs
):
pass
@
dataclass
class
TrialSubmission
:
model
:
Model
...
...
@@ -90,12 +95,14 @@ class CGOExecutionEngine(AbstractExecutionEngine):
self
.
_queue_lock
=
threading
.
Lock
()
# register advisor callbacks
advisor
=
get_advisor
()
# advisor.send_trial_callback = self._send_trial_callback
# advisor.request_trial_jobs_callback = self._request_trial_jobs_callback
advisor
.
trial_end_callback
=
self
.
_trial_end_callback
advisor
.
intermediate_metric_callback
=
self
.
_intermediate_metric_callback
advisor
.
final_metric_callback
=
self
.
_final_metric_callback
advisor
:
RetiariiAdvisor
=
get_advisor
()
advisor
.
register_callbacks
({
'send_trial'
:
_noop
,
'request_trial_jobs'
:
_noop
,
'trial_end'
:
self
.
_trial_end_callback
,
'intermediate_metric'
:
self
.
_intermediate_metric_callback
,
'final_metric'
:
self
.
_final_metric_callback
})
self
.
_stopped
=
False
self
.
_consumer_thread
=
threading
.
Thread
(
target
=
self
.
_consume_models
)
...
...
nni/retiarii/integration.py
View file @
4e71ed62
...
...
@@ -3,7 +3,7 @@
import
logging
import
os
from
typing
import
Any
,
Callable
,
Optional
from
typing
import
Any
,
Callable
,
Optional
,
Dict
,
List
,
Tuple
import
nni
from
nni.common.serializer
import
PayloadTooLarge
...
...
@@ -21,6 +21,7 @@ _logger = logging.getLogger(__name__)
class
RetiariiAdvisor
(
MsgDispatcherBase
):
"""
The class is to connect Retiarii components to NNI backend.
It can be considered as a Python wrapper of NNI manager.
It will function as the main thread when running a Retiarii experiment through NNI.
Strategy will be launched as its thread, who will call APIs in execution engine. Execution
...
...
@@ -32,9 +33,6 @@ class RetiariiAdvisor(MsgDispatcherBase):
The conversion advisor provides are minimum. It is only a send/receive module, and execution engine
needs to handle all the rest.
FIXME
How does advisor exit when strategy exists?
Attributes
----------
send_trial_callback
...
...
@@ -61,6 +59,63 @@ class RetiariiAdvisor(MsgDispatcherBase):
self
.
parameters_count
=
0
# Sometimes messages arrive first before the callbacks get registered.
# Or in case that we allow engine to be absent during the experiment.
# Here we need to store the messages and invoke them later.
self
.
call_queue
:
List
[
Tuple
[
str
,
list
]]
=
[]
def
register_callbacks
(
self
,
callbacks
:
Dict
[
str
,
Callable
[...,
None
]]):
"""
Register callbacks for NNI backend.
Parameters
----------
callbacks
A dictionary of callbacks.
The key is the name of the callback. The value is the callback function.
"""
self
.
send_trial_callback
=
callbacks
.
get
(
'send_trial'
)
self
.
request_trial_jobs_callback
=
callbacks
.
get
(
'request_trial_jobs'
)
self
.
trial_end_callback
=
callbacks
.
get
(
'trial_end'
)
self
.
intermediate_metric_callback
=
callbacks
.
get
(
'intermediate_metric'
)
self
.
final_metric_callback
=
callbacks
.
get
(
'final_metric'
)
self
.
process_queued_callbacks
()
def
process_queued_callbacks
(
self
)
->
None
:
"""
Process callbacks in queue.
Consume the messages that haven't been handled previously.
"""
processed_idx
=
[]
for
queue_idx
,
(
call_name
,
call_args
)
in
enumerate
(
self
.
call_queue
):
if
call_name
==
'send_trial'
and
self
.
send_trial_callback
is
not
None
:
self
.
send_trial_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
if
call_name
==
'request_trial_jobs'
and
self
.
request_trial_jobs_callback
is
not
None
:
self
.
request_trial_jobs_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
if
call_name
==
'trial_end'
and
self
.
trial_end_callback
is
not
None
:
self
.
trial_end_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
if
call_name
==
'intermediate_metric'
and
self
.
intermediate_metric_callback
is
not
None
:
self
.
intermediate_metric_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
if
call_name
==
'final_metric'
and
self
.
final_metric_callback
is
not
None
:
self
.
final_metric_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
# Remove processed messages
for
idx
in
reversed
(
processed_idx
):
self
.
call_queue
.
pop
(
idx
)
def
invoke_callback
(
self
,
name
:
str
,
*
args
:
Any
)
->
None
:
"""
Invoke callback.
"""
self
.
call_queue
.
append
((
name
,
list
(
args
)))
self
.
process_queued_callbacks
()
def
handle_initialize
(
self
,
data
):
"""callback for initializing the advisor
Parameters
...
...
@@ -140,8 +195,7 @@ class RetiariiAdvisor(MsgDispatcherBase):
# nevertheless, there could still be blocked by pipe / nni-manager
self
.
send
(
CommandType
.
NewTrialJob
,
send_payload
)
if
self
.
send_trial_callback
is
not
None
:
self
.
send_trial_callback
(
parameters
)
# pylint: disable=not-callable
self
.
invoke_callback
(
'send_trial'
,
parameters
)
return
self
.
parameters_count
def
mark_experiment_as_ending
(
self
):
...
...
@@ -149,8 +203,7 @@ class RetiariiAdvisor(MsgDispatcherBase):
def
handle_request_trial_jobs
(
self
,
num_trials
):
_logger
.
debug
(
'Request trial jobs: %s'
,
num_trials
)
if
self
.
request_trial_jobs_callback
is
not
None
:
self
.
request_trial_jobs_callback
(
num_trials
)
# pylint: disable=not-callable
self
.
invoke_callback
(
'request_trial_jobs'
,
num_trials
)
def
handle_update_search_space
(
self
,
data
):
_logger
.
debug
(
'Received search space: %s'
,
data
)
...
...
@@ -158,22 +211,16 @@ class RetiariiAdvisor(MsgDispatcherBase):
def
handle_trial_end
(
self
,
data
):
_logger
.
debug
(
'Trial end: %s'
,
data
)
if
self
.
trial_end_callback
is
not
None
:
self
.
trial_end_callback
(
nni
.
load
(
data
[
'hyper_params'
])[
'parameter_id'
],
# pylint: disable=not-callable
data
[
'event'
]
==
'SUCCEEDED'
)
self
.
invoke_callback
(
'trial_end'
,
nni
.
load
(
data
[
'hyper_params'
])[
'parameter_id'
],
data
[
'event'
]
==
'SUCCEEDED'
)
def
handle_report_metric_data
(
self
,
data
):
_logger
.
debug
(
'Metric reported: %s'
,
data
)
if
data
[
'type'
]
==
MetricType
.
REQUEST_PARAMETER
:
raise
ValueError
(
'Request parameter not supported'
)
elif
data
[
'type'
]
==
MetricType
.
PERIODICAL
:
if
self
.
intermediate_metric_callback
is
not
None
:
self
.
intermediate_metric_callback
(
data
[
'parameter_id'
],
# pylint: disable=not-callable
self
.
_process_value
(
data
[
'value'
]))
self
.
invoke_callback
(
'intermediate_metric'
,
data
[
'parameter_id'
],
self
.
_process_value
(
data
[
'value'
]))
elif
data
[
'type'
]
==
MetricType
.
FINAL
:
if
self
.
final_metric_callback
is
not
None
:
self
.
final_metric_callback
(
data
[
'parameter_id'
],
# pylint: disable=not-callable
self
.
_process_value
(
data
[
'value'
]))
self
.
invoke_callback
(
'final_metric'
,
data
[
'parameter_id'
],
self
.
_process_value
(
data
[
'value'
]))
@
staticmethod
def
_process_value
(
value
)
->
Any
:
# hopefully a float
...
...
nni/retiarii/strategy/bruteforce.py
View file @
4e71ed62
...
...
@@ -127,9 +127,11 @@ class Random(BaseStrategy):
if
budget_exhausted
():
return
time
.
sleep
(
self
.
_polling_interval
)
_logger
.
debug
(
'Still waiting for resource.'
)
try
:
model
=
get_targeted_model
(
base_model
,
applied_mutators
,
sample
)
if
filter_model
(
self
.
filter
,
model
):
_logger
.
debug
(
'Submitting model: %s'
,
model
)
submit_models
(
model
)
except
InvalidMutation
as
e
:
_logger
.
warning
(
f
'Invalid mutation:
{
e
}
. Skip.'
)
nni/tools/gpu_tool/gpu_metrics_collector.py
View file @
4e71ed62
...
...
@@ -15,14 +15,19 @@ def main(argv):
metrics_output_dir
=
os
.
environ
[
'METRIC_OUTPUT_DIR'
]
cmd
=
'nvidia-smi -q -x'
.
split
()
while
(
True
):
try
:
smi_output
=
subprocess
.
check_output
(
cmd
)
except
Exception
:
traceback
.
print_exc
()
retry
=
0
while
True
:
smi
=
subprocess
.
run
(
cmd
,
timeout
=
20
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
if
smi
.
returncode
!=
0
:
retry
+=
1
print
(
f
'gpu_metrics_collector error: nvidia-smi return code is
{
smi
.
returncode
}
'
,
file
=
sys
.
stderr
)
print
(
'='
*
20
+
f
'
\n
Captured stdout:
{
smi
.
stdout
}
'
,
file
=
sys
.
stderr
)
print
(
'='
*
20
+
f
'
\n
Captured stderr:
{
smi
.
stderr
}
'
,
file
=
sys
.
stderr
)
gen_empty_gpu_metric
(
metrics_output_dir
)
if
retry
>=
5
:
break
parse_nvidia_smi_result
(
smi_output
,
metrics_output_dir
)
else
:
parse_nvidia_smi_result
(
smi
.
stdout
,
metrics_output_dir
)
# TODO: change to sleep time configurable via arguments
time
.
sleep
(
5
)
...
...
pipelines/build-vm-image-linux.yml
View file @
4e71ed62
# FIXME: This pipeline is broken due to resource group location limitation.
trigger
:
none
pr
:
none
...
...
@@ -11,6 +13,7 @@ variables:
jobs
:
-
job
:
linux
pool
:
nni-it
pool
:
vmImage
:
ubuntu-latest
steps
:
-
template
:
templates/build-vm-image-template.yml
pipelines/build-vm-image-windows.yml
View file @
4e71ed62
# FIXME: This pipeline is broken due to resource group location limitation.
trigger
:
none
pr
:
none
...
...
@@ -11,7 +13,7 @@ variables:
jobs
:
-
job
:
windows
pool
:
nni-it
pool
:
nni-it
-1es-11
timeoutInMinutes
:
90
steps
:
-
template
:
templates/build-vm-image-template.yml
pipelines/full-test-compression.yml
View file @
4e71ed62
...
...
@@ -31,15 +31,18 @@ stages:
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
jobs
:
-
job
:
linux
# move back after we complete the 1ES pool...
pool
:
vmImage
:
ubuntu-latest
pool
:
nni-it-1es-11
timeoutInMinutes
:
60
steps
:
-
template
:
templates/fix-apt-1es.yml
parameters
:
check_gpu
:
true
-
template
:
templates/install-dependencies.yml
parameters
:
platform
:
ubuntu-latest
platform
:
ubuntu-latest-gpu
python_env
:
venv
-
template
:
templates/install-nni.yml
...
...
@@ -48,10 +51,9 @@ stages:
-
script
:
|
cd test/algo
python -m pytest compression
displayName
:
c
ompression unit test
displayName
:
C
ompression unit test
# add back after we complete the 1ES pool...
# - script: |
# cd test
# source scripts/model_compression.sh
# displayName: Model compression test
-
script
:
|
cd test
source scripts/model_compression.sh
displayName
:
Model compression test
pipelines/full-test-hpo.yml
View file @
4e71ed62
...
...
@@ -31,15 +31,18 @@ stages:
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
jobs
:
-
job
:
linux
# move back after we complete the 1ES pool...
pool
:
vmImage
:
ubuntu-latest
pool
:
nni-it-1es-11
timeoutInMinutes
:
60
steps
:
-
template
:
templates/fix-apt-1es.yml
parameters
:
check_gpu
:
true
-
template
:
templates/install-dependencies.yml
parameters
:
platform
:
ubuntu-latest
platform
:
ubuntu-latest-gpu
python_env
:
venv
-
template
:
templates/install-nni.yml
...
...
@@ -57,10 +60,7 @@ stages:
-
script
:
|
cd test
python training_service/nnitest/run_tests.py \
--config training_service/config/integration_tests.yml \
--ts local \
--exclude mnist-pytorch-local-gpu
python training_service/nnitest/run_tests.py --config training_service/config/integration_tests.yml --ts local
displayName
:
Integration test
# TODO: should add a test on platforms other than linux
pipelines/full-test-nas.yml
View file @
4e71ed62
...
...
@@ -31,15 +31,18 @@ stages:
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
jobs
:
-
job
:
linux
# move back after we complete the 1ES pool...
pool
:
vmImage
:
ubuntu-latest
pool
:
nni-it-1es-11
timeoutInMinutes
:
60
steps
:
-
template
:
templates/fix-apt-1es.yml
parameters
:
check_gpu
:
true
-
template
:
templates/install-dependencies.yml
parameters
:
platform
:
ubuntu-latest
platform
:
ubuntu-latest-gpu
python_env
:
venv
-
template
:
templates/install-nni.yml
...
...
@@ -51,15 +54,17 @@ stages:
displayName
:
NAS test
-
job
:
windows
# move back after we complete the 1ES pool...
pool
:
vmImage
:
windows-latest
pool
:
nni-it-1es-windows
timeoutInMinutes
:
60
steps
:
# FIXME: Windows should use GPU,
# but it's not used now since driver is not installed in the image.
-
template
:
templates/install-dependencies.yml
parameters
:
platform
:
windows
python_env
:
noop
-
template
:
templates/install-nni.yml
parameters
:
...
...
pipelines/integration-test-hybrid.yml
View file @
4e71ed62
...
...
@@ -7,11 +7,12 @@ schedules:
jobs
:
-
job
:
hybrid
pool
:
nni-it
pool
:
nni-it
-1es-11
timeoutInMinutes
:
90
steps
:
# FIXME: should use GPU here
-
template
:
templates/fix-apt-1es.yml
-
template
:
templates/install-dependencies.yml
parameters
:
...
...
pipelines/integration-test-local-linux.yml
View file @
4e71ed62
...
...
@@ -7,10 +7,14 @@ schedules:
jobs
:
-
job
:
linux
pool
:
nni-it
pool
:
nni-it
-1es-11
timeoutInMinutes
:
60
steps
:
-
template
:
templates/fix-apt-1es.yml
parameters
:
check_gpu
:
true
-
template
:
templates/install-dependencies.yml
parameters
:
platform
:
ubuntu-latest-gpu
...
...
pipelines/integration-test-local-windows.yml
View file @
4e71ed62
...
...
@@ -7,7 +7,7 @@ schedules:
jobs
:
-
job
:
windows
pool
:
nni-it-windows
pool
:
nni-it-
1es-
windows
timeoutInMinutes
:
120
steps
:
...
...
@@ -43,3 +43,5 @@ jobs:
displayName
:
Integration test
-
template
:
templates/save-crashed-info.yml
parameters
:
training_service
:
local
pipelines/integration-test-remote-l2l.yml
View file @
4e71ed62
...
...
@@ -12,10 +12,11 @@ schedules:
jobs
:
-
job
:
remote_linux2linux
pool
:
nni-it
pool
:
nni-it
-1es-11
timeoutInMinutes
:
120
steps
:
-
template
:
templates/fix-apt-1es.yml
# FIXME: GPU is not supported yet.
# Change to ubuntu-latest-gpu when it's done.
...
...
@@ -97,4 +98,4 @@ jobs:
-
template
:
templates/save-crashed-info.yml
parameters
:
remote
:
tru
e
training_service
:
remot
e
pipelines/integration-test-remote-w2w.yml
View file @
4e71ed62
...
...
@@ -11,7 +11,7 @@ variables:
jobs
:
-
job
:
remote_windows2windows
pool
:
nni-it-windows
pool
:
nni-it-
1es-
windows
timeoutInMinutes
:
120
steps
:
...
...
@@ -49,4 +49,4 @@ jobs:
-
template
:
templates/save-crashed-info.yml
parameters
:
remote
:
tru
e
training_service
:
remot
e
pipelines/templates/build-vm-image-template.yml
View file @
4e71ed62
...
...
@@ -8,8 +8,11 @@ steps:
# 1. Assign the role following the instruction.
# 2. Assign contributor role of the resource group to the identity.
# 3. Add the identity to VMSS.
#
# Update 2022/7 (running on Microsoft-hosted agents).
# Use a service principal. This service principal must be assigned contributor access to the resource group.
-
script
:
|
az login --
identity --allow-no-subscriptions --username $(identity
_id)
az login --
service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant
_id)
displayName
:
Login to Azure
# Make sure all these are registered.
...
...
@@ -65,7 +68,8 @@ steps:
export IP_ADDRESS=$(curl -s ifconfig.me)
export VERSION=$(date "+%Y").$(date "+%m%d").$(date "+%H%M%S")
export CONFIG_PATH=$(packer_config).json
sed -i -e "s/<client_id>/$(identity_id)/g" $CONFIG_PATH
sed -i -e "s/<client_id>/$(client_id)/g" $CONFIG_PATH
sed -i -e "s/<client_secret>/$(client_secret)/g" $CONFIG_PATH
sed -i -e "s/<subscription_id>/$(subscription_id)/g" $CONFIG_PATH
sed -i -e "s/<managed_image_name>/$(managed_image_name)/g" $CONFIG_PATH
sed -i -e "s/<resource_group>/$(resource_group)/g" $CONFIG_PATH
...
...
@@ -113,3 +117,6 @@ steps:
# az vmss update -n nni-it -g nni --set virtualMachineProfile.storageProfile.osDisk.diskSizeGb=50
#
# No need to update the image every time, because it's already set to latest.
#
# NOTE: After using 1ES pool, the pool image has to be updated manually to the latest version.
# However, no successful build has been performed yet, because of resource shortage in Southeast Asia.
pipelines/templates/fix-apt-1es.yml
0 → 100644
View file @
4e71ed62
# Fix apt-related issues on 1ES linux pipeline.
# 1ES has an auto-upgraded with apt-get running in the background, periodically.
# This leads to bad consequences:
# 1) apt is locked when install is actually needed
# 2) unattended upgrade could possibly break the GPU driver version, and crash nvidia-smi.
#
# The ultimate solution should be to upgrade the VM image correctly,
# but it's currently infeasible because of a resource group limitation.
# We introduce a workaround here by force disabling the auto-upgrade and,
# fix the broken dependencies if upgrade has already been accidentally run.
#
# This file can be removed after image is updated to latest.
parameters
:
-
name
:
check_gpu
type
:
boolean
default
:
false
steps
:
# Don't set -e
# Always make sure the lock is released.
-
script
:
|
set -x
sudo bash test/vso_tools/build_vm/disable_apt_daily.sh
sudo apt-get -o DPkg::Lock::Timeout=120 --fix-broken -y install
displayName
:
(1ES) Disable apt upgrade
# Make sure GPU isn't broken.
# Sometimes we can't save the GPU because upgrade runs too early.
# We have to rerun the pipeline if unlucky. But it doesn't matter if we don't intend to use GPU at all.
-
script
:
|
echo "There can be unlucky cases when we can't save the GPU. If nvidia-smi fails, try to rerun the failed jobs."
nvidia-smi
displayName
:
(1ES) Check GPU status
condition
:
and(succeeded(), ${{ parameters.check_gpu }})
pipelines/templates/save-crashed-info.yml
View file @
4e71ed62
...
...
@@ -2,9 +2,9 @@
# so that further offline investigations are possible.
parameters
:
-
name
:
remot
e
type
:
boolean
default
:
false
-
name
:
training_servic
e
type
:
string
default
:
unknown
steps
:
...
...
@@ -16,11 +16,16 @@ steps:
condition
:
and(failed(), not(contains(variables['Agent.OS'], 'Windows')))
displayName
:
(failed) (POSIX) Latest experiment directory
-
script
:
|
cp -r /tmp/$USER/nni ${EXPERIMENT_DIR}/local && echo "Copy successful" || echo "Copy failed"
condition
:
and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'local'), not(contains(variables['Agent.OS'], 'Windows')))
displayName
:
(failed) (POSIX) Harvest GPU scheduler logs
-
script
:
|
set -e
export EXPERIMENT_ID=$(echo ${EXPERIMENT_DIR} | sed -e 's/\/.*\///g')
sudo docker cp $(Build.BuildId):/tmp/nni-experiments/${EXPERIMENT_ID} ${EXPERIMENT_DIR}/remote && echo "Copy successful" || echo "Copy failed"
condition
:
and(variables['experiment_dir'], ${{ parameters.
remote }}
, not(contains(variables['Agent.OS'], 'Windows')))
condition
:
and(variables['experiment_dir'],
eq('
${{ parameters.
training_service }}', 'remote')
, not(contains(variables['Agent.OS'], 'Windows')))
displayName
:
(failed) (POSIX) Harvest remote trial logs
-
powershell
:
|
...
...
@@ -30,6 +35,21 @@ steps:
condition
:
and(failed(), contains(variables['Agent.OS'], 'Windows'))
displayName
:
(failed) (Windows) Latest experiment directory
-
powershell
:
|
$latestDir = Get-Item $(experiment_dir)
$tmpPath = "${env:Temp}\${env:UserName}\nni"
$destPath = "${latestDir}\local"
if (Test-Path $tmpPath) {
Write-Host "Copying $tmpPath to $destPath"
Copy-Item $tmpPath -Destination $destPath -Recurse
}
else {
Write-host "$tmpPath doesn't exist"
}
condition
:
and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'local'), contains(variables['Agent.OS'], 'Windows'))
displayName
:
(failed) (Windows) Harvest GPU scheduler logs
-
powershell
:
|
$latestDir = Get-Item $(experiment_dir)
$experimentId = $latestDir.name
...
...
@@ -43,7 +63,7 @@ steps:
else {
Write-host "$remotePath doesn't exist"
}
condition
:
and(variables['experiment_dir'], ${{ parameters.
remote }}
, contains(variables['Agent.OS'], 'Windows'))
condition
:
and(variables['experiment_dir'],
eq('
${{ parameters.
training_service }}', 'remote')
, contains(variables['Agent.OS'], 'Windows'))
displayName
:
(failed) (Windows) Harvest remote trial logs
-
publish
:
$(experiment_dir)
...
...
test/ut/tools/nnictl/test_kill_command.py
View file @
4e71ed62
...
...
@@ -53,8 +53,9 @@ def test_kill_process_slow_no_patience():
start_time
=
time
.
time
()
kill_command
(
process
.
pid
,
timeout
=
1
)
# didn't wait long enough
end_time
=
time
.
time
()
if
sys
.
platform
==
'linux'
:
# FIXME: on non-linux, seems that the time of termination can't be controlled
assert
0.5
<
end_time
-
start_time
<
2
if
sys
.
platform
==
'linux'
:
# There was assert 0.5 < end_time - start_time. It's not stable.
assert
end_time
-
start_time
<
2
assert
process
.
poll
()
is
None
assert
_check_pid_running
(
process
.
pid
)
else
:
...
...
@@ -73,8 +74,7 @@ def test_kill_process_slow_patiently():
kill_command
(
process
.
pid
,
timeout
=
3
)
# wait long enough
end_time
=
time
.
time
()
assert
end_time
-
start_time
<
5
if
sys
.
platform
==
'linux'
:
assert
end_time
-
start_time
>
1
# I don't know why windows is super fast
# assert end_time - start_time > 1 # This check is disabled because it's not stable
@
pytest
.
mark
.
skipif
(
sys
.
platform
!=
'linux'
,
reason
=
'Signal issues on non-linux.'
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment