Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
4e71ed62
Unverified
Commit
4e71ed62
authored
Jul 08, 2022
by
Yuge Zhang
Committed by
GitHub
Jul 08, 2022
Browse files
Migrate pipeline to 1ES (#4986)
parent
570448ea
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
233 additions
and
85 deletions
+233
-85
nni/experiment/config/utils/internal.py
nni/experiment/config/utils/internal.py
+2
-1
nni/retiarii/execution/base.py
nni/retiarii/execution/base.py
+11
-9
nni/retiarii/execution/cgo_engine.py
nni/retiarii/execution/cgo_engine.py
+13
-6
nni/retiarii/integration.py
nni/retiarii/integration.py
+64
-17
nni/retiarii/strategy/bruteforce.py
nni/retiarii/strategy/bruteforce.py
+2
-0
nni/tools/gpu_tool/gpu_metrics_collector.py
nni/tools/gpu_tool/gpu_metrics_collector.py
+12
-7
pipelines/build-vm-image-linux.yml
pipelines/build-vm-image-linux.yml
+4
-1
pipelines/build-vm-image-windows.yml
pipelines/build-vm-image-windows.yml
+3
-1
pipelines/full-test-compression.yml
pipelines/full-test-compression.yml
+12
-10
pipelines/full-test-hpo.yml
pipelines/full-test-hpo.yml
+8
-8
pipelines/full-test-nas.yml
pipelines/full-test-nas.yml
+12
-7
pipelines/integration-test-hybrid.yml
pipelines/integration-test-hybrid.yml
+2
-1
pipelines/integration-test-local-linux.yml
pipelines/integration-test-local-linux.yml
+5
-1
pipelines/integration-test-local-windows.yml
pipelines/integration-test-local-windows.yml
+3
-1
pipelines/integration-test-remote-l2l.yml
pipelines/integration-test-remote-l2l.yml
+3
-2
pipelines/integration-test-remote-w2w.yml
pipelines/integration-test-remote-w2w.yml
+2
-2
pipelines/templates/build-vm-image-template.yml
pipelines/templates/build-vm-image-template.yml
+9
-2
pipelines/templates/fix-apt-1es.yml
pipelines/templates/fix-apt-1es.yml
+37
-0
pipelines/templates/save-crashed-info.yml
pipelines/templates/save-crashed-info.yml
+25
-5
test/ut/tools/nnictl/test_kill_command.py
test/ut/tools/nnictl/test_kill_command.py
+4
-4
No files found.
nni/experiment/config/utils/internal.py
View file @
4e71ed62
...
@@ -171,7 +171,8 @@ def load_training_service_config(config) -> TrainingServiceConfig:
...
@@ -171,7 +171,8 @@ def load_training_service_config(config) -> TrainingServiceConfig:
cls
=
_get_ts_config_class
(
config
[
'platform'
])
cls
=
_get_ts_config_class
(
config
[
'platform'
])
if
cls
is
not
None
:
if
cls
is
not
None
:
return
cls
(
**
config
)
return
cls
(
**
config
)
return
config
# not valid json, don't touch
# not valid json, don't touch
return
config
# type: ignore
def
_get_ts_config_class
(
platform
:
str
)
->
type
[
TrainingServiceConfig
]
|
None
:
def
_get_ts_config_class
(
platform
:
str
)
->
type
[
TrainingServiceConfig
]
|
None
:
from
..training_service
import
TrainingServiceConfig
# avoid circular import
from
..training_service
import
TrainingServiceConfig
# avoid circular import
...
...
nni/retiarii/execution/base.py
View file @
4e71ed62
...
@@ -10,6 +10,7 @@ import string
...
@@ -10,6 +10,7 @@ import string
from
typing
import
Any
,
Dict
,
Iterable
,
List
from
typing
import
Any
,
Dict
,
Iterable
,
List
from
nni.experiment
import
rest
from
nni.experiment
import
rest
from
nni.retiarii.integration
import
RetiariiAdvisor
from
.interface
import
AbstractExecutionEngine
,
AbstractGraphListener
from
.interface
import
AbstractExecutionEngine
,
AbstractGraphListener
from
.utils
import
get_mutation_summary
from
.utils
import
get_mutation_summary
...
@@ -75,20 +76,21 @@ class BaseExecutionEngine(AbstractExecutionEngine):
...
@@ -75,20 +76,21 @@ class BaseExecutionEngine(AbstractExecutionEngine):
self
.
url_prefix
=
rest_url_prefix
self
.
url_prefix
=
rest_url_prefix
self
.
_listeners
:
List
[
AbstractGraphListener
]
=
[]
self
.
_listeners
:
List
[
AbstractGraphListener
]
=
[]
# register advisor callbacks
advisor
=
get_advisor
()
advisor
.
send_trial_callback
=
self
.
_send_trial_callback
advisor
.
request_trial_jobs_callback
=
self
.
_request_trial_jobs_callback
advisor
.
trial_end_callback
=
self
.
_trial_end_callback
advisor
.
intermediate_metric_callback
=
self
.
_intermediate_metric_callback
advisor
.
final_metric_callback
=
self
.
_final_metric_callback
self
.
_running_models
:
Dict
[
int
,
Model
]
=
dict
()
self
.
_running_models
:
Dict
[
int
,
Model
]
=
dict
()
self
.
_history
:
List
[
Model
]
=
[]
self
.
_history
:
List
[
Model
]
=
[]
self
.
resources
=
0
self
.
resources
=
0
# register advisor callbacks
advisor
:
RetiariiAdvisor
=
get_advisor
()
advisor
.
register_callbacks
({
'send_trial'
:
self
.
_send_trial_callback
,
'request_trial_jobs'
:
self
.
_request_trial_jobs_callback
,
'trial_end'
:
self
.
_trial_end_callback
,
'intermediate_metric'
:
self
.
_intermediate_metric_callback
,
'final_metric'
:
self
.
_final_metric_callback
})
def
submit_models
(
self
,
*
models
:
Model
)
->
None
:
def
submit_models
(
self
,
*
models
:
Model
)
->
None
:
for
model
in
models
:
for
model
in
models
:
data
=
self
.
pack_model_data
(
model
)
data
=
self
.
pack_model_data
(
model
)
...
...
nni/retiarii/execution/cgo_engine.py
View file @
4e71ed62
...
@@ -14,6 +14,7 @@ from dataclasses import dataclass
...
@@ -14,6 +14,7 @@ from dataclasses import dataclass
from
nni.common.device
import
GPUDevice
,
Device
from
nni.common.device
import
GPUDevice
,
Device
from
nni.experiment.config.training_services
import
RemoteConfig
from
nni.experiment.config.training_services
import
RemoteConfig
from
nni.retiarii.integration
import
RetiariiAdvisor
from
.interface
import
AbstractExecutionEngine
,
AbstractGraphListener
,
WorkerInfo
from
.interface
import
AbstractExecutionEngine
,
AbstractGraphListener
,
WorkerInfo
from
..
import
codegen
,
utils
from
..
import
codegen
,
utils
from
..graph
import
Model
,
ModelStatus
,
MetricData
,
Node
from
..graph
import
Model
,
ModelStatus
,
MetricData
,
Node
...
@@ -28,6 +29,10 @@ from .base import BaseGraphData
...
@@ -28,6 +29,10 @@ from .base import BaseGraphData
_logger
=
logging
.
getLogger
(
__name__
)
_logger
=
logging
.
getLogger
(
__name__
)
def
_noop
(
*
args
,
**
kwargs
):
pass
@
dataclass
@
dataclass
class
TrialSubmission
:
class
TrialSubmission
:
model
:
Model
model
:
Model
...
@@ -90,12 +95,14 @@ class CGOExecutionEngine(AbstractExecutionEngine):
...
@@ -90,12 +95,14 @@ class CGOExecutionEngine(AbstractExecutionEngine):
self
.
_queue_lock
=
threading
.
Lock
()
self
.
_queue_lock
=
threading
.
Lock
()
# register advisor callbacks
# register advisor callbacks
advisor
=
get_advisor
()
advisor
:
RetiariiAdvisor
=
get_advisor
()
# advisor.send_trial_callback = self._send_trial_callback
advisor
.
register_callbacks
({
# advisor.request_trial_jobs_callback = self._request_trial_jobs_callback
'send_trial'
:
_noop
,
advisor
.
trial_end_callback
=
self
.
_trial_end_callback
'request_trial_jobs'
:
_noop
,
advisor
.
intermediate_metric_callback
=
self
.
_intermediate_metric_callback
'trial_end'
:
self
.
_trial_end_callback
,
advisor
.
final_metric_callback
=
self
.
_final_metric_callback
'intermediate_metric'
:
self
.
_intermediate_metric_callback
,
'final_metric'
:
self
.
_final_metric_callback
})
self
.
_stopped
=
False
self
.
_stopped
=
False
self
.
_consumer_thread
=
threading
.
Thread
(
target
=
self
.
_consume_models
)
self
.
_consumer_thread
=
threading
.
Thread
(
target
=
self
.
_consume_models
)
...
...
nni/retiarii/integration.py
View file @
4e71ed62
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
logging
import
logging
import
os
import
os
from
typing
import
Any
,
Callable
,
Optional
from
typing
import
Any
,
Callable
,
Optional
,
Dict
,
List
,
Tuple
import
nni
import
nni
from
nni.common.serializer
import
PayloadTooLarge
from
nni.common.serializer
import
PayloadTooLarge
...
@@ -21,6 +21,7 @@ _logger = logging.getLogger(__name__)
...
@@ -21,6 +21,7 @@ _logger = logging.getLogger(__name__)
class
RetiariiAdvisor
(
MsgDispatcherBase
):
class
RetiariiAdvisor
(
MsgDispatcherBase
):
"""
"""
The class is to connect Retiarii components to NNI backend.
The class is to connect Retiarii components to NNI backend.
It can be considered as a Python wrapper of NNI manager.
It will function as the main thread when running a Retiarii experiment through NNI.
It will function as the main thread when running a Retiarii experiment through NNI.
Strategy will be launched as its thread, who will call APIs in execution engine. Execution
Strategy will be launched as its thread, who will call APIs in execution engine. Execution
...
@@ -32,9 +33,6 @@ class RetiariiAdvisor(MsgDispatcherBase):
...
@@ -32,9 +33,6 @@ class RetiariiAdvisor(MsgDispatcherBase):
The conversion advisor provides are minimum. It is only a send/receive module, and execution engine
The conversion advisor provides are minimum. It is only a send/receive module, and execution engine
needs to handle all the rest.
needs to handle all the rest.
FIXME
How does advisor exit when strategy exists?
Attributes
Attributes
----------
----------
send_trial_callback
send_trial_callback
...
@@ -61,6 +59,63 @@ class RetiariiAdvisor(MsgDispatcherBase):
...
@@ -61,6 +59,63 @@ class RetiariiAdvisor(MsgDispatcherBase):
self
.
parameters_count
=
0
self
.
parameters_count
=
0
# Sometimes messages arrive first before the callbacks get registered.
# Or in case that we allow engine to be absent during the experiment.
# Here we need to store the messages and invoke them later.
self
.
call_queue
:
List
[
Tuple
[
str
,
list
]]
=
[]
def
register_callbacks
(
self
,
callbacks
:
Dict
[
str
,
Callable
[...,
None
]]):
"""
Register callbacks for NNI backend.
Parameters
----------
callbacks
A dictionary of callbacks.
The key is the name of the callback. The value is the callback function.
"""
self
.
send_trial_callback
=
callbacks
.
get
(
'send_trial'
)
self
.
request_trial_jobs_callback
=
callbacks
.
get
(
'request_trial_jobs'
)
self
.
trial_end_callback
=
callbacks
.
get
(
'trial_end'
)
self
.
intermediate_metric_callback
=
callbacks
.
get
(
'intermediate_metric'
)
self
.
final_metric_callback
=
callbacks
.
get
(
'final_metric'
)
self
.
process_queued_callbacks
()
def
process_queued_callbacks
(
self
)
->
None
:
"""
Process callbacks in queue.
Consume the messages that haven't been handled previously.
"""
processed_idx
=
[]
for
queue_idx
,
(
call_name
,
call_args
)
in
enumerate
(
self
.
call_queue
):
if
call_name
==
'send_trial'
and
self
.
send_trial_callback
is
not
None
:
self
.
send_trial_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
if
call_name
==
'request_trial_jobs'
and
self
.
request_trial_jobs_callback
is
not
None
:
self
.
request_trial_jobs_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
if
call_name
==
'trial_end'
and
self
.
trial_end_callback
is
not
None
:
self
.
trial_end_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
if
call_name
==
'intermediate_metric'
and
self
.
intermediate_metric_callback
is
not
None
:
self
.
intermediate_metric_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
if
call_name
==
'final_metric'
and
self
.
final_metric_callback
is
not
None
:
self
.
final_metric_callback
(
*
call_args
)
# pylint: disable=not-callable
processed_idx
.
append
(
queue_idx
)
# Remove processed messages
for
idx
in
reversed
(
processed_idx
):
self
.
call_queue
.
pop
(
idx
)
def
invoke_callback
(
self
,
name
:
str
,
*
args
:
Any
)
->
None
:
"""
Invoke callback.
"""
self
.
call_queue
.
append
((
name
,
list
(
args
)))
self
.
process_queued_callbacks
()
def
handle_initialize
(
self
,
data
):
def
handle_initialize
(
self
,
data
):
"""callback for initializing the advisor
"""callback for initializing the advisor
Parameters
Parameters
...
@@ -140,8 +195,7 @@ class RetiariiAdvisor(MsgDispatcherBase):
...
@@ -140,8 +195,7 @@ class RetiariiAdvisor(MsgDispatcherBase):
# nevertheless, there could still be blocked by pipe / nni-manager
# nevertheless, there could still be blocked by pipe / nni-manager
self
.
send
(
CommandType
.
NewTrialJob
,
send_payload
)
self
.
send
(
CommandType
.
NewTrialJob
,
send_payload
)
if
self
.
send_trial_callback
is
not
None
:
self
.
invoke_callback
(
'send_trial'
,
parameters
)
self
.
send_trial_callback
(
parameters
)
# pylint: disable=not-callable
return
self
.
parameters_count
return
self
.
parameters_count
def
mark_experiment_as_ending
(
self
):
def
mark_experiment_as_ending
(
self
):
...
@@ -149,8 +203,7 @@ class RetiariiAdvisor(MsgDispatcherBase):
...
@@ -149,8 +203,7 @@ class RetiariiAdvisor(MsgDispatcherBase):
def
handle_request_trial_jobs
(
self
,
num_trials
):
def
handle_request_trial_jobs
(
self
,
num_trials
):
_logger
.
debug
(
'Request trial jobs: %s'
,
num_trials
)
_logger
.
debug
(
'Request trial jobs: %s'
,
num_trials
)
if
self
.
request_trial_jobs_callback
is
not
None
:
self
.
invoke_callback
(
'request_trial_jobs'
,
num_trials
)
self
.
request_trial_jobs_callback
(
num_trials
)
# pylint: disable=not-callable
def
handle_update_search_space
(
self
,
data
):
def
handle_update_search_space
(
self
,
data
):
_logger
.
debug
(
'Received search space: %s'
,
data
)
_logger
.
debug
(
'Received search space: %s'
,
data
)
...
@@ -158,22 +211,16 @@ class RetiariiAdvisor(MsgDispatcherBase):
...
@@ -158,22 +211,16 @@ class RetiariiAdvisor(MsgDispatcherBase):
def
handle_trial_end
(
self
,
data
):
def
handle_trial_end
(
self
,
data
):
_logger
.
debug
(
'Trial end: %s'
,
data
)
_logger
.
debug
(
'Trial end: %s'
,
data
)
if
self
.
trial_end_callback
is
not
None
:
self
.
invoke_callback
(
'trial_end'
,
nni
.
load
(
data
[
'hyper_params'
])[
'parameter_id'
],
data
[
'event'
]
==
'SUCCEEDED'
)
self
.
trial_end_callback
(
nni
.
load
(
data
[
'hyper_params'
])[
'parameter_id'
],
# pylint: disable=not-callable
data
[
'event'
]
==
'SUCCEEDED'
)
def
handle_report_metric_data
(
self
,
data
):
def
handle_report_metric_data
(
self
,
data
):
_logger
.
debug
(
'Metric reported: %s'
,
data
)
_logger
.
debug
(
'Metric reported: %s'
,
data
)
if
data
[
'type'
]
==
MetricType
.
REQUEST_PARAMETER
:
if
data
[
'type'
]
==
MetricType
.
REQUEST_PARAMETER
:
raise
ValueError
(
'Request parameter not supported'
)
raise
ValueError
(
'Request parameter not supported'
)
elif
data
[
'type'
]
==
MetricType
.
PERIODICAL
:
elif
data
[
'type'
]
==
MetricType
.
PERIODICAL
:
if
self
.
intermediate_metric_callback
is
not
None
:
self
.
invoke_callback
(
'intermediate_metric'
,
data
[
'parameter_id'
],
self
.
_process_value
(
data
[
'value'
]))
self
.
intermediate_metric_callback
(
data
[
'parameter_id'
],
# pylint: disable=not-callable
self
.
_process_value
(
data
[
'value'
]))
elif
data
[
'type'
]
==
MetricType
.
FINAL
:
elif
data
[
'type'
]
==
MetricType
.
FINAL
:
if
self
.
final_metric_callback
is
not
None
:
self
.
invoke_callback
(
'final_metric'
,
data
[
'parameter_id'
],
self
.
_process_value
(
data
[
'value'
]))
self
.
final_metric_callback
(
data
[
'parameter_id'
],
# pylint: disable=not-callable
self
.
_process_value
(
data
[
'value'
]))
@
staticmethod
@
staticmethod
def
_process_value
(
value
)
->
Any
:
# hopefully a float
def
_process_value
(
value
)
->
Any
:
# hopefully a float
...
...
nni/retiarii/strategy/bruteforce.py
View file @
4e71ed62
...
@@ -127,9 +127,11 @@ class Random(BaseStrategy):
...
@@ -127,9 +127,11 @@ class Random(BaseStrategy):
if
budget_exhausted
():
if
budget_exhausted
():
return
return
time
.
sleep
(
self
.
_polling_interval
)
time
.
sleep
(
self
.
_polling_interval
)
_logger
.
debug
(
'Still waiting for resource.'
)
try
:
try
:
model
=
get_targeted_model
(
base_model
,
applied_mutators
,
sample
)
model
=
get_targeted_model
(
base_model
,
applied_mutators
,
sample
)
if
filter_model
(
self
.
filter
,
model
):
if
filter_model
(
self
.
filter
,
model
):
_logger
.
debug
(
'Submitting model: %s'
,
model
)
submit_models
(
model
)
submit_models
(
model
)
except
InvalidMutation
as
e
:
except
InvalidMutation
as
e
:
_logger
.
warning
(
f
'Invalid mutation:
{
e
}
. Skip.'
)
_logger
.
warning
(
f
'Invalid mutation:
{
e
}
. Skip.'
)
nni/tools/gpu_tool/gpu_metrics_collector.py
View file @
4e71ed62
...
@@ -15,14 +15,19 @@ def main(argv):
...
@@ -15,14 +15,19 @@ def main(argv):
metrics_output_dir
=
os
.
environ
[
'METRIC_OUTPUT_DIR'
]
metrics_output_dir
=
os
.
environ
[
'METRIC_OUTPUT_DIR'
]
cmd
=
'nvidia-smi -q -x'
.
split
()
cmd
=
'nvidia-smi -q -x'
.
split
()
while
(
True
):
retry
=
0
try
:
while
True
:
smi_output
=
subprocess
.
check_output
(
cmd
)
smi
=
subprocess
.
run
(
cmd
,
timeout
=
20
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
except
Exception
:
if
smi
.
returncode
!=
0
:
traceback
.
print_exc
()
retry
+=
1
print
(
f
'gpu_metrics_collector error: nvidia-smi return code is
{
smi
.
returncode
}
'
,
file
=
sys
.
stderr
)
print
(
'='
*
20
+
f
'
\n
Captured stdout:
{
smi
.
stdout
}
'
,
file
=
sys
.
stderr
)
print
(
'='
*
20
+
f
'
\n
Captured stderr:
{
smi
.
stderr
}
'
,
file
=
sys
.
stderr
)
gen_empty_gpu_metric
(
metrics_output_dir
)
gen_empty_gpu_metric
(
metrics_output_dir
)
break
if
retry
>=
5
:
parse_nvidia_smi_result
(
smi_output
,
metrics_output_dir
)
break
else
:
parse_nvidia_smi_result
(
smi
.
stdout
,
metrics_output_dir
)
# TODO: change to sleep time configurable via arguments
# TODO: change to sleep time configurable via arguments
time
.
sleep
(
5
)
time
.
sleep
(
5
)
...
...
pipelines/build-vm-image-linux.yml
View file @
4e71ed62
# FIXME: This pipeline is broken due to resource group location limitation.
trigger
:
none
trigger
:
none
pr
:
none
pr
:
none
...
@@ -11,6 +13,7 @@ variables:
...
@@ -11,6 +13,7 @@ variables:
jobs
:
jobs
:
-
job
:
linux
-
job
:
linux
pool
:
nni-it
pool
:
vmImage
:
ubuntu-latest
steps
:
steps
:
-
template
:
templates/build-vm-image-template.yml
-
template
:
templates/build-vm-image-template.yml
pipelines/build-vm-image-windows.yml
View file @
4e71ed62
# FIXME: This pipeline is broken due to resource group location limitation.
trigger
:
none
trigger
:
none
pr
:
none
pr
:
none
...
@@ -11,7 +13,7 @@ variables:
...
@@ -11,7 +13,7 @@ variables:
jobs
:
jobs
:
-
job
:
windows
-
job
:
windows
pool
:
nni-it
pool
:
nni-it
-1es-11
timeoutInMinutes
:
90
timeoutInMinutes
:
90
steps
:
steps
:
-
template
:
templates/build-vm-image-template.yml
-
template
:
templates/build-vm-image-template.yml
pipelines/full-test-compression.yml
View file @
4e71ed62
...
@@ -31,15 +31,18 @@ stages:
...
@@ -31,15 +31,18 @@ stages:
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
jobs
:
jobs
:
-
job
:
linux
-
job
:
linux
# move back after we complete the 1ES pool...
pool
:
nni-it-1es-11
pool
:
vmImage
:
ubuntu-latest
timeoutInMinutes
:
60
timeoutInMinutes
:
60
steps
:
steps
:
-
template
:
templates/fix-apt-1es.yml
parameters
:
check_gpu
:
true
-
template
:
templates/install-dependencies.yml
-
template
:
templates/install-dependencies.yml
parameters
:
parameters
:
platform
:
ubuntu-latest
platform
:
ubuntu-latest-gpu
python_env
:
venv
-
template
:
templates/install-nni.yml
-
template
:
templates/install-nni.yml
...
@@ -48,10 +51,9 @@ stages:
...
@@ -48,10 +51,9 @@ stages:
-
script
:
|
-
script
:
|
cd test/algo
cd test/algo
python -m pytest compression
python -m pytest compression
displayName
:
c
ompression unit test
displayName
:
C
ompression unit test
# add back after we complete the 1ES pool...
-
script
:
|
# - script: |
cd test
# cd test
source scripts/model_compression.sh
# source scripts/model_compression.sh
displayName
:
Model compression test
# displayName: Model compression test
pipelines/full-test-hpo.yml
View file @
4e71ed62
...
@@ -31,15 +31,18 @@ stages:
...
@@ -31,15 +31,18 @@ stages:
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
jobs
:
jobs
:
-
job
:
linux
-
job
:
linux
# move back after we complete the 1ES pool...
pool
:
nni-it-1es-11
pool
:
vmImage
:
ubuntu-latest
timeoutInMinutes
:
60
timeoutInMinutes
:
60
steps
:
steps
:
-
template
:
templates/fix-apt-1es.yml
parameters
:
check_gpu
:
true
-
template
:
templates/install-dependencies.yml
-
template
:
templates/install-dependencies.yml
parameters
:
parameters
:
platform
:
ubuntu-latest
platform
:
ubuntu-latest-gpu
python_env
:
venv
-
template
:
templates/install-nni.yml
-
template
:
templates/install-nni.yml
...
@@ -57,10 +60,7 @@ stages:
...
@@ -57,10 +60,7 @@ stages:
-
script
:
|
-
script
:
|
cd test
cd test
python training_service/nnitest/run_tests.py \
python training_service/nnitest/run_tests.py --config training_service/config/integration_tests.yml --ts local
--config training_service/config/integration_tests.yml \
--ts local \
--exclude mnist-pytorch-local-gpu
displayName
:
Integration test
displayName
:
Integration test
# TODO: should add a test on platforms other than linux
# TODO: should add a test on platforms other than linux
pipelines/full-test-nas.yml
View file @
4e71ed62
...
@@ -31,15 +31,18 @@ stages:
...
@@ -31,15 +31,18 @@ stages:
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
condition
:
and(succeeded(), ne(dependencies.filter.outputs['check.execution.skipsubsequent'], 'true'))
jobs
:
jobs
:
-
job
:
linux
-
job
:
linux
# move back after we complete the 1ES pool...
pool
:
nni-it-1es-11
pool
:
vmImage
:
ubuntu-latest
timeoutInMinutes
:
60
timeoutInMinutes
:
60
steps
:
steps
:
-
template
:
templates/fix-apt-1es.yml
parameters
:
check_gpu
:
true
-
template
:
templates/install-dependencies.yml
-
template
:
templates/install-dependencies.yml
parameters
:
parameters
:
platform
:
ubuntu-latest
platform
:
ubuntu-latest-gpu
python_env
:
venv
-
template
:
templates/install-nni.yml
-
template
:
templates/install-nni.yml
...
@@ -51,15 +54,17 @@ stages:
...
@@ -51,15 +54,17 @@ stages:
displayName
:
NAS test
displayName
:
NAS test
-
job
:
windows
-
job
:
windows
# move back after we complete the 1ES pool...
pool
:
nni-it-1es-windows
pool
:
vmImage
:
windows-latest
timeoutInMinutes
:
60
timeoutInMinutes
:
60
steps
:
steps
:
# FIXME: Windows should use GPU,
# but it's not used now since driver is not installed in the image.
-
template
:
templates/install-dependencies.yml
-
template
:
templates/install-dependencies.yml
parameters
:
parameters
:
platform
:
windows
platform
:
windows
python_env
:
noop
-
template
:
templates/install-nni.yml
-
template
:
templates/install-nni.yml
parameters
:
parameters
:
...
...
pipelines/integration-test-hybrid.yml
View file @
4e71ed62
...
@@ -7,11 +7,12 @@ schedules:
...
@@ -7,11 +7,12 @@ schedules:
jobs
:
jobs
:
-
job
:
hybrid
-
job
:
hybrid
pool
:
nni-it
pool
:
nni-it
-1es-11
timeoutInMinutes
:
90
timeoutInMinutes
:
90
steps
:
steps
:
# FIXME: should use GPU here
# FIXME: should use GPU here
-
template
:
templates/fix-apt-1es.yml
-
template
:
templates/install-dependencies.yml
-
template
:
templates/install-dependencies.yml
parameters
:
parameters
:
...
...
pipelines/integration-test-local-linux.yml
View file @
4e71ed62
...
@@ -7,10 +7,14 @@ schedules:
...
@@ -7,10 +7,14 @@ schedules:
jobs
:
jobs
:
-
job
:
linux
-
job
:
linux
pool
:
nni-it
pool
:
nni-it
-1es-11
timeoutInMinutes
:
60
timeoutInMinutes
:
60
steps
:
steps
:
-
template
:
templates/fix-apt-1es.yml
parameters
:
check_gpu
:
true
-
template
:
templates/install-dependencies.yml
-
template
:
templates/install-dependencies.yml
parameters
:
parameters
:
platform
:
ubuntu-latest-gpu
platform
:
ubuntu-latest-gpu
...
...
pipelines/integration-test-local-windows.yml
View file @
4e71ed62
...
@@ -7,7 +7,7 @@ schedules:
...
@@ -7,7 +7,7 @@ schedules:
jobs
:
jobs
:
-
job
:
windows
-
job
:
windows
pool
:
nni-it-windows
pool
:
nni-it-
1es-
windows
timeoutInMinutes
:
120
timeoutInMinutes
:
120
steps
:
steps
:
...
@@ -43,3 +43,5 @@ jobs:
...
@@ -43,3 +43,5 @@ jobs:
displayName
:
Integration test
displayName
:
Integration test
-
template
:
templates/save-crashed-info.yml
-
template
:
templates/save-crashed-info.yml
parameters
:
training_service
:
local
pipelines/integration-test-remote-l2l.yml
View file @
4e71ed62
...
@@ -12,10 +12,11 @@ schedules:
...
@@ -12,10 +12,11 @@ schedules:
jobs
:
jobs
:
-
job
:
remote_linux2linux
-
job
:
remote_linux2linux
pool
:
nni-it
pool
:
nni-it
-1es-11
timeoutInMinutes
:
120
timeoutInMinutes
:
120
steps
:
steps
:
-
template
:
templates/fix-apt-1es.yml
# FIXME: GPU is not supported yet.
# FIXME: GPU is not supported yet.
# Change to ubuntu-latest-gpu when it's done.
# Change to ubuntu-latest-gpu when it's done.
...
@@ -97,4 +98,4 @@ jobs:
...
@@ -97,4 +98,4 @@ jobs:
-
template
:
templates/save-crashed-info.yml
-
template
:
templates/save-crashed-info.yml
parameters
:
parameters
:
remote
:
tru
e
training_service
:
remot
e
pipelines/integration-test-remote-w2w.yml
View file @
4e71ed62
...
@@ -11,7 +11,7 @@ variables:
...
@@ -11,7 +11,7 @@ variables:
jobs
:
jobs
:
-
job
:
remote_windows2windows
-
job
:
remote_windows2windows
pool
:
nni-it-windows
pool
:
nni-it-
1es-
windows
timeoutInMinutes
:
120
timeoutInMinutes
:
120
steps
:
steps
:
...
@@ -49,4 +49,4 @@ jobs:
...
@@ -49,4 +49,4 @@ jobs:
-
template
:
templates/save-crashed-info.yml
-
template
:
templates/save-crashed-info.yml
parameters
:
parameters
:
remote
:
tru
e
training_service
:
remot
e
pipelines/templates/build-vm-image-template.yml
View file @
4e71ed62
...
@@ -8,8 +8,11 @@ steps:
...
@@ -8,8 +8,11 @@ steps:
# 1. Assign the role following the instruction.
# 1. Assign the role following the instruction.
# 2. Assign contributor role of the resource group to the identity.
# 2. Assign contributor role of the resource group to the identity.
# 3. Add the identity to VMSS.
# 3. Add the identity to VMSS.
#
# Update 2022/7 (running on Microsoft-hosted agents).
# Use a service principal. This service principal must be assigned contributor access to the resource group.
-
script
:
|
-
script
:
|
az login --
identity --allow-no-subscriptions --username $(identity
_id)
az login --
service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant
_id)
displayName
:
Login to Azure
displayName
:
Login to Azure
# Make sure all these are registered.
# Make sure all these are registered.
...
@@ -65,7 +68,8 @@ steps:
...
@@ -65,7 +68,8 @@ steps:
export IP_ADDRESS=$(curl -s ifconfig.me)
export IP_ADDRESS=$(curl -s ifconfig.me)
export VERSION=$(date "+%Y").$(date "+%m%d").$(date "+%H%M%S")
export VERSION=$(date "+%Y").$(date "+%m%d").$(date "+%H%M%S")
export CONFIG_PATH=$(packer_config).json
export CONFIG_PATH=$(packer_config).json
sed -i -e "s/<client_id>/$(identity_id)/g" $CONFIG_PATH
sed -i -e "s/<client_id>/$(client_id)/g" $CONFIG_PATH
sed -i -e "s/<client_secret>/$(client_secret)/g" $CONFIG_PATH
sed -i -e "s/<subscription_id>/$(subscription_id)/g" $CONFIG_PATH
sed -i -e "s/<subscription_id>/$(subscription_id)/g" $CONFIG_PATH
sed -i -e "s/<managed_image_name>/$(managed_image_name)/g" $CONFIG_PATH
sed -i -e "s/<managed_image_name>/$(managed_image_name)/g" $CONFIG_PATH
sed -i -e "s/<resource_group>/$(resource_group)/g" $CONFIG_PATH
sed -i -e "s/<resource_group>/$(resource_group)/g" $CONFIG_PATH
...
@@ -113,3 +117,6 @@ steps:
...
@@ -113,3 +117,6 @@ steps:
# az vmss update -n nni-it -g nni --set virtualMachineProfile.storageProfile.osDisk.diskSizeGb=50
# az vmss update -n nni-it -g nni --set virtualMachineProfile.storageProfile.osDisk.diskSizeGb=50
#
#
# No need to update the image every time, because it's already set to latest.
# No need to update the image every time, because it's already set to latest.
#
# NOTE: After using 1ES pool, the pool image has to be updated manually to the latest version.
# However, no successful build has been performed yet, because of resource shortage in Southeast Asia.
pipelines/templates/fix-apt-1es.yml
0 → 100644
View file @
4e71ed62
# Fix apt-related issues on 1ES linux pipeline.
# 1ES has an auto-upgraded with apt-get running in the background, periodically.
# This leads to bad consequences:
# 1) apt is locked when install is actually needed
# 2) unattended upgrade could possibly break the GPU driver version, and crash nvidia-smi.
#
# The ultimate solution should be to upgrade the VM image correctly,
# but it's currently infeasible because of a resource group limitation.
# We introduce a workaround here by force disabling the auto-upgrade and,
# fix the broken dependencies if upgrade has already been accidentally run.
#
# This file can be removed after image is updated to latest.
parameters
:
-
name
:
check_gpu
type
:
boolean
default
:
false
steps
:
# Don't set -e
# Always make sure the lock is released.
-
script
:
|
set -x
sudo bash test/vso_tools/build_vm/disable_apt_daily.sh
sudo apt-get -o DPkg::Lock::Timeout=120 --fix-broken -y install
displayName
:
(1ES) Disable apt upgrade
# Make sure GPU isn't broken.
# Sometimes we can't save the GPU because upgrade runs too early.
# We have to rerun the pipeline if unlucky. But it doesn't matter if we don't intend to use GPU at all.
-
script
:
|
echo "There can be unlucky cases when we can't save the GPU. If nvidia-smi fails, try to rerun the failed jobs."
nvidia-smi
displayName
:
(1ES) Check GPU status
condition
:
and(succeeded(), ${{ parameters.check_gpu }})
pipelines/templates/save-crashed-info.yml
View file @
4e71ed62
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
# so that further offline investigations are possible.
# so that further offline investigations are possible.
parameters
:
parameters
:
-
name
:
remot
e
-
name
:
training_servic
e
type
:
boolean
type
:
string
default
:
false
default
:
unknown
steps
:
steps
:
...
@@ -16,11 +16,16 @@ steps:
...
@@ -16,11 +16,16 @@ steps:
condition
:
and(failed(), not(contains(variables['Agent.OS'], 'Windows')))
condition
:
and(failed(), not(contains(variables['Agent.OS'], 'Windows')))
displayName
:
(failed) (POSIX) Latest experiment directory
displayName
:
(failed) (POSIX) Latest experiment directory
-
script
:
|
cp -r /tmp/$USER/nni ${EXPERIMENT_DIR}/local && echo "Copy successful" || echo "Copy failed"
condition
:
and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'local'), not(contains(variables['Agent.OS'], 'Windows')))
displayName
:
(failed) (POSIX) Harvest GPU scheduler logs
-
script
:
|
-
script
:
|
set -e
set -e
export EXPERIMENT_ID=$(echo ${EXPERIMENT_DIR} | sed -e 's/\/.*\///g')
export EXPERIMENT_ID=$(echo ${EXPERIMENT_DIR} | sed -e 's/\/.*\///g')
sudo docker cp $(Build.BuildId):/tmp/nni-experiments/${EXPERIMENT_ID} ${EXPERIMENT_DIR}/remote && echo "Copy successful" || echo "Copy failed"
sudo docker cp $(Build.BuildId):/tmp/nni-experiments/${EXPERIMENT_ID} ${EXPERIMENT_DIR}/remote && echo "Copy successful" || echo "Copy failed"
condition
:
and(variables['experiment_dir'], ${{ parameters.
remote }}
, not(contains(variables['Agent.OS'], 'Windows')))
condition
:
and(variables['experiment_dir'],
eq('
${{ parameters.
training_service }}', 'remote')
, not(contains(variables['Agent.OS'], 'Windows')))
displayName
:
(failed) (POSIX) Harvest remote trial logs
displayName
:
(failed) (POSIX) Harvest remote trial logs
-
powershell
:
|
-
powershell
:
|
...
@@ -30,6 +35,21 @@ steps:
...
@@ -30,6 +35,21 @@ steps:
condition
:
and(failed(), contains(variables['Agent.OS'], 'Windows'))
condition
:
and(failed(), contains(variables['Agent.OS'], 'Windows'))
displayName
:
(failed) (Windows) Latest experiment directory
displayName
:
(failed) (Windows) Latest experiment directory
-
powershell
:
|
$latestDir = Get-Item $(experiment_dir)
$tmpPath = "${env:Temp}\${env:UserName}\nni"
$destPath = "${latestDir}\local"
if (Test-Path $tmpPath) {
Write-Host "Copying $tmpPath to $destPath"
Copy-Item $tmpPath -Destination $destPath -Recurse
}
else {
Write-host "$tmpPath doesn't exist"
}
condition
:
and(variables['experiment_dir'], eq('${{ parameters.training_service }}', 'local'), contains(variables['Agent.OS'], 'Windows'))
displayName
:
(failed) (Windows) Harvest GPU scheduler logs
-
powershell
:
|
-
powershell
:
|
$latestDir = Get-Item $(experiment_dir)
$latestDir = Get-Item $(experiment_dir)
$experimentId = $latestDir.name
$experimentId = $latestDir.name
...
@@ -43,7 +63,7 @@ steps:
...
@@ -43,7 +63,7 @@ steps:
else {
else {
Write-host "$remotePath doesn't exist"
Write-host "$remotePath doesn't exist"
}
}
condition
:
and(variables['experiment_dir'], ${{ parameters.
remote }}
, contains(variables['Agent.OS'], 'Windows'))
condition
:
and(variables['experiment_dir'],
eq('
${{ parameters.
training_service }}', 'remote')
, contains(variables['Agent.OS'], 'Windows'))
displayName
:
(failed) (Windows) Harvest remote trial logs
displayName
:
(failed) (Windows) Harvest remote trial logs
-
publish
:
$(experiment_dir)
-
publish
:
$(experiment_dir)
...
...
test/ut/tools/nnictl/test_kill_command.py
View file @
4e71ed62
...
@@ -53,8 +53,9 @@ def test_kill_process_slow_no_patience():
...
@@ -53,8 +53,9 @@ def test_kill_process_slow_no_patience():
start_time
=
time
.
time
()
start_time
=
time
.
time
()
kill_command
(
process
.
pid
,
timeout
=
1
)
# didn't wait long enough
kill_command
(
process
.
pid
,
timeout
=
1
)
# didn't wait long enough
end_time
=
time
.
time
()
end_time
=
time
.
time
()
if
sys
.
platform
==
'linux'
:
# FIXME: on non-linux, seems that the time of termination can't be controlled
if
sys
.
platform
==
'linux'
:
assert
0.5
<
end_time
-
start_time
<
2
# There was assert 0.5 < end_time - start_time. It's not stable.
assert
end_time
-
start_time
<
2
assert
process
.
poll
()
is
None
assert
process
.
poll
()
is
None
assert
_check_pid_running
(
process
.
pid
)
assert
_check_pid_running
(
process
.
pid
)
else
:
else
:
...
@@ -73,8 +74,7 @@ def test_kill_process_slow_patiently():
...
@@ -73,8 +74,7 @@ def test_kill_process_slow_patiently():
kill_command
(
process
.
pid
,
timeout
=
3
)
# wait long enough
kill_command
(
process
.
pid
,
timeout
=
3
)
# wait long enough
end_time
=
time
.
time
()
end_time
=
time
.
time
()
assert
end_time
-
start_time
<
5
assert
end_time
-
start_time
<
5
if
sys
.
platform
==
'linux'
:
# assert end_time - start_time > 1 # This check is disabled because it's not stable
assert
end_time
-
start_time
>
1
# I don't know why windows is super fast
@
pytest
.
mark
.
skipif
(
sys
.
platform
!=
'linux'
,
reason
=
'Signal issues on non-linux.'
)
@
pytest
.
mark
.
skipif
(
sys
.
platform
!=
'linux'
,
reason
=
'Signal issues on non-linux.'
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment