Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
fb26187d
Unverified
Commit
fb26187d
authored
Jan 04, 2021
by
SparkSnail
Committed by
GitHub
Jan 04, 2021
Browse files
Support adl pipeline (#3233)
parent
ae50ed14
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
162 additions
and
4 deletions
+162
-4
pipelines/integration-test-adl.yml
pipelines/integration-test-adl.yml
+63
-0
test/config/examples/cifar10-pytorch-adl.yml
test/config/examples/cifar10-pytorch-adl.yml
+23
-0
test/config/examples/cifar10_adl_search_space.json
test/config/examples/cifar10_adl_search_space.json
+5
-0
test/config/integration_tests.yml
test/config/integration_tests.yml
+4
-0
test/config/integration_tests_tf2.yml
test/config/integration_tests_tf2.yml
+4
-0
test/config/training_service.yml
test/config/training_service.yml
+19
-0
test/nni_test/nnitest/generate_ts_config.py
test/nni_test/nnitest/generate_ts_config.py
+23
-1
test/nni_test/nnitest/run_tests.py
test/nni_test/nnitest/run_tests.py
+21
-3
No files found.
pipelines/integration-test-adl.yml
0 → 100644
View file @
fb26187d
trigger
:
none
pr
:
none
schedules
:
-
cron
:
0 16 * * *
branches
:
include
:
[
master
]
jobs
:
-
job
:
adl
pool
:
NNI CI KUBE CLI
timeoutInMinutes
:
120
steps
:
-
script
:
|
export NNI_RELEASE=999.$(date -u +%Y%m%d%H%M%S)
echo "##vso[task.setvariable variable=PATH]${PATH}:${HOME}/.local/bin"
echo "##vso[task.setvariable variable=NNI_RELEASE]${NNI_RELEASE}"
echo "Working directory: ${PWD}"
echo "NNI version: ${NNI_RELEASE}"
echo "Build docker image: $(build_docker_image)"
python3 -m pip install --upgrade pip setuptools
displayName
:
Prepare
-
script
:
|
set -e
python3 setup.py build_ts
python3 setup.py bdist_wheel -p manylinux1_x86_64
python3 -m pip install dist/nni-${NNI_RELEASE}-py3-none-manylinux1_x86_64.whl[SMAC,BOHB]
displayName
:
Build and install NNI
-
script
:
|
set -e
cd examples/tuners/customized_tuner
python3 setup.py develop --user
nnictl algo register --meta meta_file.yml
displayName
:
Install customized tuner
-
script
:
|
set -e
docker login -u nnidev -p $(docker_hub_password)
sed -i '$a RUN python3 -m pip install adaptdl tensorboard' Dockerfile
sed -i '$a COPY examples /examples' Dockerfile
sed -i '$a COPY test /test' Dockerfile
echo '## Build docker image ##'
docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly .
echo '## Upload docker image ##'
docker push nnidev/nni-nightly
condition
:
eq(variables['build_docker_image'], 'true')
displayName
:
Build and upload docker image
-
script
:
|
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts adl \
--nni_docker_image nnidev/nni-nightly \
--checkpoint_storage_class $(checkpoint_storage_class) \
--checkpoint_storage_size $(checkpoint_storage_size) \
--nni_manager_ip $(nni_manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts adl
displayName
:
Integration test
test/config/examples/cifar10-pytorch-adl.yml
0 → 100644
View file @
fb26187d
authorName
:
nni
experimentName
:
default_test
maxExecDuration
:
15m
maxTrialNum
:
1
trialConcurrency
:
1
searchSpacePath
:
./cifar10_adl_search_space.json
tuner
:
builtinTunerName
:
Random
assessor
:
builtinAssessorName
:
Medianstop
classArgs
:
optimize_mode
:
maximize
trial
:
codeDir
:
/examples/trials/cifar10_pytorch
command
:
python3 main_adl.py --epochs
1
gpuNum
:
1
useAnnotation
:
false
multiPhase
:
false
multiThread
:
false
trainingServicePlatform
:
adl
test/config/examples/cifar10_adl_search_space.json
0 → 100644
View file @
fb26187d
{
"lr"
:{
"_type"
:
"choice"
,
"_value"
:[
0.1
,
0.01
,
0.001
]},
"bs"
:{
"_type"
:
"choice"
,
"_value"
:[
64
,
96
,
128
]},
"model"
:{
"_type"
:
"choice"
,
"_value"
:[
"ResNet18"
,
"SENet18"
,
"MobileNet"
]}
}
test/config/integration_tests.yml
View file @
fb26187d
...
...
@@ -75,6 +75,10 @@ testCases:
command
:
python3 main.py --epochs 1 --batches
1
gpuNum
:
0
-
name
:
cifar10-pytorch-adl
configFile
:
test/config/examples/cifar10-pytorch-adl.yml
trainingService
:
adl
#- name: nested-ss
# configFile: test/config/examples/mnist-nested-search-space.yml
...
...
test/config/integration_tests_tf2.yml
View file @
fb26187d
...
...
@@ -52,6 +52,10 @@ testCases:
command
:
python3 main.py --epochs 1 --batches
1
gpuNum
:
0
-
name
:
cifar10-pytorch-adl
configFile
:
test/config/examples/cifar10-pytorch-adl.yml
trainingService
:
adl
-
name
:
classic-nas-gen-ss
configFile
:
test/config/examples/classic-nas-tf2.yml
launchCommand
:
nnictl ss_gen --trial_command="python3 train.py --epochs 1" --trial_dir=../examples/nas/classic_nas-tf --file=config/examples/nni-nas-search-space-tf2.json
...
...
test/config/training_service.yml
View file @
fb26187d
...
...
@@ -103,3 +103,22 @@ remote:
port
:
username
:
trainingServicePlatform
:
remote
adl
:
maxExecDuration
:
15m
nniManagerIp
:
# use a small trial number to make IT faster
maxTrialNum
:
2
trialConcurrency
:
2
trial
:
namespace
:
default
command
:
codeDir
:
gpuNum
:
1
cpuNum
:
1
image
:
memorySize
:
1Gi
checkpoint
:
storageClass
:
storageSize
:
trainingServicePlatform
:
adl
test/nni_test/nnitest/generate_ts_config.py
View file @
fb26187d
...
...
@@ -88,13 +88,28 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'machineList'
][
0
][
'passwd'
]
=
args
.
remote_pwd
if
args
.
remote_reuse
is
not
None
:
config
[
args
.
ts
][
'remoteConfig'
][
'reuse'
]
=
args
.
remote_reuse
.
lower
()
==
'true'
elif
args
.
ts
==
'adl'
:
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'image'
]
=
args
.
nni_docker_image
if
args
.
checkpoint_storage_class
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'checkpoint'
][
'storageClass'
]
=
args
.
checkpoint_storage_class
if
args
.
checkpoint_storage_size
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'checkpoint'
][
'storageSize'
]
=
args
.
checkpoint_storage_size
if
args
.
adaptive
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'adaptive'
]
=
args
.
adaptive
if
args
.
adl_nfs_server
is
not
None
and
args
.
adl_nfs_path
is
not
None
and
args
.
adl_nfs_container_mount_path
is
not
None
:
# default keys in nfs is empty, need to initialize
config
[
args
.
ts
][
'trial'
][
'nfs'
]
=
{}
config
[
args
.
ts
][
'trial'
][
'nfs'
][
'server'
]
=
args
.
adl_nfs_server
config
[
args
.
ts
][
'trial'
][
'nfs'
][
'path'
]
=
args
.
adl_nfs_path
config
[
args
.
ts
][
'trial'
][
'nfs'
][
'container_mount_path'
]
=
args
.
nadl_fs_container_mount_path
dump_yml_content
(
TRAINING_SERVICE_FILE
,
config
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'pai'
,
'kubeflow'
,
'remote'
,
'local'
,
'frameworkcontroller'
],
default
=
'pai'
)
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'pai'
,
'kubeflow'
,
'remote'
,
'local'
,
'frameworkcontroller'
,
'adl'
],
default
=
'pai'
)
parser
.
add_argument
(
"--nni_docker_image"
,
type
=
str
)
parser
.
add_argument
(
"--nni_manager_ip"
,
type
=
str
)
# args for PAI
...
...
@@ -122,6 +137,13 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--remote_host"
,
type
=
str
)
parser
.
add_argument
(
"--remote_port"
,
type
=
int
)
parser
.
add_argument
(
"--remote_reuse"
,
type
=
str
)
# args for adl
parser
.
add_argument
(
"--checkpoint_storage_class"
,
type
=
str
)
parser
.
add_argument
(
"--checkpoint_storage_size"
,
type
=
str
)
parser
.
add_argument
(
"--adaptive"
,
type
=
str
)
parser
.
add_argument
(
"--adl_nfs_server"
,
type
=
str
)
parser
.
add_argument
(
"--adl_nfs_path"
,
type
=
str
)
parser
.
add_argument
(
"--adl_nfs_container_mount_path"
,
type
=
str
)
args
=
parser
.
parse_args
()
update_training_service_config
(
args
)
test/nni_test/nnitest/run_tests.py
View file @
fb26187d
...
...
@@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables
=
{}
def
update_training_service_config
(
config
,
training_service
):
def
update_training_service_config
(
config
,
training_service
,
config_file_path
):
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service.yml'
))
# hack for kubeflow trial config
...
...
@@ -38,6 +38,20 @@ def update_training_service_config(config, training_service):
config
[
'trial'
].
pop
(
'command'
)
if
'gpuNum'
in
config
[
'trial'
]:
config
[
'trial'
].
pop
(
'gpuNum'
)
if
training_service
==
'adl'
:
# hack for adl trial config, codeDir in adl mode refers to path in container
containerCodeDir
=
config
[
'trial'
][
'codeDir'
]
# replace metric test folders to container folder
if
config
[
'trial'
][
'codeDir'
]
==
'.'
:
containerCodeDir
=
'/'
+
config_file_path
[:
config_file_path
.
rfind
(
'/'
)]
elif
config
[
'trial'
][
'codeDir'
]
==
'../naive_trial'
:
containerCodeDir
=
'/test/config/naive_trial'
elif
'../../../'
in
config
[
'trial'
][
'codeDir'
]:
# replace example folders to container folder
containerCodeDir
=
config
[
'trial'
][
'codeDir'
].
replace
(
'../../../'
,
'/'
)
it_ts_config
[
training_service
][
'trial'
][
'codeDir'
]
=
containerCodeDir
it_ts_config
[
training_service
][
'trial'
][
'command'
]
=
'cd {0} && {1}'
.
format
(
containerCodeDir
,
config
[
'trial'
][
'command'
])
deep_update
(
config
,
it_ts_config
[
'all'
])
deep_update
(
config
,
it_ts_config
[
training_service
])
...
...
@@ -58,7 +72,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
update_training_service_config
(
test_yml_config
,
args
.
ts
)
update_training_service_config
(
test_yml_config
,
args
.
ts
,
test_case_config
[
'configFile'
]
)
# generate temporary config yml file to launch experiment
new_config_file
=
config_path
+
'.tmp'
...
...
@@ -249,6 +263,10 @@ def run(args):
wait_for_port_available
(
8080
,
180
)
else
:
wait_for_port_available
(
8080
,
30
)
# adl mode need more time to cleanup PVC
if
args
.
ts
==
'adl'
and
name
==
'nnictl-resume-2'
:
time
.
sleep
(
30
)
print
(
'## {}Testing: {}{} ##'
.
format
(
GREEN
,
name
,
CLEAR
))
begin_time
=
time
.
time
()
...
...
@@ -263,7 +281,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--cases"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--exclude"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'local'
,
'remote'
,
'pai'
,
'kubeflow'
,
'frameworkcontroller'
],
default
=
'local'
)
'kubeflow'
,
'frameworkcontroller'
,
'adl'
],
default
=
'local'
)
args
=
parser
.
parse_args
()
run
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment