Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
fb26187d
Unverified
Commit
fb26187d
authored
Jan 04, 2021
by
SparkSnail
Committed by
GitHub
Jan 04, 2021
Browse files
Support adl pipeline (#3233)
parent
ae50ed14
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
162 additions
and
4 deletions
+162
-4
pipelines/integration-test-adl.yml
pipelines/integration-test-adl.yml
+63
-0
test/config/examples/cifar10-pytorch-adl.yml
test/config/examples/cifar10-pytorch-adl.yml
+23
-0
test/config/examples/cifar10_adl_search_space.json
test/config/examples/cifar10_adl_search_space.json
+5
-0
test/config/integration_tests.yml
test/config/integration_tests.yml
+4
-0
test/config/integration_tests_tf2.yml
test/config/integration_tests_tf2.yml
+4
-0
test/config/training_service.yml
test/config/training_service.yml
+19
-0
test/nni_test/nnitest/generate_ts_config.py
test/nni_test/nnitest/generate_ts_config.py
+23
-1
test/nni_test/nnitest/run_tests.py
test/nni_test/nnitest/run_tests.py
+21
-3
No files found.
pipelines/integration-test-adl.yml
0 → 100644
View file @
fb26187d
trigger
:
none
pr
:
none
schedules
:
-
cron
:
0 16 * * *
branches
:
include
:
[
master
]
jobs
:
-
job
:
adl
pool
:
NNI CI KUBE CLI
timeoutInMinutes
:
120
steps
:
-
script
:
|
export NNI_RELEASE=999.$(date -u +%Y%m%d%H%M%S)
echo "##vso[task.setvariable variable=PATH]${PATH}:${HOME}/.local/bin"
echo "##vso[task.setvariable variable=NNI_RELEASE]${NNI_RELEASE}"
echo "Working directory: ${PWD}"
echo "NNI version: ${NNI_RELEASE}"
echo "Build docker image: $(build_docker_image)"
python3 -m pip install --upgrade pip setuptools
displayName
:
Prepare
-
script
:
|
set -e
python3 setup.py build_ts
python3 setup.py bdist_wheel -p manylinux1_x86_64
python3 -m pip install dist/nni-${NNI_RELEASE}-py3-none-manylinux1_x86_64.whl[SMAC,BOHB]
displayName
:
Build and install NNI
-
script
:
|
set -e
cd examples/tuners/customized_tuner
python3 setup.py develop --user
nnictl algo register --meta meta_file.yml
displayName
:
Install customized tuner
-
script
:
|
set -e
docker login -u nnidev -p $(docker_hub_password)
sed -i '$a RUN python3 -m pip install adaptdl tensorboard' Dockerfile
sed -i '$a COPY examples /examples' Dockerfile
sed -i '$a COPY test /test' Dockerfile
echo '## Build docker image ##'
docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-nightly .
echo '## Upload docker image ##'
docker push nnidev/nni-nightly
condition
:
eq(variables['build_docker_image'], 'true')
displayName
:
Build and upload docker image
-
script
:
|
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts adl \
--nni_docker_image nnidev/nni-nightly \
--checkpoint_storage_class $(checkpoint_storage_class) \
--checkpoint_storage_size $(checkpoint_storage_size) \
--nni_manager_ip $(nni_manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts adl
displayName
:
Integration test
test/config/examples/cifar10-pytorch-adl.yml
0 → 100644
View file @
fb26187d
authorName
:
nni
experimentName
:
default_test
maxExecDuration
:
15m
maxTrialNum
:
1
trialConcurrency
:
1
searchSpacePath
:
./cifar10_adl_search_space.json
tuner
:
builtinTunerName
:
Random
assessor
:
builtinAssessorName
:
Medianstop
classArgs
:
optimize_mode
:
maximize
trial
:
codeDir
:
/examples/trials/cifar10_pytorch
command
:
python3 main_adl.py --epochs
1
gpuNum
:
1
useAnnotation
:
false
multiPhase
:
false
multiThread
:
false
trainingServicePlatform
:
adl
test/config/examples/cifar10_adl_search_space.json
0 → 100644
View file @
fb26187d
{
"lr"
:{
"_type"
:
"choice"
,
"_value"
:[
0.1
,
0.01
,
0.001
]},
"bs"
:{
"_type"
:
"choice"
,
"_value"
:[
64
,
96
,
128
]},
"model"
:{
"_type"
:
"choice"
,
"_value"
:[
"ResNet18"
,
"SENet18"
,
"MobileNet"
]}
}
test/config/integration_tests.yml
View file @
fb26187d
...
...
@@ -75,6 +75,10 @@ testCases:
command
:
python3 main.py --epochs 1 --batches
1
gpuNum
:
0
-
name
:
cifar10-pytorch-adl
configFile
:
test/config/examples/cifar10-pytorch-adl.yml
trainingService
:
adl
#- name: nested-ss
# configFile: test/config/examples/mnist-nested-search-space.yml
...
...
test/config/integration_tests_tf2.yml
View file @
fb26187d
...
...
@@ -52,6 +52,10 @@ testCases:
command
:
python3 main.py --epochs 1 --batches
1
gpuNum
:
0
-
name
:
cifar10-pytorch-adl
configFile
:
test/config/examples/cifar10-pytorch-adl.yml
trainingService
:
adl
-
name
:
classic-nas-gen-ss
configFile
:
test/config/examples/classic-nas-tf2.yml
launchCommand
:
nnictl ss_gen --trial_command="python3 train.py --epochs 1" --trial_dir=../examples/nas/classic_nas-tf --file=config/examples/nni-nas-search-space-tf2.json
...
...
test/config/training_service.yml
View file @
fb26187d
...
...
@@ -103,3 +103,22 @@ remote:
port
:
username
:
trainingServicePlatform
:
remote
adl
:
maxExecDuration
:
15m
nniManagerIp
:
# use a small trial number to make IT faster
maxTrialNum
:
2
trialConcurrency
:
2
trial
:
namespace
:
default
command
:
codeDir
:
gpuNum
:
1
cpuNum
:
1
image
:
memorySize
:
1Gi
checkpoint
:
storageClass
:
storageSize
:
trainingServicePlatform
:
adl
test/nni_test/nnitest/generate_ts_config.py
View file @
fb26187d
...
...
@@ -88,13 +88,28 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'machineList'
][
0
][
'passwd'
]
=
args
.
remote_pwd
if
args
.
remote_reuse
is
not
None
:
config
[
args
.
ts
][
'remoteConfig'
][
'reuse'
]
=
args
.
remote_reuse
.
lower
()
==
'true'
elif
args
.
ts
==
'adl'
:
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'image'
]
=
args
.
nni_docker_image
if
args
.
checkpoint_storage_class
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'checkpoint'
][
'storageClass'
]
=
args
.
checkpoint_storage_class
if
args
.
checkpoint_storage_size
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'checkpoint'
][
'storageSize'
]
=
args
.
checkpoint_storage_size
if
args
.
adaptive
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'adaptive'
]
=
args
.
adaptive
if
args
.
adl_nfs_server
is
not
None
and
args
.
adl_nfs_path
is
not
None
and
args
.
adl_nfs_container_mount_path
is
not
None
:
# default keys in nfs is empty, need to initialize
config
[
args
.
ts
][
'trial'
][
'nfs'
]
=
{}
config
[
args
.
ts
][
'trial'
][
'nfs'
][
'server'
]
=
args
.
adl_nfs_server
config
[
args
.
ts
][
'trial'
][
'nfs'
][
'path'
]
=
args
.
adl_nfs_path
config
[
args
.
ts
][
'trial'
][
'nfs'
][
'container_mount_path'
]
=
args
.
nadl_fs_container_mount_path
dump_yml_content
(
TRAINING_SERVICE_FILE
,
config
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'pai'
,
'kubeflow'
,
'remote'
,
'local'
,
'frameworkcontroller'
],
default
=
'pai'
)
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'pai'
,
'kubeflow'
,
'remote'
,
'local'
,
'frameworkcontroller'
,
'adl'
],
default
=
'pai'
)
parser
.
add_argument
(
"--nni_docker_image"
,
type
=
str
)
parser
.
add_argument
(
"--nni_manager_ip"
,
type
=
str
)
# args for PAI
...
...
@@ -122,6 +137,13 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--remote_host"
,
type
=
str
)
parser
.
add_argument
(
"--remote_port"
,
type
=
int
)
parser
.
add_argument
(
"--remote_reuse"
,
type
=
str
)
# args for adl
parser
.
add_argument
(
"--checkpoint_storage_class"
,
type
=
str
)
parser
.
add_argument
(
"--checkpoint_storage_size"
,
type
=
str
)
parser
.
add_argument
(
"--adaptive"
,
type
=
str
)
parser
.
add_argument
(
"--adl_nfs_server"
,
type
=
str
)
parser
.
add_argument
(
"--adl_nfs_path"
,
type
=
str
)
parser
.
add_argument
(
"--adl_nfs_container_mount_path"
,
type
=
str
)
args
=
parser
.
parse_args
()
update_training_service_config
(
args
)
test/nni_test/nnitest/run_tests.py
View file @
fb26187d
...
...
@@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables
=
{}
def
update_training_service_config
(
config
,
training_service
):
def
update_training_service_config
(
config
,
training_service
,
config_file_path
):
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service.yml'
))
# hack for kubeflow trial config
...
...
@@ -38,6 +38,20 @@ def update_training_service_config(config, training_service):
config
[
'trial'
].
pop
(
'command'
)
if
'gpuNum'
in
config
[
'trial'
]:
config
[
'trial'
].
pop
(
'gpuNum'
)
if
training_service
==
'adl'
:
# hack for adl trial config, codeDir in adl mode refers to path in container
containerCodeDir
=
config
[
'trial'
][
'codeDir'
]
# replace metric test folders to container folder
if
config
[
'trial'
][
'codeDir'
]
==
'.'
:
containerCodeDir
=
'/'
+
config_file_path
[:
config_file_path
.
rfind
(
'/'
)]
elif
config
[
'trial'
][
'codeDir'
]
==
'../naive_trial'
:
containerCodeDir
=
'/test/config/naive_trial'
elif
'../../../'
in
config
[
'trial'
][
'codeDir'
]:
# replace example folders to container folder
containerCodeDir
=
config
[
'trial'
][
'codeDir'
].
replace
(
'../../../'
,
'/'
)
it_ts_config
[
training_service
][
'trial'
][
'codeDir'
]
=
containerCodeDir
it_ts_config
[
training_service
][
'trial'
][
'command'
]
=
'cd {0} && {1}'
.
format
(
containerCodeDir
,
config
[
'trial'
][
'command'
])
deep_update
(
config
,
it_ts_config
[
'all'
])
deep_update
(
config
,
it_ts_config
[
training_service
])
...
...
@@ -58,7 +72,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
update_training_service_config
(
test_yml_config
,
args
.
ts
)
update_training_service_config
(
test_yml_config
,
args
.
ts
,
test_case_config
[
'configFile'
]
)
# generate temporary config yml file to launch experiment
new_config_file
=
config_path
+
'.tmp'
...
...
@@ -249,6 +263,10 @@ def run(args):
wait_for_port_available
(
8080
,
180
)
else
:
wait_for_port_available
(
8080
,
30
)
# adl mode need more time to cleanup PVC
if
args
.
ts
==
'adl'
and
name
==
'nnictl-resume-2'
:
time
.
sleep
(
30
)
print
(
'## {}Testing: {}{} ##'
.
format
(
GREEN
,
name
,
CLEAR
))
begin_time
=
time
.
time
()
...
...
@@ -263,7 +281,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--cases"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--exclude"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'local'
,
'remote'
,
'pai'
,
'kubeflow'
,
'frameworkcontroller'
],
default
=
'local'
)
'kubeflow'
,
'frameworkcontroller'
,
'adl'
],
default
=
'local'
)
args
=
parser
.
parse_args
()
run
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment