Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
8f01c779
Unverified
Commit
8f01c779
authored
Jun 30, 2021
by
Ni Hao
Committed by
GitHub
Jun 30, 2021
Browse files
Add shared storage integration test (#3455)
parent
32fdd32b
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
208 additions
and
14 deletions
+208
-14
nni/experiment/config/common.py
nni/experiment/config/common.py
+1
-0
pipelines/integration-test-remote-l2l.yml
pipelines/integration-test-remote-l2l.yml
+8
-5
test/config/integration_tests.yml
test/config/integration_tests.yml
+28
-1
test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml
...redstorage_test/config_sharedstorage_remote_azureblob.yml
+43
-0
test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml
...ig/sharedstorage_test/config_sharedstorage_remote_nfs.yml
+40
-0
test/config/sharedstorage_test/config_sharedstorage_search_space.json
...sharedstorage_test/config_sharedstorage_search_space.json
+7
-0
test/config/sharedstorage_test/config_sharedstorage_trial.py
test/config/sharedstorage_test/config_sharedstorage_trial.py
+24
-0
test/config/training_service.yml
test/config/training_service.yml
+3
-0
test/nni_test/nnitest/generate_ts_config.py
test/nni_test/nnitest/generate_ts_config.py
+6
-1
test/nni_test/nnitest/run_tests.py
test/nni_test/nnitest/run_tests.py
+29
-3
test/nni_test/nnitest/validators.py
test/nni_test/nnitest/validators.py
+14
-0
test/vso_tools/start_docker.py
test/vso_tools/start_docker.py
+2
-1
ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts
...rvice/reusable/shared_storages/azureblobStorageService.ts
+3
-3
No files found.
nni/experiment/config/common.py
View file @
8f01c779
...
...
@@ -46,6 +46,7 @@ class CustomAlgorithmConfig(_AlgorithmConfig):
class
TrainingServiceConfig
(
ConfigBase
):
platform
:
str
@
dataclass
(
init
=
False
)
class
SharedStorageConfig
(
ConfigBase
):
storage_type
:
str
local_mount_point
:
str
...
...
pipelines/integration-test-remote-l2l.yml
View file @
8f01c779
...
...
@@ -87,27 +87,30 @@ jobs:
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts remote \
--remote_reuse
fals
e \
--remote_reuse
tru
e \
--remote_user nni \
--remote_host $(worker_ip) \
--remote_port $(docker_port) \
--remote_pwd $(password_in_docker) \
--nni_manager_ip $(manager_ip)
--nni_manager_ip $(manager_ip) \
--azurestoragetoken $(azureblob_token_test) \
--nfs_server $(NFS_IP)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName
:
Integration test
displayName
:
Integration test
(reuse mode)
-
script
:
|
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts remote \
--remote_reuse
tru
e \
--remote_reuse
fals
e \
--remote_user nni \
--remote_host $(worker_ip) \
--remote_port $(docker_port) \
--remote_pwd $(password_in_docker) \
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName
:
Integration test (reuse mode)
displayName
:
Integration test
-
task
:
SSH@0
inputs
:
...
...
test/config/integration_tests.yml
View file @
8f01c779
...
...
@@ -34,6 +34,34 @@ testCases:
# check status of experiment before calling validator
experimentStatusCheck
:
True
-
name
:
shared-storage-remote-azureblob
configFile
:
test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml
config
:
sharedStorage
:
localMountPoint
:
/tmp/nnimount/testlocalrootpath
remoteMountPoint
:
/tmp/nnimount/testremoterootpath
storageAccountName
:
nennistorage
storageAccountKey
:
$(azureblob_token_test)
containerName
:
sharedstorage
validator
:
class
:
FileExistValidator
kwargs
:
rootpath
:
/tmp/nnimount/testlocalrootpath
# TODO: Enable this case after nfs server is ready
#- name: shared-storage-remote-nfs
# configFile: test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml
# config:
# sharedStorage:
# localMountPoint: /tmp/nnimount/testlocalrootpath
# remoteMountPoint: /tmp/nnimount/testremoterootpath
# nfsServer: $(NFS_IP)
# exportedDirectory: /home/nni/mnt/
# validator:
# class: FileExistValidator
# kwargs:
# rootpath: /tmp/nnimount/testlocalrootpath
-
name
:
sklearn-regression
configFile
:
test/config/examples/sklearn-regression.yml
...
...
@@ -227,4 +255,3 @@ testCases:
#########################################################################
-
name
:
customized-tuners-demotuner
configFile
:
test/config/customized_tuners/demotuner-sklearn-classification.yml
test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml
0 → 100644
View file @
8f01c779
authorName
:
default
experimentName
:
example_mnist
trialConcurrency
:
1
maxExecDuration
:
1h
maxTrialNum
:
1
trainingServicePlatform
:
remote
searchSpacePath
:
config_sharedstorage_search_space.json
#choice: true, false
useAnnotation
:
false
nniManagerIp
:
127.0.0.1
tuner
:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
#SMAC (SMAC should be installed through nnictl)
builtinTunerName
:
TPE
classArgs
:
#choice: maximize, minimize
optimize_mode
:
maximize
trial
:
command
:
python3 config_sharedstorage_trial.py
codeDir
:
.
gpuNum
:
0
sharedStorage
:
storageType
:
AzureBlob
localMountPoint
:
${your/local/mount/point}
remoteMountPoint
:
${your/remote/mount/point}
storageAccountName
:
${replace_to_your_storageAccountName}
storageAccountKey
:
${replace_to_your_storageAccountKey}
# If you did not set storageAccountKey, you need use `az login` with Azure CLI at first and set resourceGroupName.
# resourceGroupName: ${replace_to_your_resourceGroupName}
containerName
:
${replace_to_your_containerName}
# usermount means you have already mount this storage on localMountPoint
# nnimount means nni will try to mount this storage on localMountPoint
# nomount means storage will not mount in local machine, will support partial storages in the future
localMounted
:
nnimount
#machineList can be empty if the platform is local
machineList
:
-
ip
:
10.1.1.1
username
:
bob
passwd
:
bob123
#port can be skip if using default ssh port 22
#port: 22
remoteConfig
:
reuse
:
true
\ No newline at end of file
test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml
0 → 100644
View file @
8f01c779
authorName
:
default
experimentName
:
example_mnist
trialConcurrency
:
1
maxExecDuration
:
1h
maxTrialNum
:
1
trainingServicePlatform
:
remote
searchSpacePath
:
config_sharedstorage_search_space.json
#choice: true, false
useAnnotation
:
false
nniManagerIp
:
127.0.0.1
tuner
:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
#SMAC (SMAC should be installed through nnictl)
builtinTunerName
:
TPE
classArgs
:
#choice: maximize, minimize
optimize_mode
:
maximize
trial
:
command
:
python3 config_sharedstorage_trial.py
codeDir
:
.
gpuNum
:
0
sharedStorage
:
storageType
:
NFS
localMountPoint
:
${your/local/mount/point}
remoteMountPoint
:
${your/remote/mount/point}
nfsServer
:
${nfs-server-ip}
exportedDirectory
:
${nfs/exported/directory}
# usermount means you have already mount this storage on localMountPoint
# nnimount means nni will try to mount this storage on localMountPoint
# nomount means storage will not mount in local machine, will support partial storages in the future
localMounted
:
nnimount
#machineList can be empty if the platform is local
machineList
:
-
ip
:
10.1.1.1
username
:
bob
passwd
:
bob123
#port can be skip if using default ssh port 22
#port: 22
remoteConfig
:
reuse
:
true
\ No newline at end of file
test/config/sharedstorage_test/config_sharedstorage_search_space.json
0 → 100644
View file @
8f01c779
{
"dropout_rate"
:{
"_type"
:
"uniform"
,
"_value"
:[
0.5
,
0.9
]},
"conv_size"
:{
"_type"
:
"choice"
,
"_value"
:[
2
,
3
,
5
,
7
]},
"hidden_size"
:{
"_type"
:
"choice"
,
"_value"
:[
124
,
512
,
1024
]},
"batch_size"
:
{
"_type"
:
"choice"
,
"_value"
:
[
16
,
32
]},
"learning_rate"
:{
"_type"
:
"choice"
,
"_value"
:[
0.0001
,
0.001
,
0.01
,
0.1
]}
}
test/config/sharedstorage_test/config_sharedstorage_trial.py
0 → 100644
View file @
8f01c779
"""
A deep MNIST classifier using convolutional layers.
This file is a modification of the official pytorch mnist example:
https://github.com/pytorch/examples/blob/master/mnist/main.py
"""
import
os
import
logging
import
nni
logger
=
logging
.
getLogger
(
'mnist_AutoML'
)
if
__name__
==
'__main__'
:
try
:
logger
.
debug
(
os
.
environ
.
get
(
'NNI_OUTPUT_DIR'
))
filename
=
os
.
path
.
join
(
os
.
environ
.
get
(
'NNI_OUTPUT_DIR'
),
'checkingfile.txt'
)
f
=
open
(
filename
,
"a"
)
tuner_params
=
nni
.
get_next_parameter
()
f
.
write
(
str
(
tuner_params
))
nni
.
report_final_result
(
1
)
f
.
close
()
except
Exception
as
exception
:
logger
.
exception
(
exception
)
raise
test/config/training_service.yml
View file @
8f01c779
...
...
@@ -87,6 +87,9 @@ remote:
port
:
username
:
trainingServicePlatform
:
remote
sharedStorage
:
storageAccountKey
:
nfsServer
:
hybrid
:
maxExecDuration
:
15m
nniManagerIp
:
...
...
test/nni_test/nnitest/generate_ts_config.py
View file @
8f01c779
...
...
@@ -74,6 +74,10 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'machineList'
][
0
][
'passwd'
]
=
args
.
remote_pwd
if
args
.
remote_reuse
is
not
None
:
config
[
args
.
ts
][
'remoteConfig'
][
'reuse'
]
=
args
.
remote_reuse
.
lower
()
==
'true'
if
args
.
azurestoragetoken
is
not
None
:
config
[
args
.
ts
][
'sharedStorage'
][
'storageAccountKey'
]
=
args
.
azurestoragetoken
if
args
.
nfs_server
is
not
None
:
config
[
args
.
ts
][
'sharedStorage'
][
'nfsServer'
]
=
args
.
nfs_server
elif
args
.
ts
==
'adl'
:
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'image'
]
=
args
.
nni_docker_image
...
...
@@ -118,6 +122,8 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--config_version"
,
type
=
str
,
choices
=
[
'v1'
,
'v2'
],
default
=
'v1'
)
parser
.
add_argument
(
"--nni_docker_image"
,
type
=
str
)
parser
.
add_argument
(
"--nni_manager_ip"
,
type
=
str
)
parser
.
add_argument
(
"--azurestoragetoken"
,
type
=
str
)
parser
.
add_argument
(
"--nfs_server"
,
type
=
str
)
# args for PAI
parser
.
add_argument
(
"--pai_user"
,
type
=
str
)
parser
.
add_argument
(
"--pai_pwd"
,
type
=
str
)
...
...
@@ -131,7 +137,6 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--nni_manager_nfs_mount_path"
,
type
=
str
)
parser
.
add_argument
(
"--container_nfs_mount_path"
,
type
=
str
)
# args for kubeflow and frameworkController
parser
.
add_argument
(
"--nfs_server"
,
type
=
str
)
parser
.
add_argument
(
"--nfs_path"
,
type
=
str
)
parser
.
add_argument
(
"--keyvault_vaultname"
,
type
=
str
)
parser
.
add_argument
(
"--keyvault_name"
,
type
=
str
)
...
...
test/nni_test/nnitest/run_tests.py
View file @
8f01c779
...
...
@@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables
=
{}
def
update_training_service_config
(
config
,
training_service
,
config_file_path
):
def
update_training_service_config
(
config
,
training_service
,
config_file_path
,
nni_source_dir
):
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service.yml'
))
# hack for kubeflow trial config
...
...
@@ -38,7 +38,7 @@ def update_training_service_config(config, training_service, config_file_path):
config
[
'trial'
].
pop
(
'command'
)
if
'gpuNum'
in
config
[
'trial'
]:
config
[
'trial'
].
pop
(
'gpuNum'
)
if
training_service
==
'adl'
:
# hack for adl trial config, codeDir in adl mode refers to path in container
containerCodeDir
=
config
[
'trial'
][
'codeDir'
]
...
...
@@ -52,6 +52,18 @@ def update_training_service_config(config, training_service, config_file_path):
containerCodeDir
=
config
[
'trial'
][
'codeDir'
].
replace
(
'../../../'
,
'/'
)
it_ts_config
[
training_service
][
'trial'
][
'codeDir'
]
=
containerCodeDir
it_ts_config
[
training_service
][
'trial'
][
'command'
]
=
'cd {0} && {1}'
.
format
(
containerCodeDir
,
config
[
'trial'
][
'command'
])
if
training_service
==
'remote'
:
testcase_config
=
get_yml_content
(
nni_source_dir
+
config_file_path
)
sharedStorage
=
testcase_config
.
get
(
'sharedStorage'
)
if
sharedStorage
is
None
:
it_ts_config
[
training_service
].
pop
(
'sharedStorage'
)
elif
str
(
sharedStorage
.
get
(
'storageType'
)).
lower
()
==
'nfs'
:
it_ts_config
[
training_service
].
get
(
'sharedStorage'
).
pop
(
'storageAccountKey'
)
elif
str
(
sharedStorage
.
get
(
'storageType'
)).
lower
()
==
'azureblob'
:
it_ts_config
[
training_service
].
get
(
'sharedStorage'
).
pop
(
'nfsServer'
)
else
:
it_ts_config
[
training_service
].
pop
(
'sharedStorage'
)
if
training_service
==
'hybrid'
:
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service_v2.yml'
))
...
...
@@ -75,7 +87,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
update_training_service_config
(
test_yml_config
,
args
.
ts
,
test_case_config
[
'configFile'
])
update_training_service_config
(
test_yml_config
,
args
.
ts
,
test_case_config
[
'configFile'
]
,
args
.
nni_source_dir
)
# generate temporary config yml file to launch experiment
new_config_file
=
config_path
+
'.tmp'
...
...
@@ -238,6 +250,15 @@ def match_training_service(test_case_config, cur_training_service):
return
True
return
False
def
match_remoteConfig
(
test_case_config
,
nni_source_dir
):
trainingservice_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service.yml'
))
trainingservice_config_reuse_value
=
str
(
trainingservice_config
[
'remote'
][
'remoteConfig'
][
'reuse'
]).
lower
()
testcase_config
=
get_yml_content
(
nni_source_dir
+
test_case_config
[
'configFile'
])
if
testcase_config
.
get
(
'remoteConfig'
)
is
not
None
:
if
testcase_config
[
'remoteConfig'
].
get
(
'reuse'
)
is
not
None
:
return
str
(
testcase_config
[
'remoteConfig'
][
'reuse'
]).
lower
()
==
trainingservice_config_reuse_value
return
True
def
run
(
args
):
it_config
=
get_yml_content
(
args
.
config
)
...
...
@@ -264,8 +285,13 @@ def run(args):
print
(
'skipped {}, training service {} not match [{}]'
.
format
(
name
,
args
.
ts
,
test_case_config
[
'trainingService'
]))
continue
# remote mode need more time to cleanup
if
args
.
ts
==
'remote'
or
args
.
ts
==
'hybrid'
:
if
args
.
ts
==
'remote'
:
if
not
match_remoteConfig
(
test_case_config
,
args
.
nni_source_dir
):
print
(
'skipped {}, remoteConfig not match.'
.
format
(
name
))
continue
wait_for_port_available
(
8080
,
240
)
else
:
wait_for_port_available
(
8080
,
60
)
...
...
test/nni_test/nnitest/validators.py
View file @
8f01c779
...
...
@@ -97,3 +97,17 @@ class NnicliValidator(ITValidator):
print
(
exp
.
get_job_statistics
())
print
(
exp
.
get_experiment_status
())
print
(
exp
.
list_trial_jobs
())
class
FileExistValidator
(
ITValidator
):
def
__call__
(
self
,
rest_endpoint
,
experiment_dir
,
nni_source_dir
,
**
kwargs
):
print
(
rest_endpoint
)
exp_id
=
osp
.
split
(
experiment_dir
)[
-
1
]
rootpath
=
kwargs
.
get
(
'rootpath'
)
metrics
=
requests
.
get
(
METRICS_URL
).
json
()
for
metric
in
metrics
:
trial_id
=
metric
[
'trialJobId'
]
checkpath
=
osp
.
join
(
rootpath
,
'nni'
,
exp_id
,
'trials'
,
trial_id
,
'nnioutput'
,
'checkingfile.txt'
)
print
(
'Checking shared storage log exists on trial '
,
trial_id
)
assert
osp
.
exists
(
checkpath
)
test/vso_tools/start_docker.py
View file @
8f01c779
...
...
@@ -25,8 +25,9 @@ container = sys.argv[2]
password
=
sys
.
argv
[
3
]
run_command
(
f
'docker build --build-arg NNI_RELEASE=
{
version
}
-t nnidev/nni-nightly .'
)
run_command
(
f
'docker run -d -t -p
{
port
}
:22 --name
{
container
}
nnidev/nni-nightly'
)
run_command
(
f
'docker run
--privileged
-d -t -p
{
port
}
:22 --name
{
container
}
nnidev/nni-nightly'
)
run_command
(
f
'docker exec
{
container
}
useradd --create-home --password
{
password
}
nni'
)
run_command
([
'docker'
,
'exec'
,
container
,
'bash'
,
'-c'
,
f
'echo "nni:
{
password
}
" | chpasswd'
])
run_command
([
'docker'
,
'exec'
,
container
,
'bash'
,
'-c'
,
'echo "nni ALL=(ALL:ALL) NOPASSWD:ALL" >> /etc/sudoers'
])
run_command
(
f
'docker exec
{
container
}
service ssh start'
)
set_variable
(
'docker_port'
,
port
)
ts/nni_manager/training_service/reusable/shared_storages/azureblobStorageService.ts
View file @
8f01c779
...
...
@@ -34,13 +34,13 @@ fi
id=$(lsb_release -i | cut -c16- | sed s/[[:space:]]//g)
version=$(lsb_release -r | cut -c9- | sed s/[[:space:]]//g)
if [ $id = "Ubuntu" ]
if [
"
$id
"
= "Ubuntu" ]
then
wget https://packages.microsoft.com/config/ubuntu/$version/packages-microsoft-prod.deb
sudo dpkg -i packages-microsoft-prod.deb
sudo
DEBIAN_FRONTEND=noninteractive
dpkg -i packages-microsoft-prod.deb
sudo apt-get update
sudo apt-get install -y blobfuse fuse
elif [ $id = "CentOS" ] || [ $id = "RHEL" ]
elif [
"
$id
"
= "CentOS" ] || [
"
$id
"
= "RHEL" ]
then
sudo rpm -Uvh https://packages.microsoft.com/config/rhel/$(echo $version | cut -c1)/packages-microsoft-prod.rpm
sudo yum install -y blobfuse fuse
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment