Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
e50ca8d3
Unverified
Commit
e50ca8d3
authored
Nov 18, 2021
by
SparkSnail
Committed by
GitHub
Nov 18, 2021
Browse files
Support reuse mode for pipeline (#4310)
parent
21256bf9
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
115 additions
and
6 deletions
+115
-6
pipelines/integration-test-frameworkcontroller.yml
pipelines/integration-test-frameworkcontroller.yml
+16
-0
pipelines/integration-test-kubeflow.yml
pipelines/integration-test-kubeflow.yml
+17
-0
test/config/training_service_v2.yml
test/config/training_service_v2.yml
+50
-0
test/nni_test/nnitest/generate_ts_config.py
test/nni_test/nnitest/generate_ts_config.py
+21
-2
test/nni_test/nnitest/run_tests.py
test/nni_test/nnitest/run_tests.py
+11
-4
No files found.
pipelines/integration-test-frameworkcontroller.yml
View file @
e50ca8d3
...
@@ -50,3 +50,19 @@ jobs:
...
@@ -50,3 +50,19 @@ jobs:
--nni_manager_ip $(manager_ip)
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --exclude multi-phase,multi-thread
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --exclude multi-phase,multi-thread
displayName
:
Integration test
displayName
:
Integration test
-
script
:
|
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts frameworkcontroller \
--keyvault_vaultname $(keyvault_vaultname) \
--keyvault_name $(keyvault_name) \
--azs_account $(azs_account) \
--azs_share $(azs_share) \
--nni_docker_image nnidev/nni-nightly \
--nni_manager_ip $(manager_ip) \
--reuse_mode True \
--config_version v2
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --reuse_mode True --exclude multi-phase,multi-thread
displayName
:
Integration test (reuse mode)
pipelines/integration-test-kubeflow.yml
View file @
e50ca8d3
...
@@ -61,3 +61,20 @@ jobs:
...
@@ -61,3 +61,20 @@ jobs:
--nni_manager_ip $(manager_ip)
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --exclude multi-phase,multi-thread
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --exclude multi-phase,multi-thread
displayName
:
Integration test
displayName
:
Integration test
-
script
:
|
set -e
cd test
az login --service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant_id)
python3 nni_test/nnitest/generate_ts_config.py \
--ts kubeflow \
--keyvault_vaultname $(keyvault_vaultname) \``
--keyvault_name $(keyvault_name) \
--azs_account $(azs_account) \
--azs_share $(azs_share) \
--nni_docker_image nnidev/nni-nightly \
--nni_manager_ip $(manager_ip) \
--reuse_mode True \
--config_version v2
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --reuse_mode True --exclude multi-phase,multi-thread
displayName
:
Integration test (reuse mode)
test/config/training_service_v2.yml
View file @
e50ca8d3
...
@@ -12,3 +12,53 @@ hybrid:
...
@@ -12,3 +12,53 @@ hybrid:
resourceGroup
:
resourceGroup
:
workspaceName
:
workspaceName
:
computeTarget
:
computeTarget
:
kubeflow
:
trialGpuNumber
:
0
trialConcurrency
:
2
maxTrialNumber
:
2
nniManagerIp
:
trainingService
:
reuseMode
:
true
platform
:
kubeflow
worker
:
command
:
code_directory
:
dockerImage
:
cpuNumber
:
1
gpuNumber
:
0
memorySize
:
8192
replicas
:
1
operator
:
tf-operator
storage
:
storageType
:
azureStorage
azureAccount
:
azureShare
:
keyVaultName
:
keyVaultKey
:
apiVersion
:
v1
frameworkcontroller
:
trialGpuNumber
:
0
trialConcurrency
:
2
maxTrialNumber
:
2
nniManagerIp
:
trainingService
:
reuseMode
:
true
platform
:
frameworkcontroller
serviceAccountName
:
frameworkcontroller
taskRoles
:
-
name
:
worker
dockerImage
:
taskNumber
:
1
command
:
gpuNumber
:
0
cpuNumber
:
1
memorySize
:
8192
framework_attempt_completion_policy
:
min_failed_task_count
:
1
minSucceedTaskCount
:
1
storage
:
storageType
:
azureStorage
azureAccount
:
azureShare
:
keyVaultName
:
keyVaultKey
:
\ No newline at end of file
test/nni_test/nnitest/generate_ts_config.py
View file @
e50ca8d3
...
@@ -35,7 +35,7 @@ def update_training_service_config(args):
...
@@ -35,7 +35,7 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'trial'
][
'virtualCluster'
]
=
args
.
vc
config
[
args
.
ts
][
'trial'
][
'virtualCluster'
]
=
args
.
vc
if
args
.
debug
is
not
None
:
if
args
.
debug
is
not
None
:
config
[
args
.
ts
][
'debug'
]
=
args
.
debug
.
lower
()
==
'true'
config
[
args
.
ts
][
'debug'
]
=
args
.
debug
.
lower
()
==
'true'
elif
args
.
ts
==
'kubeflow'
:
elif
args
.
ts
==
'kubeflow'
and
args
.
reuse_mode
==
'False'
:
if
args
.
nfs_server
is
not
None
:
if
args
.
nfs_server
is
not
None
:
config
[
args
.
ts
][
'kubeflowConfig'
][
'nfs'
][
'server'
]
=
args
.
nfs_server
config
[
args
.
ts
][
'kubeflowConfig'
][
'nfs'
][
'server'
]
=
args
.
nfs_server
if
args
.
nfs_path
is
not
None
:
if
args
.
nfs_path
is
not
None
:
...
@@ -50,7 +50,16 @@ def update_training_service_config(args):
...
@@ -50,7 +50,16 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'kubeflowConfig'
][
'azureStorage'
][
'azureShare'
]
=
args
.
azs_share
config
[
args
.
ts
][
'kubeflowConfig'
][
'azureStorage'
][
'azureShare'
]
=
args
.
azs_share
if
args
.
nni_docker_image
is
not
None
:
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'worker'
][
'image'
]
=
args
.
nni_docker_image
config
[
args
.
ts
][
'trial'
][
'worker'
][
'image'
]
=
args
.
nni_docker_image
elif
args
.
ts
==
'frameworkcontroller'
:
elif
args
.
ts
==
'kubeflow'
and
args
.
reuse_mode
==
'True'
:
config
=
get_yml_content
(
TRAINING_SERVICE_FILE_V2
)
config
[
args
.
ts
][
'trainingService'
][
'worker'
][
'dockerImage'
]
=
args
.
nni_docker_image
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'azureAccount'
]
=
args
.
azs_account
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'azureShare'
]
=
args
.
azs_share
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'keyVaultName'
]
=
args
.
keyvault_name
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'keyVaultKey'
]
=
args
.
keyvault_vaultname
config
[
args
.
ts
][
'nni_manager_ip'
]
=
args
.
nni_manager_ip
dump_yml_content
(
TRAINING_SERVICE_FILE_V2
,
config
)
elif
args
.
ts
==
'frameworkcontroller'
and
args
.
reuse_mode
==
'False'
:
if
args
.
nfs_server
is
not
None
:
if
args
.
nfs_server
is
not
None
:
config
[
args
.
ts
][
'frameworkcontrollerConfig'
][
'nfs'
][
'server'
]
=
args
.
nfs_server
config
[
args
.
ts
][
'frameworkcontrollerConfig'
][
'nfs'
][
'server'
]
=
args
.
nfs_server
if
args
.
nfs_path
is
not
None
:
if
args
.
nfs_path
is
not
None
:
...
@@ -65,6 +74,15 @@ def update_training_service_config(args):
...
@@ -65,6 +74,15 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'frameworkcontrollerConfig'
][
'azureStorage'
][
'azureShare'
]
=
args
.
azs_share
config
[
args
.
ts
][
'frameworkcontrollerConfig'
][
'azureStorage'
][
'azureShare'
]
=
args
.
azs_share
if
args
.
nni_docker_image
is
not
None
:
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'taskRoles'
][
0
][
'image'
]
=
args
.
nni_docker_image
config
[
args
.
ts
][
'trial'
][
'taskRoles'
][
0
][
'image'
]
=
args
.
nni_docker_image
elif
args
.
ts
==
'frameworkcontroller'
and
args
.
reuse_mode
==
'True'
:
config
=
get_yml_content
(
TRAINING_SERVICE_FILE_V2
)
config
[
args
.
ts
][
'trainingService'
][
'taskRoles'
][
0
][
'dockerImage'
]
=
args
.
nni_docker_image
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'azureAccount'
]
=
args
.
azs_account
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'azureShare'
]
=
args
.
azs_share
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'keyVaultName'
]
=
args
.
keyvault_name
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'keyVaultKey'
]
=
args
.
keyvault_vaultname
config
[
args
.
ts
][
'nni_manager_ip'
]
=
args
.
nni_manager_ip
dump_yml_content
(
TRAINING_SERVICE_FILE_V2
,
config
)
elif
args
.
ts
==
'remote'
:
elif
args
.
ts
==
'remote'
:
if
args
.
remote_user
is
not
None
:
if
args
.
remote_user
is
not
None
:
config
[
args
.
ts
][
'machineList'
][
0
][
'username'
]
=
args
.
remote_user
config
[
args
.
ts
][
'machineList'
][
0
][
'username'
]
=
args
.
remote_user
...
@@ -134,6 +152,7 @@ if __name__ == '__main__':
...
@@ -134,6 +152,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--config_version"
,
type
=
str
,
choices
=
[
'v1'
,
'v2'
],
default
=
'v1'
)
parser
.
add_argument
(
"--config_version"
,
type
=
str
,
choices
=
[
'v1'
,
'v2'
],
default
=
'v1'
)
parser
.
add_argument
(
"--nni_docker_image"
,
type
=
str
)
parser
.
add_argument
(
"--nni_docker_image"
,
type
=
str
)
parser
.
add_argument
(
"--nni_manager_ip"
,
type
=
str
)
parser
.
add_argument
(
"--nni_manager_ip"
,
type
=
str
)
parser
.
add_argument
(
"--reuse_mode"
,
type
=
str
,
default
=
'False'
)
# args for remote with shared storage
# args for remote with shared storage
parser
.
add_argument
(
"--azurestoragetoken"
,
type
=
str
)
parser
.
add_argument
(
"--azurestoragetoken"
,
type
=
str
)
parser
.
add_argument
(
"--nfs_server"
,
type
=
str
)
parser
.
add_argument
(
"--nfs_server"
,
type
=
str
)
...
...
test/nni_test/nnitest/run_tests.py
View file @
e50ca8d3
...
@@ -23,21 +23,27 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
...
@@ -23,21 +23,27 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables
=
{}
it_variables
=
{}
def
update_training_service_config
(
config
,
training_service
,
config_file_path
,
nni_source_dir
):
def
update_training_service_config
(
config
,
training_service
,
config_file_path
,
nni_source_dir
,
reuse_mode
=
'False'
):
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service.yml'
))
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service.yml'
))
# hack for kubeflow trial config
# hack for kubeflow trial config
if
training_service
==
'kubeflow'
:
if
training_service
==
'kubeflow'
and
reuse_mode
==
'False'
:
it_ts_config
[
training_service
][
'trial'
][
'worker'
][
'command'
]
=
config
[
'trial'
][
'command'
]
it_ts_config
[
training_service
][
'trial'
][
'worker'
][
'command'
]
=
config
[
'trial'
][
'command'
]
config
[
'trial'
].
pop
(
'command'
)
config
[
'trial'
].
pop
(
'command'
)
if
'gpuNum'
in
config
[
'trial'
]:
if
'gpuNum'
in
config
[
'trial'
]:
config
[
'trial'
].
pop
(
'gpuNum'
)
config
[
'trial'
].
pop
(
'gpuNum'
)
elif
training_service
==
'kubeflow'
and
reuse_mode
==
'True'
:
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service_v2.yml'
))
it_ts_config
[
'trainingService'
][
'worker'
][
'command'
]
=
config
[
'trialCommand'
]
if
training_service
==
'frameworkcontroller'
:
if
training_service
==
'frameworkcontroller'
and
reuse_mode
==
'False'
:
it_ts_config
[
training_service
][
'trial'
][
'taskRoles'
][
0
][
'command'
]
=
config
[
'trial'
][
'command'
]
it_ts_config
[
training_service
][
'trial'
][
'taskRoles'
][
0
][
'command'
]
=
config
[
'trial'
][
'command'
]
config
[
'trial'
].
pop
(
'command'
)
config
[
'trial'
].
pop
(
'command'
)
if
'gpuNum'
in
config
[
'trial'
]:
if
'gpuNum'
in
config
[
'trial'
]:
config
[
'trial'
].
pop
(
'gpuNum'
)
config
[
'trial'
].
pop
(
'gpuNum'
)
elif
training_service
==
'frameworkcontroller'
and
reuse_mode
==
'True'
:
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service_v2.yml'
))
it_ts_config
[
'trainingService'
][
'taskRoles'
][
0
][
'command'
]
=
config
[
'trialCommand'
]
if
training_service
==
'adl'
:
if
training_service
==
'adl'
:
# hack for adl trial config, codeDir in adl mode refers to path in container
# hack for adl trial config, codeDir in adl mode refers to path in container
...
@@ -88,7 +94,7 @@ def prepare_config_file(test_case_config, it_config, args):
...
@@ -88,7 +94,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
# the hack for kubeflow should be applied at last step
update_training_service_config
(
test_yml_config
,
args
.
ts
,
test_case_config
[
'configFile'
],
args
.
nni_source_dir
)
update_training_service_config
(
test_yml_config
,
args
.
ts
,
test_case_config
[
'configFile'
],
args
.
nni_source_dir
,
args
.
reuse_mode
)
# generate temporary config yml file to launch experiment
# generate temporary config yml file to launch experiment
new_config_file
=
config_path
+
'.tmp'
new_config_file
=
config_path
+
'.tmp'
...
@@ -313,6 +319,7 @@ if __name__ == '__main__':
...
@@ -313,6 +319,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--nni_source_dir"
,
type
=
str
,
default
=
'../'
)
parser
.
add_argument
(
"--nni_source_dir"
,
type
=
str
,
default
=
'../'
)
parser
.
add_argument
(
"--cases"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--cases"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--exclude"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--exclude"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--reuse_mode"
,
type
=
str
,
default
=
'False'
)
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'local'
,
'remote'
,
'pai'
,
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'local'
,
'remote'
,
'pai'
,
'kubeflow'
,
'frameworkcontroller'
,
'adl'
,
'aml'
,
'hybrid'
],
default
=
'local'
)
'kubeflow'
,
'frameworkcontroller'
,
'adl'
,
'aml'
,
'hybrid'
],
default
=
'local'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment