Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
e50ca8d3
"...resnet50_tensorflow.git" did not exist on "41f71f6c9798c170fcbfeae220092d7c2824d813"
Unverified
Commit
e50ca8d3
authored
Nov 18, 2021
by
SparkSnail
Committed by
GitHub
Nov 18, 2021
Browse files
Support reuse mode for pipeline (#4310)
parent
21256bf9
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
115 additions
and
6 deletions
+115
-6
pipelines/integration-test-frameworkcontroller.yml
pipelines/integration-test-frameworkcontroller.yml
+16
-0
pipelines/integration-test-kubeflow.yml
pipelines/integration-test-kubeflow.yml
+17
-0
test/config/training_service_v2.yml
test/config/training_service_v2.yml
+50
-0
test/nni_test/nnitest/generate_ts_config.py
test/nni_test/nnitest/generate_ts_config.py
+21
-2
test/nni_test/nnitest/run_tests.py
test/nni_test/nnitest/run_tests.py
+11
-4
No files found.
pipelines/integration-test-frameworkcontroller.yml
View file @
e50ca8d3
...
...
@@ -50,3 +50,19 @@ jobs:
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --exclude multi-phase,multi-thread
displayName
:
Integration test
-
script
:
|
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts frameworkcontroller \
--keyvault_vaultname $(keyvault_vaultname) \
--keyvault_name $(keyvault_name) \
--azs_account $(azs_account) \
--azs_share $(azs_share) \
--nni_docker_image nnidev/nni-nightly \
--nni_manager_ip $(manager_ip) \
--reuse_mode True \
--config_version v2
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --reuse_mode True --exclude multi-phase,multi-thread
displayName
:
Integration test (reuse mode)
pipelines/integration-test-kubeflow.yml
View file @
e50ca8d3
...
...
@@ -61,3 +61,20 @@ jobs:
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --exclude multi-phase,multi-thread
displayName
:
Integration test
-
script
:
|
set -e
cd test
az login --service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant_id)
python3 nni_test/nnitest/generate_ts_config.py \
--ts kubeflow \
--keyvault_vaultname $(keyvault_vaultname) \``
--keyvault_name $(keyvault_name) \
--azs_account $(azs_account) \
--azs_share $(azs_share) \
--nni_docker_image nnidev/nni-nightly \
--nni_manager_ip $(manager_ip) \
--reuse_mode True \
--config_version v2
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --reuse_mode True --exclude multi-phase,multi-thread
displayName
:
Integration test (reuse mode)
test/config/training_service_v2.yml
View file @
e50ca8d3
...
...
@@ -12,3 +12,53 @@ hybrid:
resourceGroup
:
workspaceName
:
computeTarget
:
kubeflow
:
trialGpuNumber
:
0
trialConcurrency
:
2
maxTrialNumber
:
2
nniManagerIp
:
trainingService
:
reuseMode
:
true
platform
:
kubeflow
worker
:
command
:
code_directory
:
dockerImage
:
cpuNumber
:
1
gpuNumber
:
0
memorySize
:
8192
replicas
:
1
operator
:
tf-operator
storage
:
storageType
:
azureStorage
azureAccount
:
azureShare
:
keyVaultName
:
keyVaultKey
:
apiVersion
:
v1
frameworkcontroller
:
trialGpuNumber
:
0
trialConcurrency
:
2
maxTrialNumber
:
2
nniManagerIp
:
trainingService
:
reuseMode
:
true
platform
:
frameworkcontroller
serviceAccountName
:
frameworkcontroller
taskRoles
:
-
name
:
worker
dockerImage
:
taskNumber
:
1
command
:
gpuNumber
:
0
cpuNumber
:
1
memorySize
:
8192
framework_attempt_completion_policy
:
min_failed_task_count
:
1
minSucceedTaskCount
:
1
storage
:
storageType
:
azureStorage
azureAccount
:
azureShare
:
keyVaultName
:
keyVaultKey
:
\ No newline at end of file
test/nni_test/nnitest/generate_ts_config.py
View file @
e50ca8d3
...
...
@@ -35,7 +35,7 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'trial'
][
'virtualCluster'
]
=
args
.
vc
if
args
.
debug
is
not
None
:
config
[
args
.
ts
][
'debug'
]
=
args
.
debug
.
lower
()
==
'true'
elif
args
.
ts
==
'kubeflow'
:
elif
args
.
ts
==
'kubeflow'
and
args
.
reuse_mode
==
'False'
:
if
args
.
nfs_server
is
not
None
:
config
[
args
.
ts
][
'kubeflowConfig'
][
'nfs'
][
'server'
]
=
args
.
nfs_server
if
args
.
nfs_path
is
not
None
:
...
...
@@ -50,7 +50,16 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'kubeflowConfig'
][
'azureStorage'
][
'azureShare'
]
=
args
.
azs_share
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'worker'
][
'image'
]
=
args
.
nni_docker_image
elif
args
.
ts
==
'frameworkcontroller'
:
elif
args
.
ts
==
'kubeflow'
and
args
.
reuse_mode
==
'True'
:
config
=
get_yml_content
(
TRAINING_SERVICE_FILE_V2
)
config
[
args
.
ts
][
'trainingService'
][
'worker'
][
'dockerImage'
]
=
args
.
nni_docker_image
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'azureAccount'
]
=
args
.
azs_account
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'azureShare'
]
=
args
.
azs_share
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'keyVaultName'
]
=
args
.
keyvault_name
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'keyVaultKey'
]
=
args
.
keyvault_vaultname
config
[
args
.
ts
][
'nni_manager_ip'
]
=
args
.
nni_manager_ip
dump_yml_content
(
TRAINING_SERVICE_FILE_V2
,
config
)
elif
args
.
ts
==
'frameworkcontroller'
and
args
.
reuse_mode
==
'False'
:
if
args
.
nfs_server
is
not
None
:
config
[
args
.
ts
][
'frameworkcontrollerConfig'
][
'nfs'
][
'server'
]
=
args
.
nfs_server
if
args
.
nfs_path
is
not
None
:
...
...
@@ -65,6 +74,15 @@ def update_training_service_config(args):
config
[
args
.
ts
][
'frameworkcontrollerConfig'
][
'azureStorage'
][
'azureShare'
]
=
args
.
azs_share
if
args
.
nni_docker_image
is
not
None
:
config
[
args
.
ts
][
'trial'
][
'taskRoles'
][
0
][
'image'
]
=
args
.
nni_docker_image
elif
args
.
ts
==
'frameworkcontroller'
and
args
.
reuse_mode
==
'True'
:
config
=
get_yml_content
(
TRAINING_SERVICE_FILE_V2
)
config
[
args
.
ts
][
'trainingService'
][
'taskRoles'
][
0
][
'dockerImage'
]
=
args
.
nni_docker_image
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'azureAccount'
]
=
args
.
azs_account
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'azureShare'
]
=
args
.
azs_share
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'keyVaultName'
]
=
args
.
keyvault_name
config
[
args
.
ts
][
'trainingService'
][
'storage'
][
'keyVaultKey'
]
=
args
.
keyvault_vaultname
config
[
args
.
ts
][
'nni_manager_ip'
]
=
args
.
nni_manager_ip
dump_yml_content
(
TRAINING_SERVICE_FILE_V2
,
config
)
elif
args
.
ts
==
'remote'
:
if
args
.
remote_user
is
not
None
:
config
[
args
.
ts
][
'machineList'
][
0
][
'username'
]
=
args
.
remote_user
...
...
@@ -134,6 +152,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--config_version"
,
type
=
str
,
choices
=
[
'v1'
,
'v2'
],
default
=
'v1'
)
parser
.
add_argument
(
"--nni_docker_image"
,
type
=
str
)
parser
.
add_argument
(
"--nni_manager_ip"
,
type
=
str
)
parser
.
add_argument
(
"--reuse_mode"
,
type
=
str
,
default
=
'False'
)
# args for remote with shared storage
parser
.
add_argument
(
"--azurestoragetoken"
,
type
=
str
)
parser
.
add_argument
(
"--nfs_server"
,
type
=
str
)
...
...
test/nni_test/nnitest/run_tests.py
View file @
e50ca8d3
...
...
@@ -23,21 +23,27 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables
=
{}
def
update_training_service_config
(
config
,
training_service
,
config_file_path
,
nni_source_dir
):
def
update_training_service_config
(
config
,
training_service
,
config_file_path
,
nni_source_dir
,
reuse_mode
=
'False'
):
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service.yml'
))
# hack for kubeflow trial config
if
training_service
==
'kubeflow'
:
if
training_service
==
'kubeflow'
and
reuse_mode
==
'False'
:
it_ts_config
[
training_service
][
'trial'
][
'worker'
][
'command'
]
=
config
[
'trial'
][
'command'
]
config
[
'trial'
].
pop
(
'command'
)
if
'gpuNum'
in
config
[
'trial'
]:
config
[
'trial'
].
pop
(
'gpuNum'
)
elif
training_service
==
'kubeflow'
and
reuse_mode
==
'True'
:
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service_v2.yml'
))
it_ts_config
[
'trainingService'
][
'worker'
][
'command'
]
=
config
[
'trialCommand'
]
if
training_service
==
'frameworkcontroller'
:
if
training_service
==
'frameworkcontroller'
and
reuse_mode
==
'False'
:
it_ts_config
[
training_service
][
'trial'
][
'taskRoles'
][
0
][
'command'
]
=
config
[
'trial'
][
'command'
]
config
[
'trial'
].
pop
(
'command'
)
if
'gpuNum'
in
config
[
'trial'
]:
config
[
'trial'
].
pop
(
'gpuNum'
)
elif
training_service
==
'frameworkcontroller'
and
reuse_mode
==
'True'
:
it_ts_config
=
get_yml_content
(
os
.
path
.
join
(
'config'
,
'training_service_v2.yml'
))
it_ts_config
[
'trainingService'
][
'taskRoles'
][
0
][
'command'
]
=
config
[
'trialCommand'
]
if
training_service
==
'adl'
:
# hack for adl trial config, codeDir in adl mode refers to path in container
...
...
@@ -88,7 +94,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
update_training_service_config
(
test_yml_config
,
args
.
ts
,
test_case_config
[
'configFile'
],
args
.
nni_source_dir
)
update_training_service_config
(
test_yml_config
,
args
.
ts
,
test_case_config
[
'configFile'
],
args
.
nni_source_dir
,
args
.
reuse_mode
)
# generate temporary config yml file to launch experiment
new_config_file
=
config_path
+
'.tmp'
...
...
@@ -313,6 +319,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
"--nni_source_dir"
,
type
=
str
,
default
=
'../'
)
parser
.
add_argument
(
"--cases"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--exclude"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--reuse_mode"
,
type
=
str
,
default
=
'False'
)
parser
.
add_argument
(
"--ts"
,
type
=
str
,
choices
=
[
'local'
,
'remote'
,
'pai'
,
'kubeflow'
,
'frameworkcontroller'
,
'adl'
,
'aml'
,
'hybrid'
],
default
=
'local'
)
args
=
parser
.
parse_args
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment