Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
b177bdc8
Unverified
Commit
b177bdc8
authored
Jan 06, 2021
by
QuanluZhang
Committed by
GitHub
Jan 06, 2021
Browse files
support hybrid training service v2.0 config (#3251)
parent
6330df2f
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
162 additions
and
58 deletions
+162
-58
examples/trials/mnist-tfv1/config_hybrid.yml
examples/trials/mnist-tfv1/config_hybrid.yml
+3
-4
examples/trials/mnist-tfv1/config_hybrid_v2.yml
examples/trials/mnist-tfv1/config_hybrid_v2.yml
+24
-0
examples/trials/mnist-tfv1/launch_hybrid.py
examples/trials/mnist-tfv1/launch_hybrid.py
+35
-0
nni/experiment/config/common.py
nni/experiment/config/common.py
+9
-5
nni/experiment/config/convert.py
nni/experiment/config/convert.py
+52
-30
nni/experiment/config/remote.py
nni/experiment/config/remote.py
+1
-1
nni/experiment/config/util.py
nni/experiment/config/util.py
+20
-6
nni/experiment/experiment.py
nni/experiment/experiment.py
+3
-3
nni/experiment/launcher.py
nni/experiment/launcher.py
+13
-7
nni/retiarii/experiment.py
nni/retiarii/experiment.py
+1
-1
nni/tools/nnictl/launcher.py
nni/tools/nnictl/launcher.py
+1
-1
No files found.
examples/trials/mnist-tfv1/config_h
eterogeneous
.yml
→
examples/trials/mnist-tfv1/config_h
ybrid
.yml
View file @
b177bdc8
...
@@ -26,7 +26,6 @@ remoteConfig:
...
@@ -26,7 +26,6 @@ remoteConfig:
reuse
:
true
reuse
:
true
machineList
:
machineList
:
-
ip
:
10.1.1.1
-
ip
:
10.1.1.1
username
:
bob
username
:
xxx
passwd
:
bob123
passwd
:
xxx
#port can be skip if using default ssh port 22
port
:
22
#port: 22
\ No newline at end of file
examples/trials/mnist-tfv1/config_hybrid_v2.yml
0 → 100644
View file @
b177bdc8
experimentName
:
example_mnist
trialConcurrency
:
3
maxExperimentDuration
:
1h
maxTrialNumber
:
10
searchSpaceFile
:
search_space.json
trialCodeDirectory
:
.
trialCommand
:
python3 mnist.py
trialGpuNumber
:
0
tuner
:
name
:
TPE
classArgs
:
optimize_mode
:
maximize
trainingService
:
-
platform
:
local
-
platform
:
remote
reuseMode
:
true
machineList
:
-
host
:
10.1.1.1
user
:
xxx
password
:
xxx
#port can be skip if using default ssh port 22
port
:
22
examples/trials/mnist-tfv1/launch_hybrid.py
0 → 100644
View file @
b177bdc8
# FIXME: For demonstration only. It should not be here
from
pathlib
import
Path
from
nni.experiment
import
Experiment
from
nni.experiment
import
RemoteMachineConfig
from
nni.algorithms.hpo.hyperopt_tuner
import
HyperoptTuner
tuner
=
HyperoptTuner
(
'tpe'
)
search_space
=
{
"dropout_rate"
:
{
"_type"
:
"uniform"
,
"_value"
:
[
0.5
,
0.9
]
},
"conv_size"
:
{
"_type"
:
"choice"
,
"_value"
:
[
2
,
3
,
5
,
7
]
},
"hidden_size"
:
{
"_type"
:
"choice"
,
"_value"
:
[
124
,
512
,
1024
]
},
"batch_size"
:
{
"_type"
:
"choice"
,
"_value"
:
[
16
,
32
]
},
"learning_rate"
:
{
"_type"
:
"choice"
,
"_value"
:
[
0.0001
,
0.001
,
0.01
,
0.1
]
}
}
experiment
=
Experiment
(
tuner
,
[
'local'
,
'remote'
])
experiment
.
config
.
experiment_name
=
'test'
experiment
.
config
.
trial_concurrency
=
3
experiment
.
config
.
max_trial_number
=
10
experiment
.
config
.
search_space
=
search_space
experiment
.
config
.
trial_command
=
'python3 mnist.py'
experiment
.
config
.
trial_code_directory
=
Path
(
__file__
).
parent
experiment
.
config
.
training_service
[
0
].
use_active_gpu
=
True
experiment
.
config
.
training_service
[
1
].
reuse_mode
=
True
rm_conf
=
RemoteMachineConfig
()
rm_conf
.
host
=
'10.1.1.1'
rm_conf
.
user
=
'xxx'
rm_conf
.
password
=
'xxx'
rm_conf
.
port
=
22
experiment
.
config
.
training_service
[
1
].
machine_list
=
[
rm_conf
]
experiment
.
run
(
26780
,
debug
=
True
)
nni/experiment/config/common.py
View file @
b177bdc8
...
@@ -65,15 +65,19 @@ class ExperimentConfig(ConfigBase):
...
@@ -65,15 +65,19 @@ class ExperimentConfig(ConfigBase):
tuner
:
Optional
[
_AlgorithmConfig
]
=
None
tuner
:
Optional
[
_AlgorithmConfig
]
=
None
accessor
:
Optional
[
_AlgorithmConfig
]
=
None
accessor
:
Optional
[
_AlgorithmConfig
]
=
None
advisor
:
Optional
[
_AlgorithmConfig
]
=
None
advisor
:
Optional
[
_AlgorithmConfig
]
=
None
training_service
:
TrainingServiceConfig
training_service
:
Union
[
TrainingServiceConfig
,
List
[
TrainingServiceConfig
]]
def
__init__
(
self
,
training_service_platform
:
Optional
[
str
]
=
None
,
**
kwargs
):
def
__init__
(
self
,
training_service_platform
:
Optional
[
Union
[
str
,
List
[
str
]]
]
=
None
,
**
kwargs
):
kwargs
=
util
.
case_insensitive
(
kwargs
)
kwargs
=
util
.
case_insensitive
(
kwargs
)
if
training_service_platform
is
not
None
:
if
training_service_platform
is
not
None
:
assert
'trainingservice'
not
in
kwargs
assert
'trainingservice'
not
in
kwargs
kwargs
[
'trainingservice'
]
=
util
.
training_service_config_factory
(
training_service_platform
)
kwargs
[
'trainingservice'
]
=
util
.
training_service_config_factory
(
platform
=
training_service_platform
)
elif
isinstance
(
kwargs
.
get
(
'trainingservice'
),
dict
):
elif
isinstance
(
kwargs
.
get
(
'trainingservice'
),
(
dict
,
list
)):
kwargs
[
'trainingservice'
]
=
util
.
training_service_config_factory
(
**
kwargs
[
'trainingservice'
])
# dict means a single training service
# list means hybrid training service
kwargs
[
'trainingservice'
]
=
util
.
training_service_config_factory
(
config
=
kwargs
[
'trainingservice'
])
else
:
raise
RuntimeError
(
'Unsupported Training service configuration!'
)
super
().
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
def
validate
(
self
,
initialized_tuner
:
bool
=
False
)
->
None
:
def
validate
(
self
,
initialized_tuner
:
bool
=
False
)
->
None
:
...
...
nni/experiment/config/convert.py
View file @
b177bdc8
...
@@ -18,8 +18,20 @@ def to_v1_yaml(config: ExperimentConfig, skip_nnictl: bool = False) -> Dict[str,
...
@@ -18,8 +18,20 @@ def to_v1_yaml(config: ExperimentConfig, skip_nnictl: bool = False) -> Dict[str,
data
=
config
.
json
()
data
=
config
.
json
()
ts
=
data
.
pop
(
'trainingService'
)
ts
=
data
.
pop
(
'trainingService'
)
if
isinstance
(
ts
,
list
):
hybrid_names
=
[]
for
conf
in
ts
:
if
conf
[
'platform'
]
==
'openpai'
:
conf
[
'platform'
]
=
'pai'
hybrid_names
.
append
(
conf
[
'platform'
])
_handle_training_service
(
conf
,
data
)
data
[
'trainingServicePlatform'
]
=
'hybrid'
data
[
'hybridConfig'
]
=
{
'trainingServicePlatforms'
:
hybrid_names
}
else
:
if
ts
[
'platform'
]
==
'openpai'
:
if
ts
[
'platform'
]
==
'openpai'
:
ts
[
'platform'
]
=
'pai'
ts
[
'platform'
]
=
'pai'
data
[
'trainingServicePlatform'
]
=
ts
[
'platform'
]
_handle_training_service
(
ts
,
data
)
data
[
'authorName'
]
=
'N/A'
data
[
'authorName'
]
=
'N/A'
data
[
'experimentName'
]
=
data
.
get
(
'experimentName'
,
'N/A'
)
data
[
'experimentName'
]
=
data
.
get
(
'experimentName'
,
'N/A'
)
...
@@ -27,7 +39,7 @@ def to_v1_yaml(config: ExperimentConfig, skip_nnictl: bool = False) -> Dict[str,
...
@@ -27,7 +39,7 @@ def to_v1_yaml(config: ExperimentConfig, skip_nnictl: bool = False) -> Dict[str,
if
data
[
'debug'
]:
if
data
[
'debug'
]:
data
[
'versionCheck'
]
=
False
data
[
'versionCheck'
]
=
False
data
[
'maxTrialNum'
]
=
data
.
pop
(
'maxTrialNumber'
,
99999
)
data
[
'maxTrialNum'
]
=
data
.
pop
(
'maxTrialNumber'
,
99999
)
data
[
'trainingServicePlatform'
]
=
ts
[
'platform'
]
ss
=
data
.
pop
(
'searchSpace'
,
None
)
ss
=
data
.
pop
(
'searchSpace'
,
None
)
ss_file
=
data
.
pop
(
'searchSpaceFile'
,
None
)
ss_file
=
data
.
pop
(
'searchSpaceFile'
,
None
)
if
ss
is
not
None
:
if
ss
is
not
None
:
...
@@ -66,6 +78,9 @@ def to_v1_yaml(config: ExperimentConfig, skip_nnictl: bool = False) -> Dict[str,
...
@@ -66,6 +78,9 @@ def to_v1_yaml(config: ExperimentConfig, skip_nnictl: bool = False) -> Dict[str,
if
'trialGpuNumber'
in
data
:
if
'trialGpuNumber'
in
data
:
data
[
'trial'
][
'gpuNum'
]
=
data
.
pop
(
'trialGpuNumber'
)
data
[
'trial'
][
'gpuNum'
]
=
data
.
pop
(
'trialGpuNumber'
)
return
data
def
_handle_training_service
(
ts
,
data
):
if
ts
[
'platform'
]
==
'local'
:
if
ts
[
'platform'
]
==
'local'
:
data
[
'localConfig'
]
=
{
data
[
'localConfig'
]
=
{
'useActiveGpu'
:
ts
.
get
(
'useActiveGpu'
,
False
),
'useActiveGpu'
:
ts
.
get
(
'useActiveGpu'
,
False
),
...
@@ -140,8 +155,6 @@ def to_v1_yaml(config: ExperimentConfig, skip_nnictl: bool = False) -> Dict[str,
...
@@ -140,8 +155,6 @@ def to_v1_yaml(config: ExperimentConfig, skip_nnictl: bool = False) -> Dict[str,
elif
ts
[
'platform'
]
==
'adl'
:
elif
ts
[
'platform'
]
==
'adl'
:
data
[
'trial'
][
'image'
]
=
ts
[
'dockerImage'
]
data
[
'trial'
][
'image'
]
=
ts
[
'dockerImage'
]
return
data
def
_convert_gpu_indices
(
indices
):
def
_convert_gpu_indices
(
indices
):
return
','
.
join
(
str
(
idx
)
for
idx
in
indices
)
if
indices
is
not
None
else
None
return
','
.
join
(
str
(
idx
)
for
idx
in
indices
)
if
indices
is
not
None
else
None
...
@@ -175,19 +188,34 @@ def to_cluster_metadata(config: ExperimentConfig) -> List[Dict[str, Any]]:
...
@@ -175,19 +188,34 @@ def to_cluster_metadata(config: ExperimentConfig) -> List[Dict[str, Any]]:
experiment_config
=
to_v1_yaml
(
config
,
skip_nnictl
=
True
)
experiment_config
=
to_v1_yaml
(
config
,
skip_nnictl
=
True
)
ret
=
[]
ret
=
[]
if
config
.
training_service
.
platform
==
'local'
:
if
isinstance
(
config
.
training_service
,
list
):
hybrid_conf
=
dict
()
hybrid_conf
[
'hybrid_config'
]
=
experiment_config
[
'hybridConfig'
]
for
conf
in
config
.
training_service
:
metadata
=
_get_cluster_metadata
(
conf
.
platform
,
experiment_config
)
if
metadata
is
not
None
:
hybrid_conf
.
update
(
metadata
)
ret
.
append
(
hybrid_conf
)
else
:
metadata
=
_get_cluster_metadata
(
config
.
training_service
.
platform
,
experiment_config
)
if
metadata
is
not
None
:
ret
.
append
(
metadata
)
if
experiment_config
.
get
(
'nniManagerIp'
)
is
not
None
:
ret
.
append
({
'nni_manager_ip'
:
{
'nniManagerIp'
:
experiment_config
[
'nniManagerIp'
]}})
ret
.
append
({
'trial_config'
:
experiment_config
[
'trial'
]})
return
ret
def
_get_cluster_metadata
(
platform
:
str
,
experiment_config
)
->
Dict
:
if
platform
==
'local'
:
request_data
=
dict
()
request_data
=
dict
()
request_data
[
'local_config'
]
=
experiment_config
[
'localConfig'
]
request_data
[
'local_config'
]
=
experiment_config
[
'localConfig'
]
if
request_data
[
'local_config'
]:
if
request_data
[
'local_config'
]:
if
request_data
[
'local_config'
].
get
(
'gpuIndices'
)
and
isinstance
(
request_data
[
'local_config'
].
get
(
'gpuIndices'
),
int
):
if
request_data
[
'local_config'
].
get
(
'gpuIndices'
)
and
isinstance
(
request_data
[
'local_config'
].
get
(
'gpuIndices'
),
int
):
request_data
[
'local_config'
][
'gpuIndices'
]
=
str
(
request_data
[
'local_config'
].
get
(
'gpuIndices'
))
request_data
[
'local_config'
][
'gpuIndices'
]
=
str
(
request_data
[
'local_config'
].
get
(
'gpuIndices'
))
if
request_data
[
'local_config'
].
get
(
'maxTrialNumOnEachGpu'
):
return
request_data
request_data
[
'local_config'
][
'maxTrialNumOnEachGpu'
]
=
request_data
[
'local_config'
].
get
(
'maxTrialNumOnEachGpu'
)
if
request_data
[
'local_config'
].
get
(
'useActiveGpu'
):
request_data
[
'local_config'
][
'useActiveGpu'
]
=
request_data
[
'local_config'
].
get
(
'useActiveGpu'
)
ret
.
append
(
request_data
)
elif
config
.
training_service
.
platform
==
'remote'
:
elif
platform
==
'remote'
:
request_data
=
dict
()
request_data
=
dict
()
if
experiment_config
.
get
(
'remoteConfig'
):
if
experiment_config
.
get
(
'remoteConfig'
):
request_data
[
'remote_config'
]
=
experiment_config
[
'remoteConfig'
]
request_data
[
'remote_config'
]
=
experiment_config
[
'remoteConfig'
]
...
@@ -198,31 +226,25 @@ def to_cluster_metadata(config: ExperimentConfig) -> List[Dict[str, Any]]:
...
@@ -198,31 +226,25 @@ def to_cluster_metadata(config: ExperimentConfig) -> List[Dict[str, Any]]:
for
i
in
range
(
len
(
request_data
[
'machine_list'
])):
for
i
in
range
(
len
(
request_data
[
'machine_list'
])):
if
isinstance
(
request_data
[
'machine_list'
][
i
].
get
(
'gpuIndices'
),
int
):
if
isinstance
(
request_data
[
'machine_list'
][
i
].
get
(
'gpuIndices'
),
int
):
request_data
[
'machine_list'
][
i
][
'gpuIndices'
]
=
str
(
request_data
[
'machine_list'
][
i
].
get
(
'gpuIndices'
))
request_data
[
'machine_list'
][
i
][
'gpuIndices'
]
=
str
(
request_data
[
'machine_list'
][
i
].
get
(
'gpuIndices'
))
ret
.
append
(
request_data
)
ret
urn
request_data
elif
config
.
training_service
.
platform
==
'openpai'
:
elif
platform
==
'openpai'
:
ret
.
append
(
{
'pai_config'
:
experiment_config
[
'paiConfig'
]}
)
ret
urn
{
'pai_config'
:
experiment_config
[
'paiConfig'
]}
elif
config
.
training_service
.
platform
==
'aml'
:
elif
platform
==
'aml'
:
ret
.
append
(
{
'aml_config'
:
experiment_config
[
'amlConfig'
]}
)
ret
urn
{
'aml_config'
:
experiment_config
[
'amlConfig'
]}
elif
config
.
training_service
.
platform
==
'kubeflow'
:
elif
platform
==
'kubeflow'
:
ret
.
append
(
{
'kubeflow_config'
:
experiment_config
[
'kubeflowConfig'
]}
)
ret
urn
{
'kubeflow_config'
:
experiment_config
[
'kubeflowConfig'
]}
elif
config
.
training_service
.
platform
==
'frameworkcontroller'
:
elif
platform
==
'frameworkcontroller'
:
ret
.
append
(
{
'frameworkcontroller_config'
:
experiment_config
[
'frameworkcontrollerConfig'
]}
)
ret
urn
{
'frameworkcontroller_config'
:
experiment_config
[
'frameworkcontrollerConfig'
]}
elif
config
.
training_service
.
platform
==
'adl'
:
elif
platform
==
'adl'
:
pass
return
None
else
:
else
:
raise
RuntimeError
(
'Unsupported training service '
+
config
.
training_service
.
platform
)
raise
RuntimeError
(
'Unsupported training service '
+
platform
)
if
experiment_config
.
get
(
'nniManagerIp'
)
is
not
None
:
ret
.
append
({
'nni_manager_ip'
:
{
'nniManagerIp'
:
experiment_config
[
'nniManagerIp'
]}})
ret
.
append
({
'trial_config'
:
experiment_config
[
'trial'
]})
return
ret
def
to_rest_json
(
config
:
ExperimentConfig
)
->
Dict
[
str
,
Any
]:
def
to_rest_json
(
config
:
ExperimentConfig
)
->
Dict
[
str
,
Any
]:
experiment_config
=
to_v1_yaml
(
config
,
skip_nnictl
=
True
)
experiment_config
=
to_v1_yaml
(
config
,
skip_nnictl
=
True
)
...
...
nni/experiment/config/remote.py
View file @
b177bdc8
...
@@ -18,7 +18,7 @@ class RemoteMachineConfig(ConfigBase):
...
@@ -18,7 +18,7 @@ class RemoteMachineConfig(ConfigBase):
port
:
int
=
22
port
:
int
=
22
user
:
str
user
:
str
password
:
Optional
[
str
]
=
None
password
:
Optional
[
str
]
=
None
ssh_key_file
:
PathLike
=
'~/.ssh/id_rsa'
ssh_key_file
:
PathLike
=
None
#
'~/.ssh/id_rsa'
ssh_passphrase
:
Optional
[
str
]
=
None
ssh_passphrase
:
Optional
[
str
]
=
None
use_active_gpu
:
bool
=
False
use_active_gpu
:
bool
=
False
max_trial_number_per_gpu
:
int
=
1
max_trial_number_per_gpu
:
int
=
1
...
...
nni/experiment/config/util.py
View file @
b177bdc8
...
@@ -8,7 +8,7 @@ Miscellaneous utility functions.
...
@@ -8,7 +8,7 @@ Miscellaneous utility functions.
import
math
import
math
import
os.path
import
os.path
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
Optional
,
Union
from
typing
import
Any
,
Dict
,
Optional
,
Union
,
List
PathLike
=
Union
[
Path
,
str
]
PathLike
=
Union
[
Path
,
str
]
...
@@ -29,12 +29,26 @@ def canonical_path(path: Optional[PathLike]) -> Optional[str]:
...
@@ -29,12 +29,26 @@ def canonical_path(path: Optional[PathLike]) -> Optional[str]:
def
count
(
*
values
)
->
int
:
def
count
(
*
values
)
->
int
:
return
sum
(
value
is
not
None
and
value
is
not
False
for
value
in
values
)
return
sum
(
value
is
not
None
and
value
is
not
False
for
value
in
values
)
def
training_service_config_factory
(
platform
:
str
,
**
kwargs
):
# -> TrainingServiceConfig
def
training_service_config_factory
(
platform
:
Union
[
str
,
List
[
str
]]
=
None
,
config
:
Union
[
List
,
Dict
]
=
None
):
# -> TrainingServiceConfig
from
.common
import
TrainingServiceConfig
from
.common
import
TrainingServiceConfig
ts_configs
=
[]
if
platform
is
not
None
:
assert
config
is
None
platforms
=
platform
if
isinstance
(
platform
,
list
)
else
[
platform
]
for
cls
in
TrainingServiceConfig
.
__subclasses__
():
for
cls
in
TrainingServiceConfig
.
__subclasses__
():
if
cls
.
platform
==
platform
:
if
cls
.
platform
in
platforms
:
return
cls
(
**
kwargs
)
ts_configs
.
append
(
cls
())
raise
ValueError
(
f
'Unrecognized platform
{
platform
}
'
)
if
len
(
ts_configs
)
<
len
(
platforms
):
raise
RuntimeError
(
'There is unrecognized platform!'
)
else
:
assert
config
is
not
None
supported_platforms
=
{
cls
.
platform
:
cls
for
cls
in
TrainingServiceConfig
.
__subclasses__
()}
configs
=
config
if
isinstance
(
config
,
list
)
else
[
config
]
for
conf
in
configs
:
if
conf
[
'platform'
]
not
in
supported_platforms
:
raise
RuntimeError
(
f
'Unrecognized platform
{
conf
[
"platform"
]
}
'
)
ts_configs
.
append
(
supported_platforms
[
conf
[
'platform'
]](
**
conf
))
return
ts_configs
if
len
(
ts_configs
)
>
1
else
ts_configs
[
0
]
def
load_config
(
Type
,
value
):
def
load_config
(
Type
,
value
):
if
isinstance
(
value
,
list
):
if
isinstance
(
value
,
list
):
...
...
nni/experiment/experiment.py
View file @
b177bdc8
...
@@ -5,7 +5,7 @@ import socket
...
@@ -5,7 +5,7 @@ import socket
from
subprocess
import
Popen
from
subprocess
import
Popen
from
threading
import
Thread
from
threading
import
Thread
import
time
import
time
from
typing
import
Optional
,
overload
from
typing
import
Optional
,
Union
,
List
,
overload
import
colorama
import
colorama
import
psutil
import
psutil
...
@@ -54,7 +54,7 @@ class Experiment:
...
@@ -54,7 +54,7 @@ class Experiment:
...
...
@
overload
@
overload
def
__init__
(
self
,
tuner
:
Tuner
,
training_service
:
str
)
->
None
:
def
__init__
(
self
,
tuner
:
Tuner
,
training_service
:
Union
[
str
,
List
[
str
]]
)
->
None
:
"""
"""
Prepare an experiment, leaving configuration fields to be set later.
Prepare an experiment, leaving configuration fields to be set later.
...
@@ -86,7 +86,7 @@ class Experiment:
...
@@ -86,7 +86,7 @@ class Experiment:
self
.
_dispatcher
:
Optional
[
MsgDispatcher
]
=
None
self
.
_dispatcher
:
Optional
[
MsgDispatcher
]
=
None
self
.
_dispatcher_thread
:
Optional
[
Thread
]
=
None
self
.
_dispatcher_thread
:
Optional
[
Thread
]
=
None
if
isinstance
(
config
,
str
):
if
isinstance
(
config
,
(
str
,
list
)
):
config
,
training_service
=
None
,
config
config
,
training_service
=
None
,
config
if
config
is
None
:
if
config
is
None
:
...
...
nni/experiment/launcher.py
View file @
b177bdc8
...
@@ -27,11 +27,13 @@ def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bo
...
@@ -27,11 +27,13 @@ def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bo
config
.
validate
(
initialized_tuner
=
True
)
config
.
validate
(
initialized_tuner
=
True
)
_ensure_port_idle
(
port
)
_ensure_port_idle
(
port
)
if
config
.
training_service
.
platform
==
'openpai'
:
if
isinstance
(
config
.
training_service
,
list
):
# hybrid training service
_ensure_port_idle
(
port
+
1
,
'OpenPAI requires an additional port'
)
_ensure_port_idle
(
port
+
1
,
'Hybrid training service requires an additional port'
)
elif
config
.
training_service
.
platform
in
[
'remote'
,
'openpai'
,
'kubeflow'
,
'frameworkcontroller'
,
'adl'
]:
_ensure_port_idle
(
port
+
1
,
f
'
{
config
.
training_service
.
platform
}
requires an additional port'
)
try
:
try
:
_logger
.
info
(
'Creating experiment %s'
,
colorama
.
Fore
.
CYAN
+
exp_id
+
colorama
.
Style
.
RESET_ALL
)
_logger
.
info
(
'Creating experiment
, Experiment ID:
%s'
,
colorama
.
Fore
.
CYAN
+
exp_id
+
colorama
.
Style
.
RESET_ALL
)
pipe
=
Pipe
(
exp_id
)
pipe
=
Pipe
(
exp_id
)
start_time
,
proc
=
_start_rest_server
(
config
,
port
,
debug
,
exp_id
,
pipe
.
path
)
start_time
,
proc
=
_start_rest_server
(
config
,
port
,
debug
,
exp_id
,
pipe
.
path
)
_logger
.
info
(
'Connecting IPC pipe...'
)
_logger
.
info
(
'Connecting IPC pipe...'
)
...
@@ -40,7 +42,8 @@ def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bo
...
@@ -40,7 +42,8 @@ def start_experiment(exp_id: str, config: ExperimentConfig, port: int, debug: bo
nni
.
runtime
.
protocol
.
_out_file
=
pipe_file
nni
.
runtime
.
protocol
.
_out_file
=
pipe_file
_logger
.
info
(
'Statring web server...'
)
_logger
.
info
(
'Statring web server...'
)
_check_rest_server
(
port
)
_check_rest_server
(
port
)
_save_experiment_information
(
exp_id
,
port
,
start_time
,
config
.
training_service
.
platform
,
platform
=
'hybrid'
if
isinstance
(
config
.
training_service
,
list
)
else
config
.
training_service
.
platform
_save_experiment_information
(
exp_id
,
port
,
start_time
,
platform
,
config
.
experiment_name
,
proc
.
pid
,
config
.
experiment_working_directory
)
config
.
experiment_name
,
proc
.
pid
,
config
.
experiment_working_directory
)
_logger
.
info
(
'Setting up...'
)
_logger
.
info
(
'Setting up...'
)
_init_experiment
(
config
,
port
,
debug
)
_init_experiment
(
config
,
port
,
debug
)
...
@@ -66,6 +69,9 @@ def _ensure_port_idle(port: int, message: Optional[str] = None) -> None:
...
@@ -66,6 +69,9 @@ def _ensure_port_idle(port: int, message: Optional[str] = None) -> None:
def
_start_rest_server
(
config
:
ExperimentConfig
,
port
:
int
,
debug
:
bool
,
experiment_id
:
str
,
pipe_path
:
str
)
->
Tuple
[
int
,
Popen
]:
def
_start_rest_server
(
config
:
ExperimentConfig
,
port
:
int
,
debug
:
bool
,
experiment_id
:
str
,
pipe_path
:
str
)
->
Tuple
[
int
,
Popen
]:
if
isinstance
(
config
.
training_service
,
list
):
ts
=
'hybrid'
else
:
ts
=
config
.
training_service
.
platform
ts
=
config
.
training_service
.
platform
if
ts
==
'openpai'
:
if
ts
==
'openpai'
:
ts
=
'pai'
ts
=
'pai'
...
...
nni/retiarii/experiment.py
View file @
b177bdc8
...
@@ -46,7 +46,7 @@ class RetiariiExeConfig(ConfigBase):
...
@@ -46,7 +46,7 @@ class RetiariiExeConfig(ConfigBase):
super
().
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
if
training_service_platform
is
not
None
:
if
training_service_platform
is
not
None
:
assert
'training_service'
not
in
kwargs
assert
'training_service'
not
in
kwargs
self
.
training_service
=
util
.
training_service_config_factory
(
training_service_platform
)
self
.
training_service
=
util
.
training_service_config_factory
(
platform
=
training_service_platform
)
def
validate
(
self
,
initialized_tuner
:
bool
=
False
)
->
None
:
def
validate
(
self
,
initialized_tuner
:
bool
=
False
)
->
None
:
super
().
validate
()
super
().
validate
()
...
...
nni/tools/nnictl/launcher.py
View file @
b177bdc8
...
@@ -607,7 +607,7 @@ def create_experiment(args):
...
@@ -607,7 +607,7 @@ def create_experiment(args):
try
:
try
:
validate_all_content
(
experiment_config
,
config_path
)
validate_all_content
(
experiment_config
,
config_path
)
except
Exception
as
e
:
except
Exception
as
e
:
print_error
(
f
'Config validation failed.
{
repr
(
e
)
}
'
)
print_error
(
f
'Config
in v1 format
validation failed.
{
repr
(
e
)
}
'
)
exit
(
1
)
exit
(
1
)
nni_config
.
set_config
(
'experimentConfig'
,
experiment_config
)
nni_config
.
set_config
(
'experimentConfig'
,
experiment_config
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment