Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
441267d1
Unverified
Commit
441267d1
authored
Apr 23, 2021
by
SparkSnail
Committed by
GitHub
Apr 23, 2021
Browse files
Hotfox k8s setClusterMetadata (#3567)
parent
22c185c5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
48 additions
and
12 deletions
+48
-12
nni/tools/nnictl/launcher.py
nni/tools/nnictl/launcher.py
+16
-4
ts/nni_manager/core/nnimanager.ts
ts/nni_manager/core/nnimanager.ts
+29
-8
ts/nni_manager/rest_server/restValidationSchemas.ts
ts/nni_manager/rest_server/restValidationSchemas.ts
+3
-0
No files found.
nni/tools/nnictl/launcher.py
View file @
441267d1
...
...
@@ -119,6 +119,18 @@ def set_trial_config(experiment_config, port, config_file_name):
def
set_adl_config
(
experiment_config
,
port
,
config_file_name
):
'''set adl configuration'''
adl_config_data
=
dict
()
# hack for supporting v2 config, need refactor
adl_config_data
[
'adl_config'
]
=
{}
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
adl_config_data
),
REST_TIME_OUT
)
err_message
=
None
if
not
response
or
not
response
.
status_code
==
200
:
if
response
is
not
None
:
err_message
=
response
.
text
_
,
stderr_full_path
=
get_log_path
(
config_file_name
)
with
open
(
stderr_full_path
,
'a+'
)
as
fout
:
fout
.
write
(
json
.
dumps
(
json
.
loads
(
err_message
),
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
return
False
,
err_message
result
,
message
=
setNNIManagerIp
(
experiment_config
,
port
,
config_file_name
)
if
not
result
:
return
result
,
message
...
...
@@ -377,6 +389,10 @@ def launch_experiment(args, experiment_config, mode, experiment_id, config_versi
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
if
config_version
==
1
and
mode
!=
'view'
:
# set platform configuration
set_platform_config
(
experiment_config
[
'trainingServicePlatform'
],
experiment_config
,
args
.
port
,
\
experiment_id
,
rest_process
)
# start a new experiment
print_normal
(
'Starting experiment...'
)
...
...
@@ -398,10 +414,6 @@ def launch_experiment(args, experiment_config, mode, experiment_id, config_versi
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
if
config_version
==
1
and
mode
!=
'view'
:
# set platform configuration
set_platform_config
(
experiment_config
[
'trainingServicePlatform'
],
experiment_config
,
args
.
port
,
\
experiment_id
,
rest_process
)
if
experiment_config
.
get
(
'nniManagerIp'
):
web_ui_url_list
=
[
'http://{0}:{1}'
.
format
(
experiment_config
[
'nniManagerIp'
],
str
(
args
.
port
))]
else
:
...
...
ts/nni_manager/core/nnimanager.ts
View file @
441267d1
...
...
@@ -175,12 +175,14 @@ class NNIManager implements Manager {
nextSequenceId
:
0
,
revision
:
0
};
this
.
config
=
config
;
this
.
log
.
info
(
`Starting experiment:
${
this
.
experimentProfile
.
id
}
`
);
await
this
.
storeExperimentProfile
();
this
.
log
.
info
(
'
Setup training service...
'
);
this
.
trainingService
=
await
this
.
initTrainingService
(
config
);
if
(
this
.
trainingService
===
undefined
)
{
this
.
log
.
info
(
'
Setup training service...
'
);
this
.
trainingService
=
await
this
.
initTrainingService
(
config
);
}
this
.
log
.
info
(
'
Setup tuner...
'
);
const
dispatcherCommand
:
string
=
getMsgDispatcherCommand
(
config
);
...
...
@@ -256,10 +258,30 @@ class NNIManager implements Manager {
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
while
(
this
.
trainingService
===
undefined
)
{
await
delay
(
1000
);
// Hack for supporting v2 config, need refactor
if
(
this
.
trainingService
===
undefined
)
{
this
.
log
.
info
(
'
Setup training service...
'
);
switch
(
key
)
{
case
'
kubeflow_config
'
:
{
const
kubeflowModule
=
await
import
(
'
../training_service/kubernetes/kubeflow/kubeflowTrainingService
'
);
this
.
trainingService
=
new
kubeflowModule
.
KubeflowTrainingService
();
break
;
}
case
'
frameworkcontroller_config
'
:
{
const
fcModule
=
await
import
(
'
../training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService
'
);
this
.
trainingService
=
new
fcModule
.
FrameworkControllerTrainingService
();
break
;
}
case
'
adl_config
'
:
{
const
adlModule
=
await
import
(
'
../training_service/kubernetes/adl/adlTrainingService
'
);
this
.
trainingService
=
new
adlModule
.
AdlTrainingService
();
break
;
}
default
:
throw
new
Error
(
"
Setup training service failed.
"
);
}
}
this
.
trainingService
.
setClusterMetadata
(
key
,
value
);
await
this
.
trainingService
.
setClusterMetadata
(
key
,
value
);
}
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
...
...
@@ -408,7 +430,6 @@ class NNIManager implements Manager {
}
private
async
initTrainingService
(
config
:
ExperimentConfig
):
Promise
<
TrainingService
>
{
this
.
config
=
config
;
let
platform
:
string
;
if
(
Array
.
isArray
(
config
.
trainingService
))
{
platform
=
'
hybrid
'
;
...
...
ts/nni_manager/rest_server/restValidationSchemas.ts
View file @
441267d1
...
...
@@ -131,6 +131,9 @@ export namespace ValidationSchemas {
maxTrialNumPerGpu
:
joi
.
number
(),
useActiveGpu
:
joi
.
boolean
(),
}),
adl_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
// hack for v2 configuration
}),
kubeflow_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
operator
:
joi
.
string
().
min
(
1
).
required
(),
storage
:
joi
.
string
().
min
(
1
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment