Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
afce6d4a
Unverified
Commit
afce6d4a
authored
Apr 03, 2019
by
fishyds
Committed by
GitHub
Apr 03, 2019
Browse files
Merge pull request #950 from Microsoft/v0.6
Merge V0.6 branch to master
parents
6545540d
29a23335
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
131 additions
and
73 deletions
+131
-73
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
+8
-9
src/webui/src/components/Overview.tsx
src/webui/src/components/Overview.tsx
+5
-4
src/webui/src/components/overview/TrialProfile.tsx
src/webui/src/components/overview/TrialProfile.tsx
+28
-16
tools/nni_cmd/config_utils.py
tools/nni_cmd/config_utils.py
+1
-1
tools/nni_cmd/constants.py
tools/nni_cmd/constants.py
+2
-0
tools/nni_cmd/launcher.py
tools/nni_cmd/launcher.py
+8
-8
tools/nni_cmd/nnictl_utils.py
tools/nni_cmd/nnictl_utils.py
+50
-23
tools/nni_cmd/rest_utils.py
tools/nni_cmd/rest_utils.py
+2
-1
tools/nni_cmd/tensorboard_utils.py
tools/nni_cmd/tensorboard_utils.py
+1
-1
tools/nni_cmd/updater.py
tools/nni_cmd/updater.py
+3
-2
tools/nni_trial_tool/constants.py
tools/nni_trial_tool/constants.py
+1
-0
tools/nni_trial_tool/trial_keeper.py
tools/nni_trial_tool/trial_keeper.py
+16
-6
tools/nni_trial_tool/url_utils.py
tools/nni_trial_tool/url_utils.py
+6
-2
No files found.
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
View file @
afce6d4a
...
@@ -192,17 +192,20 @@ class SMACTuner(Tuner):
...
@@ -192,17 +192,20 @@ class SMACTuner(Tuner):
Returns
Returns
-------
-------
dict
dict
challenger
dict
dict which stores copy of
challenger
s
"""
"""
converted_dict
=
{}
for
key
,
value
in
challenger_dict
.
items
():
for
key
,
value
in
challenger_dict
.
items
():
# convert to loguniform
# convert to loguniform
if
key
in
self
.
loguniform_key
:
if
key
in
self
.
loguniform_key
:
c
hallenger
_dict
[
key
]
=
np
.
exp
(
challenger_dict
[
key
])
c
onverted
_dict
[
key
]
=
np
.
exp
(
challenger_dict
[
key
])
# convert categorical back to original value
# convert categorical back to original value
if
key
in
self
.
categorical_dict
:
el
if
key
in
self
.
categorical_dict
:
idx
=
challenger_dict
[
key
]
idx
=
challenger_dict
[
key
]
challenger_dict
[
key
]
=
self
.
categorical_dict
[
key
][
idx
]
converted_dict
[
key
]
=
self
.
categorical_dict
[
key
][
idx
]
return
challenger_dict
else
:
converted_dict
[
key
]
=
value
return
converted_dict
def
generate_parameters
(
self
,
parameter_id
):
def
generate_parameters
(
self
,
parameter_id
):
"""generate one instance of hyperparameters
"""generate one instance of hyperparameters
...
@@ -220,13 +223,11 @@ class SMACTuner(Tuner):
...
@@ -220,13 +223,11 @@ class SMACTuner(Tuner):
if
self
.
first_one
:
if
self
.
first_one
:
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
self
.
total_data
[
parameter_id
]
=
init_challenger
self
.
total_data
[
parameter_id
]
=
init_challenger
json_tricks
.
dumps
(
init_challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
())
else
:
else
:
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
for
challenger
in
challengers
:
for
challenger
in
challengers
:
self
.
total_data
[
parameter_id
]
=
challenger
self
.
total_data
[
parameter_id
]
=
challenger
json_tricks
.
dumps
(
challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
())
def
generate_multiple_parameters
(
self
,
parameter_id_list
):
def
generate_multiple_parameters
(
self
,
parameter_id_list
):
...
@@ -247,7 +248,6 @@ class SMACTuner(Tuner):
...
@@ -247,7 +248,6 @@ class SMACTuner(Tuner):
for
one_id
in
parameter_id_list
:
for
one_id
in
parameter_id_list
:
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
self
.
total_data
[
one_id
]
=
init_challenger
self
.
total_data
[
one_id
]
=
init_challenger
json_tricks
.
dumps
(
init_challenger
.
get_dictionary
())
params
.
append
(
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
()))
params
.
append
(
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
()))
else
:
else
:
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
...
@@ -257,7 +257,6 @@ class SMACTuner(Tuner):
...
@@ -257,7 +257,6 @@ class SMACTuner(Tuner):
if
cnt
>=
len
(
parameter_id_list
):
if
cnt
>=
len
(
parameter_id_list
):
break
break
self
.
total_data
[
parameter_id_list
[
cnt
]]
=
challenger
self
.
total_data
[
parameter_id_list
[
cnt
]]
=
challenger
json_tricks
.
dumps
(
challenger
.
get_dictionary
())
params
.
append
(
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
()))
params
.
append
(
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
()))
cnt
+=
1
cnt
+=
1
return
params
return
params
src/webui/src/components/Overview.tsx
View file @
afce6d4a
...
@@ -20,6 +20,7 @@ require('../static/style/overviewTitle.scss');
...
@@ -20,6 +20,7 @@ require('../static/style/overviewTitle.scss');
interface
OverviewState
{
interface
OverviewState
{
tableData
:
Array
<
TableObj
>
;
tableData
:
Array
<
TableObj
>
;
experimentAPI
:
object
;
searchSpace
:
object
;
searchSpace
:
object
;
status
:
string
;
status
:
string
;
errorStr
:
string
;
errorStr
:
string
;
...
@@ -47,6 +48,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -47,6 +48,7 @@ class Overview extends React.Component<{}, OverviewState> {
super
(
props
);
super
(
props
);
this
.
state
=
{
this
.
state
=
{
searchSpace
:
{},
searchSpace
:
{},
experimentAPI
:
{},
status
:
''
,
status
:
''
,
errorStr
:
''
,
errorStr
:
''
,
trialProfile
:
{
trialProfile
:
{
...
@@ -143,6 +145,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -143,6 +145,7 @@ class Overview extends React.Component<{}, OverviewState> {
});
});
if
(
this
.
_isMounted
)
{
if
(
this
.
_isMounted
)
{
this
.
setState
({
this
.
setState
({
experimentAPI
:
res
.
data
,
trialProfile
:
trialPro
[
0
],
trialProfile
:
trialPro
[
0
],
searchSpace
:
searchSpace
,
searchSpace
:
searchSpace
,
isLogCollection
:
expLogCollection
isLogCollection
:
expLogCollection
...
@@ -390,7 +393,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -390,7 +393,7 @@ class Overview extends React.Component<{}, OverviewState> {
const
{
const
{
trialProfile
,
searchSpace
,
tableData
,
accuracyData
,
trialProfile
,
searchSpace
,
tableData
,
accuracyData
,
accNodata
,
status
,
errorStr
,
trialNumber
,
bestAccuracy
,
accNodata
,
status
,
errorStr
,
trialNumber
,
bestAccuracy
,
titleMaxbgcolor
,
titleMinbgcolor
,
isLogCollection
titleMaxbgcolor
,
titleMinbgcolor
,
isLogCollection
,
experimentAPI
}
=
this
.
state
;
}
=
this
.
state
;
return
(
return
(
...
@@ -425,9 +428,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -425,9 +428,7 @@ class Overview extends React.Component<{}, OverviewState> {
<
Row
className
=
"experiment"
>
<
Row
className
=
"experiment"
>
{
/* the scroll bar all the trial profile in the searchSpace div*/
}
{
/* the scroll bar all the trial profile in the searchSpace div*/
}
<
div
className
=
"experiment searchSpace"
>
<
div
className
=
"experiment searchSpace"
>
<
TrialPro
<
TrialPro
experiment
=
{
experimentAPI
}
/>
tiralProInfo
=
{
trialProfile
}
/>
</
div
>
</
div
>
</
Row
>
</
Row
>
</
Col
>
</
Col
>
...
...
src/webui/src/components/overview/TrialProfile.tsx
View file @
afce6d4a
import
*
as
React
from
'
react
'
;
import
*
as
React
from
'
react
'
;
import
{
Experiment
}
from
'
../../static/interface
'
;
import
MonacoEditor
from
'
react-monaco-editor
'
;
import
MonacoEditor
from
'
react-monaco-editor
'
;
import
{
MONACO
}
from
'
../../static/const
'
;
import
{
MONACO
}
from
'
../../static/const
'
;
interface
TrialInfoProps
{
interface
TrialInfoProps
{
tiralProInfo
:
Experimen
t
;
experiment
:
objec
t
;
}
}
class
TrialInfo
extends
React
.
Component
<
TrialInfoProps
,
{}
>
{
class
TrialInfo
extends
React
.
Component
<
TrialInfoProps
,
{}
>
{
...
@@ -13,19 +12,32 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
...
@@ -13,19 +12,32 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
super
(
props
);
super
(
props
);
}
}
render
()
{
componentWillReceiveProps
(
nextProps
:
TrialInfoProps
)
{
const
{
tiralProInfo
}
=
this
.
props
;
const
experiments
=
nextProps
.
experiment
;
const
showProInfo
=
[];
Object
.
keys
(
experiments
).
map
(
key
=>
{
showProInfo
.
push
({
switch
(
key
)
{
revision
:
tiralProInfo
.
revision
,
case
'
id
'
:
authorName
:
tiralProInfo
.
author
,
case
'
logDir
'
:
trialConcurrency
:
tiralProInfo
.
runConcurren
,
case
'
startTime
'
:
tuner
:
tiralProInfo
.
tuner
,
case
'
endTime
'
:
assessor
:
tiralProInfo
.
assessor
?
tiralProInfo
.
assessor
:
undefined
,
experiments
[
key
]
=
undefined
;
logCollection
:
tiralProInfo
.
logCollection
?
tiralProInfo
.
logCollection
:
undefined
,
break
;
advisor
:
tiralProInfo
.
advisor
?
tiralProInfo
.
advisor
:
undefined
,
case
'
params
'
:
clusterMetaData
:
tiralProInfo
.
clusterMetaData
?
tiralProInfo
.
clusterMetaData
:
undefined
const
params
=
experiments
[
key
];
Object
.
keys
(
params
).
map
(
item
=>
{
if
(
item
===
'
experimentName
'
||
item
===
'
searchSpace
'
||
item
===
'
trainingServicePlatform
'
)
{
params
[
item
]
=
undefined
;
}
});
break
;
default
:
}
});
});
}
render
()
{
const
{
experiment
}
=
this
.
props
;
return
(
return
(
<
div
className
=
"profile"
>
<
div
className
=
"profile"
>
<
MonacoEditor
<
MonacoEditor
...
@@ -33,7 +45,7 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
...
@@ -33,7 +45,7 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
height
=
"380"
height
=
"380"
language
=
"json"
language
=
"json"
theme
=
"vs-light"
theme
=
"vs-light"
value
=
{
JSON
.
stringify
(
showProInfo
[
0
]
,
null
,
2
)
}
value
=
{
JSON
.
stringify
(
experiment
,
null
,
2
)
}
options
=
{
MONACO
}
options
=
{
MONACO
}
/>
/>
</
div
>
</
div
>
...
@@ -41,4 +53,4 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
...
@@ -41,4 +53,4 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
}
}
}
}
export
default
TrialInfo
;
export
default
TrialInfo
;
\ No newline at end of file
tools/nni_cmd/config_utils.py
View file @
afce6d4a
...
@@ -79,7 +79,7 @@ class Experiments:
...
@@ -79,7 +79,7 @@ class Experiments:
self
.
experiments
[
id
][
'port'
]
=
port
self
.
experiments
[
id
][
'port'
]
=
port
self
.
experiments
[
id
][
'startTime'
]
=
time
self
.
experiments
[
id
][
'startTime'
]
=
time
self
.
experiments
[
id
][
'endTime'
]
=
'N/A'
self
.
experiments
[
id
][
'endTime'
]
=
'N/A'
self
.
experiments
[
id
][
'status'
]
=
'
running
'
self
.
experiments
[
id
][
'status'
]
=
'
INITIALIZED
'
self
.
experiments
[
id
][
'fileName'
]
=
file_name
self
.
experiments
[
id
][
'fileName'
]
=
file_name
self
.
experiments
[
id
][
'platform'
]
=
platform
self
.
experiments
[
id
][
'platform'
]
=
platform
self
.
write_file
()
self
.
write_file
()
...
...
tools/nni_cmd/constants.py
View file @
afce6d4a
...
@@ -30,6 +30,8 @@ WARNING_INFO = 'WARNING: %s'
...
@@ -30,6 +30,8 @@ WARNING_INFO = 'WARNING: %s'
DEFAULT_REST_PORT
=
8080
DEFAULT_REST_PORT
=
8080
REST_TIME_OUT
=
20
EXPERIMENT_SUCCESS_INFO
=
'
\033
[1;32;32mSuccessfully started experiment!
\n\033
[0m'
\
EXPERIMENT_SUCCESS_INFO
=
'
\033
[1;32;32mSuccessfully started experiment!
\n\033
[0m'
\
'-----------------------------------------------------------------------
\n
'
\
'-----------------------------------------------------------------------
\n
'
\
'The experiment id is %s
\n
'
\
'The experiment id is %s
\n
'
\
...
...
tools/nni_cmd/launcher.py
View file @
afce6d4a
...
@@ -139,7 +139,7 @@ def set_trial_config(experiment_config, port, config_file_name):
...
@@ -139,7 +139,7 @@ def set_trial_config(experiment_config, port, config_file_name):
'''set trial configuration'''
'''set trial configuration'''
request_data
=
dict
()
request_data
=
dict
()
request_data
[
'trial_config'
]
=
experiment_config
[
'trial'
]
request_data
[
'trial_config'
]
=
experiment_config
[
'trial'
]
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
20
)
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
REST_TIME_OUT
)
if
check_response
(
response
):
if
check_response
(
response
):
return
True
return
True
else
:
else
:
...
@@ -159,7 +159,7 @@ def set_remote_config(experiment_config, port, config_file_name):
...
@@ -159,7 +159,7 @@ def set_remote_config(experiment_config, port, config_file_name):
#set machine_list
#set machine_list
request_data
=
dict
()
request_data
=
dict
()
request_data
[
'machine_list'
]
=
experiment_config
[
'machineList'
]
request_data
[
'machine_list'
]
=
experiment_config
[
'machineList'
]
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
20
)
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
REST_TIME_OUT
)
err_message
=
''
err_message
=
''
if
not
response
or
not
check_response
(
response
):
if
not
response
or
not
check_response
(
response
):
if
response
is
not
None
:
if
response
is
not
None
:
...
@@ -180,7 +180,7 @@ def setNNIManagerIp(experiment_config, port, config_file_name):
...
@@ -180,7 +180,7 @@ def setNNIManagerIp(experiment_config, port, config_file_name):
return
True
,
None
return
True
,
None
ip_config_dict
=
dict
()
ip_config_dict
=
dict
()
ip_config_dict
[
'nni_manager_ip'
]
=
{
'nniManagerIp'
:
experiment_config
[
'nniManagerIp'
]
}
ip_config_dict
[
'nni_manager_ip'
]
=
{
'nniManagerIp'
:
experiment_config
[
'nniManagerIp'
]
}
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
ip_config_dict
),
20
)
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
ip_config_dict
),
REST_TIME_OUT
)
err_message
=
None
err_message
=
None
if
not
response
or
not
response
.
status_code
==
200
:
if
not
response
or
not
response
.
status_code
==
200
:
if
response
is
not
None
:
if
response
is
not
None
:
...
@@ -195,7 +195,7 @@ def set_pai_config(experiment_config, port, config_file_name):
...
@@ -195,7 +195,7 @@ def set_pai_config(experiment_config, port, config_file_name):
'''set pai configuration'''
'''set pai configuration'''
pai_config_data
=
dict
()
pai_config_data
=
dict
()
pai_config_data
[
'pai_config'
]
=
experiment_config
[
'paiConfig'
]
pai_config_data
[
'pai_config'
]
=
experiment_config
[
'paiConfig'
]
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
pai_config_data
),
20
)
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
pai_config_data
),
REST_TIME_OUT
)
err_message
=
None
err_message
=
None
if
not
response
or
not
response
.
status_code
==
200
:
if
not
response
or
not
response
.
status_code
==
200
:
if
response
is
not
None
:
if
response
is
not
None
:
...
@@ -214,7 +214,7 @@ def set_kubeflow_config(experiment_config, port, config_file_name):
...
@@ -214,7 +214,7 @@ def set_kubeflow_config(experiment_config, port, config_file_name):
'''set kubeflow configuration'''
'''set kubeflow configuration'''
kubeflow_config_data
=
dict
()
kubeflow_config_data
=
dict
()
kubeflow_config_data
[
'kubeflow_config'
]
=
experiment_config
[
'kubeflowConfig'
]
kubeflow_config_data
[
'kubeflow_config'
]
=
experiment_config
[
'kubeflowConfig'
]
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
kubeflow_config_data
),
20
)
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
kubeflow_config_data
),
REST_TIME_OUT
)
err_message
=
None
err_message
=
None
if
not
response
or
not
response
.
status_code
==
200
:
if
not
response
or
not
response
.
status_code
==
200
:
if
response
is
not
None
:
if
response
is
not
None
:
...
@@ -233,7 +233,7 @@ def set_frameworkcontroller_config(experiment_config, port, config_file_name):
...
@@ -233,7 +233,7 @@ def set_frameworkcontroller_config(experiment_config, port, config_file_name):
'''set kubeflow configuration'''
'''set kubeflow configuration'''
frameworkcontroller_config_data
=
dict
()
frameworkcontroller_config_data
=
dict
()
frameworkcontroller_config_data
[
'frameworkcontroller_config'
]
=
experiment_config
[
'frameworkcontrollerConfig'
]
frameworkcontroller_config_data
[
'frameworkcontroller_config'
]
=
experiment_config
[
'frameworkcontrollerConfig'
]
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
frameworkcontroller_config_data
),
20
)
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
frameworkcontroller_config_data
),
REST_TIME_OUT
)
err_message
=
None
err_message
=
None
if
not
response
or
not
response
.
status_code
==
200
:
if
not
response
or
not
response
.
status_code
==
200
:
if
response
is
not
None
:
if
response
is
not
None
:
...
@@ -304,7 +304,7 @@ def set_experiment(experiment_config, mode, port, config_file_name):
...
@@ -304,7 +304,7 @@ def set_experiment(experiment_config, mode, port, config_file_name):
request_data
[
'clusterMetaData'
].
append
(
request_data
[
'clusterMetaData'
].
append
(
{
'key'
:
'trial_config'
,
'value'
:
experiment_config
[
'trial'
]})
{
'key'
:
'trial_config'
,
'value'
:
experiment_config
[
'trial'
]})
response
=
rest_post
(
experiment_url
(
port
),
json
.
dumps
(
request_data
),
20
)
response
=
rest_post
(
experiment_url
(
port
),
json
.
dumps
(
request_data
),
REST_TIME_OUT
)
if
check_response
(
response
):
if
check_response
(
response
):
return
response
return
response
else
:
else
:
...
@@ -488,7 +488,7 @@ def resume_experiment(args):
...
@@ -488,7 +488,7 @@ def resume_experiment(args):
if
experiment_dict
.
get
(
args
.
id
)
is
None
:
if
experiment_dict
.
get
(
args
.
id
)
is
None
:
print_error
(
'Id %s not exist!'
%
args
.
id
)
print_error
(
'Id %s not exist!'
%
args
.
id
)
exit
(
1
)
exit
(
1
)
if
experiment_dict
[
args
.
id
][
'status'
]
=
=
'
running
'
:
if
experiment_dict
[
args
.
id
][
'status'
]
!
=
'
STOPPED
'
:
print_error
(
'Experiment %s is running!'
%
args
.
id
)
print_error
(
'Experiment %s is running!'
%
args
.
id
)
exit
(
1
)
exit
(
1
)
experiment_id
=
args
.
id
experiment_id
=
args
.
id
...
...
tools/nni_cmd/nnictl_utils.py
View file @
afce6d4a
...
@@ -28,10 +28,25 @@ from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_re
...
@@ -28,10 +28,25 @@ from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_re
from
.config_utils
import
Config
,
Experiments
from
.config_utils
import
Config
,
Experiments
from
.url_utils
import
trial_jobs_url
,
experiment_url
,
trial_job_id_url
from
.url_utils
import
trial_jobs_url
,
experiment_url
,
trial_job_id_url
from
.constants
import
NNICTL_HOME_DIR
,
EXPERIMENT_INFORMATION_FORMAT
,
EXPERIMENT_DETAIL_FORMAT
,
\
from
.constants
import
NNICTL_HOME_DIR
,
EXPERIMENT_INFORMATION_FORMAT
,
EXPERIMENT_DETAIL_FORMAT
,
\
EXPERIMENT_MONITOR_INFO
,
TRIAL_MONITOR_HEAD
,
TRIAL_MONITOR_CONTENT
,
TRIAL_MONITOR_TAIL
EXPERIMENT_MONITOR_INFO
,
TRIAL_MONITOR_HEAD
,
TRIAL_MONITOR_CONTENT
,
TRIAL_MONITOR_TAIL
,
REST_TIME_OUT
from
.common_utils
import
print_normal
,
print_error
,
print_warning
,
detect_process
from
.common_utils
import
print_normal
,
print_error
,
print_warning
,
detect_process
def
update_experiment_status
():
def
get_experiment_time
(
port
):
'''get the startTime and endTime of an experiment'''
response
=
rest_get
(
experiment_url
(
port
),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
content
=
convert_time_stamp_to_date
(
json
.
loads
(
response
.
text
))
return
content
.
get
(
'startTime'
),
content
.
get
(
'endTime'
)
return
None
,
None
def
get_experiment_status
(
port
):
'''get the status of an experiment'''
result
,
response
=
check_rest_server_quick
(
port
)
if
result
:
return
json
.
loads
(
response
.
text
).
get
(
'status'
)
return
None
def
update_experiment
():
'''Update the experiment status in config file'''
'''Update the experiment status in config file'''
experiment_config
=
Experiments
()
experiment_config
=
Experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
...
@@ -39,16 +54,26 @@ def update_experiment_status():
...
@@ -39,16 +54,26 @@ def update_experiment_status():
return
None
return
None
for
key
in
experiment_dict
.
keys
():
for
key
in
experiment_dict
.
keys
():
if
isinstance
(
experiment_dict
[
key
],
dict
):
if
isinstance
(
experiment_dict
[
key
],
dict
):
if
experiment_dict
[
key
].
get
(
'status'
)
=
=
'
running
'
:
if
experiment_dict
[
key
].
get
(
'status'
)
!
=
'
STOPPED
'
:
nni_config
=
Config
(
experiment_dict
[
key
][
'fileName'
])
nni_config
=
Config
(
experiment_dict
[
key
][
'fileName'
])
rest_pid
=
nni_config
.
get_config
(
'restServerPid'
)
rest_pid
=
nni_config
.
get_config
(
'restServerPid'
)
if
not
detect_process
(
rest_pid
):
if
not
detect_process
(
rest_pid
):
experiment_config
.
update_experiment
(
key
,
'status'
,
'stopped'
)
experiment_config
.
update_experiment
(
key
,
'status'
,
'STOPPED'
)
continue
rest_port
=
nni_config
.
get_config
(
'restServerPort'
)
startTime
,
endTime
=
get_experiment_time
(
rest_port
)
if
startTime
:
experiment_config
.
update_experiment
(
key
,
'startTime'
,
startTime
)
if
endTime
:
experiment_config
.
update_experiment
(
key
,
'endTime'
,
endTime
)
status
=
get_experiment_status
(
rest_port
)
if
status
:
experiment_config
.
update_experiment
(
key
,
'status'
,
status
)
def
check_experiment_id
(
args
):
def
check_experiment_id
(
args
):
'''check if the id is valid
'''check if the id is valid
'''
'''
update_experiment
_status
()
update_experiment
()
experiment_config
=
Experiments
()
experiment_config
=
Experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
if
not
experiment_dict
:
if
not
experiment_dict
:
...
@@ -58,13 +83,13 @@ def check_experiment_id(args):
...
@@ -58,13 +83,13 @@ def check_experiment_id(args):
running_experiment_list
=
[]
running_experiment_list
=
[]
for
key
in
experiment_dict
.
keys
():
for
key
in
experiment_dict
.
keys
():
if
isinstance
(
experiment_dict
[
key
],
dict
):
if
isinstance
(
experiment_dict
[
key
],
dict
):
if
experiment_dict
[
key
].
get
(
'status'
)
=
=
'
running
'
:
if
experiment_dict
[
key
].
get
(
'status'
)
!
=
'
STOPPED
'
:
running_experiment_list
.
append
(
key
)
running_experiment_list
.
append
(
key
)
elif
isinstance
(
experiment_dict
[
key
],
list
):
elif
isinstance
(
experiment_dict
[
key
],
list
):
# if the config file is old version, remove the configuration from file
# if the config file is old version, remove the configuration from file
experiment_config
.
remove_experiment
(
key
)
experiment_config
.
remove_experiment
(
key
)
if
len
(
running_experiment_list
)
>
1
:
if
len
(
running_experiment_list
)
>
1
:
print_error
(
'There are multiple experiments
running
, please set the experiment id...'
)
print_error
(
'There are multiple experiments, please set the experiment id...'
)
experiment_information
=
""
experiment_information
=
""
for
key
in
running_experiment_list
:
for
key
in
running_experiment_list
:
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
\
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
\
...
@@ -94,7 +119,7 @@ def parse_ids(args):
...
@@ -94,7 +119,7 @@ def parse_ids(args):
5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id
5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id
6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information
6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information
'''
'''
update_experiment
_status
()
update_experiment
()
experiment_config
=
Experiments
()
experiment_config
=
Experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
if
not
experiment_dict
:
if
not
experiment_dict
:
...
@@ -104,14 +129,14 @@ def parse_ids(args):
...
@@ -104,14 +129,14 @@ def parse_ids(args):
running_experiment_list
=
[]
running_experiment_list
=
[]
for
key
in
experiment_dict
.
keys
():
for
key
in
experiment_dict
.
keys
():
if
isinstance
(
experiment_dict
[
key
],
dict
):
if
isinstance
(
experiment_dict
[
key
],
dict
):
if
experiment_dict
[
key
].
get
(
'status'
)
=
=
'
running
'
:
if
experiment_dict
[
key
].
get
(
'status'
)
!
=
'
STOPPED
'
:
running_experiment_list
.
append
(
key
)
running_experiment_list
.
append
(
key
)
elif
isinstance
(
experiment_dict
[
key
],
list
):
elif
isinstance
(
experiment_dict
[
key
],
list
):
# if the config file is old version, remove the configuration from file
# if the config file is old version, remove the configuration from file
experiment_config
.
remove_experiment
(
key
)
experiment_config
.
remove_experiment
(
key
)
if
not
args
.
id
:
if
not
args
.
id
:
if
len
(
running_experiment_list
)
>
1
:
if
len
(
running_experiment_list
)
>
1
:
print_error
(
'There are multiple experiments
running
, please set the experiment id...'
)
print_error
(
'There are multiple experiments, please set the experiment id...'
)
experiment_information
=
""
experiment_information
=
""
for
key
in
running_experiment_list
:
for
key
in
running_experiment_list
:
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
\
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
\
...
@@ -207,7 +232,7 @@ def stop_experiment(args):
...
@@ -207,7 +232,7 @@ def stop_experiment(args):
print_error
(
exception
)
print_error
(
exception
)
nni_config
.
set_config
(
'tensorboardPidList'
,
[])
nni_config
.
set_config
(
'tensorboardPidList'
,
[])
print_normal
(
'Stop experiment success!'
)
print_normal
(
'Stop experiment success!'
)
experiment_config
.
update_experiment
(
experiment_id
,
'status'
,
'
stopped
'
)
experiment_config
.
update_experiment
(
experiment_id
,
'status'
,
'
STOPPED
'
)
time_now
=
time
.
strftime
(
'%Y-%m-%d %H:%M:%S'
,
time
.
localtime
(
time
.
time
()))
time_now
=
time
.
strftime
(
'%Y-%m-%d %H:%M:%S'
,
time
.
localtime
(
time
.
time
()))
experiment_config
.
update_experiment
(
experiment_id
,
'endTime'
,
str
(
time_now
))
experiment_config
.
update_experiment
(
experiment_id
,
'endTime'
,
str
(
time_now
))
...
@@ -221,7 +246,7 @@ def trial_ls(args):
...
@@ -221,7 +246,7 @@ def trial_ls(args):
return
return
running
,
response
=
check_rest_server_quick
(
rest_port
)
running
,
response
=
check_rest_server_quick
(
rest_port
)
if
running
:
if
running
:
response
=
rest_get
(
trial_jobs_url
(
rest_port
),
20
)
response
=
rest_get
(
trial_jobs_url
(
rest_port
),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
if
response
and
check_response
(
response
):
content
=
json
.
loads
(
response
.
text
)
content
=
json
.
loads
(
response
.
text
)
for
index
,
value
in
enumerate
(
content
):
for
index
,
value
in
enumerate
(
content
):
...
@@ -242,7 +267,7 @@ def trial_kill(args):
...
@@ -242,7 +267,7 @@ def trial_kill(args):
return
return
running
,
_
=
check_rest_server_quick
(
rest_port
)
running
,
_
=
check_rest_server_quick
(
rest_port
)
if
running
:
if
running
:
response
=
rest_delete
(
trial_job_id_url
(
rest_port
,
args
.
id
),
20
)
response
=
rest_delete
(
trial_job_id_url
(
rest_port
,
args
.
id
),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
if
response
and
check_response
(
response
):
print
(
response
.
text
)
print
(
response
.
text
)
else
:
else
:
...
@@ -260,7 +285,7 @@ def list_experiment(args):
...
@@ -260,7 +285,7 @@ def list_experiment(args):
return
return
running
,
_
=
check_rest_server_quick
(
rest_port
)
running
,
_
=
check_rest_server_quick
(
rest_port
)
if
running
:
if
running
:
response
=
rest_get
(
experiment_url
(
rest_port
),
20
)
response
=
rest_get
(
experiment_url
(
rest_port
),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
if
response
and
check_response
(
response
):
content
=
convert_time_stamp_to_date
(
json
.
loads
(
response
.
text
))
content
=
convert_time_stamp_to_date
(
json
.
loads
(
response
.
text
))
print
(
json
.
dumps
(
content
,
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
print
(
json
.
dumps
(
content
,
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
...
@@ -322,7 +347,7 @@ def log_trial(args):
...
@@ -322,7 +347,7 @@ def log_trial(args):
return
return
running
,
response
=
check_rest_server_quick
(
rest_port
)
running
,
response
=
check_rest_server_quick
(
rest_port
)
if
running
:
if
running
:
response
=
rest_get
(
trial_jobs_url
(
rest_port
),
20
)
response
=
rest_get
(
trial_jobs_url
(
rest_port
),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
if
response
and
check_response
(
response
):
content
=
json
.
loads
(
response
.
text
)
content
=
json
.
loads
(
response
.
text
)
for
trial
in
content
:
for
trial
in
content
:
...
@@ -362,18 +387,20 @@ def experiment_list(args):
...
@@ -362,18 +387,20 @@ def experiment_list(args):
if
not
experiment_dict
:
if
not
experiment_dict
:
print
(
'There is no experiment running...'
)
print
(
'There is no experiment running...'
)
exit
(
1
)
exit
(
1
)
update_experiment
()
experiment_id_list
=
[]
experiment_id_list
=
[]
if
args
.
all
and
args
.
all
==
'all'
:
if
args
.
all
and
args
.
all
==
'all'
:
for
key
in
experiment_dict
.
keys
():
for
key
in
experiment_dict
.
keys
():
experiment_id_list
.
append
(
key
)
experiment_id_list
.
append
(
key
)
else
:
else
:
for
key
in
experiment_dict
.
keys
():
for
key
in
experiment_dict
.
keys
():
if
experiment_dict
[
key
][
'status'
]
=
=
'
running
'
:
if
experiment_dict
[
key
][
'status'
]
!
=
'
STOPPED
'
:
experiment_id_list
.
append
(
key
)
experiment_id_list
.
append
(
key
)
if
not
experiment_id_list
:
if
not
experiment_id_list
:
print_warning
(
'There is no experiment running...
\n
You can use
\'
nnictl experiment list all
\'
to list all stopped experiments!'
)
print_warning
(
'There is no experiment running...
\n
You can use
\'
nnictl experiment list all
\'
to list all stopped experiments!'
)
experiment_information
=
""
experiment_information
=
""
for
key
in
experiment_id_list
:
for
key
in
experiment_id_list
:
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
experiment_dict
[
key
][
'port'
],
\
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
experiment_dict
[
key
][
'port'
],
\
experiment_dict
[
key
].
get
(
'platform'
),
experiment_dict
[
key
][
'startTime'
],
experiment_dict
[
key
][
'endTime'
]))
experiment_dict
[
key
].
get
(
'platform'
),
experiment_dict
[
key
][
'startTime'
],
experiment_dict
[
key
][
'endTime'
]))
print
(
EXPERIMENT_INFORMATION_FORMAT
%
experiment_information
)
print
(
EXPERIMENT_INFORMATION_FORMAT
%
experiment_information
)
...
@@ -382,8 +409,8 @@ def get_time_interval(time1, time2):
...
@@ -382,8 +409,8 @@ def get_time_interval(time1, time2):
'''get the interval of two times'''
'''get the interval of two times'''
try
:
try
:
#convert time to timestamp
#convert time to timestamp
time1
=
time
.
mktime
(
time
.
strptime
(
time1
,
'%Y
-
%m
-
%d %H:%M:%S'
))
time1
=
time
.
mktime
(
time
.
strptime
(
time1
,
'%Y
/
%m
/
%d %H:%M:%S'
))
time2
=
time
.
mktime
(
time
.
strptime
(
time2
,
'%Y
-
%m
-
%d %H:%M:%S'
))
time2
=
time
.
mktime
(
time
.
strptime
(
time2
,
'%Y
/
%m
/
%d %H:%M:%S'
))
seconds
=
(
datetime
.
datetime
.
fromtimestamp
(
time2
)
-
datetime
.
datetime
.
fromtimestamp
(
time1
)).
seconds
seconds
=
(
datetime
.
datetime
.
fromtimestamp
(
time2
)
-
datetime
.
datetime
.
fromtimestamp
(
time1
)).
seconds
#convert seconds to day:hour:minute:second
#convert seconds to day:hour:minute:second
days
=
seconds
/
86400
days
=
seconds
/
86400
...
@@ -403,21 +430,21 @@ def show_experiment_info():
...
@@ -403,21 +430,21 @@ def show_experiment_info():
if
not
experiment_dict
:
if
not
experiment_dict
:
print
(
'There is no experiment running...'
)
print
(
'There is no experiment running...'
)
exit
(
1
)
exit
(
1
)
update_experiment
()
experiment_id_list
=
[]
experiment_id_list
=
[]
for
key
in
experiment_dict
.
keys
():
for
key
in
experiment_dict
.
keys
():
if
experiment_dict
[
key
][
'status'
]
=
=
'
running
'
:
if
experiment_dict
[
key
][
'status'
]
!
=
'
STOPPED
'
:
experiment_id_list
.
append
(
key
)
experiment_id_list
.
append
(
key
)
if
not
experiment_id_list
:
if
not
experiment_id_list
:
print_warning
(
'There is no experiment running...'
)
print_warning
(
'There is no experiment running...'
)
return
return
for
key
in
experiment_id_list
:
for
key
in
experiment_id_list
:
current_time
=
time
.
strftime
(
'%Y-%m-%d %H:%M:%S'
,
time
.
localtime
(
time
.
time
()))
print
(
EXPERIMENT_MONITOR_INFO
%
(
key
,
experiment_dict
[
key
][
'status'
],
experiment_dict
[
key
][
'port'
],
\
print
(
EXPERIMENT_MONITOR_INFO
%
(
key
,
experiment_dict
[
key
][
'status'
],
experiment_dict
[
key
][
'port'
],
\
experiment_dict
[
key
].
get
(
'platform'
),
experiment_dict
[
key
][
'startTime'
],
get_time_interval
(
experiment_dict
[
key
][
'startTime'
],
current_t
ime
)))
experiment_dict
[
key
].
get
(
'platform'
),
experiment_dict
[
key
][
'startTime'
],
get_time_interval
(
experiment_dict
[
key
][
'startTime'
],
experiment_dict
[
key
][
'endT
ime
'
]
)))
print
(
TRIAL_MONITOR_HEAD
)
print
(
TRIAL_MONITOR_HEAD
)
running
,
response
=
check_rest_server_quick
(
experiment_dict
[
key
][
'port'
])
running
,
response
=
check_rest_server_quick
(
experiment_dict
[
key
][
'port'
])
if
running
:
if
running
:
response
=
rest_get
(
trial_jobs_url
(
experiment_dict
[
key
][
'port'
]),
20
)
response
=
rest_get
(
trial_jobs_url
(
experiment_dict
[
key
][
'port'
]),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
if
response
and
check_response
(
response
):
content
=
json
.
loads
(
response
.
text
)
content
=
json
.
loads
(
response
.
text
)
for
index
,
value
in
enumerate
(
content
):
for
index
,
value
in
enumerate
(
content
):
...
@@ -433,7 +460,7 @@ def monitor_experiment(args):
...
@@ -433,7 +460,7 @@ def monitor_experiment(args):
while
True
:
while
True
:
try
:
try
:
os
.
system
(
'clear'
)
os
.
system
(
'clear'
)
update_experiment
_status
()
update_experiment
()
show_experiment_info
()
show_experiment_info
()
time
.
sleep
(
args
.
time
)
time
.
sleep
(
args
.
time
)
except
KeyboardInterrupt
:
except
KeyboardInterrupt
:
...
...
tools/nni_cmd/rest_utils.py
View file @
afce6d4a
...
@@ -22,6 +22,7 @@
...
@@ -22,6 +22,7 @@
import
time
import
time
import
requests
import
requests
from
.url_utils
import
check_status_url
from
.url_utils
import
check_status_url
from
.constants
import
REST_TIME_OUT
def
rest_put
(
url
,
data
,
timeout
):
def
rest_put
(
url
,
data
,
timeout
):
'''Call rest put method'''
'''Call rest put method'''
...
@@ -61,7 +62,7 @@ def check_rest_server(rest_port):
...
@@ -61,7 +62,7 @@ def check_rest_server(rest_port):
'''Check if restful server is ready'''
'''Check if restful server is ready'''
retry_count
=
5
retry_count
=
5
for
_
in
range
(
retry_count
):
for
_
in
range
(
retry_count
):
response
=
rest_get
(
check_status_url
(
rest_port
),
20
)
response
=
rest_get
(
check_status_url
(
rest_port
),
REST_TIME_OUT
)
if
response
:
if
response
:
if
response
.
status_code
==
200
:
if
response
.
status_code
==
200
:
return
True
,
response
return
True
,
response
...
...
tools/nni_cmd/tensorboard_utils.py
View file @
afce6d4a
...
@@ -144,7 +144,7 @@ def start_tensorboard(args):
...
@@ -144,7 +144,7 @@ def start_tensorboard(args):
running
,
response
=
check_rest_server_quick
(
rest_port
)
running
,
response
=
check_rest_server_quick
(
rest_port
)
trial_content
=
None
trial_content
=
None
if
running
:
if
running
:
response
=
rest_get
(
trial_jobs_url
(
rest_port
),
20
)
response
=
rest_get
(
trial_jobs_url
(
rest_port
),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
if
response
and
check_response
(
response
):
trial_content
=
json
.
loads
(
response
.
text
)
trial_content
=
json
.
loads
(
response
.
text
)
else
:
else
:
...
...
tools/nni_cmd/updater.py
View file @
afce6d4a
...
@@ -27,6 +27,7 @@ from .config_utils import Config
...
@@ -27,6 +27,7 @@ from .config_utils import Config
from
.common_utils
import
get_json_content
from
.common_utils
import
get_json_content
from
.nnictl_utils
import
check_experiment_id
,
get_experiment_port
,
get_config_filename
from
.nnictl_utils
import
check_experiment_id
,
get_experiment_port
,
get_config_filename
from
.launcher_utils
import
parse_time
from
.launcher_utils
import
parse_time
from
.constants
import
REST_TIME_OUT
def
validate_digit
(
value
,
start
,
end
):
def
validate_digit
(
value
,
start
,
end
):
'''validate if a digit is valid'''
'''validate if a digit is valid'''
...
@@ -62,11 +63,11 @@ def update_experiment_profile(args, key, value):
...
@@ -62,11 +63,11 @@ def update_experiment_profile(args, key, value):
rest_port
=
nni_config
.
get_config
(
'restServerPort'
)
rest_port
=
nni_config
.
get_config
(
'restServerPort'
)
running
,
_
=
check_rest_server_quick
(
rest_port
)
running
,
_
=
check_rest_server_quick
(
rest_port
)
if
running
:
if
running
:
response
=
rest_get
(
experiment_url
(
rest_port
),
20
)
response
=
rest_get
(
experiment_url
(
rest_port
),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
if
response
and
check_response
(
response
):
experiment_profile
=
json
.
loads
(
response
.
text
)
experiment_profile
=
json
.
loads
(
response
.
text
)
experiment_profile
[
'params'
][
key
]
=
value
experiment_profile
[
'params'
][
key
]
=
value
response
=
rest_put
(
experiment_url
(
rest_port
)
+
get_query_type
(
key
),
json
.
dumps
(
experiment_profile
),
20
)
response
=
rest_put
(
experiment_url
(
rest_port
)
+
get_query_type
(
key
),
json
.
dumps
(
experiment_profile
),
REST_TIME_OUT
)
if
response
and
check_response
(
response
):
if
response
and
check_response
(
response
):
return
response
return
response
else
:
else
:
...
...
tools/nni_trial_tool/constants.py
View file @
afce6d4a
...
@@ -35,6 +35,7 @@ STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout')
...
@@ -35,6 +35,7 @@ STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout')
STDERR_FULL_PATH
=
os
.
path
.
join
(
LOG_DIR
,
'stderr'
)
STDERR_FULL_PATH
=
os
.
path
.
join
(
LOG_DIR
,
'stderr'
)
STDOUT_API
=
'/stdout'
STDOUT_API
=
'/stdout'
VERSION_API
=
'/version'
NNI_SYS_DIR
=
os
.
environ
[
'NNI_SYS_DIR'
]
NNI_SYS_DIR
=
os
.
environ
[
'NNI_SYS_DIR'
]
NNI_TRIAL_JOB_ID
=
os
.
environ
[
'NNI_TRIAL_JOB_ID'
]
NNI_TRIAL_JOB_ID
=
os
.
environ
[
'NNI_TRIAL_JOB_ID'
]
NNI_EXP_ID
=
os
.
environ
[
'NNI_EXP_ID'
]
NNI_EXP_ID
=
os
.
environ
[
'NNI_EXP_ID'
]
\ No newline at end of file
tools/nni_trial_tool/trial_keeper.py
View file @
afce6d4a
...
@@ -27,14 +27,18 @@ import shlex
...
@@ -27,14 +27,18 @@ import shlex
import
re
import
re
import
sys
import
sys
import
select
import
select
import
json
from
pyhdfs
import
HdfsClient
from
pyhdfs
import
HdfsClient
import
pkg_resources
import
pkg_resources
from
.rest_utils
import
rest_post
from
.url_utils
import
gen_send_stdout_url
,
gen_send_version_url
from
.constants
import
HOME_DIR
,
LOG_DIR
,
NNI_PLATFORM
,
STDOUT_FULL_PATH
,
STDERR_FULL_PATH
from
.constants
import
HOME_DIR
,
LOG_DIR
,
NNI_PLATFORM
,
STDOUT_FULL_PATH
,
STDERR_FULL_PATH
from
.hdfsClientUtility
import
copyDirectoryToHdfs
,
copyHdfsDirectoryToLocal
from
.hdfsClientUtility
import
copyDirectoryToHdfs
,
copyHdfsDirectoryToLocal
from
.log_utils
import
LogType
,
nni_log
,
RemoteLogger
,
PipeLogReader
,
StdOutputType
from
.log_utils
import
LogType
,
nni_log
,
RemoteLogger
,
PipeLogReader
,
StdOutputType
logger
=
logging
.
getLogger
(
'trial_keeper'
)
logger
=
logging
.
getLogger
(
'trial_keeper'
)
regular
=
re
.
compile
(
'v?(?P<version>[0-9](\.[0-9]){0,1}).*'
)
def
main_loop
(
args
):
def
main_loop
(
args
):
'''main loop logic for trial keeper'''
'''main loop logic for trial keeper'''
...
@@ -110,21 +114,27 @@ def check_version(args):
...
@@ -110,21 +114,27 @@ def check_version(args):
#package nni does not exist, try nni-tool package
#package nni does not exist, try nni-tool package
nni_log
(
LogType
.
Error
,
'Package nni does not exist!'
)
nni_log
(
LogType
.
Error
,
'Package nni does not exist!'
)
os
.
_exit
(
1
)
os
.
_exit
(
1
)
if
not
args
.
version
:
if
not
args
.
nni_manager_
version
:
# skip version check
# skip version check
nni_log
(
LogType
.
Warning
,
'Skipping version check!'
)
nni_log
(
LogType
.
Warning
,
'Skipping version check!'
)
else
:
else
:
regular
=
re
.
compile
(
'v?(?P<version>[0-9](\.[0-9]){0,2}).*'
)
try
:
try
:
trial_keeper_version
=
regular
.
search
(
trial_keeper_version
).
group
(
'version'
)
trial_keeper_version
=
regular
.
search
(
trial_keeper_version
).
group
(
'version'
)
nni_log
(
LogType
.
Info
,
'trial_keeper_version is {0}'
.
format
(
trial_keeper_version
))
nni_log
(
LogType
.
Info
,
'trial_keeper_version is {0}'
.
format
(
trial_keeper_version
))
training_service_version
=
regular
.
search
(
args
.
version
).
group
(
'version'
)
nni_manager_version
=
regular
.
search
(
args
.
nni_manager_version
).
group
(
'version'
)
nni_log
(
LogType
.
Info
,
'training_service_version is {0}'
.
format
(
training_service_version
))
nni_log
(
LogType
.
Info
,
'nni_manager_version is {0}'
.
format
(
nni_manager_version
))
if
trial_keeper_version
!=
training_service_version
:
log_entry
=
{}
if
trial_keeper_version
!=
nni_manager_version
:
nni_log
(
LogType
.
Error
,
'Version does not match!'
)
nni_log
(
LogType
.
Error
,
'Version does not match!'
)
error_message
=
'NNIManager version is {0}, TrialKeeper version is {1}, NNI version does not match!'
.
format
(
nni_manager_version
,
trial_keeper_version
)
log_entry
[
'tag'
]
=
'VCFail'
log_entry
[
'msg'
]
=
error_message
rest_post
(
gen_send_version_url
(
args
.
nnimanager_ip
,
args
.
nnimanager_port
),
json
.
dumps
(
log_entry
),
10
,
False
)
os
.
_exit
(
1
)
os
.
_exit
(
1
)
else
:
else
:
nni_log
(
LogType
.
Info
,
'Version match!'
)
nni_log
(
LogType
.
Info
,
'Version match!'
)
log_entry
[
'tag'
]
=
'VCSuccess'
rest_post
(
gen_send_version_url
(
args
.
nnimanager_ip
,
args
.
nnimanager_port
),
json
.
dumps
(
log_entry
),
10
,
False
)
except
AttributeError
as
err
:
except
AttributeError
as
err
:
nni_log
(
LogType
.
Error
,
err
)
nni_log
(
LogType
.
Error
,
err
)
...
@@ -142,7 +152,7 @@ if __name__ == '__main__':
...
@@ -142,7 +152,7 @@ if __name__ == '__main__':
PARSER
.
add_argument
(
'--pai_user_name'
,
type
=
str
,
help
=
'the username of hdfs'
)
PARSER
.
add_argument
(
'--pai_user_name'
,
type
=
str
,
help
=
'the username of hdfs'
)
PARSER
.
add_argument
(
'--nni_hdfs_exp_dir'
,
type
=
str
,
help
=
'nni experiment directory in hdfs'
)
PARSER
.
add_argument
(
'--nni_hdfs_exp_dir'
,
type
=
str
,
help
=
'nni experiment directory in hdfs'
)
PARSER
.
add_argument
(
'--webhdfs_path'
,
type
=
str
,
help
=
'the webhdfs path used in webhdfs URL'
)
PARSER
.
add_argument
(
'--webhdfs_path'
,
type
=
str
,
help
=
'the webhdfs path used in webhdfs URL'
)
PARSER
.
add_argument
(
'--version'
,
type
=
str
,
help
=
'the nni version transmitted from
trainingService
'
)
PARSER
.
add_argument
(
'--
nni_manager_
version'
,
type
=
str
,
help
=
'the nni version transmitted from
nniManager
'
)
PARSER
.
add_argument
(
'--log_collection'
,
type
=
str
,
help
=
'set the way to collect log in trialkeeper'
)
PARSER
.
add_argument
(
'--log_collection'
,
type
=
str
,
help
=
'set the way to collect log in trialkeeper'
)
args
,
unknown
=
PARSER
.
parse_known_args
()
args
,
unknown
=
PARSER
.
parse_known_args
()
if
args
.
trial_command
is
None
:
if
args
.
trial_command
is
None
:
...
...
tools/nni_trial_tool/url_utils.py
View file @
afce6d4a
...
@@ -18,8 +18,12 @@
...
@@ -18,8 +18,12 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from
.constants
import
API_ROOT_URL
,
BASE_URL
,
STDOUT_API
,
NNI_TRIAL_JOB_ID
,
NNI_EXP_ID
from
.constants
import
API_ROOT_URL
,
BASE_URL
,
STDOUT_API
,
NNI_TRIAL_JOB_ID
,
NNI_EXP_ID
,
VERSION_API
def
gen_send_stdout_url
(
ip
,
port
):
def
gen_send_stdout_url
(
ip
,
port
):
'''Generate send stdout url'''
'''Generate send stdout url'''
return
'{0}:{1}{2}{3}/{4}/{5}'
.
format
(
BASE_URL
.
format
(
ip
),
port
,
API_ROOT_URL
,
STDOUT_API
,
NNI_EXP_ID
,
NNI_TRIAL_JOB_ID
)
return
'{0}:{1}{2}{3}/{4}/{5}'
.
format
(
BASE_URL
.
format
(
ip
),
port
,
API_ROOT_URL
,
STDOUT_API
,
NNI_EXP_ID
,
NNI_TRIAL_JOB_ID
)
\ No newline at end of file
def
gen_send_version_url
(
ip
,
port
):
'''Generate send error url'''
return
'{0}:{1}{2}{3}/{4}/{5}'
.
format
(
BASE_URL
.
format
(
ip
),
port
,
API_ROOT_URL
,
VERSION_API
,
NNI_EXP_ID
,
NNI_TRIAL_JOB_ID
)
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment