Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
19173aa4
Commit
19173aa4
authored
Aug 14, 2019
by
Guoxin
Committed by
QuanluZhang
Aug 14, 2019
Browse files
merge v1.0(bug bash) back to master (#1462)
* squash commits in v1.0 first round bug bash
parent
f721b431
Changes
65
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
75 additions
and
21 deletions
+75
-21
tools/nni_cmd/launcher.py
tools/nni_cmd/launcher.py
+2
-2
tools/nni_cmd/launcher_utils.py
tools/nni_cmd/launcher_utils.py
+33
-0
tools/nni_cmd/nnictl.py
tools/nni_cmd/nnictl.py
+1
-0
tools/nni_cmd/nnictl_utils.py
tools/nni_cmd/nnictl_utils.py
+11
-8
tools/nni_gpu_tool/gpu_metrics_collector.py
tools/nni_gpu_tool/gpu_metrics_collector.py
+28
-11
No files found.
tools/nni_cmd/launcher.py
View file @
19173aa4
...
@@ -519,14 +519,14 @@ def resume_experiment(args):
...
@@ -519,14 +519,14 @@ def resume_experiment(args):
#find the latest stopped experiment
#find the latest stopped experiment
if
not
args
.
id
:
if
not
args
.
id
:
print_error
(
'Please set experiment id!
\n
You could use
\'
nnictl resume {id}
\'
to resume a stopped experiment!
\n
'
\
print_error
(
'Please set experiment id!
\n
You could use
\'
nnictl resume {id}
\'
to resume a stopped experiment!
\n
'
\
'You could use
\'
nnictl experiment list all
\'
to show all
of stopped
experiments!'
)
'You could use
\'
nnictl experiment list
--
all
\'
to show all experiments!'
)
exit
(
1
)
exit
(
1
)
else
:
else
:
if
experiment_dict
.
get
(
args
.
id
)
is
None
:
if
experiment_dict
.
get
(
args
.
id
)
is
None
:
print_error
(
'Id %s not exist!'
%
args
.
id
)
print_error
(
'Id %s not exist!'
%
args
.
id
)
exit
(
1
)
exit
(
1
)
if
experiment_dict
[
args
.
id
][
'status'
]
!=
'STOPPED'
:
if
experiment_dict
[
args
.
id
][
'status'
]
!=
'STOPPED'
:
print_error
(
'
Experiment %s is running!'
%
args
.
id
)
print_error
(
'
Only stopped experiments can be resumed!'
)
exit
(
1
)
exit
(
1
)
experiment_id
=
args
.
id
experiment_id
=
args
.
id
print_normal
(
'Resuming experiment %s...'
%
experiment_id
)
print_normal
(
'Resuming experiment %s...'
%
experiment_id
)
...
...
tools/nni_cmd/launcher_utils.py
View file @
19173aa4
...
@@ -56,12 +56,30 @@ def parse_path(experiment_config, config_path):
...
@@ -56,12 +56,30 @@ def parse_path(experiment_config, config_path):
expand_path
(
experiment_config
,
'searchSpacePath'
)
expand_path
(
experiment_config
,
'searchSpacePath'
)
if
experiment_config
.
get
(
'trial'
):
if
experiment_config
.
get
(
'trial'
):
expand_path
(
experiment_config
[
'trial'
],
'codeDir'
)
expand_path
(
experiment_config
[
'trial'
],
'codeDir'
)
if
experiment_config
[
'trial'
].
get
(
'authFile'
):
expand_path
(
experiment_config
[
'trial'
],
'authFile'
)
if
experiment_config
[
'trial'
].
get
(
'ps'
):
if
experiment_config
[
'trial'
][
'ps'
].
get
(
'privateRegistryAuthPath'
):
expand_path
(
experiment_config
[
'trial'
][
'ps'
],
'privateRegistryAuthPath'
)
if
experiment_config
[
'trial'
].
get
(
'master'
):
if
experiment_config
[
'trial'
][
'master'
].
get
(
'privateRegistryAuthPath'
):
expand_path
(
experiment_config
[
'trial'
][
'master'
],
'privateRegistryAuthPath'
)
if
experiment_config
[
'trial'
].
get
(
'worker'
):
if
experiment_config
[
'trial'
][
'worker'
].
get
(
'privateRegistryAuthPath'
):
expand_path
(
experiment_config
[
'trial'
][
'worker'
],
'privateRegistryAuthPath'
)
if
experiment_config
[
'trial'
].
get
(
'taskRoles'
):
for
index
in
range
(
len
(
experiment_config
[
'trial'
][
'taskRoles'
])):
if
experiment_config
[
'trial'
][
'taskRoles'
][
index
].
get
(
'privateRegistryAuthPath'
):
expand_path
(
experiment_config
[
'trial'
][
'taskRoles'
][
index
],
'privateRegistryAuthPath'
)
if
experiment_config
.
get
(
'tuner'
):
if
experiment_config
.
get
(
'tuner'
):
expand_path
(
experiment_config
[
'tuner'
],
'codeDir'
)
expand_path
(
experiment_config
[
'tuner'
],
'codeDir'
)
if
experiment_config
.
get
(
'assessor'
):
if
experiment_config
.
get
(
'assessor'
):
expand_path
(
experiment_config
[
'assessor'
],
'codeDir'
)
expand_path
(
experiment_config
[
'assessor'
],
'codeDir'
)
if
experiment_config
.
get
(
'advisor'
):
if
experiment_config
.
get
(
'advisor'
):
expand_path
(
experiment_config
[
'advisor'
],
'codeDir'
)
expand_path
(
experiment_config
[
'advisor'
],
'codeDir'
)
if
experiment_config
.
get
(
'machineList'
):
for
index
in
range
(
len
(
experiment_config
[
'machineList'
])):
expand_path
(
experiment_config
[
'machineList'
][
index
],
'sshKeyPath'
)
#if users use relative path, convert it to absolute path
#if users use relative path, convert it to absolute path
root_path
=
os
.
path
.
dirname
(
config_path
)
root_path
=
os
.
path
.
dirname
(
config_path
)
...
@@ -69,6 +87,21 @@ def parse_path(experiment_config, config_path):
...
@@ -69,6 +87,21 @@ def parse_path(experiment_config, config_path):
parse_relative_path
(
root_path
,
experiment_config
,
'searchSpacePath'
)
parse_relative_path
(
root_path
,
experiment_config
,
'searchSpacePath'
)
if
experiment_config
.
get
(
'trial'
):
if
experiment_config
.
get
(
'trial'
):
parse_relative_path
(
root_path
,
experiment_config
[
'trial'
],
'codeDir'
)
parse_relative_path
(
root_path
,
experiment_config
[
'trial'
],
'codeDir'
)
if
experiment_config
[
'trial'
].
get
(
'authFile'
):
parse_relative_path
(
root_path
,
experiment_config
[
'trial'
],
'authFile'
)
if
experiment_config
[
'trial'
].
get
(
'ps'
):
if
experiment_config
[
'trial'
][
'ps'
].
get
(
'privateRegistryAuthPath'
):
parse_relative_path
(
root_path
,
experiment_config
[
'trial'
][
'ps'
],
'privateRegistryAuthPath'
)
if
experiment_config
[
'trial'
].
get
(
'master'
):
if
experiment_config
[
'trial'
][
'master'
].
get
(
'privateRegistryAuthPath'
):
parse_relative_path
(
root_path
,
experiment_config
[
'trial'
][
'master'
],
'privateRegistryAuthPath'
)
if
experiment_config
[
'trial'
].
get
(
'worker'
):
if
experiment_config
[
'trial'
][
'worker'
].
get
(
'privateRegistryAuthPath'
):
parse_relative_path
(
root_path
,
experiment_config
[
'trial'
][
'worker'
],
'privateRegistryAuthPath'
)
if
experiment_config
[
'trial'
].
get
(
'taskRoles'
):
for
index
in
range
(
len
(
experiment_config
[
'trial'
][
'taskRoles'
])):
if
experiment_config
[
'trial'
][
'taskRoles'
][
index
].
get
(
'privateRegistryAuthPath'
):
parse_relative_path
(
root_path
,
experiment_config
[
'trial'
][
'taskRoles'
][
index
],
'privateRegistryAuthPath'
)
if
experiment_config
.
get
(
'tuner'
):
if
experiment_config
.
get
(
'tuner'
):
parse_relative_path
(
root_path
,
experiment_config
[
'tuner'
],
'codeDir'
)
parse_relative_path
(
root_path
,
experiment_config
[
'tuner'
],
'codeDir'
)
if
experiment_config
.
get
(
'assessor'
):
if
experiment_config
.
get
(
'assessor'
):
...
...
tools/nni_cmd/nnictl.py
View file @
19173aa4
...
@@ -91,6 +91,7 @@ def parse_args():
...
@@ -91,6 +91,7 @@ def parse_args():
parser_stop
=
subparsers
.
add_parser
(
'stop'
,
help
=
'stop the experiment'
)
parser_stop
=
subparsers
.
add_parser
(
'stop'
,
help
=
'stop the experiment'
)
parser_stop
.
add_argument
(
'id'
,
nargs
=
'?'
,
help
=
'the id of experiment, use
\'
all
\'
to stop all running experiments'
)
parser_stop
.
add_argument
(
'id'
,
nargs
=
'?'
,
help
=
'the id of experiment, use
\'
all
\'
to stop all running experiments'
)
parser_stop
.
add_argument
(
'--port'
,
'-p'
,
dest
=
'port'
,
help
=
'the port of restful server'
)
parser_stop
.
add_argument
(
'--port'
,
'-p'
,
dest
=
'port'
,
help
=
'the port of restful server'
)
parser_stop
.
add_argument
(
'--all'
,
'-a'
,
action
=
'store_true'
,
help
=
'stop all of experiments'
)
parser_stop
.
set_defaults
(
func
=
stop_experiment
)
parser_stop
.
set_defaults
(
func
=
stop_experiment
)
#parse trial command
#parse trial command
...
...
tools/nni_cmd/nnictl_utils.py
View file @
19173aa4
...
@@ -22,7 +22,7 @@ import csv
...
@@ -22,7 +22,7 @@ import csv
import
os
import
os
import
psutil
import
psutil
import
json
import
json
import
datetime
from
datetime
import
datetime
,
timezone
import
time
import
time
import
re
import
re
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -142,6 +142,8 @@ def parse_ids(args):
...
@@ -142,6 +142,8 @@ def parse_ids(args):
elif
isinstance
(
experiment_dict
[
key
],
list
):
elif
isinstance
(
experiment_dict
[
key
],
list
):
# if the config file is old version, remove the configuration from file
# if the config file is old version, remove the configuration from file
experiment_config
.
remove_experiment
(
key
)
experiment_config
.
remove_experiment
(
key
)
if
args
.
all
:
return
running_experiment_list
if
args
.
port
is
not
None
:
if
args
.
port
is
not
None
:
for
key
in
running_experiment_list
:
for
key
in
running_experiment_list
:
if
str
(
experiment_dict
[
key
][
'port'
])
==
args
.
port
:
if
str
(
experiment_dict
[
key
][
'port'
])
==
args
.
port
:
...
@@ -160,8 +162,6 @@ def parse_ids(args):
...
@@ -160,8 +162,6 @@ def parse_ids(args):
exit
(
1
)
exit
(
1
)
else
:
else
:
result_list
=
running_experiment_list
result_list
=
running_experiment_list
elif
args
.
id
==
'all'
:
result_list
=
running_experiment_list
elif
args
.
id
.
endswith
(
'*'
):
elif
args
.
id
.
endswith
(
'*'
):
for
id
in
running_experiment_list
:
for
id
in
running_experiment_list
:
if
id
.
startswith
(
args
.
id
[:
-
1
]):
if
id
.
startswith
(
args
.
id
[:
-
1
]):
...
@@ -175,7 +175,7 @@ def parse_ids(args):
...
@@ -175,7 +175,7 @@ def parse_ids(args):
if
len
(
result_list
)
>
1
:
if
len
(
result_list
)
>
1
:
print_error
(
args
.
id
+
' is ambiguous, please choose '
+
' '
.
join
(
result_list
)
)
print_error
(
args
.
id
+
' is ambiguous, please choose '
+
' '
.
join
(
result_list
)
)
return
None
return
None
if
not
result_list
and
(
(
args
.
id
and
args
.
id
!=
'all'
)
or
args
.
port
):
if
not
result_list
and
(
args
.
id
or
args
.
port
):
print_error
(
'There are no experiments matched, please set correct experiment id or restful server port'
)
print_error
(
'There are no experiments matched, please set correct experiment id or restful server port'
)
elif
not
result_list
:
elif
not
result_list
:
print_error
(
'There is no experiment running...'
)
print_error
(
'There is no experiment running...'
)
...
@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content):
...
@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content):
start_time_stamp
=
content
.
get
(
'startTime'
)
start_time_stamp
=
content
.
get
(
'startTime'
)
end_time_stamp
=
content
.
get
(
'endTime'
)
end_time_stamp
=
content
.
get
(
'endTime'
)
if
start_time_stamp
:
if
start_time_stamp
:
start_time
=
datetime
.
datetime
.
utc
fromtimestamp
(
start_time_stamp
//
1000
).
strftime
(
"%Y/%m/%d %H:%M:%S"
)
start_time
=
datetime
.
fromtimestamp
(
start_time_stamp
//
1000
,
timezone
.
utc
).
astimezone
(
).
strftime
(
"%Y/%m/%d %H:%M:%S"
)
content
[
'startTime'
]
=
str
(
start_time
)
content
[
'startTime'
]
=
str
(
start_time
)
if
end_time_stamp
:
if
end_time_stamp
:
end_time
=
datetime
.
datetime
.
utc
fromtimestamp
(
end_time_stamp
//
1000
).
strftime
(
"%Y/%m/%d %H:%M:%S"
)
end_time
=
datetime
.
fromtimestamp
(
end_time_stamp
//
1000
,
timezone
.
utc
).
astimezone
(
).
strftime
(
"%Y/%m/%d %H:%M:%S"
)
content
[
'endTime'
]
=
str
(
end_time
)
content
[
'endTime'
]
=
str
(
end_time
)
return
content
return
content
...
@@ -225,6 +225,9 @@ def check_rest(args):
...
@@ -225,6 +225,9 @@ def check_rest(args):
def
stop_experiment
(
args
):
def
stop_experiment
(
args
):
'''Stop the experiment which is running'''
'''Stop the experiment which is running'''
if
args
.
id
and
args
.
id
==
'all'
:
print_warning
(
'
\'
nnictl stop all
\'
is abolished, please use
\'
nnictl stop --all
\'
to stop all of experiments!'
)
exit
(
1
)
experiment_id_list
=
parse_ids
(
args
)
experiment_id_list
=
parse_ids
(
args
)
if
experiment_id_list
:
if
experiment_id_list
:
experiment_config
=
Experiments
()
experiment_config
=
Experiments
()
...
@@ -568,7 +571,7 @@ def experiment_list(args):
...
@@ -568,7 +571,7 @@ def experiment_list(args):
if
experiment_dict
[
key
][
'status'
]
!=
'STOPPED'
:
if
experiment_dict
[
key
][
'status'
]
!=
'STOPPED'
:
experiment_id_list
.
append
(
key
)
experiment_id_list
.
append
(
key
)
if
not
experiment_id_list
:
if
not
experiment_id_list
:
print_warning
(
'There is no experiment running...
\n
You can use
\'
nnictl experiment list --all
\'
to list all
stopped
experiments.'
)
print_warning
(
'There is no experiment running...
\n
You can use
\'
nnictl experiment list --all
\'
to list all experiments.'
)
experiment_information
=
""
experiment_information
=
""
for
key
in
experiment_id_list
:
for
key
in
experiment_id_list
:
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
experiment_dict
[
key
][
'port'
],
\
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
experiment_dict
[
key
][
'port'
],
\
...
@@ -581,7 +584,7 @@ def get_time_interval(time1, time2):
...
@@ -581,7 +584,7 @@ def get_time_interval(time1, time2):
#convert time to timestamp
#convert time to timestamp
time1
=
time
.
mktime
(
time
.
strptime
(
time1
,
'%Y/%m/%d %H:%M:%S'
))
time1
=
time
.
mktime
(
time
.
strptime
(
time1
,
'%Y/%m/%d %H:%M:%S'
))
time2
=
time
.
mktime
(
time
.
strptime
(
time2
,
'%Y/%m/%d %H:%M:%S'
))
time2
=
time
.
mktime
(
time
.
strptime
(
time2
,
'%Y/%m/%d %H:%M:%S'
))
seconds
=
(
datetime
.
datetime
.
fromtimestamp
(
time2
)
-
datetime
.
datetime
.
fromtimestamp
(
time1
)).
seconds
seconds
=
(
datetime
.
fromtimestamp
(
time2
)
-
datetime
.
fromtimestamp
(
time1
)).
seconds
#convert seconds to day:hour:minute:second
#convert seconds to day:hour:minute:second
days
=
seconds
/
86400
days
=
seconds
/
86400
seconds
%=
86400
seconds
%=
86400
...
...
tools/nni_gpu_tool/gpu_metrics_collector.py
View file @
19173aa4
...
@@ -21,6 +21,7 @@ import os
...
@@ -21,6 +21,7 @@ import os
import
subprocess
import
subprocess
import
sys
import
sys
import
time
import
time
import
traceback
from
xml.dom
import
minidom
from
xml.dom
import
minidom
...
@@ -33,7 +34,7 @@ def check_ready_to_run():
...
@@ -33,7 +34,7 @@ def check_ready_to_run():
pidList
.
remove
(
os
.
getpid
())
pidList
.
remove
(
os
.
getpid
())
return
len
(
pidList
)
==
0
return
len
(
pidList
)
==
0
else
:
else
:
pgrep_output
=
subprocess
.
check_output
(
'pgrep -fx
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pgrep_output
=
subprocess
.
check_output
(
'pgrep -fx
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pidList
=
[]
pidList
=
[]
for
pid
in
pgrep_output
.
splitlines
():
for
pid
in
pgrep_output
.
splitlines
():
pidList
.
append
(
int
(
pid
))
pidList
.
append
(
int
(
pid
))
...
@@ -45,23 +46,21 @@ def main(argv):
...
@@ -45,23 +46,21 @@ def main(argv):
if
check_ready_to_run
()
==
False
:
if
check_ready_to_run
()
==
False
:
# GPU metrics collector is already running. Exit
# GPU metrics collector is already running. Exit
exit
(
2
)
exit
(
2
)
with
open
(
os
.
path
.
join
(
metrics_output_dir
,
"gpu_metrics"
),
"w"
)
as
outputFile
:
cmd
=
'nvidia-smi -q -x'
.
split
()
pass
os
.
chmod
(
os
.
path
.
join
(
metrics_output_dir
,
"gpu_metrics"
),
0o777
)
cmd
=
'nvidia-smi -q -x'
while
(
True
):
while
(
True
):
try
:
try
:
smi_output
=
subprocess
.
check_output
(
cmd
,
shell
=
True
)
smi_output
=
subprocess
.
check_output
(
cmd
)
parse_nvidia_smi_result
(
smi_output
,
metrics_output_dir
)
except
Exception
:
except
:
traceback
.
print_exc
()
exception
=
sys
.
exc_info
(
)
gen_empty_gpu_metric
(
metrics_output_dir
)
for
e
in
exception
:
break
p
rint
(
"job exporter error {}"
.
format
(
e
)
)
p
arse_nvidia_smi_result
(
smi_output
,
metrics_output_dir
)
# TODO: change to sleep time configurable via arguments
# TODO: change to sleep time configurable via arguments
time
.
sleep
(
5
)
time
.
sleep
(
5
)
def
parse_nvidia_smi_result
(
smi
,
outputDir
):
def
parse_nvidia_smi_result
(
smi
,
outputDir
):
try
:
try
:
old_umask
=
os
.
umask
(
0
)
xmldoc
=
minidom
.
parseString
(
smi
)
xmldoc
=
minidom
.
parseString
(
smi
)
gpuList
=
xmldoc
.
getElementsByTagName
(
'gpu'
)
gpuList
=
xmldoc
.
getElementsByTagName
(
'gpu'
)
with
open
(
os
.
path
.
join
(
outputDir
,
"gpu_metrics"
),
'a'
)
as
outputFile
:
with
open
(
os
.
path
.
join
(
outputDir
,
"gpu_metrics"
),
'a'
)
as
outputFile
:
...
@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir):
...
@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir):
except
:
except
:
e_info
=
sys
.
exc_info
()
e_info
=
sys
.
exc_info
()
print
(
'xmldoc paring error'
)
print
(
'xmldoc paring error'
)
finally
:
os
.
umask
(
old_umask
)
def
gen_empty_gpu_metric
(
outputDir
):
try
:
old_umask
=
os
.
umask
(
0
)
with
open
(
os
.
path
.
join
(
outputDir
,
"gpu_metrics"
),
'a'
)
as
outputFile
:
outPut
=
{}
outPut
[
"Timestamp"
]
=
time
.
asctime
(
time
.
localtime
())
outPut
[
"gpuCount"
]
=
0
outPut
[
"gpuInfos"
]
=
[]
print
(
outPut
)
outputFile
.
write
(
"{}
\n
"
.
format
(
json
.
dumps
(
outPut
,
sort_keys
=
True
)))
outputFile
.
flush
()
except
Exception
:
traceback
.
print_exc
()
finally
:
os
.
umask
(
old_umask
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment