Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
5b0034e4
Unverified
Commit
5b0034e4
authored
Aug 20, 2019
by
SparkSnail
Committed by
GitHub
Aug 20, 2019
Browse files
Merge pull request #204 from microsoft/master
merge master
parents
704b50e2
19173aa4
Changes
82
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
39 additions
and
19 deletions
+39
-19
tools/nni_cmd/nnictl_utils.py
tools/nni_cmd/nnictl_utils.py
+11
-8
tools/nni_gpu_tool/gpu_metrics_collector.py
tools/nni_gpu_tool/gpu_metrics_collector.py
+28
-11
No files found.
tools/nni_cmd/nnictl_utils.py
View file @
5b0034e4
...
...
@@ -22,7 +22,7 @@ import csv
import
os
import
psutil
import
json
import
datetime
from
datetime
import
datetime
,
timezone
import
time
import
re
from
pathlib
import
Path
...
...
@@ -142,6 +142,8 @@ def parse_ids(args):
elif
isinstance
(
experiment_dict
[
key
],
list
):
# if the config file is old version, remove the configuration from file
experiment_config
.
remove_experiment
(
key
)
if
args
.
all
:
return
running_experiment_list
if
args
.
port
is
not
None
:
for
key
in
running_experiment_list
:
if
str
(
experiment_dict
[
key
][
'port'
])
==
args
.
port
:
...
...
@@ -160,8 +162,6 @@ def parse_ids(args):
exit
(
1
)
else
:
result_list
=
running_experiment_list
elif
args
.
id
==
'all'
:
result_list
=
running_experiment_list
elif
args
.
id
.
endswith
(
'*'
):
for
id
in
running_experiment_list
:
if
id
.
startswith
(
args
.
id
[:
-
1
]):
...
...
@@ -175,7 +175,7 @@ def parse_ids(args):
if
len
(
result_list
)
>
1
:
print_error
(
args
.
id
+
' is ambiguous, please choose '
+
' '
.
join
(
result_list
)
)
return
None
if
not
result_list
and
(
(
args
.
id
and
args
.
id
!=
'all'
)
or
args
.
port
):
if
not
result_list
and
(
args
.
id
or
args
.
port
):
print_error
(
'There are no experiments matched, please set correct experiment id or restful server port'
)
elif
not
result_list
:
print_error
(
'There is no experiment running...'
)
...
...
@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content):
start_time_stamp
=
content
.
get
(
'startTime'
)
end_time_stamp
=
content
.
get
(
'endTime'
)
if
start_time_stamp
:
start_time
=
datetime
.
datetime
.
utc
fromtimestamp
(
start_time_stamp
//
1000
).
strftime
(
"%Y/%m/%d %H:%M:%S"
)
start_time
=
datetime
.
fromtimestamp
(
start_time_stamp
//
1000
,
timezone
.
utc
).
astimezone
(
).
strftime
(
"%Y/%m/%d %H:%M:%S"
)
content
[
'startTime'
]
=
str
(
start_time
)
if
end_time_stamp
:
end_time
=
datetime
.
datetime
.
utc
fromtimestamp
(
end_time_stamp
//
1000
).
strftime
(
"%Y/%m/%d %H:%M:%S"
)
end_time
=
datetime
.
fromtimestamp
(
end_time_stamp
//
1000
,
timezone
.
utc
).
astimezone
(
).
strftime
(
"%Y/%m/%d %H:%M:%S"
)
content
[
'endTime'
]
=
str
(
end_time
)
return
content
...
...
@@ -225,6 +225,9 @@ def check_rest(args):
def
stop_experiment
(
args
):
'''Stop the experiment which is running'''
if
args
.
id
and
args
.
id
==
'all'
:
print_warning
(
'
\'
nnictl stop all
\'
is abolished, please use
\'
nnictl stop --all
\'
to stop all of experiments!'
)
exit
(
1
)
experiment_id_list
=
parse_ids
(
args
)
if
experiment_id_list
:
experiment_config
=
Experiments
()
...
...
@@ -568,7 +571,7 @@ def experiment_list(args):
if
experiment_dict
[
key
][
'status'
]
!=
'STOPPED'
:
experiment_id_list
.
append
(
key
)
if
not
experiment_id_list
:
print_warning
(
'There is no experiment running...
\n
You can use
\'
nnictl experiment list --all
\'
to list all
stopped
experiments.'
)
print_warning
(
'There is no experiment running...
\n
You can use
\'
nnictl experiment list --all
\'
to list all experiments.'
)
experiment_information
=
""
for
key
in
experiment_id_list
:
experiment_information
+=
(
EXPERIMENT_DETAIL_FORMAT
%
(
key
,
experiment_dict
[
key
][
'status'
],
experiment_dict
[
key
][
'port'
],
\
...
...
@@ -581,7 +584,7 @@ def get_time_interval(time1, time2):
#convert time to timestamp
time1
=
time
.
mktime
(
time
.
strptime
(
time1
,
'%Y/%m/%d %H:%M:%S'
))
time2
=
time
.
mktime
(
time
.
strptime
(
time2
,
'%Y/%m/%d %H:%M:%S'
))
seconds
=
(
datetime
.
datetime
.
fromtimestamp
(
time2
)
-
datetime
.
datetime
.
fromtimestamp
(
time1
)).
seconds
seconds
=
(
datetime
.
fromtimestamp
(
time2
)
-
datetime
.
fromtimestamp
(
time1
)).
seconds
#convert seconds to day:hour:minute:second
days
=
seconds
/
86400
seconds
%=
86400
...
...
tools/nni_gpu_tool/gpu_metrics_collector.py
View file @
5b0034e4
...
...
@@ -21,6 +21,7 @@ import os
import
subprocess
import
sys
import
time
import
traceback
from
xml.dom
import
minidom
...
...
@@ -33,7 +34,7 @@ def check_ready_to_run():
pidList
.
remove
(
os
.
getpid
())
return
len
(
pidList
)
==
0
else
:
pgrep_output
=
subprocess
.
check_output
(
'pgrep -fx
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pgrep_output
=
subprocess
.
check_output
(
'pgrep -fx
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pidList
=
[]
for
pid
in
pgrep_output
.
splitlines
():
pidList
.
append
(
int
(
pid
))
...
...
@@ -45,23 +46,21 @@ def main(argv):
if
check_ready_to_run
()
==
False
:
# GPU metrics collector is already running. Exit
exit
(
2
)
with
open
(
os
.
path
.
join
(
metrics_output_dir
,
"gpu_metrics"
),
"w"
)
as
outputFile
:
pass
os
.
chmod
(
os
.
path
.
join
(
metrics_output_dir
,
"gpu_metrics"
),
0o777
)
cmd
=
'nvidia-smi -q -x'
cmd
=
'nvidia-smi -q -x'
.
split
()
while
(
True
):
try
:
smi_output
=
subprocess
.
check_output
(
cmd
,
shell
=
True
)
smi_output
=
subprocess
.
check_output
(
cmd
)
except
Exception
:
traceback
.
print_exc
()
gen_empty_gpu_metric
(
metrics_output_dir
)
break
parse_nvidia_smi_result
(
smi_output
,
metrics_output_dir
)
except
:
exception
=
sys
.
exc_info
()
for
e
in
exception
:
print
(
"job exporter error {}"
.
format
(
e
))
# TODO: change to sleep time configurable via arguments
time
.
sleep
(
5
)
def
parse_nvidia_smi_result
(
smi
,
outputDir
):
try
:
old_umask
=
os
.
umask
(
0
)
xmldoc
=
minidom
.
parseString
(
smi
)
gpuList
=
xmldoc
.
getElementsByTagName
(
'gpu'
)
with
open
(
os
.
path
.
join
(
outputDir
,
"gpu_metrics"
),
'a'
)
as
outputFile
:
...
...
@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir):
except
:
e_info
=
sys
.
exc_info
()
print
(
'xmldoc paring error'
)
finally
:
os
.
umask
(
old_umask
)
def
gen_empty_gpu_metric
(
outputDir
):
try
:
old_umask
=
os
.
umask
(
0
)
with
open
(
os
.
path
.
join
(
outputDir
,
"gpu_metrics"
),
'a'
)
as
outputFile
:
outPut
=
{}
outPut
[
"Timestamp"
]
=
time
.
asctime
(
time
.
localtime
())
outPut
[
"gpuCount"
]
=
0
outPut
[
"gpuInfos"
]
=
[]
print
(
outPut
)
outputFile
.
write
(
"{}
\n
"
.
format
(
json
.
dumps
(
outPut
,
sort_keys
=
True
)))
outputFile
.
flush
()
except
Exception
:
traceback
.
print_exc
()
finally
:
os
.
umask
(
old_umask
)
if
__name__
==
"__main__"
:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment