Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
fefee175
Commit
fefee175
authored
Sep 24, 2020
by
huchen
Browse files
Merge branch 'xuan_dev' into 'develop'
DTK-203 See merge request dcutoolkit/deeplearing/NNI!2
parents
c377abcf
662457ba
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
58 additions
and
41 deletions
+58
-41
examples/trials/mnist-keras/config.yml
examples/trials/mnist-keras/config.yml
+4
-2
examples/trials/mnist-keras/mnist-keras.py
examples/trials/mnist-keras/mnist-keras.py
+2
-1
examples/trials/network_morphism/FashionMNIST/FashionMNIST_keras.py
...rials/network_morphism/FashionMNIST/FashionMNIST_keras.py
+2
-1
examples/trials/network_morphism/FashionMNIST/config.yml
examples/trials/network_morphism/FashionMNIST/config.yml
+1
-1
tools/nni_gpu_tool/gpu_metrics_collector.py
tools/nni_gpu_tool/gpu_metrics_collector.py
+20
-14
tools/nni_trial_tool/gpu.py
tools/nni_trial_tool/gpu.py
+29
-22
No files found.
examples/trials/mnist-keras/config.yml
View file @
fefee175
...
@@ -8,6 +8,7 @@ trainingServicePlatform: local
...
@@ -8,6 +8,7 @@ trainingServicePlatform: local
searchSpacePath
:
search_space.json
searchSpacePath
:
search_space.json
#choice: true, false
#choice: true, false
useAnnotation
:
false
useAnnotation
:
false
#useActiveGpu: true
tuner
:
tuner
:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner
#SMAC (SMAC should be installed through nnictl)
#SMAC (SMAC should be installed through nnictl)
...
@@ -15,7 +16,8 @@ tuner:
...
@@ -15,7 +16,8 @@ tuner:
classArgs
:
classArgs
:
#choice: maximize, minimize
#choice: maximize, minimize
optimize_mode
:
maximize
optimize_mode
:
maximize
gpuIndices
:
1
trial
:
trial
:
command
:
python3 mnist-keras.py
command
:
HIP_VISIBLE_DEVICES=0 NNI_OUTPUT_DIR=/data_share/xuan/nni/
python3 mnist-keras.py
codeDir
:
.
codeDir
:
.
gpuNum
:
0
gpuNum
:
1
examples/trials/mnist-keras/mnist-keras.py
View file @
fefee175
...
@@ -65,7 +65,7 @@ def load_mnist_data(args):
...
@@ -65,7 +65,7 @@ def load_mnist_data(args):
'''
'''
mnist_path
=
os
.
path
.
join
(
os
.
environ
.
get
(
'NNI_OUTPUT_DIR'
),
'mnist.npz'
)
mnist_path
=
os
.
path
.
join
(
os
.
environ
.
get
(
'NNI_OUTPUT_DIR'
),
'mnist.npz'
)
(
x_train
,
y_train
),
(
x_test
,
y_test
)
=
mnist
.
load_data
(
path
=
mnist_path
)
(
x_train
,
y_train
),
(
x_test
,
y_test
)
=
mnist
.
load_data
(
path
=
mnist_path
)
os
.
remove
(
mnist_path
)
#
os.remove(mnist_path)
x_train
=
(
np
.
expand_dims
(
x_train
,
-
1
).
astype
(
np
.
float
)
/
255.
)[:
args
.
num_train
]
x_train
=
(
np
.
expand_dims
(
x_train
,
-
1
).
astype
(
np
.
float
)
/
255.
)[:
args
.
num_train
]
x_test
=
(
np
.
expand_dims
(
x_test
,
-
1
).
astype
(
np
.
float
)
/
255.
)[:
args
.
num_test
]
x_test
=
(
np
.
expand_dims
(
x_test
,
-
1
).
astype
(
np
.
float
)
/
255.
)[:
args
.
num_test
]
...
@@ -128,6 +128,7 @@ if __name__ == '__main__':
...
@@ -128,6 +128,7 @@ if __name__ == '__main__':
# get parameters from tuner
# get parameters from tuner
RECEIVED_PARAMS
=
nni
.
get_next_parameter
()
RECEIVED_PARAMS
=
nni
.
get_next_parameter
()
LOG
.
debug
(
RECEIVED_PARAMS
)
LOG
.
debug
(
RECEIVED_PARAMS
)
print
(
"xuana "
,
RECEIVED_PARAMS
)
PARAMS
=
generate_default_params
()
PARAMS
=
generate_default_params
()
PARAMS
.
update
(
RECEIVED_PARAMS
)
PARAMS
.
update
(
RECEIVED_PARAMS
)
# train
# train
...
...
examples/trials/network_morphism/FashionMNIST/FashionMNIST_keras.py
View file @
fefee175
...
@@ -113,7 +113,7 @@ def parse_rev_args(receive_msg):
...
@@ -113,7 +113,7 @@ def parse_rev_args(receive_msg):
# parallel model
# parallel model
try
:
try
:
available_devices
=
os
.
environ
[
"
CUDA
_VISIBLE_DEVICES"
]
available_devices
=
os
.
environ
[
"
HIP
_VISIBLE_DEVICES"
]
gpus
=
len
(
available_devices
.
split
(
","
))
gpus
=
len
(
available_devices
.
split
(
","
))
if
gpus
>
1
:
if
gpus
>
1
:
net
=
multi_gpu_model
(
net
,
gpus
)
net
=
multi_gpu_model
(
net
,
gpus
)
...
@@ -197,6 +197,7 @@ if __name__ == "__main__":
...
@@ -197,6 +197,7 @@ if __name__ == "__main__":
# trial get next parameter from network morphism tuner
# trial get next parameter from network morphism tuner
RCV_CONFIG
=
nni
.
get_next_parameter
()
RCV_CONFIG
=
nni
.
get_next_parameter
()
logger
.
debug
(
RCV_CONFIG
)
logger
.
debug
(
RCV_CONFIG
)
print
(
RCV_CONFIG
)
parse_rev_args
(
RCV_CONFIG
)
parse_rev_args
(
RCV_CONFIG
)
train_eval
()
train_eval
()
except
Exception
as
exception
:
except
Exception
as
exception
:
...
...
examples/trials/network_morphism/FashionMNIST/config.yml
View file @
fefee175
...
@@ -24,6 +24,6 @@ tuner:
...
@@ -24,6 +24,6 @@ tuner:
#number of classes
#number of classes
n_output_node
:
10
n_output_node
:
10
trial
:
trial
:
command
:
python3 FashionMNIST_keras.py
command
:
HIP_VISIBLE_DEVICES=0 NNI_OUTPUT_DIR=/data_share/xuan/nni/examples/trials/network_morphism/FashionMNIST/output
python3 FashionMNIST_keras.py
codeDir
:
.
codeDir
:
.
gpuNum
:
1
gpuNum
:
1
tools/nni_gpu_tool/gpu_metrics_collector.py
View file @
fefee175
...
@@ -9,12 +9,12 @@ import time
...
@@ -9,12 +9,12 @@ import time
import
traceback
import
traceback
from
xml.dom
import
minidom
from
xml.dom
import
minidom
import
json
def
main
(
argv
):
def
main
(
argv
):
metrics_output_dir
=
os
.
environ
[
'METRIC_OUTPUT_DIR'
]
metrics_output_dir
=
os
.
environ
[
'METRIC_OUTPUT_DIR'
]
cmd
=
'
nvidia
-smi -
q
-
x
'
.
split
()
cmd
=
'
rocm
-smi -
a
-
-json
'
.
split
()
while
(
True
):
while
(
True
):
try
:
try
:
smi_output
=
subprocess
.
check_output
(
cmd
)
smi_output
=
subprocess
.
check_output
(
cmd
)
...
@@ -30,25 +30,31 @@ def main(argv):
...
@@ -30,25 +30,31 @@ def main(argv):
def
parse_nvidia_smi_result
(
smi
,
outputDir
):
def
parse_nvidia_smi_result
(
smi
,
outputDir
):
try
:
try
:
old_umask
=
os
.
umask
(
0
)
old_umask
=
os
.
umask
(
0
)
xmldoc
=
minidom
.
parseString
(
smi
)
#xmldoc = minidom.parseString(smi)
gpuList
=
xmldoc
.
getElementsByTagName
(
'gpu'
)
smi
=
json
.
loads
(
smi
)
#gpuList = xmldoc.getElementsByTagName('gpu')
gpuList
=
smi
.
keys
()
with
open
(
os
.
path
.
join
(
outputDir
,
"gpu_metrics"
),
'a'
)
as
outputFile
:
with
open
(
os
.
path
.
join
(
outputDir
,
"gpu_metrics"
),
'a'
)
as
outputFile
:
outPut
=
{}
outPut
=
{}
outPut
[
"Timestamp"
]
=
time
.
asctime
(
time
.
localtime
())
outPut
[
"Timestamp"
]
=
time
.
asctime
(
time
.
localtime
())
outPut
[
"gpuCount"
]
=
len
(
gpuList
)
outPut
[
"gpuCount"
]
=
len
(
gpuList
)
-
1
outPut
[
"gpuInfos"
]
=
[]
outPut
[
"gpuInfos"
]
=
[]
for
gpuIndex
,
gpu
in
enumerate
(
gpuList
):
for
gpuIndex
,
gpu
in
enumerate
(
gpuList
):
if
gpu
==
'system'
:
continue
gpuInfo
=
{}
gpuInfo
=
{}
gpuInfo
[
'index'
]
=
gpuIndex
gpuInfo
[
'index'
]
=
gpuIndex
gpuInfo
[
'gpuUtil'
]
=
gpu
.
getElementsByTagName
(
'utilization'
)[
0
]
\
gpuInfo
[
'gpuUtil'
]
=
smi
[
gpu
][
"GPU OverDrive value (%)"
]
.
getElementsByTagName
(
'gpu_util'
)[
0
]
\
gpuInfo
[
'gpuMemUtil'
]
=
smi
[
gpu
][
"GPU Memory OverDrive value (%)"
]
.
childNodes
[
0
].
data
.
replace
(
"%"
,
""
).
strip
()
# gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
gpuInfo
[
'gpuMemUtil'
]
=
gpu
.
getElementsByTagName
(
'utilization'
)[
0
]
\
# .getElementsByTagName('gpu_util')[0]\
.
getElementsByTagName
(
'memory_util'
)[
0
]
\
# .childNodes[0].data.replace("%", "").strip()
.
childNodes
[
0
].
data
.
replace
(
"%"
,
""
).
strip
()
# gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
processes
=
gpu
.
getElementsByTagName
(
'processes'
)
# .getElementsByTagName('memory_util')[0]\
runningProNumber
=
len
(
processes
[
0
].
getElementsByTagName
(
'process_info'
))
# .childNodes[0].data.replace("%", "").strip()
gpuInfo
[
'activeProcessNum'
]
=
runningProNumber
# processes = gpu.getElementsByTagName('processes')
# runningProNumber = len(processes[0].getElementsByTagName('process_info'))
# gpuInfo['activeProcessNum'] = runningProNumber
outPut
[
"gpuInfos"
].
append
(
gpuInfo
)
outPut
[
"gpuInfos"
].
append
(
gpuInfo
)
print
(
outPut
)
print
(
outPut
)
...
...
tools/nni_trial_tool/gpu.py
View file @
fefee175
...
@@ -8,7 +8,7 @@ from xml.dom import minidom
...
@@ -8,7 +8,7 @@ from xml.dom import minidom
def
collect_gpu_usage
(
node_id
):
def
collect_gpu_usage
(
node_id
):
cmd
=
'
nvidia
-smi -
q
-
x
'
.
split
()
cmd
=
'
rocm
-smi -
a
-
-json
'
.
split
()
info
=
None
info
=
None
try
:
try
:
smi_output
=
subprocess
.
check_output
(
cmd
)
smi_output
=
subprocess
.
check_output
(
cmd
)
...
@@ -22,33 +22,40 @@ def collect_gpu_usage(node_id):
...
@@ -22,33 +22,40 @@ def collect_gpu_usage(node_id):
def
parse_nvidia_smi_result
(
smi
):
def
parse_nvidia_smi_result
(
smi
):
try
:
try
:
output
=
{}
output
=
{}
xmldoc
=
minidom
.
parseString
(
smi
)
# xmldoc = minidom.parseString(smi)
gpuList
=
xmldoc
.
getElementsByTagName
(
'gpu'
)
# gpuList = xmldoc.getElementsByTagName('gpu')
smi
=
json
.
loads
(
smi
)
gpuList
=
smi
.
keys
()
output
[
"Timestamp"
]
=
time
.
asctime
(
time
.
localtime
())
output
[
"Timestamp"
]
=
time
.
asctime
(
time
.
localtime
())
output
[
"gpuCount"
]
=
len
(
gpuList
)
output
[
"gpuCount"
]
=
len
(
gpuList
)
output
[
"gpuInfos"
]
=
[]
output
[
"gpuInfos"
]
=
[]
for
gpuIndex
,
gpu
in
enumerate
(
gpuList
):
for
gpuIndex
,
gpu
in
enumerate
(
gpuList
):
if
gpu
==
'system'
:
break
gpuInfo
=
{}
gpuInfo
=
{}
gpuInfo
[
'index'
]
=
gpuIndex
gpuInfo
[
'index'
]
=
gpuIndex
gpuInfo
[
'gpuUtil'
]
=
gpu
.
getElementsByTagName
(
'utilization'
)[
0
]
\
gpuInfo
[
'gpuUtil'
]
=
smi
[
gpu
][
"GPU OverDrive value (%)"
]
.
getElementsByTagName
(
'gpu_util'
)[
0
]
\
gpuInfo
[
'gpuMemUtil'
]
=
smi
[
gpu
][
"GPU Memory OverDrive value (%)"
]
.
childNodes
[
0
].
data
.
replace
(
"%"
,
""
).
strip
()
# gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\
gpuInfo
[
'gpuMemUtil'
]
=
gpu
.
getElementsByTagName
(
'utilization'
)[
0
]
\
# .getElementsByTagName('gpu_util')[0]\
.
getElementsByTagName
(
'memory_util'
)[
0
]
\
# .childNodes[0].data.replace("%", "").strip()
.
childNodes
[
0
].
data
.
replace
(
"%"
,
""
).
strip
()
# gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\
processes
=
gpu
.
getElementsByTagName
(
'processes'
)
# .getElementsByTagName('memory_util')[0]\
runningProNumber
=
len
(
processes
[
0
].
getElementsByTagName
(
'process_info'
))
# .childNodes[0].data.replace("%", "").strip()
gpuInfo
[
'activeProcessNum'
]
=
runningProNumber
# processes = gpu.getElementsByTagName('processes')
# runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo
[
'gpuType'
]
=
gpu
.
getElementsByTagName
(
'product_name'
)[
0
]
\
# gpuInfo['activeProcessNum'] = runningProNumber
.
childNodes
[
0
].
data
gpuInfo
[
'gpuType'
]
=
smi
[
gpu
][
"GPU ID"
]
memUsage
=
gpu
.
getElementsByTagName
(
'fb_memory_usage'
)[
0
]
# gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\
gpuInfo
[
'gpuMemTotal'
]
=
memUsage
.
getElementsByTagName
(
'total'
)[
0
]
\
# .childNodes[0].data
.
childNodes
[
0
].
data
.
replace
(
"MiB"
,
""
).
strip
()
# memUsage = gpu.getElementsByTagName('fb_memory_usage')[0]
gpuInfo
[
'gpuMemUsed'
]
=
memUsage
.
getElementsByTagName
(
'used'
)[
0
]
\
gpuInfo
[
'gpuMemUsed'
]
=
smi
[
gpu
][
"GPU use (%)"
]
.
childNodes
[
0
].
data
.
replace
(
"MiB"
,
""
).
strip
()
# gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\
gpuInfo
[
'gpuMemFree'
]
=
memUsage
.
getElementsByTagName
(
'free'
)[
0
]
\
# .childNodes[0].data.replace("MiB", "").strip()
.
childNodes
[
0
].
data
.
replace
(
"MiB"
,
""
).
strip
()
# gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\
# .childNodes[0].data.replace("MiB", "").strip()
# gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\
# .childNodes[0].data.replace("MiB", "").strip()
output
[
"gpuInfos"
].
append
(
gpuInfo
)
output
[
"gpuInfos"
].
append
(
gpuInfo
)
except
Exception
:
except
Exception
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment