Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
d95c3513
Unverified
Commit
d95c3513
authored
Apr 03, 2019
by
SparkSnail
Committed by
GitHub
Apr 03, 2019
Browse files
Merge pull request #155 from Microsoft/master
merge master
parents
77526d37
e7d31abd
Changes
70
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
168 additions
and
406 deletions
+168
-406
src/nni_manager/training_service/common/clusterJobRestServer.ts
...i_manager/training_service/common/clusterJobRestServer.ts
+41
-0
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+8
-0
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+5
-0
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+5
-0
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+1
-1
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+41
-306
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
...ager/training_service/local/localTrainingServiceForGPU.ts
+2
-2
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+1
-1
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+6
-2
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+1
-9
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+8
-3
src/nni_manager/types/node-nvidia-smi/index.d.ts
src/nni_manager/types/node-nvidia-smi/index.d.ts
+0
-23
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
+2
-2
src/sdk/pynni/nni/metis_tuner/Regression_GP/CreateModel.py
src/sdk/pynni/nni/metis_tuner/Regression_GP/CreateModel.py
+1
-1
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
+3
-25
src/sdk/pynni/nni/networkmorphism_tuner/layers.py
src/sdk/pynni/nni/networkmorphism_tuner/layers.py
+1
-1
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
+8
-9
src/webui/src/components/Overview.tsx
src/webui/src/components/Overview.tsx
+5
-4
src/webui/src/components/overview/TrialProfile.tsx
src/webui/src/components/overview/TrialProfile.tsx
+28
-16
No files found.
src/nni_manager/training_service/common/clusterJobRestServer.ts
View file @
d95c3513
...
...
@@ -41,6 +41,10 @@ export abstract class ClusterJobRestServer extends RestServer{
private
readonly
expId
:
string
=
getExperimentId
();
private
enableVersionCheck
:
boolean
=
true
;
//switch to enable version check
private
versionCheckSuccess
:
boolean
|
undefined
;
private
errorMessage
?:
string
;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
...
...
@@ -59,6 +63,14 @@ export abstract class ClusterJobRestServer extends RestServer{
return
this
.
port
;
}
public
get
getErrorMessage
():
string
|
undefined
{
return
this
.
errorMessage
;
}
public
set
setEnableVersionCheck
(
versionCheck
:
boolean
)
{
this
.
enableVersionCheck
=
versionCheck
;
}
/**
* NNIRestServer's own router registration
*/
...
...
@@ -77,6 +89,31 @@ export abstract class ClusterJobRestServer extends RestServer{
next
();
});
router
.
post
(
`/version/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
this
.
enableVersionCheck
)
{
try
{
const
checkResultSuccess
:
boolean
=
req
.
body
.
tag
===
'
VCSuccess
'
?
true
:
false
;
if
(
this
.
versionCheckSuccess
!==
undefined
&&
this
.
versionCheckSuccess
!==
checkResultSuccess
)
{
this
.
errorMessage
=
'
Version check error, version check result is inconsistent!
'
;
this
.
log
.
error
(
this
.
errorMessage
);
}
else
if
(
checkResultSuccess
)
{
this
.
log
.
info
(
`Version check in trialKeeper success!`
);
this
.
versionCheckSuccess
=
true
;
}
else
{
this
.
versionCheckSuccess
=
false
;
this
.
errorMessage
=
req
.
body
.
msg
;
}
}
catch
(
err
)
{
this
.
log
.
error
(
`json parse metrics error:
${
err
}
`
);
res
.
status
(
500
);
res
.
send
(
err
.
message
);
}
}
else
{
this
.
log
.
info
(
`Skipping version check!`
);
}
res
.
send
();
});
router
.
post
(
`/update-metrics/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
try
{
this
.
log
.
info
(
`Get update-metrics request, trial job id is
${
req
.
params
.
trialId
}
`
);
...
...
@@ -94,6 +131,10 @@ export abstract class ClusterJobRestServer extends RestServer{
});
router
.
post
(
`/stdout/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
this
.
enableVersionCheck
&&
!
this
.
versionCheckSuccess
&&
!
this
.
errorMessage
)
{
this
.
errorMessage
=
`Version check failed, didn't get version check response from trialKeeper, please check your NNI version in `
+
`NNIManager and TrialKeeper!`
}
const
trialLogPath
:
string
=
path
.
join
(
getLogDir
(),
`trial_
${
req
.
params
.
trialId
}
.log`
);
try
{
let
skipLogging
:
boolean
=
false
;
...
...
src/nni_manager/training_service/common/gpuData.ts
View file @
d95c3513
...
...
@@ -58,3 +58,11 @@ export class GPUSummary {
this
.
gpuInfos
=
gpuInfos
;
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
d95c3513
...
...
@@ -66,11 +66,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
}
}
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
d95c3513
...
...
@@ -71,11 +71,16 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
}
this
.
log
.
info
(
'
Kubeflow training service exit.
'
);
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
d95c3513
...
...
@@ -71,5 +71,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --version '{11}' --log_collection '{12}'`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --
nni_manager_
version '{11}' --log_collection '{12}'`
+
`1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
d95c3513
...
...
@@ -61,7 +61,7 @@ abstract class KubernetesTrainingService {
protected
kubernetesCRDClient
?:
KubernetesCRDClient
;
protected
kubernetesJobRestServer
?:
KubernetesJobRestServer
;
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
versionCheck
?
:
boolean
=
true
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
constructor
()
{
...
...
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
d95c3513
...
...
@@ -19,268 +19,16 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
nodeNvidiaSmi
from
'
node-nvidia-smi
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
/* Example of nvidia-smi result
{
"nvidia_smi_log": {
"timestamp": "Fri Jul 13 15:17:27 2018",
"driver_version": "396.26",
"attached_gpus": "8",
"gpu": [
...,
{
...
"minor_number": "5",
"utilization": {
"gpu_util": "100 %",
"memory_util": "27 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
...
"processes": {
"process_info": {
"pid": "39943",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
...
},
{
"$": {
"id": "00000000:8E:00.0"
},
"product_name": "Tesla P100-PCIE-16GB",
"product_brand": "Tesla",
"display_mode": "Enabled",
"display_active": "Disabled",
"persistence_mode": "Disabled",
"accounting_mode": "Disabled",
"accounting_mode_buffer_size": "4000",
"driver_model": {
"current_dm": "N/A",
"pending_dm": "N/A"
},
"serial": "0321017108732",
"uuid": "GPU-df3e8a0a-ce99-350c-b196-c3775eb32309",
"minor_number": "6",
"vbios_version": "86.00.40.00.01",
"multigpu_board": "No",
"board_id": "0x8e00",
"gpu_part_number": "900-2H400-0300-031",
"inforom_version": {
"img_version": "H400.0201.00.08",
"oem_object": "1.1",
"ecc_object": "4.1",
"pwr_object": "N/A"
},
"gpu_operation_mode": {
"current_gom": "N/A",
"pending_gom": "N/A"
},
"gpu_virtualization_mode": {
"virtualization_mode": "None"
},
"ibmnpu": {
"relaxed_ordering_mode": "N/A"
},
"pci": {
"pci_bus": "8E",
"pci_device": "00",
"pci_domain": "0000",
"pci_device_id": "15F810DE",
"pci_bus_id": "00000000:8E:00.0",
"pci_sub_system_id": "118F10DE",
"pci_gpu_link_info": {
"pcie_gen": {
"max_link_gen": "3",
"current_link_gen": "3"
},
"link_widths": {
"max_link_width": "16x",
"current_link_width": "16x"
}
},
"pci_bridge_chip": {
"bridge_chip_type": "N/A",
"bridge_chip_fw": "N/A"
},
"replay_counter": "0",
"tx_util": "0 KB/s",
"rx_util": "0 KB/s"
},
"fan_speed": "N/A",
"performance_state": "P0",
"clocks_throttle_reasons": {
"clocks_throttle_reason_gpu_idle": "Not Active",
"clocks_throttle_reason_applications_clocks_setting": "Not Active",
"clocks_throttle_reason_sw_power_cap": "Not Active",
"clocks_throttle_reason_hw_slowdown": "Not Active",
"clocks_throttle_reason_hw_thermal_slowdown": "Not Active",
"clocks_throttle_reason_hw_power_brake_slowdown": "Not Active",
"clocks_throttle_reason_sync_boost": "Not Active",
"clocks_throttle_reason_sw_thermal_slowdown": "Not Active"
},
"fb_memory_usage": {
"total": "16280 MiB",
"used": "16239 MiB",
"free": "41 MiB"
},
"bar1_memory_usage": {
"total": "16384 MiB",
"used": "2 MiB",
"free": "16382 MiB"
},
"compute_mode": "Default",
"utilization": {
"gpu_util": "0 %",
"memory_util": "0 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
"encoder_stats": {
"session_count": "0",
"average_fps": "0",
"average_latency": "0"
},
"ecc_mode": {
"current_ecc": "Enabled",
"pending_ecc": "Enabled"
},
"ecc_errors": {
"volatile": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
},
"aggregate": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
}
},
"retired_pages": {
"multiple_single_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"double_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"pending_retirement": "No"
},
"temperature": {
"gpu_temp": "33 C",
"gpu_temp_max_threshold": "85 C",
"gpu_temp_slow_threshold": "82 C",
"gpu_temp_max_gpu_threshold": "N/A",
"memory_temp": "N/A",
"gpu_temp_max_mem_threshold": "N/A"
},
"power_readings": {
"power_state": "P0",
"power_management": "Supported",
"power_draw": "37.29 W",
"power_limit": "250.00 W",
"default_power_limit": "250.00 W",
"enforced_power_limit": "250.00 W",
"min_power_limit": "125.00 W",
"max_power_limit": "250.00 W"
},
"clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1189 MHz"
},
"applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"default_applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"max_clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1328 MHz"
},
"max_customer_boost_clocks": {
"graphics_clock": "1328 MHz"
},
"clock_policy": {
"auto_boost": "N/A",
"auto_boost_default": "N/A"
},
"supported_clocks": {
"supported_mem_clock": {
"value": "715 MHz",
"supported_graphics_clock": [
"1328 MHz",
"1316 MHz",
"1303 MHz",
...
]
}
},
"processes": {
"process_info": {
"pid": "40788",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
"accounted_processes": "\n\t\t"
},
...
]
}
}*/
import
*
as
cp
from
'
child_process
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
os
from
'
os
'
;
import
*
as
fs
from
'
fs
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
/**
* GPUScheduler
...
...
@@ -290,29 +38,43 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
nvdmNotFoundRegex
:
RegExp
;
private
gpuMetricCollectorScriptFolder
:
string
;
constructor
()
{
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
nvdmNotFoundRegex
=
/nvidia-smi: not found/gi
;
this
.
gpuMetricCollectorScriptFolder
=
`
${
os
.
tmpdir
()}
/nni/script`
;
}
public
async
run
():
Promise
<
void
>
{
await
this
.
runGpuMetricsCollectorScript
();
while
(
!
this
.
stopping
)
{
try
{
this
.
gpuSummary
=
await
this
.
read
GPUSummary
();
await
this
.
update
GPUSummary
();
}
catch
(
error
)
{
this
.
log
.
error
(
'
Read GPU summary failed with error:
'
,
error
);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if
(
this
.
nvdmNotFoundRegex
.
test
(
error
))
{
break
;
}
}
await
delay
(
5000
);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
//generate gpu_metrics_collector.sh
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics_collector.sh
'
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
this
.
gpuMetricCollectorScriptFolder
,
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
cp
.
exec
(
`bash
${
gpuMetricsCollectorScriptPath
}
`
);
}
public
getAvailableGPUIndices
():
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
).
map
((
info
:
GPUInfo
)
=>
info
.
index
);
...
...
@@ -321,51 +83,24 @@ class GPUScheduler {
return
[];
}
public
stop
()
:
void
{
public
async
stop
()
{
this
.
stopping
=
true
;
try
{
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
'
utf8
'
);
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
await
cpp
.
exec
(
`rm -rf
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
}
catch
(
error
){
this
.
log
.
error
(
`GPU scheduler error:
${
error
}
`
);
}
private
generateEmbededGPUSummary
(
data
:
nodeNvidiaSmi
.
GPUInfo
)
:
GPUInfo
[]
{
let
gpuInfos
:
GPUInfo
[]
=
[];
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
assert
(
gpuNumber
>
0
);
if
(
gpuNumber
==
1
)
{
const
embededGPUSummary
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
.
push
(
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
else
{
const
embededGPUSummaryArray
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
[]
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
=
embededGPUSummaryArray
.
map
(
embededGPUSummary
=>
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
return
gpuInfos
;
}
private
convertGPUSummaryToInfo
(
embededGPUSummary
:
nodeNvidiaSmi
.
EmbededGPUSummary
)
:
GPUInfo
{
return
new
GPUInfo
(
typeof
embededGPUSummary
.
process
===
'
object
'
?
1
:
0
,
parseFloat
(
embededGPUSummary
.
utilization
.
memory_util
),
parseFloat
(
embededGPUSummary
.
utilization
.
gpu_util
),
parseInt
(
embededGPUSummary
.
minor_number
,
10
));
}
private
readGPUSummary
():
Promise
<
GPUSummary
>
{
return
new
Promise
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
nodeNvidiaSmi
((
error
:
Error
,
data
:
nodeNvidiaSmi
.
GPUInfo
)
=>
{
if
(
error
)
{
reject
(
error
);
private
async
updateGPUSummary
()
{
const
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
)}
`
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
const
gpuSummary
:
GPUSummary
=
new
GPUSummary
(
gpuNumber
,
Date
().
toString
(),
this
.
generateEmbededGPUSummary
(
data
)
);
resolve
(
gpuSummary
);
this
.
log
.
error
(
'
Could not get gpu metrics information!
'
);
}
});
});
}
}
...
...
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
View file @
d95c3513
...
...
@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
public
cleanUp
():
Promise
<
void
>
{
public
async
cleanUp
():
Promise
<
void
>
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
this
.
gpuScheduler
.
stop
();
await
this
.
gpuScheduler
.
stop
();
}
return
super
.
cleanUp
();
...
...
src/nni_manager/training_service/pai/paiData.ts
View file @
d95c3513
...
...
@@ -64,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --version '{12}' --log_collection '{13}'`
;
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --
nni_manager_
version '{12}' --log_collection '{13}'`
;
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
`hdfs://{0}:9000/`
;
...
...
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
d95c3513
...
...
@@ -75,7 +75,7 @@ class PAITrainingService implements TrainingService {
private
paiRestServerPort
?:
number
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
versionCheck
?
:
boolean
=
true
;
private
versionCheck
:
boolean
=
true
;
private
logCollection
:
string
;
constructor
()
{
...
...
@@ -97,11 +97,15 @@ class PAITrainingService implements TrainingService {
this
.
log
.
info
(
'
Run PAI training service.
'
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`PAI Training service rest server listening on:
${
restServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
await
this
.
updatePaiToken
();
await
this
.
paiJobCollector
.
retrieveTrialStatus
(
this
.
paiToken
,
this
.
paiClusterConfig
);
if
(
restServer
.
getErrorMessage
)
{
throw
new
Error
(
restServer
.
getErrorMessage
)
this
.
stopping
=
true
;
}
await
delay
(
3000
);
}
this
.
log
.
info
(
'
PAI training service exit.
'
);
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
d95c3513
...
...
@@ -250,7 +250,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={
cd $NNI_SYS_DIR
sh install_nni.sh
echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --
nni_manager_
version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{12}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
...
...
@@ -259,11 +259,3 @@ cd {0}
echo $$ >{1}
eval {2} >stdout 2>stderr
echo $?
\`
date +%s%3N
\`
>{3}`
;
export
const
GPU_COLLECTOR_FORMAT
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
d95c3513
...
...
@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
import
{
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
GPU_COLLECTOR_FORMAT
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
...
...
@@ -102,6 +102,7 @@ class RemoteMachineTrainingService implements TrainingService {
public
async
run
():
Promise
<
void
>
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
'
Run remote machine training service.
'
);
while
(
!
this
.
stopping
)
{
while
(
this
.
jobQueue
.
length
>
0
)
{
...
...
@@ -117,6 +118,10 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
}
}
if
(
restServer
.
getErrorMessage
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
await
delay
(
3000
);
}
this
.
log
.
info
(
'
Remote machine training service exit.
'
);
...
...
@@ -447,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_COLLECTOR_FORMAT
,
GPU_
INFO_
COLLECTOR_FORMAT
,
remoteGPUScriptsDir
,
path
.
join
(
remoteGPUScriptsDir
,
'
pid
'
),
);
...
...
src/nni_manager/types/node-nvidia-smi/index.d.ts
deleted
100644 → 0
View file @
77526d37
declare
module
'
node-nvidia-smi
'
{
function
smi
(
callback
:
(
error
:
Error
,
data
:
smi
.
GPUInfo
)
=>
void
):
void
;
namespace
smi
{
interface
EmbededGPUSummary
{
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
}
interface
GPUInfo
{
nvidia_smi_log
:
{
attached_gpus
:
string
;
gpu
:
EmbededGPUSummary
[]
|
EmbededGPUSummary
;
};
}
}
export
=
smi
;
}
\ No newline at end of file
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
View file @
d95c3513
...
...
@@ -143,7 +143,7 @@ class Bracket():
self
.
s_max
=
s_max
self
.
eta
=
eta
self
.
n
=
math
.
ceil
((
s_max
+
1
)
*
(
eta
**
s
)
/
(
s
+
1
)
-
_epsilon
)
# pylint: disable=invalid-name
self
.
r
=
math
.
ceil
(
R
/
eta
**
s
-
_epsilon
)
# pylint: disable=invalid-name
self
.
r
=
R
/
eta
**
s
# pylint: disable=invalid-name
self
.
i
=
0
self
.
hyper_configs
=
[]
# [ {id: params}, {}, ... ]
self
.
configs_perf
=
[]
# [ {id: [seq, acc]}, {}, ... ]
...
...
@@ -158,7 +158,7 @@ class Bracket():
def
get_n_r
(
self
):
"""return the values of n and r for the next round"""
return
math
.
floor
(
self
.
n
/
self
.
eta
**
self
.
i
+
_epsilon
),
self
.
r
*
self
.
eta
**
self
.
i
return
math
.
floor
(
self
.
n
/
self
.
eta
**
self
.
i
+
_epsilon
),
math
.
floor
(
self
.
r
*
self
.
eta
**
self
.
i
+
_epsilon
)
def
increase_i
(
self
):
"""i means the ith round. Increase i by 1"""
...
...
src/sdk/pynni/nni/metis_tuner/Regression_GP/CreateModel.py
View file @
d95c3513
...
...
@@ -40,7 +40,7 @@ def create_model(samples_x, samples_y_aggregation,
regressor
=
gp
.
GaussianProcessRegressor
(
kernel
=
kernel
,
n_restarts_optimizer
=
n_restarts_optimizer
,
normalize_y
=
True
,
alpha
=
0
)
alpha
=
1e-1
0
)
regressor
.
fit
(
numpy
.
array
(
samples_x
),
numpy
.
array
(
samples_y_aggregation
))
model
=
{}
...
...
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
View file @
d95c3513
...
...
@@ -65,7 +65,7 @@ class MetisTuner(Tuner):
"""
def
__init__
(
self
,
optimize_mode
=
"maximize"
,
no_resampling
=
True
,
no_candidates
=
True
,
selection_num_starting_points
=
600
,
cold_start_num
=
10
,
exploration_probability
=
0.
1
):
selection_num_starting_points
=
600
,
cold_start_num
=
10
,
exploration_probability
=
0.
9
):
"""
Parameters
----------
...
...
@@ -126,13 +126,7 @@ class MetisTuner(Tuner):
for
key
in
search_space
:
key_type
=
search_space
[
key
][
'_type'
]
key_range
=
search_space
[
key
][
'_value'
]
try
:
idx
=
self
.
key_order
.
index
(
key
)
except
Exception
as
ex
:
logger
.
exception
(
ex
)
raise
RuntimeError
(
"The format search space contains
\
some key that didn't define in key_order."
)
if
key_type
==
'quniform'
:
if
key_range
[
2
]
==
1
:
self
.
x_bounds
[
idx
]
=
[
key_range
[
0
],
key_range
[
1
]]
...
...
@@ -271,7 +265,6 @@ class MetisTuner(Tuner):
samples_size_unique
=
len
(
samples_y
)
# ===== STEP 1: Compute the current optimum =====
#sys.stderr.write("[%s] Predicting the optimal configuration from the current training dataset...\n" % (os.path.basename(__file__)))
gp_model
=
gp_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
lm_current
=
gp_selection
.
selection
(
"lm"
,
...
...
@@ -291,8 +284,6 @@ class MetisTuner(Tuner):
'reason'
:
"exploitation_gp"
})
# ===== STEP 2: Get recommended configurations for exploration =====
#sys.stderr.write("[%s] Getting candidates for exploration...\n"
#% \(os.path.basename(__file__)))
results_exploration
=
gp_selection
.
selection
(
"lc"
,
samples_y_aggregation
,
...
...
@@ -309,15 +300,11 @@ class MetisTuner(Tuner):
'expected_sigma'
:
results_exploration
[
'expected_sigma'
],
'reason'
:
"exploration"
})
logger
.
info
(
"DEBUG: 1 exploration candidate selected
\n
"
)
#sys.stderr.write("[%s] DEBUG: 1 exploration candidate selected\n" % (os.path.basename(__file__)))
else
:
logger
.
info
(
"DEBUG: No suitable exploration candidates were"
)
# sys.stderr.write("[%s] DEBUG: No suitable exploration candidates were \
# found\n" % (os.path.basename(__file__)))
# ===== STEP 3: Get recommended configurations for exploitation =====
if
samples_size_all
>=
threshold_samplessize_exploitation
:
#sys.stderr.write("[%s] Getting candidates for exploitation...\n" % (os.path.basename(__file__)))
print
(
"Getting candidates for exploitation...
\n
"
)
try
:
gmm
=
gmm_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
...
...
@@ -385,13 +372,6 @@ class MetisTuner(Tuner):
temp_improvement
=
threads_result
[
'expected_lowest_mu'
]
-
lm_current
[
'expected_mu'
]
if
next_improvement
>
temp_improvement
:
logger
.
info
(
"DEBUG:
\"
next_candidate
\"
changed:
\
lowest mu might reduce from %f (%s) to %f (%s), %s
\n
"
%
\
lm_current
[
'expected_mu'
],
str
(
lm_current
[
'hyperparameter'
]),
\
threads_result
[
'expected_lowest_mu'
],
\
str
(
threads_result
[
'candidate'
][
'hyperparameter'
]),
\
threads_result
[
'candidate'
][
'reason'
])
next_improvement
=
temp_improvement
next_candidate
=
threads_result
[
'candidate'
]
else
:
...
...
@@ -415,7 +395,7 @@ class MetisTuner(Tuner):
if
next_candidate
is
not
None
:
outputs
=
self
.
_pack_output
(
next_candidate
[
'hyperparameter'
])
else
:
random_parameter
=
_rand_init
(
self
.
x_bounds
,
self
.
x_types
,
1
)[
0
]
random_parameter
=
_rand_init
(
x_bounds
,
x_types
,
1
)[
0
]
outputs
=
self
.
_pack_output
(
random_parameter
)
self
.
history_parameters
.
append
(
outputs
)
return
outputs
...
...
@@ -443,8 +423,6 @@ def _rand_with_constraints(x_bounds, x_types):
def
_calculate_lowest_mu_threaded
(
inputs
):
[
candidate
,
samples_x
,
samples_y
,
x_bounds
,
x_types
,
minimize_constraints_fun
,
minimize_starting_points
]
=
inputs
sys
.
stderr
.
write
(
"[%s] Evaluating information gain of %s (%s)...
\n
"
%
\
(
os
.
path
.
basename
(
__file__
),
candidate
[
'hyperparameter'
],
candidate
[
'reason'
]))
outputs
=
{
"candidate"
:
candidate
,
"expected_lowest_mu"
:
None
}
for
expected_mu
in
[
candidate
[
'expected_mu'
]
+
1.96
*
candidate
[
'expected_sigma'
],
...
...
src/sdk/pynni/nni/networkmorphism_tuner/layers.py
View file @
d95c3513
...
...
@@ -254,7 +254,7 @@ class StubConv(StubWeightBiasLayer):
keras_layer
.
set_weights
((
self
.
weights
[
0
].
T
,
self
.
weights
[
1
]))
def
size
(
self
):
return
self
.
filters
*
self
.
kernel_size
*
self
.
kernel_size
+
self
.
filters
return
(
self
.
input_channel
*
self
.
kernel_size
*
self
.
kernel_size
+
1
)
*
self
.
filters
@
abstractmethod
def
to_real_layer
(
self
):
...
...
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
View file @
d95c3513
...
...
@@ -192,17 +192,20 @@ class SMACTuner(Tuner):
Returns
-------
dict
challenger
dict
dict which stores copy of
challenger
s
"""
converted_dict
=
{}
for
key
,
value
in
challenger_dict
.
items
():
# convert to loguniform
if
key
in
self
.
loguniform_key
:
c
hallenger
_dict
[
key
]
=
np
.
exp
(
challenger_dict
[
key
])
c
onverted
_dict
[
key
]
=
np
.
exp
(
challenger_dict
[
key
])
# convert categorical back to original value
if
key
in
self
.
categorical_dict
:
el
if
key
in
self
.
categorical_dict
:
idx
=
challenger_dict
[
key
]
challenger_dict
[
key
]
=
self
.
categorical_dict
[
key
][
idx
]
return
challenger_dict
converted_dict
[
key
]
=
self
.
categorical_dict
[
key
][
idx
]
else
:
converted_dict
[
key
]
=
value
return
converted_dict
def
generate_parameters
(
self
,
parameter_id
):
"""generate one instance of hyperparameters
...
...
@@ -220,13 +223,11 @@ class SMACTuner(Tuner):
if
self
.
first_one
:
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
self
.
total_data
[
parameter_id
]
=
init_challenger
json_tricks
.
dumps
(
init_challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
())
else
:
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
for
challenger
in
challengers
:
self
.
total_data
[
parameter_id
]
=
challenger
json_tricks
.
dumps
(
challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
())
def
generate_multiple_parameters
(
self
,
parameter_id_list
):
...
...
@@ -247,7 +248,6 @@ class SMACTuner(Tuner):
for
one_id
in
parameter_id_list
:
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
self
.
total_data
[
one_id
]
=
init_challenger
json_tricks
.
dumps
(
init_challenger
.
get_dictionary
())
params
.
append
(
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
()))
else
:
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
...
...
@@ -257,7 +257,6 @@ class SMACTuner(Tuner):
if
cnt
>=
len
(
parameter_id_list
):
break
self
.
total_data
[
parameter_id_list
[
cnt
]]
=
challenger
json_tricks
.
dumps
(
challenger
.
get_dictionary
())
params
.
append
(
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
()))
cnt
+=
1
return
params
src/webui/src/components/Overview.tsx
View file @
d95c3513
...
...
@@ -20,6 +20,7 @@ require('../static/style/overviewTitle.scss');
interface
OverviewState
{
tableData
:
Array
<
TableObj
>
;
experimentAPI
:
object
;
searchSpace
:
object
;
status
:
string
;
errorStr
:
string
;
...
...
@@ -47,6 +48,7 @@ class Overview extends React.Component<{}, OverviewState> {
super
(
props
);
this
.
state
=
{
searchSpace
:
{},
experimentAPI
:
{},
status
:
''
,
errorStr
:
''
,
trialProfile
:
{
...
...
@@ -143,6 +145,7 @@ class Overview extends React.Component<{}, OverviewState> {
});
if
(
this
.
_isMounted
)
{
this
.
setState
({
experimentAPI
:
res
.
data
,
trialProfile
:
trialPro
[
0
],
searchSpace
:
searchSpace
,
isLogCollection
:
expLogCollection
...
...
@@ -390,7 +393,7 @@ class Overview extends React.Component<{}, OverviewState> {
const
{
trialProfile
,
searchSpace
,
tableData
,
accuracyData
,
accNodata
,
status
,
errorStr
,
trialNumber
,
bestAccuracy
,
titleMaxbgcolor
,
titleMinbgcolor
,
isLogCollection
titleMaxbgcolor
,
titleMinbgcolor
,
isLogCollection
,
experimentAPI
}
=
this
.
state
;
return
(
...
...
@@ -425,9 +428,7 @@ class Overview extends React.Component<{}, OverviewState> {
<
Row
className
=
"experiment"
>
{
/* the scroll bar all the trial profile in the searchSpace div*/
}
<
div
className
=
"experiment searchSpace"
>
<
TrialPro
tiralProInfo
=
{
trialProfile
}
/>
<
TrialPro
experiment
=
{
experimentAPI
}
/>
</
div
>
</
Row
>
</
Col
>
...
...
src/webui/src/components/overview/TrialProfile.tsx
View file @
d95c3513
import
*
as
React
from
'
react
'
;
import
{
Experiment
}
from
'
../../static/interface
'
;
import
MonacoEditor
from
'
react-monaco-editor
'
;
import
{
MONACO
}
from
'
../../static/const
'
;
interface
TrialInfoProps
{
tiralProInfo
:
Experimen
t
;
experiment
:
objec
t
;
}
class
TrialInfo
extends
React
.
Component
<
TrialInfoProps
,
{}
>
{
...
...
@@ -13,19 +12,32 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
super
(
props
);
}
render
()
{
const
{
tiralProInfo
}
=
this
.
props
;
const
showProInfo
=
[];
showProInfo
.
push
({
revision
:
tiralProInfo
.
revision
,
authorName
:
tiralProInfo
.
author
,
trialConcurrency
:
tiralProInfo
.
runConcurren
,
tuner
:
tiralProInfo
.
tuner
,
assessor
:
tiralProInfo
.
assessor
?
tiralProInfo
.
assessor
:
undefined
,
logCollection
:
tiralProInfo
.
logCollection
?
tiralProInfo
.
logCollection
:
undefined
,
advisor
:
tiralProInfo
.
advisor
?
tiralProInfo
.
advisor
:
undefined
,
clusterMetaData
:
tiralProInfo
.
clusterMetaData
?
tiralProInfo
.
clusterMetaData
:
undefined
componentWillReceiveProps
(
nextProps
:
TrialInfoProps
)
{
const
experiments
=
nextProps
.
experiment
;
Object
.
keys
(
experiments
).
map
(
key
=>
{
switch
(
key
)
{
case
'
id
'
:
case
'
logDir
'
:
case
'
startTime
'
:
case
'
endTime
'
:
experiments
[
key
]
=
undefined
;
break
;
case
'
params
'
:
const
params
=
experiments
[
key
];
Object
.
keys
(
params
).
map
(
item
=>
{
if
(
item
===
'
experimentName
'
||
item
===
'
searchSpace
'
||
item
===
'
trainingServicePlatform
'
)
{
params
[
item
]
=
undefined
;
}
});
break
;
default
:
}
});
}
render
()
{
const
{
experiment
}
=
this
.
props
;
return
(
<
div
className
=
"profile"
>
<
MonacoEditor
...
...
@@ -33,7 +45,7 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
height
=
"380"
language
=
"json"
theme
=
"vs-light"
value
=
{
JSON
.
stringify
(
showProInfo
[
0
]
,
null
,
2
)
}
value
=
{
JSON
.
stringify
(
experiment
,
null
,
2
)
}
options
=
{
MONACO
}
/>
</
div
>
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment