Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
d95c3513
Unverified
Commit
d95c3513
authored
Apr 03, 2019
by
SparkSnail
Committed by
GitHub
Apr 03, 2019
Browse files
Merge pull request #155 from Microsoft/master
merge master
parents
77526d37
e7d31abd
Changes
70
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
168 additions
and
406 deletions
+168
-406
src/nni_manager/training_service/common/clusterJobRestServer.ts
...i_manager/training_service/common/clusterJobRestServer.ts
+41
-0
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+8
-0
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+5
-0
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+5
-0
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+1
-1
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+41
-306
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
...ager/training_service/local/localTrainingServiceForGPU.ts
+2
-2
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+1
-1
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+6
-2
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+1
-9
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+8
-3
src/nni_manager/types/node-nvidia-smi/index.d.ts
src/nni_manager/types/node-nvidia-smi/index.d.ts
+0
-23
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
+2
-2
src/sdk/pynni/nni/metis_tuner/Regression_GP/CreateModel.py
src/sdk/pynni/nni/metis_tuner/Regression_GP/CreateModel.py
+1
-1
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
+3
-25
src/sdk/pynni/nni/networkmorphism_tuner/layers.py
src/sdk/pynni/nni/networkmorphism_tuner/layers.py
+1
-1
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
+8
-9
src/webui/src/components/Overview.tsx
src/webui/src/components/Overview.tsx
+5
-4
src/webui/src/components/overview/TrialProfile.tsx
src/webui/src/components/overview/TrialProfile.tsx
+28
-16
No files found.
src/nni_manager/training_service/common/clusterJobRestServer.ts
View file @
d95c3513
...
@@ -41,6 +41,10 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -41,6 +41,10 @@ export abstract class ClusterJobRestServer extends RestServer{
private
readonly
expId
:
string
=
getExperimentId
();
private
readonly
expId
:
string
=
getExperimentId
();
private
enableVersionCheck
:
boolean
=
true
;
//switch to enable version check
private
versionCheckSuccess
:
boolean
|
undefined
;
private
errorMessage
?:
string
;
/**
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
*/
...
@@ -58,6 +62,14 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -58,6 +62,14 @@ export abstract class ClusterJobRestServer extends RestServer{
}
}
return
this
.
port
;
return
this
.
port
;
}
}
public
get
getErrorMessage
():
string
|
undefined
{
return
this
.
errorMessage
;
}
public
set
setEnableVersionCheck
(
versionCheck
:
boolean
)
{
this
.
enableVersionCheck
=
versionCheck
;
}
/**
/**
* NNIRestServer's own router registration
* NNIRestServer's own router registration
...
@@ -77,6 +89,31 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -77,6 +89,31 @@ export abstract class ClusterJobRestServer extends RestServer{
next
();
next
();
});
});
router
.
post
(
`/version/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
this
.
enableVersionCheck
)
{
try
{
const
checkResultSuccess
:
boolean
=
req
.
body
.
tag
===
'
VCSuccess
'
?
true
:
false
;
if
(
this
.
versionCheckSuccess
!==
undefined
&&
this
.
versionCheckSuccess
!==
checkResultSuccess
)
{
this
.
errorMessage
=
'
Version check error, version check result is inconsistent!
'
;
this
.
log
.
error
(
this
.
errorMessage
);
}
else
if
(
checkResultSuccess
)
{
this
.
log
.
info
(
`Version check in trialKeeper success!`
);
this
.
versionCheckSuccess
=
true
;
}
else
{
this
.
versionCheckSuccess
=
false
;
this
.
errorMessage
=
req
.
body
.
msg
;
}
}
catch
(
err
)
{
this
.
log
.
error
(
`json parse metrics error:
${
err
}
`
);
res
.
status
(
500
);
res
.
send
(
err
.
message
);
}
}
else
{
this
.
log
.
info
(
`Skipping version check!`
);
}
res
.
send
();
});
router
.
post
(
`/update-metrics/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
post
(
`/update-metrics/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
try
{
try
{
this
.
log
.
info
(
`Get update-metrics request, trial job id is
${
req
.
params
.
trialId
}
`
);
this
.
log
.
info
(
`Get update-metrics request, trial job id is
${
req
.
params
.
trialId
}
`
);
...
@@ -94,6 +131,10 @@ export abstract class ClusterJobRestServer extends RestServer{
...
@@ -94,6 +131,10 @@ export abstract class ClusterJobRestServer extends RestServer{
});
});
router
.
post
(
`/stdout/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
router
.
post
(
`/stdout/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
this
.
enableVersionCheck
&&
!
this
.
versionCheckSuccess
&&
!
this
.
errorMessage
)
{
this
.
errorMessage
=
`Version check failed, didn't get version check response from trialKeeper, please check your NNI version in `
+
`NNIManager and TrialKeeper!`
}
const
trialLogPath
:
string
=
path
.
join
(
getLogDir
(),
`trial_
${
req
.
params
.
trialId
}
.log`
);
const
trialLogPath
:
string
=
path
.
join
(
getLogDir
(),
`trial_
${
req
.
params
.
trialId
}
.log`
);
try
{
try
{
let
skipLogging
:
boolean
=
false
;
let
skipLogging
:
boolean
=
false
;
...
...
src/nni_manager/training_service/common/gpuData.ts
View file @
d95c3513
...
@@ -58,3 +58,11 @@ export class GPUSummary {
...
@@ -58,3 +58,11 @@ export class GPUSummary {
this
.
gpuInfos
=
gpuInfos
;
this
.
gpuInfos
=
gpuInfos
;
}
}
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
d95c3513
...
@@ -66,11 +66,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
...
@@ -66,11 +66,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
}
await
this
.
kubernetesJobRestServer
.
start
();
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
delay
(
3000
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
}
}
}
}
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
d95c3513
...
@@ -71,11 +71,16 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
...
@@ -71,11 +71,16 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
}
await
this
.
kubernetesJobRestServer
.
start
();
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
}
}
this
.
log
.
info
(
'
Kubeflow training service exit.
'
);
this
.
log
.
info
(
'
Kubeflow training service exit.
'
);
}
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
d95c3513
...
@@ -71,5 +71,5 @@ mkdir -p $NNI_OUTPUT_DIR
...
@@ -71,5 +71,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --version '{11}' --log_collection '{12}'`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --
nni_manager_
version '{11}' --log_collection '{12}'`
+
`1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
+
`1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
d95c3513
...
@@ -61,7 +61,7 @@ abstract class KubernetesTrainingService {
...
@@ -61,7 +61,7 @@ abstract class KubernetesTrainingService {
protected
kubernetesCRDClient
?:
KubernetesCRDClient
;
protected
kubernetesCRDClient
?:
KubernetesCRDClient
;
protected
kubernetesJobRestServer
?:
KubernetesJobRestServer
;
protected
kubernetesJobRestServer
?:
KubernetesJobRestServer
;
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
versionCheck
?
:
boolean
=
true
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
protected
logCollection
:
string
;
constructor
()
{
constructor
()
{
...
...
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
d95c3513
...
@@ -19,268 +19,16 @@
...
@@ -19,268 +19,16 @@
'
use strict
'
;
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
nodeNvidiaSmi
from
'
node-nvidia-smi
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
*
as
cp
from
'
child_process
'
;
/* Example of nvidia-smi result
import
*
as
cpp
from
'
child-process-promise
'
;
{
import
*
as
path
from
'
path
'
;
"nvidia_smi_log": {
import
*
as
os
from
'
os
'
;
"timestamp": "Fri Jul 13 15:17:27 2018",
import
*
as
fs
from
'
fs
'
;
"driver_version": "396.26",
import
{
String
}
from
'
typescript-string-operations
'
;
"attached_gpus": "8",
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
"gpu": [
...,
{
...
"minor_number": "5",
"utilization": {
"gpu_util": "100 %",
"memory_util": "27 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
...
"processes": {
"process_info": {
"pid": "39943",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
...
},
{
"$": {
"id": "00000000:8E:00.0"
},
"product_name": "Tesla P100-PCIE-16GB",
"product_brand": "Tesla",
"display_mode": "Enabled",
"display_active": "Disabled",
"persistence_mode": "Disabled",
"accounting_mode": "Disabled",
"accounting_mode_buffer_size": "4000",
"driver_model": {
"current_dm": "N/A",
"pending_dm": "N/A"
},
"serial": "0321017108732",
"uuid": "GPU-df3e8a0a-ce99-350c-b196-c3775eb32309",
"minor_number": "6",
"vbios_version": "86.00.40.00.01",
"multigpu_board": "No",
"board_id": "0x8e00",
"gpu_part_number": "900-2H400-0300-031",
"inforom_version": {
"img_version": "H400.0201.00.08",
"oem_object": "1.1",
"ecc_object": "4.1",
"pwr_object": "N/A"
},
"gpu_operation_mode": {
"current_gom": "N/A",
"pending_gom": "N/A"
},
"gpu_virtualization_mode": {
"virtualization_mode": "None"
},
"ibmnpu": {
"relaxed_ordering_mode": "N/A"
},
"pci": {
"pci_bus": "8E",
"pci_device": "00",
"pci_domain": "0000",
"pci_device_id": "15F810DE",
"pci_bus_id": "00000000:8E:00.0",
"pci_sub_system_id": "118F10DE",
"pci_gpu_link_info": {
"pcie_gen": {
"max_link_gen": "3",
"current_link_gen": "3"
},
"link_widths": {
"max_link_width": "16x",
"current_link_width": "16x"
}
},
"pci_bridge_chip": {
"bridge_chip_type": "N/A",
"bridge_chip_fw": "N/A"
},
"replay_counter": "0",
"tx_util": "0 KB/s",
"rx_util": "0 KB/s"
},
"fan_speed": "N/A",
"performance_state": "P0",
"clocks_throttle_reasons": {
"clocks_throttle_reason_gpu_idle": "Not Active",
"clocks_throttle_reason_applications_clocks_setting": "Not Active",
"clocks_throttle_reason_sw_power_cap": "Not Active",
"clocks_throttle_reason_hw_slowdown": "Not Active",
"clocks_throttle_reason_hw_thermal_slowdown": "Not Active",
"clocks_throttle_reason_hw_power_brake_slowdown": "Not Active",
"clocks_throttle_reason_sync_boost": "Not Active",
"clocks_throttle_reason_sw_thermal_slowdown": "Not Active"
},
"fb_memory_usage": {
"total": "16280 MiB",
"used": "16239 MiB",
"free": "41 MiB"
},
"bar1_memory_usage": {
"total": "16384 MiB",
"used": "2 MiB",
"free": "16382 MiB"
},
"compute_mode": "Default",
"utilization": {
"gpu_util": "0 %",
"memory_util": "0 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
"encoder_stats": {
"session_count": "0",
"average_fps": "0",
"average_latency": "0"
},
"ecc_mode": {
"current_ecc": "Enabled",
"pending_ecc": "Enabled"
},
"ecc_errors": {
"volatile": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
},
"aggregate": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
}
},
"retired_pages": {
"multiple_single_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"double_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"pending_retirement": "No"
},
"temperature": {
"gpu_temp": "33 C",
"gpu_temp_max_threshold": "85 C",
"gpu_temp_slow_threshold": "82 C",
"gpu_temp_max_gpu_threshold": "N/A",
"memory_temp": "N/A",
"gpu_temp_max_mem_threshold": "N/A"
},
"power_readings": {
"power_state": "P0",
"power_management": "Supported",
"power_draw": "37.29 W",
"power_limit": "250.00 W",
"default_power_limit": "250.00 W",
"enforced_power_limit": "250.00 W",
"min_power_limit": "125.00 W",
"max_power_limit": "250.00 W"
},
"clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1189 MHz"
},
"applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"default_applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"max_clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1328 MHz"
},
"max_customer_boost_clocks": {
"graphics_clock": "1328 MHz"
},
"clock_policy": {
"auto_boost": "N/A",
"auto_boost_default": "N/A"
},
"supported_clocks": {
"supported_mem_clock": {
"value": "715 MHz",
"supported_graphics_clock": [
"1328 MHz",
"1316 MHz",
"1303 MHz",
...
]
}
},
"processes": {
"process_info": {
"pid": "40788",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
"accounted_processes": "\n\t\t"
},
...
]
}
}*/
/**
/**
* GPUScheduler
* GPUScheduler
...
@@ -290,29 +38,43 @@ class GPUScheduler {
...
@@ -290,29 +38,43 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
log
:
Logger
;
private
nvdmNotFoundRegex
:
RegExp
;
private
gpuMetricCollectorScriptFolder
:
string
;
constructor
()
{
constructor
()
{
this
.
stopping
=
false
;
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
nvdmNotFoundRegex
=
/nvidia-smi: not found/gi
;
this
.
gpuMetricCollectorScriptFolder
=
`
${
os
.
tmpdir
()}
/nni/script`
;
}
}
public
async
run
():
Promise
<
void
>
{
public
async
run
():
Promise
<
void
>
{
await
this
.
runGpuMetricsCollectorScript
();
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
try
{
try
{
this
.
gpuSummary
=
await
this
.
read
GPUSummary
();
await
this
.
update
GPUSummary
();
}
catch
(
error
)
{
}
catch
(
error
)
{
this
.
log
.
error
(
'
Read GPU summary failed with error:
'
,
error
);
this
.
log
.
error
(
'
Read GPU summary failed with error:
'
,
error
);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if
(
this
.
nvdmNotFoundRegex
.
test
(
error
))
{
break
;
}
}
}
await
delay
(
5000
);
await
delay
(
5000
);
}
}
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
//generate gpu_metrics_collector.sh
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics_collector.sh
'
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
this
.
gpuMetricCollectorScriptFolder
,
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
cp
.
exec
(
`bash
${
gpuMetricsCollectorScriptPath
}
`
);
}
public
getAvailableGPUIndices
():
number
[]
{
public
getAvailableGPUIndices
():
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
if
(
this
.
gpuSummary
!==
undefined
)
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
).
map
((
info
:
GPUInfo
)
=>
info
.
index
);
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
).
map
((
info
:
GPUInfo
)
=>
info
.
index
);
...
@@ -321,51 +83,24 @@ class GPUScheduler {
...
@@ -321,51 +83,24 @@ class GPUScheduler {
return
[];
return
[];
}
}
public
stop
()
:
void
{
public
async
stop
()
{
this
.
stopping
=
true
;
this
.
stopping
=
true
;
try
{
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
'
utf8
'
);
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
await
cpp
.
exec
(
`rm -rf
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
}
catch
(
error
){
this
.
log
.
error
(
`GPU scheduler error:
${
error
}
`
);
}
}
}
private
async
updateGPUSummary
()
{
private
generateEmbededGPUSummary
(
data
:
nodeNvidiaSmi
.
GPUInfo
)
:
GPUInfo
[]
{
const
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
)}
`
);
let
gpuInfos
:
GPUInfo
[]
=
[];
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
assert
(
gpuNumber
>
0
);
if
(
gpuNumber
==
1
)
{
const
embededGPUSummary
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
.
push
(
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
else
{
}
else
{
const
embededGPUSummaryArray
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
[]
>
data
.
nvidia_smi_log
.
gpu
;
this
.
log
.
error
(
'
Could not get gpu metrics information!
'
);
gpuInfos
=
embededGPUSummaryArray
.
map
(
embededGPUSummary
=>
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
}
return
gpuInfos
;
}
private
convertGPUSummaryToInfo
(
embededGPUSummary
:
nodeNvidiaSmi
.
EmbededGPUSummary
)
:
GPUInfo
{
return
new
GPUInfo
(
typeof
embededGPUSummary
.
process
===
'
object
'
?
1
:
0
,
parseFloat
(
embededGPUSummary
.
utilization
.
memory_util
),
parseFloat
(
embededGPUSummary
.
utilization
.
gpu_util
),
parseInt
(
embededGPUSummary
.
minor_number
,
10
));
}
private
readGPUSummary
():
Promise
<
GPUSummary
>
{
return
new
Promise
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
nodeNvidiaSmi
((
error
:
Error
,
data
:
nodeNvidiaSmi
.
GPUInfo
)
=>
{
if
(
error
)
{
reject
(
error
);
}
else
{
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
const
gpuSummary
:
GPUSummary
=
new
GPUSummary
(
gpuNumber
,
Date
().
toString
(),
this
.
generateEmbededGPUSummary
(
data
)
);
resolve
(
gpuSummary
);
}
});
});
}
}
}
}
...
...
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
View file @
d95c3513
...
@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
...
@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
}
}
public
cleanUp
():
Promise
<
void
>
{
public
async
cleanUp
():
Promise
<
void
>
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
this
.
gpuScheduler
.
stop
();
await
this
.
gpuScheduler
.
stop
();
}
}
return
super
.
cleanUp
();
return
super
.
cleanUp
();
...
...
src/nni_manager/training_service/pai/paiData.ts
View file @
d95c3513
...
@@ -64,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
...
@@ -64,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --version '{12}' --log_collection '{13}'`
;
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --
nni_manager_
version '{12}' --log_collection '{13}'`
;
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
`hdfs://{0}:9000/`
;
`hdfs://{0}:9000/`
;
...
...
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
d95c3513
...
@@ -75,7 +75,7 @@ class PAITrainingService implements TrainingService {
...
@@ -75,7 +75,7 @@ class PAITrainingService implements TrainingService {
private
paiRestServerPort
?:
number
;
private
paiRestServerPort
?:
number
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
versionCheck
?
:
boolean
=
true
;
private
versionCheck
:
boolean
=
true
;
private
logCollection
:
string
;
private
logCollection
:
string
;
constructor
()
{
constructor
()
{
...
@@ -97,11 +97,15 @@ class PAITrainingService implements TrainingService {
...
@@ -97,11 +97,15 @@ class PAITrainingService implements TrainingService {
this
.
log
.
info
(
'
Run PAI training service.
'
);
this
.
log
.
info
(
'
Run PAI training service.
'
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
await
restServer
.
start
();
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`PAI Training service rest server listening on:
${
restServer
.
endPoint
}
`
);
this
.
log
.
info
(
`PAI Training service rest server listening on:
${
restServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
await
this
.
updatePaiToken
();
await
this
.
updatePaiToken
();
await
this
.
paiJobCollector
.
retrieveTrialStatus
(
this
.
paiToken
,
this
.
paiClusterConfig
);
await
this
.
paiJobCollector
.
retrieveTrialStatus
(
this
.
paiToken
,
this
.
paiClusterConfig
);
if
(
restServer
.
getErrorMessage
)
{
throw
new
Error
(
restServer
.
getErrorMessage
)
this
.
stopping
=
true
;
}
await
delay
(
3000
);
await
delay
(
3000
);
}
}
this
.
log
.
info
(
'
PAI training service exit.
'
);
this
.
log
.
info
(
'
PAI training service exit.
'
);
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
d95c3513
...
@@ -250,7 +250,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={
...
@@ -250,7 +250,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={
cd $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
sh install_nni.sh
echo $$ >{6}
echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --
nni_manager_
version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{12}`
;
echo $?
\`
date +%s%3N
\`
>{12}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
...
@@ -259,11 +259,3 @@ cd {0}
...
@@ -259,11 +259,3 @@ cd {0}
echo $$ >{1}
echo $$ >{1}
eval {2} >stdout 2>stderr
eval {2} >stdout 2>stderr
echo $?
\`
date +%s%3N
\`
>{3}`
;
echo $?
\`
date +%s%3N
\`
>{3}`
;
export
const
GPU_COLLECTOR_FORMAT
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
d95c3513
...
@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
...
@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
import
{
import
{
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
GPU_COLLECTOR_FORMAT
}
from
'
./remoteMachineData
'
;
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
...
@@ -102,6 +102,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -102,6 +102,7 @@ class RemoteMachineTrainingService implements TrainingService {
public
async
run
():
Promise
<
void
>
{
public
async
run
():
Promise
<
void
>
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
await
restServer
.
start
();
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
'
Run remote machine training service.
'
);
this
.
log
.
info
(
'
Run remote machine training service.
'
);
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
while
(
this
.
jobQueue
.
length
>
0
)
{
while
(
this
.
jobQueue
.
length
>
0
)
{
...
@@ -117,6 +118,10 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -117,6 +118,10 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
break
;
}
}
}
}
if
(
restServer
.
getErrorMessage
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
await
delay
(
3000
);
await
delay
(
3000
);
}
}
this
.
log
.
info
(
'
Remote machine training service exit.
'
);
this
.
log
.
info
(
'
Remote machine training service exit.
'
);
...
@@ -447,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -447,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_COLLECTOR_FORMAT
,
GPU_
INFO_
COLLECTOR_FORMAT
,
remoteGPUScriptsDir
,
remoteGPUScriptsDir
,
path
.
join
(
remoteGPUScriptsDir
,
'
pid
'
),
path
.
join
(
remoteGPUScriptsDir
,
'
pid
'
),
);
);
...
...
src/nni_manager/types/node-nvidia-smi/index.d.ts
deleted
100644 → 0
View file @
77526d37
declare
module
'
node-nvidia-smi
'
{
function
smi
(
callback
:
(
error
:
Error
,
data
:
smi
.
GPUInfo
)
=>
void
):
void
;
namespace
smi
{
interface
EmbededGPUSummary
{
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
}
interface
GPUInfo
{
nvidia_smi_log
:
{
attached_gpus
:
string
;
gpu
:
EmbededGPUSummary
[]
|
EmbededGPUSummary
;
};
}
}
export
=
smi
;
}
\ No newline at end of file
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
View file @
d95c3513
...
@@ -143,7 +143,7 @@ class Bracket():
...
@@ -143,7 +143,7 @@ class Bracket():
self
.
s_max
=
s_max
self
.
s_max
=
s_max
self
.
eta
=
eta
self
.
eta
=
eta
self
.
n
=
math
.
ceil
((
s_max
+
1
)
*
(
eta
**
s
)
/
(
s
+
1
)
-
_epsilon
)
# pylint: disable=invalid-name
self
.
n
=
math
.
ceil
((
s_max
+
1
)
*
(
eta
**
s
)
/
(
s
+
1
)
-
_epsilon
)
# pylint: disable=invalid-name
self
.
r
=
math
.
ceil
(
R
/
eta
**
s
-
_epsilon
)
# pylint: disable=invalid-name
self
.
r
=
R
/
eta
**
s
# pylint: disable=invalid-name
self
.
i
=
0
self
.
i
=
0
self
.
hyper_configs
=
[]
# [ {id: params}, {}, ... ]
self
.
hyper_configs
=
[]
# [ {id: params}, {}, ... ]
self
.
configs_perf
=
[]
# [ {id: [seq, acc]}, {}, ... ]
self
.
configs_perf
=
[]
# [ {id: [seq, acc]}, {}, ... ]
...
@@ -158,7 +158,7 @@ class Bracket():
...
@@ -158,7 +158,7 @@ class Bracket():
def
get_n_r
(
self
):
def
get_n_r
(
self
):
"""return the values of n and r for the next round"""
"""return the values of n and r for the next round"""
return
math
.
floor
(
self
.
n
/
self
.
eta
**
self
.
i
+
_epsilon
),
self
.
r
*
self
.
eta
**
self
.
i
return
math
.
floor
(
self
.
n
/
self
.
eta
**
self
.
i
+
_epsilon
),
math
.
floor
(
self
.
r
*
self
.
eta
**
self
.
i
+
_epsilon
)
def
increase_i
(
self
):
def
increase_i
(
self
):
"""i means the ith round. Increase i by 1"""
"""i means the ith round. Increase i by 1"""
...
...
src/sdk/pynni/nni/metis_tuner/Regression_GP/CreateModel.py
View file @
d95c3513
...
@@ -40,7 +40,7 @@ def create_model(samples_x, samples_y_aggregation,
...
@@ -40,7 +40,7 @@ def create_model(samples_x, samples_y_aggregation,
regressor
=
gp
.
GaussianProcessRegressor
(
kernel
=
kernel
,
regressor
=
gp
.
GaussianProcessRegressor
(
kernel
=
kernel
,
n_restarts_optimizer
=
n_restarts_optimizer
,
n_restarts_optimizer
=
n_restarts_optimizer
,
normalize_y
=
True
,
normalize_y
=
True
,
alpha
=
0
)
alpha
=
1e-1
0
)
regressor
.
fit
(
numpy
.
array
(
samples_x
),
numpy
.
array
(
samples_y_aggregation
))
regressor
.
fit
(
numpy
.
array
(
samples_x
),
numpy
.
array
(
samples_y_aggregation
))
model
=
{}
model
=
{}
...
...
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
View file @
d95c3513
...
@@ -65,7 +65,7 @@ class MetisTuner(Tuner):
...
@@ -65,7 +65,7 @@ class MetisTuner(Tuner):
"""
"""
def
__init__
(
self
,
optimize_mode
=
"maximize"
,
no_resampling
=
True
,
no_candidates
=
True
,
def
__init__
(
self
,
optimize_mode
=
"maximize"
,
no_resampling
=
True
,
no_candidates
=
True
,
selection_num_starting_points
=
600
,
cold_start_num
=
10
,
exploration_probability
=
0.
1
):
selection_num_starting_points
=
600
,
cold_start_num
=
10
,
exploration_probability
=
0.
9
):
"""
"""
Parameters
Parameters
----------
----------
...
@@ -126,13 +126,7 @@ class MetisTuner(Tuner):
...
@@ -126,13 +126,7 @@ class MetisTuner(Tuner):
for
key
in
search_space
:
for
key
in
search_space
:
key_type
=
search_space
[
key
][
'_type'
]
key_type
=
search_space
[
key
][
'_type'
]
key_range
=
search_space
[
key
][
'_value'
]
key_range
=
search_space
[
key
][
'_value'
]
try
:
idx
=
self
.
key_order
.
index
(
key
)
idx
=
self
.
key_order
.
index
(
key
)
except
Exception
as
ex
:
logger
.
exception
(
ex
)
raise
RuntimeError
(
"The format search space contains
\
some key that didn't define in key_order."
)
if
key_type
==
'quniform'
:
if
key_type
==
'quniform'
:
if
key_range
[
2
]
==
1
:
if
key_range
[
2
]
==
1
:
self
.
x_bounds
[
idx
]
=
[
key_range
[
0
],
key_range
[
1
]]
self
.
x_bounds
[
idx
]
=
[
key_range
[
0
],
key_range
[
1
]]
...
@@ -271,7 +265,6 @@ class MetisTuner(Tuner):
...
@@ -271,7 +265,6 @@ class MetisTuner(Tuner):
samples_size_unique
=
len
(
samples_y
)
samples_size_unique
=
len
(
samples_y
)
# ===== STEP 1: Compute the current optimum =====
# ===== STEP 1: Compute the current optimum =====
#sys.stderr.write("[%s] Predicting the optimal configuration from the current training dataset...\n" % (os.path.basename(__file__)))
gp_model
=
gp_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
gp_model
=
gp_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
lm_current
=
gp_selection
.
selection
(
lm_current
=
gp_selection
.
selection
(
"lm"
,
"lm"
,
...
@@ -291,8 +284,6 @@ class MetisTuner(Tuner):
...
@@ -291,8 +284,6 @@ class MetisTuner(Tuner):
'reason'
:
"exploitation_gp"
})
'reason'
:
"exploitation_gp"
})
# ===== STEP 2: Get recommended configurations for exploration =====
# ===== STEP 2: Get recommended configurations for exploration =====
#sys.stderr.write("[%s] Getting candidates for exploration...\n"
#% \(os.path.basename(__file__)))
results_exploration
=
gp_selection
.
selection
(
results_exploration
=
gp_selection
.
selection
(
"lc"
,
"lc"
,
samples_y_aggregation
,
samples_y_aggregation
,
...
@@ -309,15 +300,11 @@ class MetisTuner(Tuner):
...
@@ -309,15 +300,11 @@ class MetisTuner(Tuner):
'expected_sigma'
:
results_exploration
[
'expected_sigma'
],
'expected_sigma'
:
results_exploration
[
'expected_sigma'
],
'reason'
:
"exploration"
})
'reason'
:
"exploration"
})
logger
.
info
(
"DEBUG: 1 exploration candidate selected
\n
"
)
logger
.
info
(
"DEBUG: 1 exploration candidate selected
\n
"
)
#sys.stderr.write("[%s] DEBUG: 1 exploration candidate selected\n" % (os.path.basename(__file__)))
else
:
else
:
logger
.
info
(
"DEBUG: No suitable exploration candidates were"
)
logger
.
info
(
"DEBUG: No suitable exploration candidates were"
)
# sys.stderr.write("[%s] DEBUG: No suitable exploration candidates were \
# found\n" % (os.path.basename(__file__)))
# ===== STEP 3: Get recommended configurations for exploitation =====
# ===== STEP 3: Get recommended configurations for exploitation =====
if
samples_size_all
>=
threshold_samplessize_exploitation
:
if
samples_size_all
>=
threshold_samplessize_exploitation
:
#sys.stderr.write("[%s] Getting candidates for exploitation...\n" % (os.path.basename(__file__)))
print
(
"Getting candidates for exploitation...
\n
"
)
print
(
"Getting candidates for exploitation...
\n
"
)
try
:
try
:
gmm
=
gmm_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
gmm
=
gmm_create_model
.
create_model
(
samples_x
,
samples_y_aggregation
)
...
@@ -385,13 +372,6 @@ class MetisTuner(Tuner):
...
@@ -385,13 +372,6 @@ class MetisTuner(Tuner):
temp_improvement
=
threads_result
[
'expected_lowest_mu'
]
-
lm_current
[
'expected_mu'
]
temp_improvement
=
threads_result
[
'expected_lowest_mu'
]
-
lm_current
[
'expected_mu'
]
if
next_improvement
>
temp_improvement
:
if
next_improvement
>
temp_improvement
:
logger
.
info
(
"DEBUG:
\"
next_candidate
\"
changed:
\
lowest mu might reduce from %f (%s) to %f (%s), %s
\n
"
%
\
lm_current
[
'expected_mu'
],
str
(
lm_current
[
'hyperparameter'
]),
\
threads_result
[
'expected_lowest_mu'
],
\
str
(
threads_result
[
'candidate'
][
'hyperparameter'
]),
\
threads_result
[
'candidate'
][
'reason'
])
next_improvement
=
temp_improvement
next_improvement
=
temp_improvement
next_candidate
=
threads_result
[
'candidate'
]
next_candidate
=
threads_result
[
'candidate'
]
else
:
else
:
...
@@ -415,7 +395,7 @@ class MetisTuner(Tuner):
...
@@ -415,7 +395,7 @@ class MetisTuner(Tuner):
if
next_candidate
is
not
None
:
if
next_candidate
is
not
None
:
outputs
=
self
.
_pack_output
(
next_candidate
[
'hyperparameter'
])
outputs
=
self
.
_pack_output
(
next_candidate
[
'hyperparameter'
])
else
:
else
:
random_parameter
=
_rand_init
(
self
.
x_bounds
,
self
.
x_types
,
1
)[
0
]
random_parameter
=
_rand_init
(
x_bounds
,
x_types
,
1
)[
0
]
outputs
=
self
.
_pack_output
(
random_parameter
)
outputs
=
self
.
_pack_output
(
random_parameter
)
self
.
history_parameters
.
append
(
outputs
)
self
.
history_parameters
.
append
(
outputs
)
return
outputs
return
outputs
...
@@ -443,8 +423,6 @@ def _rand_with_constraints(x_bounds, x_types):
...
@@ -443,8 +423,6 @@ def _rand_with_constraints(x_bounds, x_types):
def
_calculate_lowest_mu_threaded
(
inputs
):
def
_calculate_lowest_mu_threaded
(
inputs
):
[
candidate
,
samples_x
,
samples_y
,
x_bounds
,
x_types
,
minimize_constraints_fun
,
minimize_starting_points
]
=
inputs
[
candidate
,
samples_x
,
samples_y
,
x_bounds
,
x_types
,
minimize_constraints_fun
,
minimize_starting_points
]
=
inputs
sys
.
stderr
.
write
(
"[%s] Evaluating information gain of %s (%s)...
\n
"
%
\
(
os
.
path
.
basename
(
__file__
),
candidate
[
'hyperparameter'
],
candidate
[
'reason'
]))
outputs
=
{
"candidate"
:
candidate
,
"expected_lowest_mu"
:
None
}
outputs
=
{
"candidate"
:
candidate
,
"expected_lowest_mu"
:
None
}
for
expected_mu
in
[
candidate
[
'expected_mu'
]
+
1.96
*
candidate
[
'expected_sigma'
],
for
expected_mu
in
[
candidate
[
'expected_mu'
]
+
1.96
*
candidate
[
'expected_sigma'
],
...
...
src/sdk/pynni/nni/networkmorphism_tuner/layers.py
View file @
d95c3513
...
@@ -254,7 +254,7 @@ class StubConv(StubWeightBiasLayer):
...
@@ -254,7 +254,7 @@ class StubConv(StubWeightBiasLayer):
keras_layer
.
set_weights
((
self
.
weights
[
0
].
T
,
self
.
weights
[
1
]))
keras_layer
.
set_weights
((
self
.
weights
[
0
].
T
,
self
.
weights
[
1
]))
def
size
(
self
):
def
size
(
self
):
return
self
.
filters
*
self
.
kernel_size
*
self
.
kernel_size
+
self
.
filters
return
(
self
.
input_channel
*
self
.
kernel_size
*
self
.
kernel_size
+
1
)
*
self
.
filters
@
abstractmethod
@
abstractmethod
def
to_real_layer
(
self
):
def
to_real_layer
(
self
):
...
...
src/sdk/pynni/nni/smac_tuner/smac_tuner.py
View file @
d95c3513
...
@@ -192,17 +192,20 @@ class SMACTuner(Tuner):
...
@@ -192,17 +192,20 @@ class SMACTuner(Tuner):
Returns
Returns
-------
-------
dict
dict
challenger
dict
dict which stores copy of
challenger
s
"""
"""
converted_dict
=
{}
for
key
,
value
in
challenger_dict
.
items
():
for
key
,
value
in
challenger_dict
.
items
():
# convert to loguniform
# convert to loguniform
if
key
in
self
.
loguniform_key
:
if
key
in
self
.
loguniform_key
:
c
hallenger
_dict
[
key
]
=
np
.
exp
(
challenger_dict
[
key
])
c
onverted
_dict
[
key
]
=
np
.
exp
(
challenger_dict
[
key
])
# convert categorical back to original value
# convert categorical back to original value
if
key
in
self
.
categorical_dict
:
el
if
key
in
self
.
categorical_dict
:
idx
=
challenger_dict
[
key
]
idx
=
challenger_dict
[
key
]
challenger_dict
[
key
]
=
self
.
categorical_dict
[
key
][
idx
]
converted_dict
[
key
]
=
self
.
categorical_dict
[
key
][
idx
]
return
challenger_dict
else
:
converted_dict
[
key
]
=
value
return
converted_dict
def
generate_parameters
(
self
,
parameter_id
):
def
generate_parameters
(
self
,
parameter_id
):
"""generate one instance of hyperparameters
"""generate one instance of hyperparameters
...
@@ -220,13 +223,11 @@ class SMACTuner(Tuner):
...
@@ -220,13 +223,11 @@ class SMACTuner(Tuner):
if
self
.
first_one
:
if
self
.
first_one
:
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
self
.
total_data
[
parameter_id
]
=
init_challenger
self
.
total_data
[
parameter_id
]
=
init_challenger
json_tricks
.
dumps
(
init_challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
())
else
:
else
:
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
for
challenger
in
challengers
:
for
challenger
in
challengers
:
self
.
total_data
[
parameter_id
]
=
challenger
self
.
total_data
[
parameter_id
]
=
challenger
json_tricks
.
dumps
(
challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
())
return
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
())
def
generate_multiple_parameters
(
self
,
parameter_id_list
):
def
generate_multiple_parameters
(
self
,
parameter_id_list
):
...
@@ -247,7 +248,6 @@ class SMACTuner(Tuner):
...
@@ -247,7 +248,6 @@ class SMACTuner(Tuner):
for
one_id
in
parameter_id_list
:
for
one_id
in
parameter_id_list
:
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
init_challenger
=
self
.
smbo_solver
.
nni_smac_start
()
self
.
total_data
[
one_id
]
=
init_challenger
self
.
total_data
[
one_id
]
=
init_challenger
json_tricks
.
dumps
(
init_challenger
.
get_dictionary
())
params
.
append
(
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
()))
params
.
append
(
self
.
convert_loguniform_categorical
(
init_challenger
.
get_dictionary
()))
else
:
else
:
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
challengers
=
self
.
smbo_solver
.
nni_smac_request_challengers
()
...
@@ -257,7 +257,6 @@ class SMACTuner(Tuner):
...
@@ -257,7 +257,6 @@ class SMACTuner(Tuner):
if
cnt
>=
len
(
parameter_id_list
):
if
cnt
>=
len
(
parameter_id_list
):
break
break
self
.
total_data
[
parameter_id_list
[
cnt
]]
=
challenger
self
.
total_data
[
parameter_id_list
[
cnt
]]
=
challenger
json_tricks
.
dumps
(
challenger
.
get_dictionary
())
params
.
append
(
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
()))
params
.
append
(
self
.
convert_loguniform_categorical
(
challenger
.
get_dictionary
()))
cnt
+=
1
cnt
+=
1
return
params
return
params
src/webui/src/components/Overview.tsx
View file @
d95c3513
...
@@ -20,6 +20,7 @@ require('../static/style/overviewTitle.scss');
...
@@ -20,6 +20,7 @@ require('../static/style/overviewTitle.scss');
interface
OverviewState
{
interface
OverviewState
{
tableData
:
Array
<
TableObj
>
;
tableData
:
Array
<
TableObj
>
;
experimentAPI
:
object
;
searchSpace
:
object
;
searchSpace
:
object
;
status
:
string
;
status
:
string
;
errorStr
:
string
;
errorStr
:
string
;
...
@@ -47,6 +48,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -47,6 +48,7 @@ class Overview extends React.Component<{}, OverviewState> {
super
(
props
);
super
(
props
);
this
.
state
=
{
this
.
state
=
{
searchSpace
:
{},
searchSpace
:
{},
experimentAPI
:
{},
status
:
''
,
status
:
''
,
errorStr
:
''
,
errorStr
:
''
,
trialProfile
:
{
trialProfile
:
{
...
@@ -143,6 +145,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -143,6 +145,7 @@ class Overview extends React.Component<{}, OverviewState> {
});
});
if
(
this
.
_isMounted
)
{
if
(
this
.
_isMounted
)
{
this
.
setState
({
this
.
setState
({
experimentAPI
:
res
.
data
,
trialProfile
:
trialPro
[
0
],
trialProfile
:
trialPro
[
0
],
searchSpace
:
searchSpace
,
searchSpace
:
searchSpace
,
isLogCollection
:
expLogCollection
isLogCollection
:
expLogCollection
...
@@ -390,7 +393,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -390,7 +393,7 @@ class Overview extends React.Component<{}, OverviewState> {
const
{
const
{
trialProfile
,
searchSpace
,
tableData
,
accuracyData
,
trialProfile
,
searchSpace
,
tableData
,
accuracyData
,
accNodata
,
status
,
errorStr
,
trialNumber
,
bestAccuracy
,
accNodata
,
status
,
errorStr
,
trialNumber
,
bestAccuracy
,
titleMaxbgcolor
,
titleMinbgcolor
,
isLogCollection
titleMaxbgcolor
,
titleMinbgcolor
,
isLogCollection
,
experimentAPI
}
=
this
.
state
;
}
=
this
.
state
;
return
(
return
(
...
@@ -425,9 +428,7 @@ class Overview extends React.Component<{}, OverviewState> {
...
@@ -425,9 +428,7 @@ class Overview extends React.Component<{}, OverviewState> {
<
Row
className
=
"experiment"
>
<
Row
className
=
"experiment"
>
{
/* the scroll bar all the trial profile in the searchSpace div*/
}
{
/* the scroll bar all the trial profile in the searchSpace div*/
}
<
div
className
=
"experiment searchSpace"
>
<
div
className
=
"experiment searchSpace"
>
<
TrialPro
<
TrialPro
experiment
=
{
experimentAPI
}
/>
tiralProInfo
=
{
trialProfile
}
/>
</
div
>
</
div
>
</
Row
>
</
Row
>
</
Col
>
</
Col
>
...
...
src/webui/src/components/overview/TrialProfile.tsx
View file @
d95c3513
import
*
as
React
from
'
react
'
;
import
*
as
React
from
'
react
'
;
import
{
Experiment
}
from
'
../../static/interface
'
;
import
MonacoEditor
from
'
react-monaco-editor
'
;
import
MonacoEditor
from
'
react-monaco-editor
'
;
import
{
MONACO
}
from
'
../../static/const
'
;
import
{
MONACO
}
from
'
../../static/const
'
;
interface
TrialInfoProps
{
interface
TrialInfoProps
{
tiralProInfo
:
Experimen
t
;
experiment
:
objec
t
;
}
}
class
TrialInfo
extends
React
.
Component
<
TrialInfoProps
,
{}
>
{
class
TrialInfo
extends
React
.
Component
<
TrialInfoProps
,
{}
>
{
...
@@ -13,19 +12,32 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
...
@@ -13,19 +12,32 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
super
(
props
);
super
(
props
);
}
}
render
()
{
componentWillReceiveProps
(
nextProps
:
TrialInfoProps
)
{
const
{
tiralProInfo
}
=
this
.
props
;
const
experiments
=
nextProps
.
experiment
;
const
showProInfo
=
[];
Object
.
keys
(
experiments
).
map
(
key
=>
{
showProInfo
.
push
({
switch
(
key
)
{
revision
:
tiralProInfo
.
revision
,
case
'
id
'
:
authorName
:
tiralProInfo
.
author
,
case
'
logDir
'
:
trialConcurrency
:
tiralProInfo
.
runConcurren
,
case
'
startTime
'
:
tuner
:
tiralProInfo
.
tuner
,
case
'
endTime
'
:
assessor
:
tiralProInfo
.
assessor
?
tiralProInfo
.
assessor
:
undefined
,
experiments
[
key
]
=
undefined
;
logCollection
:
tiralProInfo
.
logCollection
?
tiralProInfo
.
logCollection
:
undefined
,
break
;
advisor
:
tiralProInfo
.
advisor
?
tiralProInfo
.
advisor
:
undefined
,
case
'
params
'
:
clusterMetaData
:
tiralProInfo
.
clusterMetaData
?
tiralProInfo
.
clusterMetaData
:
undefined
const
params
=
experiments
[
key
];
Object
.
keys
(
params
).
map
(
item
=>
{
if
(
item
===
'
experimentName
'
||
item
===
'
searchSpace
'
||
item
===
'
trainingServicePlatform
'
)
{
params
[
item
]
=
undefined
;
}
});
break
;
default
:
}
});
});
}
render
()
{
const
{
experiment
}
=
this
.
props
;
return
(
return
(
<
div
className
=
"profile"
>
<
div
className
=
"profile"
>
<
MonacoEditor
<
MonacoEditor
...
@@ -33,7 +45,7 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
...
@@ -33,7 +45,7 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
height
=
"380"
height
=
"380"
language
=
"json"
language
=
"json"
theme
=
"vs-light"
theme
=
"vs-light"
value
=
{
JSON
.
stringify
(
showProInfo
[
0
]
,
null
,
2
)
}
value
=
{
JSON
.
stringify
(
experiment
,
null
,
2
)
}
options
=
{
MONACO
}
options
=
{
MONACO
}
/>
/>
</
div
>
</
div
>
...
@@ -41,4 +53,4 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
...
@@ -41,4 +53,4 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
}
}
}
}
export
default
TrialInfo
;
export
default
TrialInfo
;
\ No newline at end of file
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment