Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
f05e685f
Unverified
Commit
f05e685f
authored
Apr 01, 2019
by
SparkSnail
Committed by
GitHub
Apr 01, 2019
Browse files
Refactor local gpu scheduler (#943)
parent
f075aab0
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
50 additions
and
343 deletions
+50
-343
src/nni_manager/package.json
src/nni_manager/package.json
+0
-1
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+8
-0
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+37
-306
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
...ager/training_service/local/localTrainingServiceForGPU.ts
+2
-2
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+0
-8
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+3
-3
src/nni_manager/types/node-nvidia-smi/index.d.ts
src/nni_manager/types/node-nvidia-smi/index.d.ts
+0
-23
No files found.
src/nni_manager/package.json
View file @
f05e685f
...
...
@@ -18,7 +18,6 @@
"express-joi-validator"
:
"^2.0.0"
,
"js-base64"
:
"^2.4.9"
,
"kubernetes-client"
:
"^6.5.0"
,
"node-nvidia-smi"
:
"^1.0.0"
,
"rx"
:
"^4.1.0"
,
"sqlite3"
:
"^4.0.2"
,
"ssh2"
:
"^0.6.1"
,
...
...
src/nni_manager/training_service/common/gpuData.ts
View file @
f05e685f
...
...
@@ -58,3 +58,11 @@ export class GPUSummary {
this
.
gpuInfos
=
gpuInfos
;
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
f05e685f
...
...
@@ -19,268 +19,16 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
nodeNvidiaSmi
from
'
node-nvidia-smi
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
/* Example of nvidia-smi result
{
"nvidia_smi_log": {
"timestamp": "Fri Jul 13 15:17:27 2018",
"driver_version": "396.26",
"attached_gpus": "8",
"gpu": [
...,
{
...
"minor_number": "5",
"utilization": {
"gpu_util": "100 %",
"memory_util": "27 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
...
"processes": {
"process_info": {
"pid": "39943",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
...
},
{
"$": {
"id": "00000000:8E:00.0"
},
"product_name": "Tesla P100-PCIE-16GB",
"product_brand": "Tesla",
"display_mode": "Enabled",
"display_active": "Disabled",
"persistence_mode": "Disabled",
"accounting_mode": "Disabled",
"accounting_mode_buffer_size": "4000",
"driver_model": {
"current_dm": "N/A",
"pending_dm": "N/A"
},
"serial": "0321017108732",
"uuid": "GPU-df3e8a0a-ce99-350c-b196-c3775eb32309",
"minor_number": "6",
"vbios_version": "86.00.40.00.01",
"multigpu_board": "No",
"board_id": "0x8e00",
"gpu_part_number": "900-2H400-0300-031",
"inforom_version": {
"img_version": "H400.0201.00.08",
"oem_object": "1.1",
"ecc_object": "4.1",
"pwr_object": "N/A"
},
"gpu_operation_mode": {
"current_gom": "N/A",
"pending_gom": "N/A"
},
"gpu_virtualization_mode": {
"virtualization_mode": "None"
},
"ibmnpu": {
"relaxed_ordering_mode": "N/A"
},
"pci": {
"pci_bus": "8E",
"pci_device": "00",
"pci_domain": "0000",
"pci_device_id": "15F810DE",
"pci_bus_id": "00000000:8E:00.0",
"pci_sub_system_id": "118F10DE",
"pci_gpu_link_info": {
"pcie_gen": {
"max_link_gen": "3",
"current_link_gen": "3"
},
"link_widths": {
"max_link_width": "16x",
"current_link_width": "16x"
}
},
"pci_bridge_chip": {
"bridge_chip_type": "N/A",
"bridge_chip_fw": "N/A"
},
"replay_counter": "0",
"tx_util": "0 KB/s",
"rx_util": "0 KB/s"
},
"fan_speed": "N/A",
"performance_state": "P0",
"clocks_throttle_reasons": {
"clocks_throttle_reason_gpu_idle": "Not Active",
"clocks_throttle_reason_applications_clocks_setting": "Not Active",
"clocks_throttle_reason_sw_power_cap": "Not Active",
"clocks_throttle_reason_hw_slowdown": "Not Active",
"clocks_throttle_reason_hw_thermal_slowdown": "Not Active",
"clocks_throttle_reason_hw_power_brake_slowdown": "Not Active",
"clocks_throttle_reason_sync_boost": "Not Active",
"clocks_throttle_reason_sw_thermal_slowdown": "Not Active"
},
"fb_memory_usage": {
"total": "16280 MiB",
"used": "16239 MiB",
"free": "41 MiB"
},
"bar1_memory_usage": {
"total": "16384 MiB",
"used": "2 MiB",
"free": "16382 MiB"
},
"compute_mode": "Default",
"utilization": {
"gpu_util": "0 %",
"memory_util": "0 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
"encoder_stats": {
"session_count": "0",
"average_fps": "0",
"average_latency": "0"
},
"ecc_mode": {
"current_ecc": "Enabled",
"pending_ecc": "Enabled"
},
"ecc_errors": {
"volatile": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
},
"aggregate": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
}
},
"retired_pages": {
"multiple_single_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"double_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"pending_retirement": "No"
},
"temperature": {
"gpu_temp": "33 C",
"gpu_temp_max_threshold": "85 C",
"gpu_temp_slow_threshold": "82 C",
"gpu_temp_max_gpu_threshold": "N/A",
"memory_temp": "N/A",
"gpu_temp_max_mem_threshold": "N/A"
},
"power_readings": {
"power_state": "P0",
"power_management": "Supported",
"power_draw": "37.29 W",
"power_limit": "250.00 W",
"default_power_limit": "250.00 W",
"enforced_power_limit": "250.00 W",
"min_power_limit": "125.00 W",
"max_power_limit": "250.00 W"
},
"clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1189 MHz"
},
"applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"default_applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"max_clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1328 MHz"
},
"max_customer_boost_clocks": {
"graphics_clock": "1328 MHz"
},
"clock_policy": {
"auto_boost": "N/A",
"auto_boost_default": "N/A"
},
"supported_clocks": {
"supported_mem_clock": {
"value": "715 MHz",
"supported_graphics_clock": [
"1328 MHz",
"1316 MHz",
"1303 MHz",
...
]
}
},
"processes": {
"process_info": {
"pid": "40788",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
"accounted_processes": "\n\t\t"
},
...
]
}
}*/
import
*
as
cp
from
'
child_process
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
os
from
'
os
'
;
import
*
as
fs
from
'
fs
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
/**
* GPUScheduler
...
...
@@ -290,29 +38,43 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
nvdmNotFoundRegex
:
RegExp
;
private
gpuMetricCollectorScriptFolder
:
string
;
constructor
()
{
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
nvdmNotFoundRegex
=
/nvidia-smi: not found/gi
;
this
.
gpuMetricCollectorScriptFolder
=
`
${
os
.
tmpdir
()}
/nni/script`
;
}
public
async
run
():
Promise
<
void
>
{
await
this
.
runGpuMetricsCollectorScript
();
while
(
!
this
.
stopping
)
{
try
{
this
.
gpuSummary
=
await
this
.
read
GPUSummary
();
await
this
.
update
GPUSummary
();
}
catch
(
error
)
{
this
.
log
.
error
(
'
Read GPU summary failed with error:
'
,
error
);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if
(
this
.
nvdmNotFoundRegex
.
test
(
error
))
{
break
;
}
}
await
delay
(
5000
);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
//generate gpu_metrics_collector.sh
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics_collector.sh
'
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
this
.
gpuMetricCollectorScriptFolder
,
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
cp
.
exec
(
`bash
${
gpuMetricsCollectorScriptPath
}
`
);
}
public
getAvailableGPUIndices
():
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
).
map
((
info
:
GPUInfo
)
=>
info
.
index
);
...
...
@@ -321,51 +83,20 @@ class GPUScheduler {
return
[];
}
public
stop
()
:
void
{
public
async
stop
()
{
this
.
stopping
=
true
;
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
'
utf8
'
);
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
await
cpp
.
exec
(
`rm -rf
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
}
private
generateEmbededGPUSummary
(
data
:
nodeNvidiaSmi
.
GPUInfo
)
:
GPUInfo
[]
{
let
gpuInfos
:
GPUInfo
[]
=
[];
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
assert
(
gpuNumber
>
0
);
if
(
gpuNumber
==
1
)
{
const
embededGPUSummary
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
.
push
(
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
private
async
updateGPUSummary
()
{
const
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
)}
`
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
const
embededGPUSummaryArray
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
[]
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
=
embededGPUSummaryArray
.
map
(
embededGPUSummary
=>
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
this
.
log
.
error
(
'
Could not get gpu metrics information!
'
);
}
return
gpuInfos
;
}
private
convertGPUSummaryToInfo
(
embededGPUSummary
:
nodeNvidiaSmi
.
EmbededGPUSummary
)
:
GPUInfo
{
return
new
GPUInfo
(
typeof
embededGPUSummary
.
process
===
'
object
'
?
1
:
0
,
parseFloat
(
embededGPUSummary
.
utilization
.
memory_util
),
parseFloat
(
embededGPUSummary
.
utilization
.
gpu_util
),
parseInt
(
embededGPUSummary
.
minor_number
,
10
));
}
private
readGPUSummary
():
Promise
<
GPUSummary
>
{
return
new
Promise
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
nodeNvidiaSmi
((
error
:
Error
,
data
:
nodeNvidiaSmi
.
GPUInfo
)
=>
{
if
(
error
)
{
reject
(
error
);
}
else
{
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
const
gpuSummary
:
GPUSummary
=
new
GPUSummary
(
gpuNumber
,
Date
().
toString
(),
this
.
generateEmbededGPUSummary
(
data
)
);
resolve
(
gpuSummary
);
}
});
});
}
}
...
...
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
View file @
f05e685f
...
...
@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
public
cleanUp
():
Promise
<
void
>
{
public
async
cleanUp
():
Promise
<
void
>
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
this
.
gpuScheduler
.
stop
();
await
this
.
gpuScheduler
.
stop
();
}
return
super
.
cleanUp
();
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
f05e685f
...
...
@@ -259,11 +259,3 @@ cd {0}
echo $$ >{1}
eval {2} >stdout 2>stderr
echo $?
\`
date +%s%3N
\`
>{3}`
;
export
const
GPU_COLLECTOR_FORMAT
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
f05e685f
...
...
@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
import
{
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
GPU_COLLECTOR_FORMAT
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
...
...
@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_COLLECTOR_FORMAT
,
GPU_
INFO_
COLLECTOR_FORMAT
,
remoteGPUScriptsDir
,
path
.
join
(
remoteGPUScriptsDir
,
'
pid
'
),
);
...
...
src/nni_manager/types/node-nvidia-smi/index.d.ts
deleted
100644 → 0
View file @
f075aab0
declare
module
'
node-nvidia-smi
'
{
function
smi
(
callback
:
(
error
:
Error
,
data
:
smi
.
GPUInfo
)
=>
void
):
void
;
namespace
smi
{
interface
EmbededGPUSummary
{
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
}
interface
GPUInfo
{
nvidia_smi_log
:
{
attached_gpus
:
string
;
gpu
:
EmbededGPUSummary
[]
|
EmbededGPUSummary
;
};
}
}
export
=
smi
;
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment