Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
f05e685f
"vscode:/vscode.git/clone" did not exist on "700daa3463c6045b1ddc22ab917220f98cac82fe"
Unverified
Commit
f05e685f
authored
Apr 01, 2019
by
SparkSnail
Committed by
GitHub
Apr 01, 2019
Browse files
Refactor local gpu scheduler (#943)
parent
f075aab0
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
50 additions
and
343 deletions
+50
-343
src/nni_manager/package.json
src/nni_manager/package.json
+0
-1
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+8
-0
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+37
-306
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
...ager/training_service/local/localTrainingServiceForGPU.ts
+2
-2
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+0
-8
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+3
-3
src/nni_manager/types/node-nvidia-smi/index.d.ts
src/nni_manager/types/node-nvidia-smi/index.d.ts
+0
-23
No files found.
src/nni_manager/package.json
View file @
f05e685f
...
...
@@ -18,7 +18,6 @@
"express-joi-validator"
:
"^2.0.0"
,
"js-base64"
:
"^2.4.9"
,
"kubernetes-client"
:
"^6.5.0"
,
"node-nvidia-smi"
:
"^1.0.0"
,
"rx"
:
"^4.1.0"
,
"sqlite3"
:
"^4.0.2"
,
"ssh2"
:
"^0.6.1"
,
...
...
src/nni_manager/training_service/common/gpuData.ts
View file @
f05e685f
...
...
@@ -58,3 +58,11 @@ export class GPUSummary {
this
.
gpuInfos
=
gpuInfos
;
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
f05e685f
...
...
@@ -19,268 +19,16 @@
'
use strict
'
;
import
*
as
assert
from
'
assert
'
;
import
*
as
nodeNvidiaSmi
from
'
node-nvidia-smi
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
/* Example of nvidia-smi result
{
"nvidia_smi_log": {
"timestamp": "Fri Jul 13 15:17:27 2018",
"driver_version": "396.26",
"attached_gpus": "8",
"gpu": [
...,
{
...
"minor_number": "5",
"utilization": {
"gpu_util": "100 %",
"memory_util": "27 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
...
"processes": {
"process_info": {
"pid": "39943",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
...
},
{
"$": {
"id": "00000000:8E:00.0"
},
"product_name": "Tesla P100-PCIE-16GB",
"product_brand": "Tesla",
"display_mode": "Enabled",
"display_active": "Disabled",
"persistence_mode": "Disabled",
"accounting_mode": "Disabled",
"accounting_mode_buffer_size": "4000",
"driver_model": {
"current_dm": "N/A",
"pending_dm": "N/A"
},
"serial": "0321017108732",
"uuid": "GPU-df3e8a0a-ce99-350c-b196-c3775eb32309",
"minor_number": "6",
"vbios_version": "86.00.40.00.01",
"multigpu_board": "No",
"board_id": "0x8e00",
"gpu_part_number": "900-2H400-0300-031",
"inforom_version": {
"img_version": "H400.0201.00.08",
"oem_object": "1.1",
"ecc_object": "4.1",
"pwr_object": "N/A"
},
"gpu_operation_mode": {
"current_gom": "N/A",
"pending_gom": "N/A"
},
"gpu_virtualization_mode": {
"virtualization_mode": "None"
},
"ibmnpu": {
"relaxed_ordering_mode": "N/A"
},
"pci": {
"pci_bus": "8E",
"pci_device": "00",
"pci_domain": "0000",
"pci_device_id": "15F810DE",
"pci_bus_id": "00000000:8E:00.0",
"pci_sub_system_id": "118F10DE",
"pci_gpu_link_info": {
"pcie_gen": {
"max_link_gen": "3",
"current_link_gen": "3"
},
"link_widths": {
"max_link_width": "16x",
"current_link_width": "16x"
}
},
"pci_bridge_chip": {
"bridge_chip_type": "N/A",
"bridge_chip_fw": "N/A"
},
"replay_counter": "0",
"tx_util": "0 KB/s",
"rx_util": "0 KB/s"
},
"fan_speed": "N/A",
"performance_state": "P0",
"clocks_throttle_reasons": {
"clocks_throttle_reason_gpu_idle": "Not Active",
"clocks_throttle_reason_applications_clocks_setting": "Not Active",
"clocks_throttle_reason_sw_power_cap": "Not Active",
"clocks_throttle_reason_hw_slowdown": "Not Active",
"clocks_throttle_reason_hw_thermal_slowdown": "Not Active",
"clocks_throttle_reason_hw_power_brake_slowdown": "Not Active",
"clocks_throttle_reason_sync_boost": "Not Active",
"clocks_throttle_reason_sw_thermal_slowdown": "Not Active"
},
"fb_memory_usage": {
"total": "16280 MiB",
"used": "16239 MiB",
"free": "41 MiB"
},
"bar1_memory_usage": {
"total": "16384 MiB",
"used": "2 MiB",
"free": "16382 MiB"
},
"compute_mode": "Default",
"utilization": {
"gpu_util": "0 %",
"memory_util": "0 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
"encoder_stats": {
"session_count": "0",
"average_fps": "0",
"average_latency": "0"
},
"ecc_mode": {
"current_ecc": "Enabled",
"pending_ecc": "Enabled"
},
"ecc_errors": {
"volatile": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
},
"aggregate": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
}
},
"retired_pages": {
"multiple_single_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"double_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"pending_retirement": "No"
},
"temperature": {
"gpu_temp": "33 C",
"gpu_temp_max_threshold": "85 C",
"gpu_temp_slow_threshold": "82 C",
"gpu_temp_max_gpu_threshold": "N/A",
"memory_temp": "N/A",
"gpu_temp_max_mem_threshold": "N/A"
},
"power_readings": {
"power_state": "P0",
"power_management": "Supported",
"power_draw": "37.29 W",
"power_limit": "250.00 W",
"default_power_limit": "250.00 W",
"enforced_power_limit": "250.00 W",
"min_power_limit": "125.00 W",
"max_power_limit": "250.00 W"
},
"clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1189 MHz"
},
"applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"default_applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"max_clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1328 MHz"
},
"max_customer_boost_clocks": {
"graphics_clock": "1328 MHz"
},
"clock_policy": {
"auto_boost": "N/A",
"auto_boost_default": "N/A"
},
"supported_clocks": {
"supported_mem_clock": {
"value": "715 MHz",
"supported_graphics_clock": [
"1328 MHz",
"1316 MHz",
"1303 MHz",
...
]
}
},
"processes": {
"process_info": {
"pid": "40788",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
"accounted_processes": "\n\t\t"
},
...
]
}
}*/
import
*
as
cp
from
'
child_process
'
;
import
*
as
cpp
from
'
child-process-promise
'
;
import
*
as
path
from
'
path
'
;
import
*
as
os
from
'
os
'
;
import
*
as
fs
from
'
fs
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
/**
* GPUScheduler
...
...
@@ -290,29 +38,43 @@ class GPUScheduler {
private
gpuSummary
!
:
GPUSummary
;
private
stopping
:
boolean
;
private
log
:
Logger
;
private
nvdmNotFoundRegex
:
RegExp
;
private
gpuMetricCollectorScriptFolder
:
string
;
constructor
()
{
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
nvdmNotFoundRegex
=
/nvidia-smi: not found/gi
;
this
.
gpuMetricCollectorScriptFolder
=
`
${
os
.
tmpdir
()}
/nni/script`
;
}
public
async
run
():
Promise
<
void
>
{
await
this
.
runGpuMetricsCollectorScript
();
while
(
!
this
.
stopping
)
{
try
{
this
.
gpuSummary
=
await
this
.
read
GPUSummary
();
await
this
.
update
GPUSummary
();
}
catch
(
error
)
{
this
.
log
.
error
(
'
Read GPU summary failed with error:
'
,
error
);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if
(
this
.
nvdmNotFoundRegex
.
test
(
error
))
{
break
;
}
}
await
delay
(
5000
);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
cpp
.
exec
(
`mkdir -p
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
//generate gpu_metrics_collector.sh
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics_collector.sh
'
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT
,
this
.
gpuMetricCollectorScriptFolder
,
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
cp
.
exec
(
`bash
${
gpuMetricsCollectorScriptPath
}
`
);
}
public
getAvailableGPUIndices
():
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
).
map
((
info
:
GPUInfo
)
=>
info
.
index
);
...
...
@@ -321,51 +83,20 @@ class GPUScheduler {
return
[];
}
public
stop
()
:
void
{
public
async
stop
()
{
this
.
stopping
=
true
;
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
pid
'
),
'
utf8
'
);
await
cpp
.
exec
(
`pkill -P
${
pid
}
`
);
await
cpp
.
exec
(
`rm -rf
${
this
.
gpuMetricCollectorScriptFolder
}
`
);
}
private
generateEmbededGPUSummary
(
data
:
nodeNvidiaSmi
.
GPUInfo
)
:
GPUInfo
[]
{
let
gpuInfos
:
GPUInfo
[]
=
[];
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
assert
(
gpuNumber
>
0
);
if
(
gpuNumber
==
1
)
{
const
embededGPUSummary
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
.
push
(
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
else
{
const
embededGPUSummaryArray
=
<
nodeNvidiaSmi
.
EmbededGPUSummary
[]
>
data
.
nvidia_smi_log
.
gpu
;
gpuInfos
=
embededGPUSummaryArray
.
map
(
embededGPUSummary
=>
this
.
convertGPUSummaryToInfo
(
embededGPUSummary
));
}
return
gpuInfos
;
}
private
convertGPUSummaryToInfo
(
embededGPUSummary
:
nodeNvidiaSmi
.
EmbededGPUSummary
)
:
GPUInfo
{
return
new
GPUInfo
(
typeof
embededGPUSummary
.
process
===
'
object
'
?
1
:
0
,
parseFloat
(
embededGPUSummary
.
utilization
.
memory_util
),
parseFloat
(
embededGPUSummary
.
utilization
.
gpu_util
),
parseInt
(
embededGPUSummary
.
minor_number
,
10
));
}
private
readGPUSummary
():
Promise
<
GPUSummary
>
{
return
new
Promise
((
resolve
:
Function
,
reject
:
Function
):
void
=>
{
nodeNvidiaSmi
((
error
:
Error
,
data
:
nodeNvidiaSmi
.
GPUInfo
)
=>
{
if
(
error
)
{
reject
(
error
);
private
async
updateGPUSummary
()
{
const
cmdresult
=
await
cpp
.
exec
(
`tail -n 1
${
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
'
gpu_metrics
'
)}
`
);
if
(
cmdresult
&&
cmdresult
.
stdout
)
{
this
.
gpuSummary
=
<
GPUSummary
>
JSON
.
parse
(
cmdresult
.
stdout
);
}
else
{
const
gpuNumber
:
number
=
parseInt
(
data
.
nvidia_smi_log
.
attached_gpus
,
10
);
const
gpuSummary
:
GPUSummary
=
new
GPUSummary
(
gpuNumber
,
Date
().
toString
(),
this
.
generateEmbededGPUSummary
(
data
)
);
resolve
(
gpuSummary
);
this
.
log
.
error
(
'
Could not get gpu metrics information!
'
);
}
});
});
}
}
...
...
src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
View file @
f05e685f
...
...
@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
public
cleanUp
():
Promise
<
void
>
{
public
async
cleanUp
():
Promise
<
void
>
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
this
.
gpuScheduler
.
stop
();
await
this
.
gpuScheduler
.
stop
();
}
return
super
.
cleanUp
();
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
f05e685f
...
...
@@ -259,11 +259,3 @@ cd {0}
echo $$ >{1}
eval {2} >stdout 2>stderr
echo $?
\`
date +%s%3N
\`
>{3}`
;
export
const
GPU_COLLECTOR_FORMAT
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
f05e685f
...
...
@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
import
{
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
RemoteMachineMeta
,
RemoteMachineScheduleInfo
,
RemoteMachineScheduleResult
,
SSHClient
,
SSHClientManager
,
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
GPU_COLLECTOR_FORMAT
RemoteMachineTrialJobDetail
,
ScheduleResultType
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
}
from
'
./remoteMachineData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT
}
from
'
../common/gpuData
'
;
import
{
SSHClientUtility
}
from
'
./sshClientUtility
'
;
import
{
validateCodeDir
}
from
'
../common/util
'
;
import
{
RemoteMachineJobRestServer
}
from
'
./remoteMachineJobRestServer
'
;
...
...
@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
// This directory is used to store gpu_metrics and pid created by script
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_COLLECTOR_FORMAT
,
GPU_
INFO_
COLLECTOR_FORMAT
,
remoteGPUScriptsDir
,
path
.
join
(
remoteGPUScriptsDir
,
'
pid
'
),
);
...
...
src/nni_manager/types/node-nvidia-smi/index.d.ts
deleted
100644 → 0
View file @
f075aab0
declare
module
'
node-nvidia-smi
'
{
function
smi
(
callback
:
(
error
:
Error
,
data
:
smi
.
GPUInfo
)
=>
void
):
void
;
namespace
smi
{
interface
EmbededGPUSummary
{
minor_number
:
string
;
utilization
:
{
gpu_util
:
string
;
memory_util
:
string
;
};
process
:
string
|
object
;
}
interface
GPUInfo
{
nvidia_smi_log
:
{
attached_gpus
:
string
;
gpu
:
EmbededGPUSummary
[]
|
EmbededGPUSummary
;
};
}
}
export
=
smi
;
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment