Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
6a5864cd
Unverified
Commit
6a5864cd
authored
Nov 21, 2019
by
liuzhe-lz
Committed by
GitHub
Nov 21, 2019
Browse files
fix gpu script permission issue (#1707)
* fix gpu script permission issue * make gpu tool local to user
parent
cb52d441
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
19 additions
and
74 deletions
+19
-74
src/nni_manager/training_service/common/gpuData.ts
src/nni_manager/training_service/common/gpuData.ts
+0
-8
src/nni_manager/training_service/common/util.ts
src/nni_manager/training_service/common/util.ts
+10
-16
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+3
-8
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+5
-41
tools/nni_gpu_tool/gpu_metrics_collector.py
tools/nni_gpu_tool/gpu_metrics_collector.py
+1
-1
No files found.
src/nni_manager/training_service/common/gpuData.ts
View file @
6a5864cd
...
...
@@ -59,14 +59,6 @@ export class GPUSummary {
}
}
export
const
GPU_INFO_COLLECTOR_FORMAT_LINUX
:
string
=
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
;
export
const
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
:
string
=
`
$env:METRIC_OUTPUT_DIR="{0}"
...
...
src/nni_manager/training_service/common/util.ts
View file @
6a5864cd
...
...
@@ -27,7 +27,7 @@ import * as path from 'path';
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
countFilesRecursively
,
getNewLine
,
validateFileNameRecursively
}
from
'
../../common/utils
'
;
import
{
file
}
from
'
../../node_modules/@types/tmp
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
}
from
'
./gpuData
'
;
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
...
...
@@ -219,22 +219,16 @@ export function getScriptName(fileNamePrefix: string): string {
}
}
/**
* generate script file
* @param gpuMetricCollectorScriptFolder
*/
export
function
getg
puMetricsCollector
ScriptContent
(
gpuMetricCollectorS
criptFolder
:
string
):
string
{
export
function
getGpuMetricsCollectorBashScriptContent
(
scriptFolder
:
string
):
string
{
return
`echo $$ >
${
scriptFolder
}
/pid ; METRIC_OUTPUT_DIR=
${
scriptFolder
}
python3 -m nni_gpu_tool.gpu_metrics_collector`
;
}
export
function
runG
puMetricsCollector
(
s
criptFolder
:
string
):
void
{
if
(
process
.
platform
===
'
win32
'
)
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
const
scriptPath
=
path
.
join
(
scriptFolder
,
'
gpu_metrics_collector.ps1
'
);
const
content
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS
,
scriptFolder
,
path
.
join
(
scriptFolder
,
'
pid
'
));
fs
.
writeFile
(
scriptPath
,
content
,
{
encoding
:
'
utf8
'
},
()
=>
{
runScript
(
scriptPath
);
});
}
else
{
return
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
gpuMetricCollectorScriptFolder
,
path
.
join
(
gpuMetricCollectorScriptFolder
,
'
pid
'
)
);
cp
.
exec
(
getGpuMetricsCollectorBashScriptContent
(
scriptFolder
),
{
shell
:
'
/bin/bash
'
});
}
}
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
6a5864cd
...
...
@@ -28,7 +28,7 @@ import { String } from 'typescript-string-operations';
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
delay
}
from
'
../../common/utils
'
;
import
{
GPUInfo
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
execKill
,
execMkdir
,
execRemove
,
execTail
,
getg
puMetricsCollector
ScriptContent
,
getScriptName
,
runScript
}
from
'
../common/util
'
;
import
{
execKill
,
execMkdir
,
execRemove
,
execTail
,
runG
puMetricsCollector
}
from
'
../common/util
'
;
/**
* GPUScheduler for local training service
...
...
@@ -43,7 +43,7 @@ class GPUScheduler {
constructor
()
{
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
gpuMetricCollectorScriptFolder
=
`
${
os
.
tmpdir
()}
/nni/script`
;
this
.
gpuMetricCollectorScriptFolder
=
`
${
os
.
tmpdir
()}
/
${
os
.
userInfo
().
username
}
/
nni/script`
;
}
public
async
run
():
Promise
<
void
>
{
...
...
@@ -101,12 +101,7 @@ class GPUScheduler {
*/
private
async
runGpuMetricsCollectorScript
():
Promise
<
void
>
{
await
execMkdir
(
this
.
gpuMetricCollectorScriptFolder
,
true
);
//generate gpu_metrics_collector script
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
this
.
gpuMetricCollectorScriptFolder
,
getScriptName
(
'
gpu_metrics_collector
'
));
const
gpuMetricsCollectorScriptContent
:
string
=
getgpuMetricsCollectorScriptContent
(
this
.
gpuMetricCollectorScriptFolder
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
runScript
(
gpuMetricsCollectorScriptPath
);
runGpuMetricsCollector
(
this
.
gpuMetricCollectorScriptFolder
);
}
// tslint:disable:non-literal-fs-path
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
6a5864cd
...
...
@@ -42,10 +42,10 @@ import {
getVersion
,
uniqueString
,
unixPathJoin
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
execCopydir
,
execMkdir
,
execRemove
,
validateCodeDir
}
from
'
../common/util
'
;
import
{
execCopydir
,
execMkdir
,
execRemove
,
validateCodeDir
,
getGpuMetricsCollectorBashScriptContent
}
from
'
../common/util
'
;
import
{
GPUScheduler
}
from
'
./gpuScheduler
'
;
import
{
HOST_JOB_SHELL_FORMAT
,
RemoteCommandResult
,
REMOTEMACHINE_TRIAL_COMMAND_FORMAT
,
RemoteMachineMeta
,
...
...
@@ -334,8 +334,6 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
case
TrialConfigMetadataKey
.
MACHINE_LIST
:
await
this
.
setupConnections
(
value
);
//remove local temp files
await
execRemove
(
this
.
getLocalGpuMetricCollectorDir
());
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
...
...
@@ -428,34 +426,6 @@ class RemoteMachineTrainingService implements TrainingService {
return
Promise
.
resolve
();
}
/**
* Generate gpu metric collector directory to store temp gpu metric collector script files
*/
private
getLocalGpuMetricCollectorDir
():
string
{
const
userName
:
string
=
path
.
basename
(
os
.
homedir
());
//get current user name of os
return
path
.
join
(
os
.
tmpdir
(),
userName
,
'
nni
'
,
'
scripts
'
);
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private
async
generateGpuMetricsCollectorScript
(
userName
:
string
):
Promise
<
void
>
{
const
gpuMetricCollectorScriptFolder
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
await
execMkdir
(
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
));
//generate gpu_metrics_collector.sh
const
gpuMetricsCollectorScriptPath
:
string
=
path
.
join
(
gpuMetricCollectorScriptFolder
,
userName
,
'
gpu_metrics_collector.sh
'
);
// This directory is used to store gpu_metrics and pid created by script
const
remoteGPUScriptsDir
:
string
=
this
.
getRemoteScriptsPath
(
userName
);
const
gpuMetricsCollectorScriptContent
:
string
=
String
.
Format
(
GPU_INFO_COLLECTOR_FORMAT_LINUX
,
remoteGPUScriptsDir
,
unixPathJoin
(
remoteGPUScriptsDir
,
'
pid
'
)
);
await
fs
.
promises
.
writeFile
(
gpuMetricsCollectorScriptPath
,
gpuMetricsCollectorScriptContent
,
{
encoding
:
'
utf8
'
});
}
private
async
setupConnections
(
machineList
:
string
):
Promise
<
void
>
{
this
.
log
.
debug
(
`Connecting to remote machines:
${
machineList
}
`
);
const
deferred
:
Deferred
<
void
>
=
new
Deferred
<
void
>
();
...
...
@@ -479,24 +449,18 @@ class RemoteMachineTrainingService implements TrainingService {
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
,
conn
:
Client
):
Promise
<
void
>
{
// Create root working directory after ssh connection is ready
// generate gpu script in local machine first, will copy to remote machine later
await
this
.
generateGpuMetricsCollectorScript
(
rmMeta
.
username
);
const
nniRootDir
:
string
=
unixPathJoin
(
getRemoteTmpDir
(
this
.
remoteOS
),
'
nni
'
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
this
.
remoteExpRootDir
}
`
,
conn
);
// Copy NNI scripts to remote expeirment working directory
const
localGpuScriptCollectorDir
:
string
=
this
.
getLocalGpuMetricCollectorDir
();
// the directory to store temp scripts in remote machine
const
remoteGpuScriptCollectorDir
:
string
=
this
.
getRemoteScriptsPath
(
rmMeta
.
username
);
await
SSHClientUtility
.
remoteExeCommand
(
`mkdir -p
${
remoteGpuScriptCollectorDir
}
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`
(umask 0 ;
mkdir -p
${
remoteGpuScriptCollectorDir
}
)
`
,
conn
);
await
SSHClientUtility
.
remoteExeCommand
(
`chmod 777
${
nniRootDir
}
${
nniRootDir
}
/*
${
nniRootDir
}
/scripts/*`
,
conn
);
//copy gpu_metrics_collector.sh to remote
await
SSHClientUtility
.
copyFileToRemote
(
path
.
join
(
localGpuScriptCollectorDir
,
rmMeta
.
username
,
'
gpu_metrics_collector.sh
'
),
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
),
conn
);
//Begin to execute gpu_metrics_collection scripts
// tslint:disable-next-line: no-floating-promises
SSHClientUtility
.
remoteExeCommand
(
`bash
${
unixPathJoin
(
remoteGpuScriptCollectorDir
,
'
gpu_metrics_collector.sh
'
)}
`
,
conn
);
const
script
=
getGpuMetricsCollectorBashScriptContent
(
remoteGpuScriptCollectorDir
);
SSHClientUtility
.
remoteExeCommand
(
`bash -c '
${
script
}
'`
,
conn
);
const
disposable
:
Rx
.
IDisposable
=
this
.
timer
.
subscribe
(
async
(
tick
:
number
)
=>
{
...
...
tools/nni_gpu_tool/gpu_metrics_collector.py
View file @
6a5864cd
...
...
@@ -35,7 +35,7 @@ def check_ready_to_run():
pidList
.
remove
(
os
.
getpid
())
return
not
pidList
else
:
pgrep_output
=
subprocess
.
check_output
(
'pgrep -fx
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pgrep_output
=
subprocess
.
check_output
(
'pgrep -fx
u "$(whoami)"
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pidList
=
[]
for
pid
in
pgrep_output
.
splitlines
():
pidList
.
append
(
int
(
pid
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment