Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
be09f11c
Unverified
Commit
be09f11c
authored
May 25, 2020
by
Chi Song
Committed by
GitHub
May 25, 2020
Browse files
Improve stablability of remote training service. (#2474)
parent
e640ad6f
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
22 additions
and
34 deletions
+22
-34
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+1
-1
src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
.../training_service/remote_machine/extends/linuxCommands.ts
+5
-2
src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
...raining_service/remote_machine/extends/windowsCommands.ts
+6
-3
src/nni_manager/training_service/remote_machine/osCommands.ts
...nni_manager/training_service/remote_machine/osCommands.ts
+1
-1
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+6
-2
src/nni_manager/training_service/remote_machine/shellExecutor.ts
..._manager/training_service/remote_machine/shellExecutor.ts
+2
-2
tools/nni_gpu_tool/gpu_metrics_collector.py
tools/nni_gpu_tool/gpu_metrics_collector.py
+1
-23
No files found.
src/nni_manager/core/nnimanager.ts
View file @
be09f11c
...
...
@@ -566,7 +566,7 @@ class NNIManager implements Manager {
assert
(
this
.
status
.
status
===
'
RUNNING
'
||
this
.
status
.
status
===
'
DONE
'
||
this
.
status
.
status
===
'
NO_MORE_TRIAL
'
||
this
.
status
.
status
===
'
TUNER_NO_MORE_TRIAL
'
);
this
.
status
.
status
===
'
TUNER_NO_MORE_TRIAL
'
,
`Actual status:
${
this
.
status
.
status
}
`
);
if
(
this
.
experimentProfile
.
execDuration
>
this
.
experimentProfile
.
params
.
maxExecDuration
||
this
.
currSubmittedTrialNum
>=
this
.
experimentProfile
.
params
.
maxTrialNum
)
{
if
(
this
.
status
.
status
!==
'
DONE
'
)
{
...
...
src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
View file @
be09f11c
...
...
@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
return
result
;
}
public
killChildProcesses
(
pidFileName
:
string
):
string
{
public
killChildProcesses
(
pidFileName
:
string
,
killSelf
:
boolean
):
string
{
// prevent trialkeeper to be killed, so it can save exit code.
cons
t
command
=
`list_descendants ()
le
t
command
=
`list_descendants ()
{
local children=$(ps -o pid= --ppid "$1")
...
...
@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
echo "$children"
}
kill $(list_descendants
\`
cat '
${
pidFileName
}
'
\`
)`
if
(
killSelf
)
{
command
+=
`\nkill
\`
cat '
${
pidFileName
}
'
\`
`
}
return
command
;
}
...
...
src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
View file @
be09f11c
...
...
@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
return
result
;
}
public
killChildProcesses
(
pidFileName
:
string
):
string
{
cons
t
command
=
`powershell "$ppid=(type
${
pidFileName
}
); function Kill-Tree {Param([int]$subppid);`
+
public
killChildProcesses
(
pidFileName
:
string
,
killSelf
:
boolean
):
string
{
le
t
command
=
`powershell "$ppid=(type
${
pidFileName
}
); function Kill-Tree {Param([int]$subppid);`
+
`Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; `
+
`if ($subppid -ne $ppid){Stop-Process -Id $subppid}}`
+
`if ($subppid -ne $ppid){Stop-Process -Id $subppid
-Force"
}}`
+
`kill-tree $ppid"`
;
if
(
killSelf
){
command
+=
`;Stop-Process -Id $ppid`
;
}
return
command
;
}
...
...
src/nni_manager/training_service/remote_machine/osCommands.ts
View file @
be09f11c
...
...
@@ -25,7 +25,7 @@ abstract class OsCommands {
public
abstract
readLastLines
(
fileName
:
string
,
lineCount
:
number
):
string
;
public
abstract
isProcessAliveCommand
(
pidFileName
:
string
):
string
;
public
abstract
isProcessAliveProcessOutput
(
result
:
RemoteCommandResult
):
boolean
;
public
abstract
killChildProcesses
(
pidFileName
:
string
):
string
;
public
abstract
killChildProcesses
(
pidFileName
:
string
,
killSelf
:
boolean
):
string
;
public
abstract
extractFile
(
tarFileName
:
string
,
targetFolder
:
string
):
string
;
public
abstract
executeScript
(
script
:
string
,
isFile
:
boolean
):
string
;
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
be09f11c
...
...
@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
this
.
stopping
=
true
;
throw
new
Error
(
restServer
.
getErrorMessage
);
}
await
delay
(
3000
);
}
...
...
@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
executor
!==
undefined
)
{
this
.
log
.
info
(
`killing gpu metric collector on
${
executor
.
name
}
`
);
const
gpuJobPidPath
:
string
=
executor
.
joinPath
(
executor
.
getRemoteScriptsPath
(
getExperimentId
()),
'
pid
'
);
await
executor
.
killChildProcesses
(
gpuJobPidPath
);
await
executor
.
killChildProcesses
(
gpuJobPidPath
,
true
);
}
executorManager
.
releaseAllExecutor
();
}
...
...
@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
timer
.
unsubscribe
(
disposable
);
}
}
if
(
this
.
stopping
){
this
.
timer
.
unsubscribe
(
disposable
);
this
.
log
.
debug
(
`Stopped GPU collector on
${
rmMeta
.
ip
}
, since experiment is exiting.`
);
}
collectingCount
.
pop
();
}
}
...
...
src/nni_manager/training_service/remote_machine/shellExecutor.ts
View file @
be09f11c
...
...
@@ -230,8 +230,8 @@ class ShellExecutor {
return
result
!==
undefined
?
result
:
false
;
}
public
async
killChildProcesses
(
pidFileName
:
string
):
Promise
<
boolean
>
{
const
commandText
=
this
.
osCommands
&&
this
.
osCommands
.
killChildProcesses
(
pidFileName
);
public
async
killChildProcesses
(
pidFileName
:
string
,
killSelf
:
boolean
=
false
):
Promise
<
boolean
>
{
const
commandText
=
this
.
osCommands
&&
this
.
osCommands
.
killChildProcesses
(
pidFileName
,
killSelf
);
const
commandResult
=
await
this
.
execute
(
commandText
);
return
commandResult
.
exitCode
==
0
;
}
...
...
tools/nni_gpu_tool/gpu_metrics_collector.py
View file @
be09f11c
...
...
@@ -11,31 +11,9 @@ import traceback
from
xml.dom
import
minidom
def
check_ready_to_run
():
if
sys
.
platform
==
'win32'
:
pgrep_output
=
subprocess
.
check_output
(
'wmic process where "CommandLine like
\'
%nni_gpu_tool.gpu_metrics_collector%
\'
and name like
\'
%python%
\'
" get processId'
)
pidList
=
pgrep_output
.
decode
(
"utf-8"
).
strip
().
split
()
pidList
.
pop
(
0
)
# remove the key word 'ProcessId'
pidList
=
list
(
map
(
int
,
pidList
))
pidList
.
remove
(
os
.
getpid
())
return
not
pidList
else
:
pgrep_output
=
subprocess
.
check_output
(
'pgrep -afu "$(whoami)"
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pidList
=
[]
for
pid
in
pgrep_output
.
splitlines
():
pid
=
pid
.
decode
()
if
"pgrep "
in
pid
or
pid
.
startswith
(
'%s '
%
os
.
getpid
())
or
pid
.
startswith
(
'%s '
%
os
.
getppid
()):
continue
pidList
.
append
(
pid
)
return
not
pidList
def
main
(
argv
):
metrics_output_dir
=
os
.
environ
[
'METRIC_OUTPUT_DIR'
]
if
check_ready_to_run
()
==
False
:
print
(
"GPU metrics collector is already running. exiting..."
)
exit
(
2
)
cmd
=
'nvidia-smi -q -x'
.
split
()
while
(
True
):
try
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment