Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
be09f11c
"results/vscode:/vscode.git/clone" did not exist on "aac98df8686bdec3e546ddd8c8d467054d1b72d3"
Unverified
Commit
be09f11c
authored
May 25, 2020
by
Chi Song
Committed by
GitHub
May 25, 2020
Browse files
Improve stablability of remote training service. (#2474)
parent
e640ad6f
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
22 additions
and
34 deletions
+22
-34
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+1
-1
src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
.../training_service/remote_machine/extends/linuxCommands.ts
+5
-2
src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
...raining_service/remote_machine/extends/windowsCommands.ts
+6
-3
src/nni_manager/training_service/remote_machine/osCommands.ts
...nni_manager/training_service/remote_machine/osCommands.ts
+1
-1
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+6
-2
src/nni_manager/training_service/remote_machine/shellExecutor.ts
..._manager/training_service/remote_machine/shellExecutor.ts
+2
-2
tools/nni_gpu_tool/gpu_metrics_collector.py
tools/nni_gpu_tool/gpu_metrics_collector.py
+1
-23
No files found.
src/nni_manager/core/nnimanager.ts
View file @
be09f11c
...
...
@@ -566,7 +566,7 @@ class NNIManager implements Manager {
assert
(
this
.
status
.
status
===
'
RUNNING
'
||
this
.
status
.
status
===
'
DONE
'
||
this
.
status
.
status
===
'
NO_MORE_TRIAL
'
||
this
.
status
.
status
===
'
TUNER_NO_MORE_TRIAL
'
);
this
.
status
.
status
===
'
TUNER_NO_MORE_TRIAL
'
,
`Actual status:
${
this
.
status
.
status
}
`
);
if
(
this
.
experimentProfile
.
execDuration
>
this
.
experimentProfile
.
params
.
maxExecDuration
||
this
.
currSubmittedTrialNum
>=
this
.
experimentProfile
.
params
.
maxTrialNum
)
{
if
(
this
.
status
.
status
!==
'
DONE
'
)
{
...
...
src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
View file @
be09f11c
...
...
@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
return
result
;
}
public
killChildProcesses
(
pidFileName
:
string
):
string
{
public
killChildProcesses
(
pidFileName
:
string
,
killSelf
:
boolean
):
string
{
// prevent trialkeeper to be killed, so it can save exit code.
cons
t
command
=
`list_descendants ()
le
t
command
=
`list_descendants ()
{
local children=$(ps -o pid= --ppid "$1")
...
...
@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
echo "$children"
}
kill $(list_descendants
\`
cat '
${
pidFileName
}
'
\`
)`
if
(
killSelf
)
{
command
+=
`\nkill
\`
cat '
${
pidFileName
}
'
\`
`
}
return
command
;
}
...
...
src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
View file @
be09f11c
...
...
@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
return
result
;
}
public
killChildProcesses
(
pidFileName
:
string
):
string
{
cons
t
command
=
`powershell "$ppid=(type
${
pidFileName
}
); function Kill-Tree {Param([int]$subppid);`
+
public
killChildProcesses
(
pidFileName
:
string
,
killSelf
:
boolean
):
string
{
le
t
command
=
`powershell "$ppid=(type
${
pidFileName
}
); function Kill-Tree {Param([int]$subppid);`
+
`Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; `
+
`if ($subppid -ne $ppid){Stop-Process -Id $subppid}}`
+
`if ($subppid -ne $ppid){Stop-Process -Id $subppid
-Force"
}}`
+
`kill-tree $ppid"`
;
if
(
killSelf
){
command
+=
`;Stop-Process -Id $ppid`
;
}
return
command
;
}
...
...
src/nni_manager/training_service/remote_machine/osCommands.ts
View file @
be09f11c
...
...
@@ -25,7 +25,7 @@ abstract class OsCommands {
public
abstract
readLastLines
(
fileName
:
string
,
lineCount
:
number
):
string
;
public
abstract
isProcessAliveCommand
(
pidFileName
:
string
):
string
;
public
abstract
isProcessAliveProcessOutput
(
result
:
RemoteCommandResult
):
boolean
;
public
abstract
killChildProcesses
(
pidFileName
:
string
):
string
;
public
abstract
killChildProcesses
(
pidFileName
:
string
,
killSelf
:
boolean
):
string
;
public
abstract
extractFile
(
tarFileName
:
string
,
targetFolder
:
string
):
string
;
public
abstract
executeScript
(
script
:
string
,
isFile
:
boolean
):
string
;
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
be09f11c
...
...
@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
if
(
restServer
.
getErrorMessage
!==
undefined
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
this
.
stopping
=
true
;
throw
new
Error
(
restServer
.
getErrorMessage
);
}
await
delay
(
3000
);
}
...
...
@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
executor
!==
undefined
)
{
this
.
log
.
info
(
`killing gpu metric collector on
${
executor
.
name
}
`
);
const
gpuJobPidPath
:
string
=
executor
.
joinPath
(
executor
.
getRemoteScriptsPath
(
getExperimentId
()),
'
pid
'
);
await
executor
.
killChildProcesses
(
gpuJobPidPath
);
await
executor
.
killChildProcesses
(
gpuJobPidPath
,
true
);
}
executorManager
.
releaseAllExecutor
();
}
...
...
@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
timer
.
unsubscribe
(
disposable
);
}
}
if
(
this
.
stopping
){
this
.
timer
.
unsubscribe
(
disposable
);
this
.
log
.
debug
(
`Stopped GPU collector on
${
rmMeta
.
ip
}
, since experiment is exiting.`
);
}
collectingCount
.
pop
();
}
}
...
...
src/nni_manager/training_service/remote_machine/shellExecutor.ts
View file @
be09f11c
...
...
@@ -230,8 +230,8 @@ class ShellExecutor {
return
result
!==
undefined
?
result
:
false
;
}
public
async
killChildProcesses
(
pidFileName
:
string
):
Promise
<
boolean
>
{
const
commandText
=
this
.
osCommands
&&
this
.
osCommands
.
killChildProcesses
(
pidFileName
);
public
async
killChildProcesses
(
pidFileName
:
string
,
killSelf
:
boolean
=
false
):
Promise
<
boolean
>
{
const
commandText
=
this
.
osCommands
&&
this
.
osCommands
.
killChildProcesses
(
pidFileName
,
killSelf
);
const
commandResult
=
await
this
.
execute
(
commandText
);
return
commandResult
.
exitCode
==
0
;
}
...
...
tools/nni_gpu_tool/gpu_metrics_collector.py
View file @
be09f11c
...
...
@@ -11,31 +11,9 @@ import traceback
from
xml.dom
import
minidom
def
check_ready_to_run
():
if
sys
.
platform
==
'win32'
:
pgrep_output
=
subprocess
.
check_output
(
'wmic process where "CommandLine like
\'
%nni_gpu_tool.gpu_metrics_collector%
\'
and name like
\'
%python%
\'
" get processId'
)
pidList
=
pgrep_output
.
decode
(
"utf-8"
).
strip
().
split
()
pidList
.
pop
(
0
)
# remove the key word 'ProcessId'
pidList
=
list
(
map
(
int
,
pidList
))
pidList
.
remove
(
os
.
getpid
())
return
not
pidList
else
:
pgrep_output
=
subprocess
.
check_output
(
'pgrep -afu "$(whoami)"
\'
python3 -m nni_gpu_tool.gpu_metrics_collector
\'
'
,
shell
=
True
)
pidList
=
[]
for
pid
in
pgrep_output
.
splitlines
():
pid
=
pid
.
decode
()
if
"pgrep "
in
pid
or
pid
.
startswith
(
'%s '
%
os
.
getpid
())
or
pid
.
startswith
(
'%s '
%
os
.
getppid
()):
continue
pidList
.
append
(
pid
)
return
not
pidList
def
main
(
argv
):
metrics_output_dir
=
os
.
environ
[
'METRIC_OUTPUT_DIR'
]
if
check_ready_to_run
()
==
False
:
print
(
"GPU metrics collector is already running. exiting..."
)
exit
(
2
)
cmd
=
'nvidia-smi -q -x'
.
split
()
while
(
True
):
try
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment