Unverified Commit be09f11c authored by Chi Song's avatar Chi Song Committed by GitHub
Browse files

Improve stablability of remote training service. (#2474)

parent e640ad6f
...@@ -566,7 +566,7 @@ class NNIManager implements Manager { ...@@ -566,7 +566,7 @@ class NNIManager implements Manager {
assert(this.status.status === 'RUNNING' || assert(this.status.status === 'RUNNING' ||
this.status.status === 'DONE' || this.status.status === 'DONE' ||
this.status.status === 'NO_MORE_TRIAL' || this.status.status === 'NO_MORE_TRIAL' ||
this.status.status === 'TUNER_NO_MORE_TRIAL'); this.status.status === 'TUNER_NO_MORE_TRIAL', `Actual status: ${this.status.status}`);
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration || if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) { this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status !== 'DONE') { if (this.status.status !== 'DONE') {
......
...@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands { ...@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
return result; return result;
} }
public killChildProcesses(pidFileName: string): string { public killChildProcesses(pidFileName: string, killSelf: boolean): string {
// prevent trialkeeper to be killed, so it can save exit code. // prevent trialkeeper to be killed, so it can save exit code.
const command = `list_descendants () let command = `list_descendants ()
{ {
local children=$(ps -o pid= --ppid "$1") local children=$(ps -o pid= --ppid "$1")
...@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands { ...@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
echo "$children" echo "$children"
} }
kill $(list_descendants \`cat '${pidFileName}'\`)` kill $(list_descendants \`cat '${pidFileName}'\`)`
if (killSelf) {
command += `\nkill \`cat '${pidFileName}'\``
}
return command; return command;
} }
......
...@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands { ...@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
return result; return result;
} }
public killChildProcesses(pidFileName: string): string { public killChildProcesses(pidFileName: string, killSelf: boolean): string {
const command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` + let command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
`Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` + `Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` +
`if ($subppid -ne $ppid){Stop-Process -Id $subppid}}` + `if ($subppid -ne $ppid){Stop-Process -Id $subppid -Force"}}` +
`kill-tree $ppid"`; `kill-tree $ppid"`;
if (killSelf){
command += `;Stop-Process -Id $ppid`;
}
return command; return command;
} }
......
...@@ -25,7 +25,7 @@ abstract class OsCommands { ...@@ -25,7 +25,7 @@ abstract class OsCommands {
public abstract readLastLines(fileName: string, lineCount: number): string; public abstract readLastLines(fileName: string, lineCount: number): string;
public abstract isProcessAliveCommand(pidFileName: string): string; public abstract isProcessAliveCommand(pidFileName: string): string;
public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean; public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean;
public abstract killChildProcesses(pidFileName: string): string; public abstract killChildProcesses(pidFileName: string, killSelf: boolean): string;
public abstract extractFile(tarFileName: string, targetFolder: string): string; public abstract extractFile(tarFileName: string, targetFolder: string): string;
public abstract executeScript(script: string, isFile: boolean): string; public abstract executeScript(script: string, isFile: boolean): string;
......
...@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
} }
} }
if (restServer.getErrorMessage !== undefined) { if (restServer.getErrorMessage !== undefined) {
throw new Error(restServer.getErrorMessage);
this.stopping = true; this.stopping = true;
throw new Error(restServer.getErrorMessage);
} }
await delay(3000); await delay(3000);
} }
...@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
if (executor !== undefined) { if (executor !== undefined) {
this.log.info(`killing gpu metric collector on ${executor.name}`); this.log.info(`killing gpu metric collector on ${executor.name}`);
const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid'); const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
await executor.killChildProcesses(gpuJobPidPath); await executor.killChildProcesses(gpuJobPidPath, true);
} }
executorManager.releaseAllExecutor(); executorManager.releaseAllExecutor();
} }
...@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
this.timer.unsubscribe(disposable); this.timer.unsubscribe(disposable);
} }
} }
if (this.stopping){
this.timer.unsubscribe(disposable);
this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`);
}
collectingCount.pop(); collectingCount.pop();
} }
} }
......
...@@ -230,8 +230,8 @@ class ShellExecutor { ...@@ -230,8 +230,8 @@ class ShellExecutor {
return result !== undefined ? result : false; return result !== undefined ? result : false;
} }
public async killChildProcesses(pidFileName: string): Promise<boolean> { public async killChildProcesses(pidFileName: string, killSelf: boolean = false): Promise<boolean> {
const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName); const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName, killSelf);
const commandResult = await this.execute(commandText); const commandResult = await this.execute(commandText);
return commandResult.exitCode == 0; return commandResult.exitCode == 0;
} }
......
...@@ -11,31 +11,9 @@ import traceback ...@@ -11,31 +11,9 @@ import traceback
from xml.dom import minidom from xml.dom import minidom
def check_ready_to_run():
if sys.platform == 'win32':
pgrep_output = subprocess.check_output(
'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
pidList = pgrep_output.decode("utf-8").strip().split()
pidList.pop(0) # remove the key word 'ProcessId'
pidList = list(map(int, pidList))
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pid = pid.decode()
if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
continue
pidList.append(pid)
return not pidList
def main(argv): def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR'] metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
if check_ready_to_run() == False:
print("GPU metrics collector is already running. exiting...")
exit(2)
cmd = 'nvidia-smi -q -x'.split() cmd = 'nvidia-smi -q -x'.split()
while(True): while(True):
try: try:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment