Unverified Commit 21a1dd8b authored by J-shang's avatar J-shang Committed by GitHub
Browse files

fix aml outputs and python process not killed (#3321)

* fix outputs and python process not killed

* change cleanup stop environment logic

* fix bug
parent a0aa12f9
...@@ -114,7 +114,7 @@ export class AMLEnvironmentService extends EnvironmentService { ...@@ -114,7 +114,7 @@ export class AMLEnvironmentService extends EnvironmentService {
} }
const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation; const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation;
const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp");
environment.command = `import os\nos.system('${amlEnvironment.command}')`; environment.command = `import os\nos.system('mv envs outputs/envs && cd outputs && ${amlEnvironment.command}')`;
environment.useActiveGpu = this.amlClusterConfig.useActiveGpu; environment.useActiveGpu = this.amlClusterConfig.useActiveGpu;
environment.maxTrialNumberPerGpu = this.amlClusterConfig.maxTrialNumPerGpu; environment.maxTrialNumberPerGpu = this.amlClusterConfig.maxTrialNumPerGpu;
await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' });
......
...@@ -111,6 +111,10 @@ ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ ...@@ -111,6 +111,10 @@ ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \
} }
public async stopEnvironment(environment: EnvironmentInformation): Promise<void> { public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
if (environment.isAlive === false) {
return Promise.resolve();
}
const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`; const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
const pid: string = await fs.promises.readFile(jobpidPath, 'utf8'); const pid: string = await fs.promises.readFile(jobpidPath, 'utf8');
tkill(Number(pid), 'SIGKILL'); tkill(Number(pid), 'SIGKILL');
......
...@@ -219,6 +219,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -219,6 +219,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
public async stopEnvironment(environment: EnvironmentInformation): Promise<void> { public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
if (environment.isAlive === false) {
return Promise.resolve();
}
if (this.paiClusterConfig === undefined) { if (this.paiClusterConfig === undefined) {
return Promise.reject(new Error('PAI Cluster config is not initialized')); return Promise.reject(new Error('PAI Cluster config is not initialized'));
} }
......
...@@ -289,6 +289,10 @@ ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ ...@@ -289,6 +289,10 @@ ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \
} }
public async stopEnvironment(environment: EnvironmentInformation): Promise<void> { public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
if (environment.isAlive === false) {
return Promise.resolve();
}
const executor = await this.getExecutor(environment.id); const executor = await this.getExecutor(environment.id);
if (environment.status === 'UNKNOWN') { if (environment.status === 'UNKNOWN') {
......
...@@ -310,14 +310,12 @@ class TrialDispatcher implements TrainingService { ...@@ -310,14 +310,12 @@ class TrialDispatcher implements TrainingService {
for (let index = 0; index < environments.length; index++) { for (let index = 0; index < environments.length; index++) {
const environment = environments[index]; const environment = environments[index];
if (environment.isAlive === true) { this.log.info(`stopping environment ${environment.id}...`);
this.log.info(`stopping environment ${environment.id}...`); if (environment.environmentService === undefined) {
if (environment.environmentService === undefined) { throw new Error(`${environment.id} do not have environmentService!`);
throw new Error(`${environment.id} do not have environmentService!`);
}
await environment.environmentService.stopEnvironment(environment);
this.log.info(`stopped environment ${environment.id}.`);
} }
await environment.environmentService.stopEnvironment(environment);
this.log.info(`stopped environment ${environment.id}.`);
} }
this.commandEmitter.off("command", this.handleCommand); this.commandEmitter.off("command", this.handleCommand);
for (const commandChannel of this.commandChannelSet) { for (const commandChannel of this.commandChannelSet) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment