Unverified Commit a922f9f0 authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Cancel unknown trial job (#1943)

parent 7565d3c0
...@@ -151,18 +151,20 @@ abstract class PAITrainingService implements TrainingService { ...@@ -151,18 +151,20 @@ abstract class PAITrainingService implements TrainingService {
public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> { public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
const deferred: Deferred<void> = new Deferred<void>();
if (trialJobDetail === undefined) { if (trialJobDetail === undefined) {
this.log.error(`cancelTrialJob: trial job id ${trialJobId} not found`); return Promise.reject(new Error(`cancelTrialJob: trial job id ${trialJobId} not found`));
return Promise.reject();
} }
if (this.paiClusterConfig === undefined) { if (this.paiClusterConfig === undefined) {
throw new Error('PAI Cluster config is not initialized'); return Promise.reject(new Error('PAI Cluster config is not initialized'));
} }
if (this.paiToken === undefined) { if (this.paiToken === undefined) {
throw new Error('PAI token is not initialized'); return Promise.reject(new Error('PAI token is not initialized'));
}
if (trialJobDetail.status === 'UNKNOWN') {
trialJobDetail.status = 'USER_CANCELED';
return Promise.resolve();
} }
const stopJobRequest: request.Options = { const stopJobRequest: request.Options = {
...@@ -179,6 +181,7 @@ abstract class PAITrainingService implements TrainingService { ...@@ -179,6 +181,7 @@ abstract class PAITrainingService implements TrainingService {
// Set trialjobDetail's early stopped field, to mark the job's cancellation source // Set trialjobDetail's early stopped field, to mark the job's cancellation source
trialJobDetail.isEarlyStopped = isEarlyStopped; trialJobDetail.isEarlyStopped = isEarlyStopped;
const deferred: Deferred<void> = new Deferred<void>();
request(stopJobRequest, (error: Error, response: request.Response, body: any) => { request(stopJobRequest, (error: Error, response: request.Response, body: any) => {
if ((error !== undefined && error !== null) || response.statusCode >= 400) { if ((error !== undefined && error !== null) || response.statusCode >= 400) {
......
...@@ -277,6 +277,12 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -277,6 +277,12 @@ class RemoteMachineTrainingService implements TrainingService {
throw new Error(`Invalid job id ${trialJobId}, cannot find ssh client`); throw new Error(`Invalid job id ${trialJobId}, cannot find ssh client`);
} }
if (trialJob.status === 'UNKNOWN') {
this.releaseTrialSSHClient(trialJob);
trialJob.status = 'USER_CANCELED';
return
}
const jobpidPath: string = this.getJobPidPath(trialJob.id); const jobpidPath: string = this.getJobPidPath(trialJob.id);
try { try {
// Mark the toEarlyStop tag here // Mark the toEarlyStop tag here
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment