Commit 14c1b31c authored by demianzhang's avatar demianzhang Committed by chicm-ms
Browse files

Fix failed to connect to PAI with http code:500 (#1176)

* Catch the error in pai training service

* no retry
parent 2039c1c6
......@@ -489,7 +489,10 @@ class PAITrainingService implements TrainingService {
await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient);
} catch (error) {
this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`);
throw new Error(error.message);
trialJobDetail.status = 'FAILED';
deferred.resolve(true);
return deferred.promise;
}
// Step 3. Submit PAI job via Rest call
......@@ -510,7 +513,7 @@ class PAITrainingService implements TrainingService {
`Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body}`;
this.log.error(errorMessage);
trialJobDetail.status = 'FAILED';
deferred.reject(new Error(errorMessage));
deferred.resolve(true);
} else {
trialJobDetail.submitTime = Date.now();
deferred.resolve(true);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment