Commit 14c1b31c authored by demianzhang's avatar demianzhang Committed by chicm-ms
Browse files

Fix failed to connect to PAI with http code:500 (#1176)

* Catch the error in pai training service

* no retry
parent 2039c1c6
...@@ -489,7 +489,10 @@ class PAITrainingService implements TrainingService { ...@@ -489,7 +489,10 @@ class PAITrainingService implements TrainingService {
await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient); await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient);
} catch (error) { } catch (error) {
this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`); this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`);
throw new Error(error.message); trialJobDetail.status = 'FAILED';
deferred.resolve(true);
return deferred.promise;
} }
// Step 3. Submit PAI job via Rest call // Step 3. Submit PAI job via Rest call
...@@ -510,7 +513,7 @@ class PAITrainingService implements TrainingService { ...@@ -510,7 +513,7 @@ class PAITrainingService implements TrainingService {
`Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body}`; `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body}`;
this.log.error(errorMessage); this.log.error(errorMessage);
trialJobDetail.status = 'FAILED'; trialJobDetail.status = 'FAILED';
deferred.reject(new Error(errorMessage)); deferred.resolve(true);
} else { } else {
trialJobDetail.submitTime = Date.now(); trialJobDetail.submitTime = Date.now();
deferred.resolve(true); deferred.resolve(true);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment