Unverified Commit 80624de7 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Update pai token by time interval (#434)

Update pai token every 2 hours.
parent 23530bb6
...@@ -64,6 +64,8 @@ class PAITrainingService implements TrainingService { ...@@ -64,6 +64,8 @@ class PAITrainingService implements TrainingService {
private stopping: boolean = false; private stopping: boolean = false;
private hdfsClient: any; private hdfsClient: any;
private paiToken? : string; private paiToken? : string;
private paiTokenUpdateTime?: number;
private paiTokenUpdateInterval: number;
private experimentId! : string; private experimentId! : string;
private readonly paiJobCollector : PAIJobInfoCollector; private readonly paiJobCollector : PAIJobInfoCollector;
private readonly hdfsDirPattern: string; private readonly hdfsDirPattern: string;
...@@ -83,6 +85,7 @@ class PAITrainingService implements TrainingService { ...@@ -83,6 +85,7 @@ class PAITrainingService implements TrainingService {
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?'; this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?';
this.nextTrialSequenceId = -1; this.nextTrialSequenceId = -1;
this.paiTokenUpdateInterval = 7200000; //2hours
} }
public async run(): Promise<void> { public async run(): Promise<void> {
...@@ -90,6 +93,7 @@ class PAITrainingService implements TrainingService { ...@@ -90,6 +93,7 @@ class PAITrainingService implements TrainingService {
await restServer.start(); await restServer.start();
this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
await this.updatePaiToken();
await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig); await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig);
await delay(3000); await delay(3000);
} }
...@@ -347,40 +351,8 @@ class PAITrainingService implements TrainingService { ...@@ -347,40 +351,8 @@ class PAITrainingService implements TrainingService {
}); });
// Get PAI authentication token // Get PAI authentication token
const authentication_req: request.Options = { await this.updatePaiToken();
uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/token`, break;
method: 'POST',
json: true,
body: {
username: this.paiClusterConfig.userName,
password: this.paiClusterConfig.passWord
}
};
request(authentication_req, (error: Error, response: request.Response, body: any) => {
if (error) {
this.log.error(`Get PAI token failed: ${error.message}`);
deferred.reject(new Error(`Get PAI token failed: ${error.message}`));
} else {
if(response.statusCode !== 200){
this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}`);
deferred.reject(new Error(`Get PAI token failed, please check paiConfig username or password`));
}
this.paiToken = body.token;
deferred.resolve();
}
});
let timeoutId: NodeJS.Timer;
const timeoutDelay: Promise<void> = new Promise<void>((resolve: Function, reject: Function): void => {
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId = setTimeout(
() => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')),
5000);
});
return Promise.race([timeoutDelay, deferred.promise]).finally(() => clearTimeout(timeoutId));
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG:
if (!this.paiClusterConfig){ if (!this.paiClusterConfig){
...@@ -487,6 +459,60 @@ class PAITrainingService implements TrainingService { ...@@ -487,6 +459,60 @@ class PAITrainingService implements TrainingService {
return this.nextTrialSequenceId++; return this.nextTrialSequenceId++;
} }
/**
* Update pai token by the interval time or initialize the pai token
*/
private async updatePaiToken(): Promise<void> {
const deferred : Deferred<void> = new Deferred<void>();
let currentTime: number = new Date().getTime();
//If pai token initialized and not reach the interval time, do not update
if(this.paiTokenUpdateTime && (currentTime - this.paiTokenUpdateTime) < this.paiTokenUpdateInterval){
return Promise.resolve();
}
if(!this.paiClusterConfig){
const paiClusterConfigError = `pai cluster config not initialized!`
this.log.error(`${paiClusterConfigError}`);
throw Error(`${paiClusterConfigError}`)
}
const authentication_req: request.Options = {
uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/token`,
method: 'POST',
json: true,
body: {
username: this.paiClusterConfig.userName,
password: this.paiClusterConfig.passWord
}
};
request(authentication_req, (error: Error, response: request.Response, body: any) => {
if (error) {
this.log.error(`Get PAI token failed: ${error.message}`);
deferred.reject(new Error(`Get PAI token failed: ${error.message}`));
} else {
if(response.statusCode !== 200){
this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}`);
deferred.reject(new Error(`Get PAI token failed, please check paiConfig username or password`));
}
this.paiToken = body.token;
this.paiTokenUpdateTime = new Date().getTime();
deferred.resolve();
}
});
let timeoutId: NodeJS.Timer;
const timeoutDelay: Promise<void> = new Promise<void>((resolve: Function, reject: Function): void => {
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId = setTimeout(
() => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')),
5000);
});
return Promise.race([timeoutDelay, deferred.promise]).finally(() => clearTimeout(timeoutId));
}
} }
export { PAITrainingService } export { PAITrainingService }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment