Unverified Commit fba2dd5c authored by George Cheng's avatar George Cheng Committed by GitHub
Browse files

DLTS: Get NNI manager IP / port from endpoints (#2305)

parent 62d74565
...@@ -38,7 +38,9 @@ class DLTSTrainingService implements TrainingService { ...@@ -38,7 +38,9 @@ class DLTSTrainingService implements TrainingService {
private versionCheck: boolean = true; private versionCheck: boolean = true;
private logCollection: string = 'none'; private logCollection: string = 'none';
private isMultiPhase: boolean = false; private isMultiPhase: boolean = false;
private dltsRestServerHost: string;
private dltsRestServerPort?: number; private dltsRestServerPort?: number;
private jobMode: boolean;
private readonly trialJobsMap: Map<string, DLTSTrialJobDetail>; private readonly trialJobsMap: Map<string, DLTSTrialJobDetail>;
private nniManagerIpConfig?: NNIManagerIpConfig; private nniManagerIpConfig?: NNIManagerIpConfig;
...@@ -51,7 +53,9 @@ class DLTSTrainingService implements TrainingService { ...@@ -51,7 +53,9 @@ class DLTSTrainingService implements TrainingService {
this.trialJobsMap = new Map(); this.trialJobsMap = new Map();
this.jobQueue = []; this.jobQueue = [];
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.log.info('Construct DLTS training service.'); this.dltsRestServerHost = getIPV4Address();
this.jobMode = 'DLTS_JOB_ID' in process.env;
this.log.info(`Construct DLTS training service in ${this.jobMode ? 'job mode' : 'local mode'}.`);
} }
public async run(): Promise<void> { public async run(): Promise<void> {
...@@ -60,12 +64,70 @@ class DLTSTrainingService implements TrainingService { ...@@ -60,12 +64,70 @@ class DLTSTrainingService implements TrainingService {
await restServer.start(); await restServer.start();
restServer.setEnableVersionCheck = this.versionCheck; restServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`DLTS Training service rest server listening on: ${restServer.endPoint}`); this.log.info(`DLTS Training service rest server listening on: ${restServer.endPoint}`);
if (this.jobMode) {
await this.exposeRestServerPort(restServer.clusterRestServerPort);
} else {
this.dltsRestServerPort = restServer.clusterRestServerPort
}
await Promise.all([ await Promise.all([
this.statusCheckingLoop(), this.statusCheckingLoop(),
this.submitJobLoop()]); this.submitJobLoop()]);
this.log.info('DLTS training service exit.'); this.log.info('DLTS training service exit.');
} }
private async exposeRestServerPort(port: number): Promise<void> {
if (this.dltsClusterConfig == null) {
throw Error('Cluster config is not set');
}
const { dashboard, cluster, email, password } = this.dltsClusterConfig;
const jobId = process.env['DLTS_JOB_ID'] + '';
const uri = `${dashboard}api/clusters/${cluster}/jobs/${jobId}/endpoints`;
const qs = { email, password };
do {
this.log.debug('Checking endpoints');
const endpoints = await new Promise((resolve, reject) => {
request.get(uri, { qs, json: true }, function (error, response, body) {
if (error) {
reject(error);
} else {
resolve(body);
}
});
});
this.log.debug('Endpoints: %o', endpoints);
if (Array.isArray(endpoints)) {
const restServerEndpoint = endpoints.find(({ podPort }) => podPort === port);
if (restServerEndpoint == null) {
this.log.debug('Exposing %d', port);
await new Promise((resolve, reject) => {
request.post(uri, {
qs,
json: true,
body: {
endpoints: [{
name: "nni-rest-server",
podPort: port
}]
}
}, function (error) {
if (error) {
reject(error);
} else {
resolve();
}
});
});
} else if (restServerEndpoint['status'] === 'running') {
// We get an exposed restserver port
this.dltsRestServerHost = restServerEndpoint['nodeName'];
this.dltsRestServerPort = restServerEndpoint['port'];
break;
}
}
} while (await new Promise(resolve => setTimeout(resolve, 1000, true)));
}
private async statusCheckingLoop(): Promise<void> { private async statusCheckingLoop(): Promise<void> {
while (!this.stopping) { while (!this.stopping) {
const updateDLTSTrialJobs: Promise<void>[] = []; const updateDLTSTrialJobs: Promise<void>[] = [];
...@@ -400,7 +462,7 @@ class DLTSTrainingService implements TrainingService { ...@@ -400,7 +462,7 @@ class DLTSTrainingService implements TrainingService {
); );
} }
// tslint:disable-next-line: strict-boolean-expressions // tslint:disable-next-line: strict-boolean-expressions
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : this.dltsRestServerHost;
const version: string = this.versionCheck ? await getVersion() : ''; const version: string = this.versionCheck ? await getVersion() : '';
const nniDLTSTrialCommand: string = String.Format( const nniDLTSTrialCommand: string = String.Format(
DLTS_TRIAL_COMMAND_FORMAT, DLTS_TRIAL_COMMAND_FORMAT,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment