nnimanager.ts 25.2 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/**
 * Copyright (c) Microsoft Corporation
 * All rights reserved.
 *
 * MIT License
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

'use strict';

import * as assert from 'assert';
import * as cpp from 'child-process-promise';
import { ChildProcess, spawn } from 'child_process';
import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
28
import { NNIError } from '../common/errors';
Deshui Yu's avatar
Deshui Yu committed
29
30
31
32
import { getExperimentId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import {
    ExperimentParams, ExperimentProfile, Manager,
33
    NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
Deshui Yu's avatar
Deshui Yu committed
34
35
36
37
} from '../common/manager';
import {
    TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService';
38
import { delay , getLogDir, getMsgDispatcherCommand} from '../common/utils';
Deshui Yu's avatar
Deshui Yu committed
39
40
import {
    ADD_CUSTOMIZED_TRIAL_JOB, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, REPORT_METRIC_DATA,
chicm-ms's avatar
chicm-ms committed
41
    REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
Deshui Yu's avatar
Deshui Yu committed
42
} from './commands';
43
import { createDispatcherInterface, IpcInterface } from './ipcInterface';
Deshui Yu's avatar
Deshui Yu committed
44
45
46
47
48
49

/**
 * NNIManager
 */
class NNIManager implements Manager {
    private trainingService: TrainingService;
50
    private dispatcher: IpcInterface | undefined;
Deshui Yu's avatar
Deshui Yu committed
51
    private currSubmittedTrialNum: number; // need to be recovered
QuanluZhang's avatar
QuanluZhang committed
52
    private trialConcurrencyChange: number; // >0: increase, <0: decrease
Deshui Yu's avatar
Deshui Yu committed
53
54
55
56
    private customizedTrials: string[]; // need to be recovered
    private log: Logger;
    private dataStore: DataStore;
    private experimentProfile: ExperimentProfile;
57
    private dispatcherPid: number;
58
    private status: NNIManagerStatus;
QuanluZhang's avatar
QuanluZhang committed
59
60
61
    private waitingTrials: string[];
    private trialJobs: Map<string, TrialJobDetail>;
    private suspendDuration: number;
Deshui Yu's avatar
Deshui Yu committed
62
63
64

    constructor() {
        this.currSubmittedTrialNum = 0;
QuanluZhang's avatar
QuanluZhang committed
65
        this.trialConcurrencyChange = 0;
Deshui Yu's avatar
Deshui Yu committed
66
67
68
        this.customizedTrials = [];
        this.trainingService = component.get(TrainingService);
        assert(this.trainingService);
69
        this.dispatcherPid = 0;
QuanluZhang's avatar
QuanluZhang committed
70
71
72
        this.waitingTrials = [];
        this.trialJobs = new Map<string, TrialJobDetail>();
        this.suspendDuration = 0;
Deshui Yu's avatar
Deshui Yu committed
73
74
75

        this.log = getLogger();
        this.dataStore = component.get(DataStore);
76
77
78
79
        this.experimentProfile = this.createEmptyExperimentProfile();
        this.status = {
            status: 'INITIALIZED',
            errors: []
Deshui Yu's avatar
Deshui Yu committed
80
81
82
83
84
85
86
87
88
89
90
91
92
93
        };
    }

    public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
        switch (updateType) {
            case 'TRIAL_CONCURRENCY':
                this.updateTrialConcurrency(experimentProfile.params.trialConcurrency);
                break;
            case 'MAX_EXEC_DURATION':
                this.updateMaxExecDuration(experimentProfile.params.maxExecDuration);
                break;
            case 'SEARCH_SPACE':
                this.updateSearchSpace(experimentProfile.params.searchSpace);
                break;
QuanluZhang's avatar
QuanluZhang committed
94
95
96
            case 'MAX_TRIAL_NUM':
                this.updateMaxTrialNum(experimentProfile.params.maxTrialNum);
                break;
Deshui Yu's avatar
Deshui Yu committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
            default:
                throw new Error('Error: unrecognized updateType');
        }

        return this.storeExperimentProfile();
    }

    public addCustomizedTrialJob(hyperParams: string): Promise<void> {
        if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
            return Promise.reject(
                new Error('reach maxTrialNum')
            );
        }
        this.customizedTrials.push(hyperParams);

        // trial id has not been generated yet, thus use '' instead
        return this.dataStore.storeTrialJobEvent('ADD_CUSTOMIZED', '', hyperParams);
    }

    public async cancelTrialJobByUser(trialJobId: string): Promise<void> {
        await this.trainingService.cancelTrialJob(trialJobId);
        await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, '');
    }

    public async startExperiment(expParams: ExperimentParams): Promise<string> {
        this.log.debug(`Starting experiment: ${this.experimentProfile.id}`);
        this.experimentProfile.params = expParams;
        await this.storeExperimentProfile();
        this.log.debug('Setup tuner...');
126

127
128
129
130
131
        // Set up multiphase config
        if(expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
            this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
        }

chicm-ms's avatar
chicm-ms committed
132
        const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase);
133
        this.log.debug(`dispatcher command: ${dispatcherCommand}`);
Deshui Yu's avatar
Deshui Yu committed
134
        this.setupTuner(
135
136
137
            //expParams.tuner.tunerCommand,
            dispatcherCommand,
            undefined,
Deshui Yu's avatar
Deshui Yu committed
138
            'start',
139
            expParams.tuner.checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
140

141
142
        this.experimentProfile.startTime = Date.now();
        this.status.status = 'EXPERIMENT_RUNNING';
Deshui Yu's avatar
Deshui Yu committed
143
        await this.storeExperimentProfile();
144
145
        this.run().catch((err: Error) => {
            this.criticalError(err);
Deshui Yu's avatar
Deshui Yu committed
146
        });
147

Deshui Yu's avatar
Deshui Yu committed
148
149
150
151
152
153
154
155
156
        return this.experimentProfile.id;
    }

    public async resumeExperiment(): Promise<void> {
        //Fetch back the experiment profile
        const experimentId: string = getExperimentId();
        this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
        const expParams: ExperimentParams = this.experimentProfile.params;

157
        // Set up multiphase config
158
        if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
159
160
161
            this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
        }

chicm-ms's avatar
chicm-ms committed
162
        const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase);
163
        this.log.debug(`dispatcher command: ${dispatcherCommand}`);
Deshui Yu's avatar
Deshui Yu committed
164
        this.setupTuner(
165
166
            dispatcherCommand,
            undefined,
Deshui Yu's avatar
Deshui Yu committed
167
            'resume',
168
            expParams.tuner.checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
169
170
171
172
173
174
175
176
177
178
179

        const allTrialJobs: TrialJobInfo[] = await this.dataStore.listTrialJobs();

        // Resume currSubmittedTrialNum
        this.currSubmittedTrialNum = allTrialJobs.length;

        // Check the final status for WAITING and RUNNING jobs
        await Promise.all(allTrialJobs
            .filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING')
            .map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.id)));

180
181
        this.status.status = 'EXPERIMENT_RUNNING';

Deshui Yu's avatar
Deshui Yu committed
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
        // TO DO: update database record for resume event
        this.run().catch(console.error);
    }

    public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
        return Promise.resolve(
            this.trainingService.getTrialJob(trialJobId)
        );
    }

    public async setClusterMetadata(key: string, value: string): Promise<void> {
        let timeoutId: NodeJS.Timer;
        // TO DO: move timeout value to constants file
        const delay1: Promise<{}> = new Promise((resolve: Function, reject: Function): void => {
            timeoutId = setTimeout(
197
                () => { reject(new Error('TrainingService setClusterMetadata timeout. Please check your config file.')); },
Deshui Yu's avatar
Deshui Yu committed
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
                10000);
        });
        await Promise.race([delay1, this.trainingService.setClusterMetadata(key, value)]).finally(() => {
            clearTimeout(timeoutId);
        });
    }

    public getClusterMetadata(key: string): Promise<string> {
        return Promise.resolve(
            this.trainingService.getClusterMetadata(key)
        );
    }

    public async getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
        return this.dataStore.getTrialJobStatistics();
    }

    public stopExperiment(): Promise<void> {
216
        this.status.status = 'STOPPING';
Deshui Yu's avatar
Deshui Yu committed
217

QuanluZhang's avatar
QuanluZhang committed
218
        return Promise.resolve();
Deshui Yu's avatar
Deshui Yu committed
219
220
    }

221
    public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
Deshui Yu's avatar
Deshui Yu committed
222
223
224
225
226
227
228
229
230
231
232
        return this.dataStore.getMetricData(trialJobId, metricType);
    }

    public getExperimentProfile(): Promise<ExperimentProfile> {
        // TO DO: using Promise.resolve()
        const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>();
        deferred.resolve(this.experimentProfile);

        return deferred.promise;
    }

233
234
235
236
    public getStatus(): NNIManagerStatus {
        return this.status;
    }

Deshui Yu's avatar
Deshui Yu committed
237
238
239
240
    public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
        return this.dataStore.listTrialJobs(status);
    }

241
242
    private setupTuner(command: string, cwd: string | undefined, mode: 'start' | 'resume', dataDirectory: string): void {
        if (this.dispatcher !== undefined) {
Deshui Yu's avatar
Deshui Yu committed
243
244
245
246
247
248
249
250
251
252
            return;
        }
        const stdio: (string | NodeJS.WriteStream)[] = ['ignore', process.stdout, process.stderr, 'pipe', 'pipe'];
        let newCwd: string;
        if (cwd === undefined || cwd === '') {
            newCwd = getLogDir();
        } else {
            newCwd = cwd;
        }
        // TO DO: add CUDA_VISIBLE_DEVICES
Zejun Lin's avatar
Zejun Lin committed
253
254
255
256
257
258
        let nniEnv = {
            NNI_MODE: mode,
            NNI_CHECKPOINT_DIRECTORY: dataDirectory,
            NNI_LOG_DIRECTORY: getLogDir()
        };
        let newEnv = Object.assign({}, process.env, nniEnv);
Deshui Yu's avatar
Deshui Yu committed
259
260
261
        const tunerProc: ChildProcess = spawn(command, [], {
            stdio,
            cwd: newCwd,
Zejun Lin's avatar
Zejun Lin committed
262
            env: newEnv,
Deshui Yu's avatar
Deshui Yu committed
263
264
            shell: true
        });
265
266
        this.dispatcherPid = tunerProc.pid;
        this.dispatcher = createDispatcherInterface(tunerProc);
Deshui Yu's avatar
Deshui Yu committed
267
268
269
270
271

        return;
    }

    private updateTrialConcurrency(trialConcurrency: number): void {
QuanluZhang's avatar
QuanluZhang committed
272
273
        // we assume trialConcurrency >= 0, which is checked by restserver
        this.trialConcurrencyChange += (trialConcurrency - this.experimentProfile.params.trialConcurrency);
Deshui Yu's avatar
Deshui Yu committed
274
275
276
277
278
279
280
281
282
283
284
285
        this.experimentProfile.params.trialConcurrency = trialConcurrency;

        return;
    }

    private updateMaxExecDuration(duration: number): void {
        this.experimentProfile.params.maxExecDuration = duration;

        return;
    }

    private updateSearchSpace(searchSpace: string): void {
286
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
287
288
            throw new Error('Error: tuner has not been setup');
        }
289
        this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, searchSpace);
Deshui Yu's avatar
Deshui Yu committed
290
291
292
293
294
        this.experimentProfile.params.searchSpace = searchSpace;

        return;
    }

QuanluZhang's avatar
QuanluZhang committed
295
296
297
298
299
300
    private updateMaxTrialNum(maxTrialNum: number): void {
        this.experimentProfile.params.maxTrialNum = maxTrialNum;

        return;
    }

Deshui Yu's avatar
Deshui Yu committed
301
    private async experimentDoneCleanUp(): Promise<void> {
302
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
303
304
            throw new Error('Error: tuner has not been setup');
        }
305
        this.dispatcher.sendCommand(TERMINATE);
Deshui Yu's avatar
Deshui Yu committed
306
307
308
        let tunerAlive: boolean = true;
        // gracefully terminate tuner and assessor here, wait at most 30 seconds.
        for (let i: number = 0; i < 30; i++) {
309
            if (!tunerAlive) { break; }
Deshui Yu's avatar
Deshui Yu committed
310
            try {
311
                await cpp.exec(`kill -0 ${this.dispatcherPid}`);
Deshui Yu's avatar
Deshui Yu committed
312
313
314
315
            } catch (error) { tunerAlive = false; }
            await delay(1000);
        }
        try {
316
            await cpp.exec(`kill ${this.dispatcherPid}`);
Deshui Yu's avatar
Deshui Yu committed
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
        } catch (error) {
            // this.tunerPid does not exist, do nothing here
        }
        const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();
        // TO DO: to promise all
        for (const trialJob of trialJobList) {
            if (trialJob.status === 'RUNNING' ||
                trialJob.status === 'WAITING') {
                try {
                    await this.trainingService.cancelTrialJob(trialJob.id);
                } catch (error) {
                    // pid does not exist, do nothing here
                }
            }
        }
        await this.trainingService.cleanUp();
333
        this.experimentProfile.endTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
334
        await this.storeExperimentProfile();
335
        this.status.status = 'STOPPED';
Deshui Yu's avatar
Deshui Yu committed
336
337
338
    }

    private async periodicallyUpdateExecDuration(): Promise<void> {
339
        const startTime: number = Date.now();
Deshui Yu's avatar
Deshui Yu committed
340
341
342
        const execDuration: number = this.experimentProfile.execDuration;
        for (; ;) {
            await delay(1000 * 60 * 10); // 10 minutes
QuanluZhang's avatar
QuanluZhang committed
343
            this.experimentProfile.execDuration = execDuration + (Date.now() - startTime) / 1000 - this.suspendDuration;
Deshui Yu's avatar
Deshui Yu committed
344
345
346
347
            await this.storeExperimentProfile();
        }
    }

QuanluZhang's avatar
QuanluZhang committed
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
    private async requestTrialJobsStatus(): Promise<number> {
        const deferred: Deferred<number> = new Deferred<number>();
        let finishedTrialJobNum: number = 0;
        for (const trialJobId of Array.from(this.trialJobs.keys())) {
            const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
            const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
            //assert(oldTrialJobDetail);
            if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status !== trialJobDetail.status) {
                this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail));
                await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail.url);
            }
            switch (trialJobDetail.status) {
                case 'SUCCEEDED':
                case 'USER_CANCELED':
                    this.trialJobs.delete(trialJobId);
                    finishedTrialJobNum++;
                    break;
                case 'FAILED':
                case 'SYS_CANCELED':
                    // In the current version, we do not retry
                    // TO DO: push this job to queue for retry
                    this.trialJobs.delete(trialJobId);
                    finishedTrialJobNum++;
                    break;
                case 'WAITING':
                case 'RUNNING':
                case 'UNKNOWN':
                    // Do nothing
                    break;
                default:
                // TO DO: add warning in log
            }
        }
        deferred.resolve(finishedTrialJobNum);

        return deferred.promise;
    }

    private async manageTrials(): Promise<void> {
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        let allFinishedTrialJobNum: number = 0;
        const startTime: number = Date.now();
        let suspendStartTime: number = 0;
        for (; ;) {
            if (this.status.status === 'STOPPING') {
                break;
            }
            const finishedTrialJobNum: number = await this.requestTrialJobsStatus();

            allFinishedTrialJobNum += finishedTrialJobNum;
            if (allFinishedTrialJobNum >= this.experimentProfile.params.maxTrialNum) {
                // write this log for travis CI
                this.log.info('Experiment done.');
            }

            // requestTrialNum is the number of trials that will be requested from tuner.
            // If trialConcurrency does not change, requestTrialNum equals finishedTrialJobNum.
            // If trialConcurrency changes, for example, trialConcurrency increases by 2 (trialConcurrencyChange=2), then
            // requestTrialNum equals 2 + finishedTrialJobNum and trialConcurrencyChange becomes 0.
            // If trialConcurrency changes, for example, trialConcurrency decreases by 4 (trialConcurrencyChange=-4) and 
            // finishedTrialJobNum is 2, then requestTrialNum becomes -2. No trial will be requested from tuner,
            // and trialConcurrencyChange becomes -2.
            const requestTrialNum: number = this.trialConcurrencyChange + finishedTrialJobNum;
            if (requestTrialNum >= 0) {
                this.trialConcurrencyChange = 0;
            } else {
                this.trialConcurrencyChange = requestTrialNum;
            }
            for (let i: number = 0; i < requestTrialNum; i++) {
                // ask tuner for more trials
                if (this.customizedTrials.length > 0) {
                    const hyperParams: string | undefined = this.customizedTrials.shift();
                    this.dispatcher.sendCommand(ADD_CUSTOMIZED_TRIAL_JOB, hyperParams);
                } else {
                    this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1');
                }
            }

            // check maxtrialnum and maxduration here
            if ((Date.now() - startTime) / 1000 + this.experimentProfile.execDuration - this.suspendDuration 
                > this.experimentProfile.params.maxExecDuration ||
                this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
                assert(this.status.status === 'EXPERIMENT_RUNNING' || this.status.status === 'DONE');
                if (this.status.status === 'EXPERIMENT_RUNNING') {
                    suspendStartTime = Date.now();
                }
                this.status.status = 'DONE';
            } else {
                if (this.status.status === 'DONE') {
                    assert(suspendStartTime !== 0);
                    this.suspendDuration += (Date.now() - suspendStartTime) / 1000;
                }
                this.status.status = 'EXPERIMENT_RUNNING';
                for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
                    if (this.waitingTrials.length === 0 ||
                        this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
                        break;
                    }
                    const hyperParams: string | undefined = this.waitingTrials.shift();
                    if (hyperParams === undefined) {
                        throw new Error(`Error: invalid hyper-parameters for job submission: ${hyperParams}`);
                    }
                    this.currSubmittedTrialNum++;
                    const trialJobAppForm: TrialJobApplicationForm = {
                        jobType: 'TRIAL',
                        hyperParameters: {
                            value: hyperParams,
                            index: 0
                        }
                    };
                    const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
                    this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail));
                    const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id);
                    if (trialJobDetailSnapshot != undefined) {
                        await this.dataStore.storeTrialJobEvent(
                            trialJobDetailSnapshot.status, trialJobDetailSnapshot.id, hyperParams, trialJobDetailSnapshot.url);
                    } else {
                        assert(false, `undefined trialJobDetail in trialJobs: ${trialJobDetail.id}`);
                    }
                }
            }
            await delay(1000 * 5); // 5 seconds
        }
        
        this.log.info('Experiment done, cleaning up...');
        await this.experimentDoneCleanUp();
        this.log.info('Experiment done.');
    }

Deshui Yu's avatar
Deshui Yu committed
479
480
481
482
483
484
    private storeExperimentProfile(): Promise<void> {
        this.experimentProfile.revision += 1;

        return this.dataStore.storeExperimentProfile(this.experimentProfile);
    }

485
    private async run(): Promise<void> {
QuanluZhang's avatar
QuanluZhang committed
486
        assert(this.dispatcher !== undefined);
487
488
489
490
491
492
493

        this.addEventListeners();

        this.sendInitTunerCommands();

        await Promise.all([
            this.periodicallyUpdateExecDuration(),
494
495
496
            this.trainingService.run().catch((err: Error) => {
                throw new NNIError('Training service error', `Training service error: ${err.message}`, err);
            }),
QuanluZhang's avatar
QuanluZhang committed
497
498
            this.manageTrials().catch((err: Error) => {
                throw new NNIError('Job management error', `Job management error: ${err.message}`, err);
499
            })]);
500
501
    }

QuanluZhang's avatar
QuanluZhang committed
502
    private addEventListeners(): void {
503
        // TO DO: cannot run this method more than once in one NNIManager instance
QuanluZhang's avatar
QuanluZhang committed
504
        if (this.dispatcher === undefined) {
505
506
507
508
            throw new Error('Error: tuner or job maintainer have not been setup');
        }
        this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => {
            this.onTrialJobMetrics(metric).catch((err: Error) => {
509
                this.criticalError(new NNIError('Job metrics error', `Job metrics error: ${err.message}`, err));
510
511
512
513
514
            });
        });

        this.dispatcher.onCommand((commandType: string, content: string) => {
            this.onTunerCommand(commandType, content).catch((err: Error) => {
515
                this.criticalError(new NNIError('Tuner command event error', `Tuner command event error: ${err.message}`, err));
516
517
518
519
520
521
            });
        });
    }

    private sendInitTunerCommands(): void {
        if (this.dispatcher === undefined) {
522
            throw new Error('Dispatcher error: tuner has not been setup');
523
        }
Deshui Yu's avatar
Deshui Yu committed
524
        // TO DO: we should send INITIALIZE command to tuner if user's tuner needs to run init method in tuner
525
526
        this.log.debug(`Send tuner command: update search space: ${this.experimentProfile.params.searchSpace}`);
        this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, this.experimentProfile.params.searchSpace);
527
        this.log.debug(`Send tuner command: ${this.experimentProfile.params.trialConcurrency}`);
528
        this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(this.experimentProfile.params.trialConcurrency));
529
530
531
532
533
534
535
536
537
538
539
540
541
542
    }

    private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
        await this.dataStore.storeMetricData(metric.id, metric.data);
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
    }

    private async onTunerCommand(commandType: string, content: string): Promise<void> {
        this.log.info(`Command from tuner: ${commandType}, ${content}`);
        switch (commandType) {
            case NEW_TRIAL_JOB:
QuanluZhang's avatar
QuanluZhang committed
543
                this.waitingTrials.push(content);
544
                break;
chicm-ms's avatar
chicm-ms committed
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
            case SEND_TRIAL_JOB_PARAMETER:
                const tunerCommand: any = JSON.parse(content);
                assert(tunerCommand.parameter_index >= 0);
                assert(tunerCommand.trial_job_id !== undefined);

                const trialJobForm: TrialJobApplicationForm = {
                    jobType: 'TRIAL',
                    hyperParameters: {
                        value: content,
                        index: tunerCommand.parameter_index
                    }
                };
                await this.trainingService.updateTrialJob(tunerCommand.trial_job_id, trialJobForm);
                await this.dataStore.storeTrialJobEvent(
                        'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
                break;
561
            case NO_MORE_TRIAL_JOBS:
QuanluZhang's avatar
QuanluZhang committed
562
563
                //this.trialJobsMaintainer.setNoMoreTrials();
                // ignore this event for now
564
565
566
567
568
569
570
                break;
            case KILL_TRIAL_JOB:
                await this.trainingService.cancelTrialJob(JSON.parse(content));
                break;
            default:
                throw new Error('Error: unsupported command type from tuner');
        }
Deshui Yu's avatar
Deshui Yu committed
571
572
    }

573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
    private criticalError(err: Error): void {
        this.logError(err);
        console.error(err);
    }

    private logError(err: Error): void {
        if (err.stack !== undefined) {
            this.log.error(err.stack);
        }
        this.status.errors.push(err.message);
        this.status.status = 'ERROR';
    }

    private createEmptyExperimentProfile(): ExperimentProfile {
        return {
            id: getExperimentId(),
            revision: 0,
            execDuration: 0,
            params: {
                authorName: '',
                experimentName: '',
                trialConcurrency: 0,
                maxExecDuration: 0, // unit: second
                maxTrialNum: 0, // maxTrialNum includes all the submitted trial jobs
597
                trainingServicePlatform: '',
598
599
600
601
602
603
604
605
                searchSpace: '',
                tuner: {
                    className: '',
                    classArgs: {},
                    checkpointDir: ''
                }
            }
        };
Deshui Yu's avatar
Deshui Yu committed
606
607
608
609
    }
}

export { NNIManager };