nnimanager.ts 30.3 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/**
 * Copyright (c) Microsoft Corporation
 * All rights reserved.
 *
 * MIT License
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

'use strict';

import * as assert from 'assert';
import * as cpp from 'child-process-promise';
goooxu's avatar
goooxu committed
24
import { ChildProcess, spawn, StdioOptions } from 'child_process';
Deshui Yu's avatar
Deshui Yu committed
25
26
27
import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
28
import { NNIError } from '../common/errors';
29
import { getExperimentId, setInitTrialSequenceId } from '../common/experimentStartupInfo';
Deshui Yu's avatar
Deshui Yu committed
30
31
32
import { getLogger, Logger } from '../common/log';
import {
    ExperimentParams, ExperimentProfile, Manager,
33
    NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
Deshui Yu's avatar
Deshui Yu committed
34
35
36
37
} from '../common/manager';
import {
    TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService';
38
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils';
Deshui Yu's avatar
Deshui Yu committed
39
import {
chicm-ms's avatar
chicm-ms committed
40
    ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
chicm-ms's avatar
chicm-ms committed
41
    REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
Deshui Yu's avatar
Deshui Yu committed
42
} from './commands';
43
import { createDispatcherInterface, IpcInterface } from './ipcInterface';
Deshui Yu's avatar
Deshui Yu committed
44
45

/**
chicm-ms's avatar
chicm-ms committed
46
 * NNIManager which implements Manager interface
Deshui Yu's avatar
Deshui Yu committed
47
48
49
 */
class NNIManager implements Manager {
    private trainingService: TrainingService;
50
    private dispatcher: IpcInterface | undefined;
51
    private currSubmittedTrialNum: number;  // need to be recovered
QuanluZhang's avatar
QuanluZhang committed
52
    private trialConcurrencyChange: number; // >0: increase, <0: decrease
Deshui Yu's avatar
Deshui Yu committed
53
54
55
56
    private customizedTrials: string[]; // need to be recovered
    private log: Logger;
    private dataStore: DataStore;
    private experimentProfile: ExperimentProfile;
57
    private dispatcherPid: number;
58
    private status: NNIManagerStatus;
QuanluZhang's avatar
QuanluZhang committed
59
60
    private waitingTrials: string[];
    private trialJobs: Map<string, TrialJobDetail>;
Deshui Yu's avatar
Deshui Yu committed
61
62
63

    constructor() {
        this.currSubmittedTrialNum = 0;
QuanluZhang's avatar
QuanluZhang committed
64
        this.trialConcurrencyChange = 0;
Deshui Yu's avatar
Deshui Yu committed
65
66
67
        this.customizedTrials = [];
        this.trainingService = component.get(TrainingService);
        assert(this.trainingService);
68
        this.dispatcherPid = 0;
QuanluZhang's avatar
QuanluZhang committed
69
70
        this.waitingTrials = [];
        this.trialJobs = new Map<string, TrialJobDetail>();
Deshui Yu's avatar
Deshui Yu committed
71
72
73

        this.log = getLogger();
        this.dataStore = component.get(DataStore);
74
75
76
77
        this.experimentProfile = this.createEmptyExperimentProfile();
        this.status = {
            status: 'INITIALIZED',
            errors: []
Deshui Yu's avatar
Deshui Yu committed
78
79
80
81
82
83
84
85
86
87
88
89
90
91
        };
    }

    public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
        switch (updateType) {
            case 'TRIAL_CONCURRENCY':
                this.updateTrialConcurrency(experimentProfile.params.trialConcurrency);
                break;
            case 'MAX_EXEC_DURATION':
                this.updateMaxExecDuration(experimentProfile.params.maxExecDuration);
                break;
            case 'SEARCH_SPACE':
                this.updateSearchSpace(experimentProfile.params.searchSpace);
                break;
QuanluZhang's avatar
QuanluZhang committed
92
93
94
            case 'MAX_TRIAL_NUM':
                this.updateMaxTrialNum(experimentProfile.params.maxTrialNum);
                break;
Deshui Yu's avatar
Deshui Yu committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
            default:
                throw new Error('Error: unrecognized updateType');
        }

        return this.storeExperimentProfile();
    }

    public addCustomizedTrialJob(hyperParams: string): Promise<void> {
        if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
            return Promise.reject(
                new Error('reach maxTrialNum')
            );
        }
        this.customizedTrials.push(hyperParams);

        // trial id has not been generated yet, thus use '' instead
        return this.dataStore.storeTrialJobEvent('ADD_CUSTOMIZED', '', hyperParams);
    }

    public async cancelTrialJobByUser(trialJobId: string): Promise<void> {
        await this.trainingService.cancelTrialJob(trialJobId);
        await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, '');
    }

    public async startExperiment(expParams: ExperimentParams): Promise<string> {
        this.log.debug(`Starting experiment: ${this.experimentProfile.id}`);
        this.experimentProfile.params = expParams;
        await this.storeExperimentProfile();
        this.log.debug('Setup tuner...');
124

125
        // Set up multiphase config
126
        if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
127
128
129
            this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
        }

QuanluZhang's avatar
QuanluZhang committed
130
        const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.advisor,
goooxu's avatar
goooxu committed
131
            expParams.multiPhase, expParams.multiThread);
132
        this.log.debug(`dispatcher command: ${dispatcherCommand}`);
QuanluZhang's avatar
QuanluZhang committed
133
        const checkpointDir: string = await this.createCheckpointDir();
Deshui Yu's avatar
Deshui Yu committed
134
        this.setupTuner(
135
136
            dispatcherCommand,
            undefined,
Deshui Yu's avatar
Deshui Yu committed
137
            'start',
QuanluZhang's avatar
QuanluZhang committed
138
            checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
139

140
        this.experimentProfile.startTime = Date.now();
141
        this.status.status = 'RUNNING';
Deshui Yu's avatar
Deshui Yu committed
142
        await this.storeExperimentProfile();
143
144
        this.run().catch((err: Error) => {
            this.criticalError(err);
Deshui Yu's avatar
Deshui Yu committed
145
        });
146

Deshui Yu's avatar
Deshui Yu committed
147
148
149
150
151
152
153
154
155
        return this.experimentProfile.id;
    }

    public async resumeExperiment(): Promise<void> {
        //Fetch back the experiment profile
        const experimentId: string = getExperimentId();
        this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
        const expParams: ExperimentParams = this.experimentProfile.params;

156
157
        setInitTrialSequenceId(this.experimentProfile.maxSequenceId + 1);

158
        // Set up multiphase config
159
        if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
160
161
162
            this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
        }

QuanluZhang's avatar
QuanluZhang committed
163
        const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.advisor,
goooxu's avatar
goooxu committed
164
            expParams.multiPhase, expParams.multiThread);
165
        this.log.debug(`dispatcher command: ${dispatcherCommand}`);
QuanluZhang's avatar
QuanluZhang committed
166
        const checkpointDir: string = await this.createCheckpointDir();
Deshui Yu's avatar
Deshui Yu committed
167
        this.setupTuner(
168
169
            dispatcherCommand,
            undefined,
Deshui Yu's avatar
Deshui Yu committed
170
            'resume',
QuanluZhang's avatar
QuanluZhang committed
171
            checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
172
173
174
175
176
177
178
179
180
181
182

        const allTrialJobs: TrialJobInfo[] = await this.dataStore.listTrialJobs();

        // Resume currSubmittedTrialNum
        this.currSubmittedTrialNum = allTrialJobs.length;

        // Check the final status for WAITING and RUNNING jobs
        await Promise.all(allTrialJobs
            .filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING')
            .map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.id)));

chicm-ms's avatar
chicm-ms committed
183
184
185
186
187
        if (this.experimentProfile.execDuration < this.experimentProfile.params.maxExecDuration &&
            this.currSubmittedTrialNum < this.experimentProfile.params.maxTrialNum &&
            this.experimentProfile.endTime) {
            delete this.experimentProfile.endTime;
        }
188
        this.status.status = 'RUNNING';
189

Deshui Yu's avatar
Deshui Yu committed
190
        // TO DO: update database record for resume event
191
192
193
        this.run().catch((err: Error) => {
            this.criticalError(err);
        });
Deshui Yu's avatar
Deshui Yu committed
194
195
196
197
198
199
200
201
202
203
204
205
206
    }

    public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
        return Promise.resolve(
            this.trainingService.getTrialJob(trialJobId)
        );
    }

    public async setClusterMetadata(key: string, value: string): Promise<void> {
        let timeoutId: NodeJS.Timer;
        // TO DO: move timeout value to constants file
        const delay1: Promise<{}> = new Promise((resolve: Function, reject: Function): void => {
            timeoutId = setTimeout(
207
                () => { reject(new Error('TrainingService setClusterMetadata timeout. Please check your config file.')); },
Deshui Yu's avatar
Deshui Yu committed
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
                10000);
        });
        await Promise.race([delay1, this.trainingService.setClusterMetadata(key, value)]).finally(() => {
            clearTimeout(timeoutId);
        });
    }

    public getClusterMetadata(key: string): Promise<string> {
        return Promise.resolve(
            this.trainingService.getClusterMetadata(key)
        );
    }

    public async getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
        return this.dataStore.getTrialJobStatistics();
    }

225
    public async stopExperiment(): Promise<void> {
226
        this.status.status = 'STOPPING';
227
228
229
        this.log.info('Experiment done, cleaning up...');
        await this.experimentDoneCleanUp();
        this.log.info('Experiment done.');
Deshui Yu's avatar
Deshui Yu committed
230
231
    }

232
    public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
Deshui Yu's avatar
Deshui Yu committed
233
234
235
236
237
238
239
240
241
242
243
        return this.dataStore.getMetricData(trialJobId, metricType);
    }

    public getExperimentProfile(): Promise<ExperimentProfile> {
        // TO DO: using Promise.resolve()
        const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>();
        deferred.resolve(this.experimentProfile);

        return deferred.promise;
    }

244
245
246
247
    public getStatus(): NNIManagerStatus {
        return this.status;
    }

Deshui Yu's avatar
Deshui Yu committed
248
249
250
251
    public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
        return this.dataStore.listTrialJobs(status);
    }

252
253
    private setupTuner(command: string, cwd: string | undefined, mode: 'start' | 'resume', dataDirectory: string): void {
        if (this.dispatcher !== undefined) {
Deshui Yu's avatar
Deshui Yu committed
254
255
            return;
        }
goooxu's avatar
goooxu committed
256
        const stdio: StdioOptions = ['ignore', process.stdout, process.stderr, 'pipe', 'pipe'];
Deshui Yu's avatar
Deshui Yu committed
257
258
259
260
261
262
263
        let newCwd: string;
        if (cwd === undefined || cwd === '') {
            newCwd = getLogDir();
        } else {
            newCwd = cwd;
        }
        // TO DO: add CUDA_VISIBLE_DEVICES
Zejun Lin's avatar
Zejun Lin committed
264
265
266
267
268
269
        let nniEnv = {
            NNI_MODE: mode,
            NNI_CHECKPOINT_DIRECTORY: dataDirectory,
            NNI_LOG_DIRECTORY: getLogDir()
        };
        let newEnv = Object.assign({}, process.env, nniEnv);
Deshui Yu's avatar
Deshui Yu committed
270
271
272
        const tunerProc: ChildProcess = spawn(command, [], {
            stdio,
            cwd: newCwd,
Zejun Lin's avatar
Zejun Lin committed
273
            env: newEnv,
Deshui Yu's avatar
Deshui Yu committed
274
275
            shell: true
        });
276
277
        this.dispatcherPid = tunerProc.pid;
        this.dispatcher = createDispatcherInterface(tunerProc);
Deshui Yu's avatar
Deshui Yu committed
278
279
280
281
282

        return;
    }

    private updateTrialConcurrency(trialConcurrency: number): void {
QuanluZhang's avatar
QuanluZhang committed
283
284
        // we assume trialConcurrency >= 0, which is checked by restserver
        this.trialConcurrencyChange += (trialConcurrency - this.experimentProfile.params.trialConcurrency);
Deshui Yu's avatar
Deshui Yu committed
285
286
287
288
289
290
291
292
293
294
295
296
        this.experimentProfile.params.trialConcurrency = trialConcurrency;

        return;
    }

    private updateMaxExecDuration(duration: number): void {
        this.experimentProfile.params.maxExecDuration = duration;

        return;
    }

    private updateSearchSpace(searchSpace: string): void {
297
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
298
299
            throw new Error('Error: tuner has not been setup');
        }
300
        this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, searchSpace);
Deshui Yu's avatar
Deshui Yu committed
301
302
303
304
305
        this.experimentProfile.params.searchSpace = searchSpace;

        return;
    }

QuanluZhang's avatar
QuanluZhang committed
306
307
308
309
310
311
    private updateMaxTrialNum(maxTrialNum: number): void {
        this.experimentProfile.params.maxTrialNum = maxTrialNum;

        return;
    }

Deshui Yu's avatar
Deshui Yu committed
312
    private async experimentDoneCleanUp(): Promise<void> {
313
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
314
315
            throw new Error('Error: tuner has not been setup');
        }
316
        this.dispatcher.sendCommand(TERMINATE);
Deshui Yu's avatar
Deshui Yu committed
317
318
319
        let tunerAlive: boolean = true;
        // gracefully terminate tuner and assessor here, wait at most 30 seconds.
        for (let i: number = 0; i < 30; i++) {
320
            if (!tunerAlive) { break; }
Deshui Yu's avatar
Deshui Yu committed
321
            try {
322
                await cpp.exec(`kill -0 ${this.dispatcherPid}`);
Deshui Yu's avatar
Deshui Yu committed
323
324
325
326
            } catch (error) { tunerAlive = false; }
            await delay(1000);
        }
        try {
327
            await cpp.exec(`kill ${this.dispatcherPid}`);
Deshui Yu's avatar
Deshui Yu committed
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
        } catch (error) {
            // this.tunerPid does not exist, do nothing here
        }
        const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();
        // TO DO: to promise all
        for (const trialJob of trialJobList) {
            if (trialJob.status === 'RUNNING' ||
                trialJob.status === 'WAITING') {
                try {
                    await this.trainingService.cancelTrialJob(trialJob.id);
                } catch (error) {
                    // pid does not exist, do nothing here
                }
            }
        }
        await this.trainingService.cleanUp();
344
        this.experimentProfile.endTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
345
        await this.storeExperimentProfile();
346
        this.status.status = 'STOPPED';
Deshui Yu's avatar
Deshui Yu committed
347
348
349
    }

    private async periodicallyUpdateExecDuration(): Promise<void> {
350
        let count: number = 1;
Gems Guo's avatar
Gems Guo committed
351
        while (this.status.status !== 'STOPPING' && this.status.status !== 'STOPPED') {
352
            await delay(1000 * 1); // 1 seconds
353
            if (this.status.status === 'RUNNING') {
354
355
356
357
358
359
                this.experimentProfile.execDuration += 1;
                if (count % 10 === 0) {
                    await this.storeExperimentProfile();
                }
            }
            count += 1;
Deshui Yu's avatar
Deshui Yu committed
360
361
362
        }
    }

chicm-ms's avatar
chicm-ms committed
363
364
365
366
367
368
369
370
371
372
    private async pingDispatcher(): Promise<void> {
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
            await delay(1000 * 5);
            this.dispatcher.sendCommand(PING);
        }
    }

QuanluZhang's avatar
QuanluZhang committed
373
374
    private async requestTrialJobsStatus(): Promise<number> {
        let finishedTrialJobNum: number = 0;
QuanluZhang's avatar
QuanluZhang committed
375
376
377
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
QuanluZhang's avatar
QuanluZhang committed
378
379
380
381
382
        for (const trialJobId of Array.from(this.trialJobs.keys())) {
            const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
            const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
            if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status !== trialJobDetail.status) {
                this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail));
383
                await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail);
QuanluZhang's avatar
QuanluZhang committed
384
            }
QuanluZhang's avatar
QuanluZhang committed
385
            let hyperParams: string | undefined = undefined;
QuanluZhang's avatar
QuanluZhang committed
386
387
388
            switch (trialJobDetail.status) {
                case 'SUCCEEDED':
                case 'USER_CANCELED':
QuanluZhang's avatar
QuanluZhang committed
389
                case 'EARLY_STOPPED':
QuanluZhang's avatar
QuanluZhang committed
390
391
                    this.trialJobs.delete(trialJobId);
                    finishedTrialJobNum++;
QuanluZhang's avatar
QuanluZhang committed
392
393
394
395
396
397
398
399
                    if (trialJobDetail.form.jobType === 'TRIAL') {
                        hyperParams = (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters.value;
                    } else {
                        throw new Error('Error: jobType error, not TRIAL');
                    }
                    this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({
                        trial_job_id: trialJobDetail.id,
                        event: trialJobDetail.status,
goooxu's avatar
goooxu committed
400
401
                        hyper_params: hyperParams
                    }));
QuanluZhang's avatar
QuanluZhang committed
402
403
404
405
406
407
408
                    break;
                case 'FAILED':
                case 'SYS_CANCELED':
                    // In the current version, we do not retry
                    // TO DO: push this job to queue for retry
                    this.trialJobs.delete(trialJobId);
                    finishedTrialJobNum++;
QuanluZhang's avatar
QuanluZhang committed
409
410
411
412
413
414
415
416
                    if (trialJobDetail.form.jobType === 'TRIAL') {
                        hyperParams = (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters.value;
                    } else {
                        throw new Error('Error: jobType error, not TRIAL');
                    }
                    this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({
                        trial_job_id: trialJobDetail.id,
                        event: trialJobDetail.status,
goooxu's avatar
goooxu committed
417
418
                        hyper_params: hyperParams
                    }));
QuanluZhang's avatar
QuanluZhang committed
419
420
421
422
423
424
425
426
427
428
                    break;
                case 'WAITING':
                case 'RUNNING':
                case 'UNKNOWN':
                    // Do nothing
                    break;
                default:
                // TO DO: add warning in log
            }
        }
goooxu's avatar
goooxu committed
429

Gems Guo's avatar
Gems Guo committed
430
        return finishedTrialJobNum;
QuanluZhang's avatar
QuanluZhang committed
431
432
433
434
435
436
    }

    private async manageTrials(): Promise<void> {
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
QuanluZhang's avatar
QuanluZhang committed
437
        let allFinishedTrialJobNum: number = this.currSubmittedTrialNum;
QuanluZhang's avatar
QuanluZhang committed
438
        let waitSubmittedToFinish: number;
Gems Guo's avatar
Gems Guo committed
439
        while (this.status.status !== 'STOPPING' && this.status.status !== 'STOPPED') {
QuanluZhang's avatar
QuanluZhang committed
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
            const finishedTrialJobNum: number = await this.requestTrialJobsStatus();
            allFinishedTrialJobNum += finishedTrialJobNum;

            // requestTrialNum is the number of trials that will be requested from tuner.
            // If trialConcurrency does not change, requestTrialNum equals finishedTrialJobNum.
            // If trialConcurrency changes, for example, trialConcurrency increases by 2 (trialConcurrencyChange=2), then
            // requestTrialNum equals 2 + finishedTrialJobNum and trialConcurrencyChange becomes 0.
            // If trialConcurrency changes, for example, trialConcurrency decreases by 4 (trialConcurrencyChange=-4) and 
            // finishedTrialJobNum is 2, then requestTrialNum becomes -2. No trial will be requested from tuner,
            // and trialConcurrencyChange becomes -2.
            const requestTrialNum: number = this.trialConcurrencyChange + finishedTrialJobNum;
            if (requestTrialNum >= 0) {
                this.trialConcurrencyChange = 0;
            } else {
                this.trialConcurrencyChange = requestTrialNum;
            }
chicm-ms's avatar
chicm-ms committed
456
457
458

            const requestCustomTrialNum: number = Math.min(requestTrialNum, this.customizedTrials.length);
            for (let i: number = 0; i < requestCustomTrialNum; i++) {
QuanluZhang's avatar
QuanluZhang committed
459
460
461
462
463
464
465
                // ask tuner for more trials
                if (this.customizedTrials.length > 0) {
                    const hyperParams: string | undefined = this.customizedTrials.shift();
                    this.dispatcher.sendCommand(ADD_CUSTOMIZED_TRIAL_JOB, hyperParams);
                }
            }

chicm-ms's avatar
chicm-ms committed
466
467
468
469
            if (requestTrialNum - requestCustomTrialNum > 0) {
                this.requestTrialJobs(requestTrialNum - requestCustomTrialNum);
            }

QuanluZhang's avatar
QuanluZhang committed
470
            // check maxtrialnum and maxduration here
471
            // NO_MORE_TRIAL is more like a subset of RUNNING, because during RUNNING tuner
472
            // might tell nnimanager that this is no more trials. In NO_MORE_TRIAL state, the experiment is viewed
473
474
            // as still running. DONE could be transfered from RUNNING or NO_MORE_TRIAL.
            assert(this.status.status === 'RUNNING' ||
475
                this.status.status === 'DONE' ||
QuanluZhang's avatar
QuanluZhang committed
476
477
                this.status.status === 'NO_MORE_TRIAL' ||
                this.status.status === 'TUNER_NO_MORE_TRIAL');
478
            if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
QuanluZhang's avatar
QuanluZhang committed
479
                this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
QuanluZhang's avatar
QuanluZhang committed
480
481
482
483
484
485
486
487
488
489
490
491
                if (this.status.status !== 'DONE') {
                    this.status.status = 'NO_MORE_TRIAL';
                    waitSubmittedToFinish = this.currSubmittedTrialNum;

                    assert(allFinishedTrialJobNum <= waitSubmittedToFinish);
                    if (allFinishedTrialJobNum >= waitSubmittedToFinish) {
                        this.status.status = 'DONE';
                        this.experimentProfile.endTime = Date.now();
                        await this.storeExperimentProfile();
                        // write this log for travis CI
                        this.log.info('Experiment done.');
                    }
QuanluZhang's avatar
QuanluZhang committed
492
493
494
                }
            } else {
                if (this.status.status === 'DONE') {
495
496
                    delete this.experimentProfile.endTime;
                    await this.storeExperimentProfile();
QuanluZhang's avatar
QuanluZhang committed
497
                }
QuanluZhang's avatar
QuanluZhang committed
498
                if (this.status.status !== 'TUNER_NO_MORE_TRIAL') {
499
                    this.status.status = 'RUNNING';
500
                }
QuanluZhang's avatar
QuanluZhang committed
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
                for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
                    if (this.waitingTrials.length === 0 ||
                        this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
                        break;
                    }
                    const hyperParams: string | undefined = this.waitingTrials.shift();
                    if (hyperParams === undefined) {
                        throw new Error(`Error: invalid hyper-parameters for job submission: ${hyperParams}`);
                    }
                    this.currSubmittedTrialNum++;
                    const trialJobAppForm: TrialJobApplicationForm = {
                        jobType: 'TRIAL',
                        hyperParameters: {
                            value: hyperParams,
                            index: 0
                        }
                    };
                    const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
519
                    await this.storeMaxSequenceId(trialJobDetail.sequenceId);
QuanluZhang's avatar
QuanluZhang committed
520
521
522
523
                    this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail));
                    const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id);
                    if (trialJobDetailSnapshot != undefined) {
                        await this.dataStore.storeTrialJobEvent(
524
                            trialJobDetailSnapshot.status, trialJobDetailSnapshot.id, hyperParams, trialJobDetailSnapshot);
QuanluZhang's avatar
QuanluZhang committed
525
526
527
528
529
530
531
532
533
                    } else {
                        assert(false, `undefined trialJobDetail in trialJobs: ${trialJobDetail.id}`);
                    }
                }
            }
            await delay(1000 * 5); // 5 seconds
        }
    }

Deshui Yu's avatar
Deshui Yu committed
534
535
536
537
538
539
    private storeExperimentProfile(): Promise<void> {
        this.experimentProfile.revision += 1;

        return this.dataStore.storeExperimentProfile(this.experimentProfile);
    }

540
    private async run(): Promise<void> {
QuanluZhang's avatar
QuanluZhang committed
541
        assert(this.dispatcher !== undefined);
542
543
544
545
546
547
548

        this.addEventListeners();

        this.sendInitTunerCommands();

        await Promise.all([
            this.periodicallyUpdateExecDuration(),
chicm-ms's avatar
chicm-ms committed
549
550
551
            this.pingDispatcher().catch((err: Error) => {
                throw new NNIError('Dispatcher error', `Dispatcher error: ${err.message}`, err);
            }),
552
553
554
            this.trainingService.run().catch((err: Error) => {
                throw new NNIError('Training service error', `Training service error: ${err.message}`, err);
            }),
QuanluZhang's avatar
QuanluZhang committed
555
556
            this.manageTrials().catch((err: Error) => {
                throw new NNIError('Job management error', `Job management error: ${err.message}`, err);
557
            })]);
558
559
    }

QuanluZhang's avatar
QuanluZhang committed
560
    private addEventListeners(): void {
561
        // TO DO: cannot run this method more than once in one NNIManager instance
QuanluZhang's avatar
QuanluZhang committed
562
        if (this.dispatcher === undefined) {
563
564
565
566
            throw new Error('Error: tuner or job maintainer have not been setup');
        }
        this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => {
            this.onTrialJobMetrics(metric).catch((err: Error) => {
567
                this.criticalError(new NNIError('Job metrics error', `Job metrics error: ${err.message}`, err));
568
569
570
571
572
            });
        });

        this.dispatcher.onCommand((commandType: string, content: string) => {
            this.onTunerCommand(commandType, content).catch((err: Error) => {
573
                this.criticalError(new NNIError('Tuner command event error', `Tuner command event error: ${err.message}`, err));
574
575
576
577
578
579
            });
        });
    }

    private sendInitTunerCommands(): void {
        if (this.dispatcher === undefined) {
580
            throw new Error('Dispatcher error: tuner has not been setup');
581
        }
chicm-ms's avatar
chicm-ms committed
582
583
584
        this.log.debug(`Send tuner command: INITIALIZE: ${this.experimentProfile.params.searchSpace}`);
        // Tuner need to be initialized with search space before generating any hyper parameters
        this.dispatcher.sendCommand(INITIALIZE, this.experimentProfile.params.searchSpace);
585
586
587
588
589
590
591
592
593
594
    }

    private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
        await this.dataStore.storeMetricData(metric.id, metric.data);
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
    }

chicm-ms's avatar
chicm-ms committed
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
    private requestTrialJobs(jobNum: number): void {
        if (jobNum < 1) {
            return;
        }
        if (this.dispatcher === undefined) {
            throw new Error('Dispatcher error: tuner has not been setup');
        }
        if (this.experimentProfile.params.multiThread) {
            // Send multiple requests to ensure multiple hyper parameters are generated in non-blocking way.
            // For a single REQUEST_TRIAL_JOBS request, hyper parameters are generated one by one
            // sequentially.
            for (let i: number = 0; i < jobNum; i++) {
                this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1');
            }
        } else {
            this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(jobNum));
        }
    }

614
615
616
    private async onTunerCommand(commandType: string, content: string): Promise<void> {
        this.log.info(`Command from tuner: ${commandType}, ${content}`);
        switch (commandType) {
chicm-ms's avatar
chicm-ms committed
617
618
619
620
            case INITIALIZED:
                // Tuner is intialized, search space is set, request tuner to generate hyper parameters
                this.requestTrialJobs(this.experimentProfile.params.trialConcurrency);
                break;
621
            case NEW_TRIAL_JOB:
QuanluZhang's avatar
QuanluZhang committed
622
                if (this.status.status === 'TUNER_NO_MORE_TRIAL') {
623
                    this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set');
624
                    this.status.status = 'RUNNING';
625
                }
QuanluZhang's avatar
QuanluZhang committed
626
                this.waitingTrials.push(content);
627
                break;
chicm-ms's avatar
chicm-ms committed
628
629
630
631
632
633
634
635
636
637
638
639
640
641
            case SEND_TRIAL_JOB_PARAMETER:
                const tunerCommand: any = JSON.parse(content);
                assert(tunerCommand.parameter_index >= 0);
                assert(tunerCommand.trial_job_id !== undefined);

                const trialJobForm: TrialJobApplicationForm = {
                    jobType: 'TRIAL',
                    hyperParameters: {
                        value: content,
                        index: tunerCommand.parameter_index
                    }
                };
                await this.trainingService.updateTrialJob(tunerCommand.trial_job_id, trialJobForm);
                await this.dataStore.storeTrialJobEvent(
642
                    'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
chicm-ms's avatar
chicm-ms committed
643
                break;
644
            case NO_MORE_TRIAL_JOBS:
QuanluZhang's avatar
QuanluZhang committed
645
                this.status.status = 'TUNER_NO_MORE_TRIAL';
646
647
                break;
            case KILL_TRIAL_JOB:
QuanluZhang's avatar
QuanluZhang committed
648
                await this.trainingService.cancelTrialJob(JSON.parse(content), true);
649
650
651
652
                break;
            default:
                throw new Error('Error: unsupported command type from tuner');
        }
Deshui Yu's avatar
Deshui Yu committed
653
654
    }

655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    private criticalError(err: Error): void {
        this.logError(err);
        console.error(err);
    }

    private logError(err: Error): void {
        if (err.stack !== undefined) {
            this.log.error(err.stack);
        }
        this.status.errors.push(err.message);
        this.status.status = 'ERROR';
    }

    private createEmptyExperimentProfile(): ExperimentProfile {
        return {
            id: getExperimentId(),
            revision: 0,
            execDuration: 0,
673
            logDir: getExperimentRootDir(),
674
            maxSequenceId: 0,
675
676
677
678
679
680
            params: {
                authorName: '',
                experimentName: '',
                trialConcurrency: 0,
                maxExecDuration: 0, // unit: second
                maxTrialNum: 0, // maxTrialNum includes all the submitted trial jobs
681
                trainingServicePlatform: '',
QuanluZhang's avatar
QuanluZhang committed
682
                searchSpace: ''
683
684
            }
        };
Deshui Yu's avatar
Deshui Yu committed
685
    }
686

QuanluZhang's avatar
QuanluZhang committed
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
    private async createCheckpointDir(): Promise<string> {
        // TODO: test
        const chkpDir: string = getCheckpointDir();
        // create checkpoint directory
        await mkDirP(chkpDir);
        // assign this directory to exp profile's checkpointDir
        if (this.experimentProfile.params.advisor) {
            this.experimentProfile.params.advisor.checkpointDir = chkpDir;
        }
        if (this.experimentProfile.params.tuner) {
            this.experimentProfile.params.tuner.checkpointDir = chkpDir;
        }
        if (this.experimentProfile.params.assessor) {
            this.experimentProfile.params.assessor.checkpointDir = chkpDir;
        }

        return Promise.resolve(chkpDir);
    }
goooxu's avatar
goooxu committed
705

706
707
708
709
710
711
    private async storeMaxSequenceId(sequenceId: number): Promise<void> {
        if (sequenceId > this.experimentProfile.maxSequenceId) {
            this.experimentProfile.maxSequenceId = sequenceId;
            await this.storeExperimentProfile();
        }
    }
Deshui Yu's avatar
Deshui Yu committed
712
713
714
}

export { NNIManager };