nnimanager.ts 36 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/**
 * Copyright (c) Microsoft Corporation
 * All rights reserved.
 *
 * MIT License
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

'use strict';

import * as assert from 'assert';
import * as cpp from 'child-process-promise';
goooxu's avatar
goooxu committed
24
import { ChildProcess, spawn, StdioOptions } from 'child_process';
Deshui Yu's avatar
Deshui Yu committed
25
26
27
import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
28
import { NNIError } from '../common/errors';
29
import { getExperimentId } from '../common/experimentStartupInfo';
Deshui Yu's avatar
Deshui Yu committed
30
31
import { getLogger, Logger } from '../common/log';
import {
chicm-ms's avatar
chicm-ms committed
32
    ExperimentParams, ExperimentProfile, Manager, ExperimentStatus,
33
    NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
Deshui Yu's avatar
Deshui Yu committed
34
35
36
37
} from '../common/manager';
import {
    TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService';
38
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils';
Deshui Yu's avatar
Deshui Yu committed
39
import {
chicm-ms's avatar
chicm-ms committed
40
    ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
41
    REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE, IMPORT_DATA
Deshui Yu's avatar
Deshui Yu committed
42
} from './commands';
43
import { createDispatcherInterface, IpcInterface } from './ipcInterface';
Deshui Yu's avatar
Deshui Yu committed
44
45

/**
chicm-ms's avatar
chicm-ms committed
46
 * NNIManager which implements Manager interface
Deshui Yu's avatar
Deshui Yu committed
47
48
49
 */
class NNIManager implements Manager {
    private trainingService: TrainingService;
50
    private dispatcher: IpcInterface | undefined;
51
    private currSubmittedTrialNum: number;  // need to be recovered
QuanluZhang's avatar
QuanluZhang committed
52
    private trialConcurrencyChange: number; // >0: increase, <0: decrease
Deshui Yu's avatar
Deshui Yu committed
53
54
55
56
    private customizedTrials: string[]; // need to be recovered
    private log: Logger;
    private dataStore: DataStore;
    private experimentProfile: ExperimentProfile;
57
    private dispatcherPid: number;
58
    private status: NNIManagerStatus;
QuanluZhang's avatar
QuanluZhang committed
59
60
    private waitingTrials: string[];
    private trialJobs: Map<string, TrialJobDetail>;
61
    private trialDataForTuner: string;
SparkSnail's avatar
SparkSnail committed
62
    private readonly: boolean;
63

64
    private trialJobMetricListener: (metric: TrialJobMetric) => void;
65

Deshui Yu's avatar
Deshui Yu committed
66
67
    constructor() {
        this.currSubmittedTrialNum = 0;
QuanluZhang's avatar
QuanluZhang committed
68
        this.trialConcurrencyChange = 0;
Deshui Yu's avatar
Deshui Yu committed
69
70
71
        this.customizedTrials = [];
        this.trainingService = component.get(TrainingService);
        assert(this.trainingService);
72
        this.dispatcherPid = 0;
QuanluZhang's avatar
QuanluZhang committed
73
74
        this.waitingTrials = [];
        this.trialJobs = new Map<string, TrialJobDetail>();
75
        this.trialDataForTuner = '';
SparkSnail's avatar
SparkSnail committed
76
        this.readonly = false;
Deshui Yu's avatar
Deshui Yu committed
77
78
79

        this.log = getLogger();
        this.dataStore = component.get(DataStore);
80
81
82
83
        this.experimentProfile = this.createEmptyExperimentProfile();
        this.status = {
            status: 'INITIALIZED',
            errors: []
Deshui Yu's avatar
Deshui Yu committed
84
        };
85
86
87
88
89
        this.trialJobMetricListener = (metric: TrialJobMetric) => {
            this.onTrialJobMetrics(metric).catch((err: Error) => {
                this.criticalError(NNIError.FromError(err, 'Job metrics error: '));
            });
        };
Deshui Yu's avatar
Deshui Yu committed
90
91
92
    }

    public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
SparkSnail's avatar
SparkSnail committed
93
94
95
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not update experiment profile in readonly mode!'));
        }
Deshui Yu's avatar
Deshui Yu committed
96
97
98
99
100
101
102
103
104
105
        switch (updateType) {
            case 'TRIAL_CONCURRENCY':
                this.updateTrialConcurrency(experimentProfile.params.trialConcurrency);
                break;
            case 'MAX_EXEC_DURATION':
                this.updateMaxExecDuration(experimentProfile.params.maxExecDuration);
                break;
            case 'SEARCH_SPACE':
                this.updateSearchSpace(experimentProfile.params.searchSpace);
                break;
QuanluZhang's avatar
QuanluZhang committed
106
107
108
            case 'MAX_TRIAL_NUM':
                this.updateMaxTrialNum(experimentProfile.params.maxTrialNum);
                break;
Deshui Yu's avatar
Deshui Yu committed
109
110
111
112
113
114
115
            default:
                throw new Error('Error: unrecognized updateType');
        }

        return this.storeExperimentProfile();
    }

116
    public importData(data: string): Promise<void> {
SparkSnail's avatar
SparkSnail committed
117
118
119
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not import data in readonly mode!'));
        }
120
121
122
123
124
125
126
127
128
129
        if (this.dispatcher === undefined) {
            return Promise.reject(
                new Error('tuner has not been setup')
            );
        }
        this.dispatcher.sendCommand(IMPORT_DATA, data);

        return this.dataStore.storeTrialJobEvent('IMPORT_DATA', '', data);
    }

130
131
132
133
    public async exportData(): Promise<string> {
        return this.dataStore.exportTrialHpConfigs();
    }

Deshui Yu's avatar
Deshui Yu committed
134
    public addCustomizedTrialJob(hyperParams: string): Promise<void> {
SparkSnail's avatar
SparkSnail committed
135
136
137
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not add customized trial job in readonly mode!'));
        }
Deshui Yu's avatar
Deshui Yu committed
138
139
140
141
142
143
144
145
146
147
148
149
        if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
            return Promise.reject(
                new Error('reach maxTrialNum')
            );
        }
        this.customizedTrials.push(hyperParams);

        // trial id has not been generated yet, thus use '' instead
        return this.dataStore.storeTrialJobEvent('ADD_CUSTOMIZED', '', hyperParams);
    }

    public async cancelTrialJobByUser(trialJobId: string): Promise<void> {
SparkSnail's avatar
SparkSnail committed
150
151
152
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not cancel trial job in readonly mode!'));
        }
chicm-ms's avatar
chicm-ms committed
153
        this.log.info(`User cancelTrialJob: ${trialJobId}`);
Deshui Yu's avatar
Deshui Yu committed
154
155
156
157
158
        await this.trainingService.cancelTrialJob(trialJobId);
        await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, '');
    }

    public async startExperiment(expParams: ExperimentParams): Promise<string> {
chicm-ms's avatar
chicm-ms committed
159
        this.log.info(`Starting experiment: ${this.experimentProfile.id}`);
Deshui Yu's avatar
Deshui Yu committed
160
161
162
        this.experimentProfile.params = expParams;
        await this.storeExperimentProfile();
        this.log.debug('Setup tuner...');
163

164
        // Set up multiphase config
165
        if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
166
167
            this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
        }
168
169
170
171
        // Set up versionCheck config
        if (expParams.versionCheck !== undefined) {
            this.trainingService.setClusterMetadata('version_check', expParams.versionCheck.toString());
        }
SparkSnail's avatar
SparkSnail committed
172
173
174
175
        // Set up logCollection config
        if (expParams.logCollection !== undefined) {
            this.trainingService.setClusterMetadata('log_collection', expParams.logCollection.toString());
        }
176

QuanluZhang's avatar
QuanluZhang committed
177
        const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.advisor,
goooxu's avatar
goooxu committed
178
            expParams.multiPhase, expParams.multiThread);
179
        this.log.debug(`dispatcher command: ${dispatcherCommand}`);
QuanluZhang's avatar
QuanluZhang committed
180
        const checkpointDir: string = await this.createCheckpointDir();
Deshui Yu's avatar
Deshui Yu committed
181
        this.setupTuner(
182
183
            dispatcherCommand,
            undefined,
Deshui Yu's avatar
Deshui Yu committed
184
            'start',
QuanluZhang's avatar
QuanluZhang committed
185
            checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
186

187
        this.experimentProfile.startTime = Date.now();
chicm-ms's avatar
chicm-ms committed
188
        this.setStatus('RUNNING');
Deshui Yu's avatar
Deshui Yu committed
189
        await this.storeExperimentProfile();
190
191
        this.run().catch((err: Error) => {
            this.criticalError(err);
Deshui Yu's avatar
Deshui Yu committed
192
        });
193

Deshui Yu's avatar
Deshui Yu committed
194
195
196
        return this.experimentProfile.id;
    }

SparkSnail's avatar
SparkSnail committed
197
    public async resumeExperiment(readonly: boolean): Promise<void> {
chicm-ms's avatar
chicm-ms committed
198
        this.log.info(`Resuming experiment: ${this.experimentProfile.id}`);
Deshui Yu's avatar
Deshui Yu committed
199
200
201
        //Fetch back the experiment profile
        const experimentId: string = getExperimentId();
        this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
SparkSnail's avatar
SparkSnail committed
202
203
204
205
        this.readonly = readonly;
        if (readonly) {
            return Promise.resolve();
        }
Deshui Yu's avatar
Deshui Yu committed
206
        const expParams: ExperimentParams = this.experimentProfile.params;
207

208
        // Set up multiphase config
209
        if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
210
211
212
            this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
        }

213
214
        // Set up versionCheck config
        if (expParams.versionCheck !== undefined) {
SparkSnail's avatar
SparkSnail committed
215
            this.trainingService.setClusterMetadata('version_check', expParams.versionCheck.toString());
216
217
        }

QuanluZhang's avatar
QuanluZhang committed
218
        const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.advisor,
goooxu's avatar
goooxu committed
219
            expParams.multiPhase, expParams.multiThread);
220
        this.log.debug(`dispatcher command: ${dispatcherCommand}`);
QuanluZhang's avatar
QuanluZhang committed
221
        const checkpointDir: string = await this.createCheckpointDir();
Deshui Yu's avatar
Deshui Yu committed
222
        this.setupTuner(
223
224
            dispatcherCommand,
            undefined,
Deshui Yu's avatar
Deshui Yu committed
225
            'resume',
QuanluZhang's avatar
QuanluZhang committed
226
            checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
227
228
229
230
231
232
233
234
235
236
237

        const allTrialJobs: TrialJobInfo[] = await this.dataStore.listTrialJobs();

        // Resume currSubmittedTrialNum
        this.currSubmittedTrialNum = allTrialJobs.length;

        // Check the final status for WAITING and RUNNING jobs
        await Promise.all(allTrialJobs
            .filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING')
            .map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.id)));

238
239
240
241
242
243
244
245
246
247
        // Collect generated trials and imported trials
        const finishedTrialData: string = await this.exportData();
        const importedData: string[] = await this.dataStore.getImportedData();
        let trialData: Object[] = JSON.parse(finishedTrialData);
        for (const oneImportedData of importedData) {
            // do not deduplicate
            trialData = trialData.concat(<Object[]>JSON.parse(oneImportedData));
        }
        this.trialDataForTuner = JSON.stringify(trialData);

chicm-ms's avatar
chicm-ms committed
248
249
250
251
252
        if (this.experimentProfile.execDuration < this.experimentProfile.params.maxExecDuration &&
            this.currSubmittedTrialNum < this.experimentProfile.params.maxTrialNum &&
            this.experimentProfile.endTime) {
            delete this.experimentProfile.endTime;
        }
chicm-ms's avatar
chicm-ms committed
253
        this.setStatus('RUNNING');
254

Deshui Yu's avatar
Deshui Yu committed
255
        // TO DO: update database record for resume event
256
257
258
        this.run().catch((err: Error) => {
            this.criticalError(err);
        });
Deshui Yu's avatar
Deshui Yu committed
259
260
    }

261
262
    public getTrialJob(trialJobId: string): Promise<TrialJobInfo> {
        return this.dataStore.getTrialJob(trialJobId);
Deshui Yu's avatar
Deshui Yu committed
263
264
265
    }

    public async setClusterMetadata(key: string, value: string): Promise<void> {
SparkSnail's avatar
SparkSnail committed
266
267
268
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not set cluster metadata in readonly mode!'));
        }
chicm-ms's avatar
chicm-ms committed
269
        this.log.info(`NNIManager setClusterMetadata, key: ${key}, value: ${value}`);
Deshui Yu's avatar
Deshui Yu committed
270
271
272
273
        let timeoutId: NodeJS.Timer;
        // TO DO: move timeout value to constants file
        const delay1: Promise<{}> = new Promise((resolve: Function, reject: Function): void => {
            timeoutId = setTimeout(
274
                () => { reject(new Error('TrainingService setClusterMetadata timeout. Please check your config file.')); },
Deshui Yu's avatar
Deshui Yu committed
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
                10000);
        });
        await Promise.race([delay1, this.trainingService.setClusterMetadata(key, value)]).finally(() => {
            clearTimeout(timeoutId);
        });
    }

    public getClusterMetadata(key: string): Promise<string> {
        return Promise.resolve(
            this.trainingService.getClusterMetadata(key)
        );
    }

    public async getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
        return this.dataStore.getTrialJobStatistics();
    }

292
    public async stopExperiment(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
293
294
        this.setStatus('STOPPING');
        this.log.info('Stopping experiment, cleaning up ...');
295
        await this.experimentDoneCleanUp();
chicm-ms's avatar
chicm-ms committed
296
        this.log.info('Experiment stopped.');
Deshui Yu's avatar
Deshui Yu committed
297
298
    }

299
    public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
Deshui Yu's avatar
Deshui Yu committed
300
301
302
        return this.dataStore.getMetricData(trialJobId, metricType);
    }

303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
    public async getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise<MetricDataRecord[]> {
        const trialJobs = await this.dataStore.listTrialJobs();
        const targetTrials = trialJobs.filter(trial => (
            // FIXME: can this be undefined?
            trial.sequenceId !== undefined && minSeqId <= trial.sequenceId && trial.sequenceId <= maxSeqId
        ));
        const targetTrialIds = new Set(targetTrials.map(trial => trial.id));

        const allMetrics = await this.dataStore.getMetricData();
        return allMetrics.filter(metric => targetTrialIds.has(metric.trialJobId));
    }

    public async getLatestMetricData(): Promise<MetricDataRecord[]> {
        // FIXME: this can take a long time
        const allMetrics: MetricDataRecord[] = await this.dataStore.getMetricData();
        const finals: MetricDataRecord[] = [];
        const latestIntermediates: Map<string, MetricDataRecord> = new Map<string, MetricDataRecord>();
        for (const metric of allMetrics) {
            if (metric.type !== 'PERIODICAL') {
                finals.push(metric);
            } else {
                const old: MetricDataRecord | undefined = latestIntermediates.get(metric.trialJobId);
                if (old === undefined || old.sequence <= metric.sequence) {
                    latestIntermediates.set(metric.trialJobId, metric);
                }
            }
        }
        return finals.concat(Array.from(latestIntermediates.values()));
        // FIXME: unit test
    }

Deshui Yu's avatar
Deshui Yu committed
334
335
336
337
338
339
340
341
    public getExperimentProfile(): Promise<ExperimentProfile> {
        // TO DO: using Promise.resolve()
        const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>();
        deferred.resolve(this.experimentProfile);

        return deferred.promise;
    }

342
343
344
345
    public getStatus(): NNIManagerStatus {
        return this.status;
    }

Deshui Yu's avatar
Deshui Yu committed
346
347
348
349
    public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
        return this.dataStore.listTrialJobs(status);
    }

350
351
    private setupTuner(command: string, cwd: string | undefined, mode: 'start' | 'resume', dataDirectory: string): void {
        if (this.dispatcher !== undefined) {
Deshui Yu's avatar
Deshui Yu committed
352
353
            return;
        }
goooxu's avatar
goooxu committed
354
        const stdio: StdioOptions = ['ignore', process.stdout, process.stderr, 'pipe', 'pipe'];
Deshui Yu's avatar
Deshui Yu committed
355
356
357
358
359
360
361
        let newCwd: string;
        if (cwd === undefined || cwd === '') {
            newCwd = getLogDir();
        } else {
            newCwd = cwd;
        }
        // TO DO: add CUDA_VISIBLE_DEVICES
362
363
364
365
366
        let includeIntermediateResultsEnv: boolean | undefined = false;
        if (this.experimentProfile.params.tuner !== undefined) {
            includeIntermediateResultsEnv = this.experimentProfile.params.tuner.includeIntermediateResults;
        }

Zejun Lin's avatar
Zejun Lin committed
367
368
369
        let nniEnv = {
            NNI_MODE: mode,
            NNI_CHECKPOINT_DIRECTORY: dataDirectory,
370
            NNI_LOG_DIRECTORY: getLogDir(),
371
            NNI_LOG_LEVEL: getLogLevel(),
372
373
            NNI_INCLUDE_INTERMEDIATE_RESULTS: includeIntermediateResultsEnv,
            CUDA_VISIBLE_DEVICES: this.getGpuEnvvarValue()
Zejun Lin's avatar
Zejun Lin committed
374
375
        };
        let newEnv = Object.assign({}, process.env, nniEnv);
376
        const tunerProc: ChildProcess = getTunerProc(command,stdio,newCwd,newEnv);
377
378
        this.dispatcherPid = tunerProc.pid;
        this.dispatcher = createDispatcherInterface(tunerProc);
Deshui Yu's avatar
Deshui Yu committed
379
380
381
382

        return;
    }

383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
    private getGpuEnvvarValue(): string {
        let cudaDevices: string | undefined;

        if (this.experimentProfile.params.advisor !== undefined) {
            cudaDevices = this.experimentProfile.params.advisor.gpuIndices;
        } else if (this.experimentProfile.params.tuner !== undefined) {
            cudaDevices = this.experimentProfile.params.tuner.gpuIndices;
        }

        if (cudaDevices === undefined) {
            return '';
        } else {
            return cudaDevices;
        }
    }

Deshui Yu's avatar
Deshui Yu committed
399
    private updateTrialConcurrency(trialConcurrency: number): void {
QuanluZhang's avatar
QuanluZhang committed
400
401
        // we assume trialConcurrency >= 0, which is checked by restserver
        this.trialConcurrencyChange += (trialConcurrency - this.experimentProfile.params.trialConcurrency);
Deshui Yu's avatar
Deshui Yu committed
402
403
404
405
406
407
408
409
410
411
412
413
        this.experimentProfile.params.trialConcurrency = trialConcurrency;

        return;
    }

    private updateMaxExecDuration(duration: number): void {
        this.experimentProfile.params.maxExecDuration = duration;

        return;
    }

    private updateSearchSpace(searchSpace: string): void {
414
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
415
416
            throw new Error('Error: tuner has not been setup');
        }
417
        this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, searchSpace);
Deshui Yu's avatar
Deshui Yu committed
418
419
420
421
422
        this.experimentProfile.params.searchSpace = searchSpace;

        return;
    }

QuanluZhang's avatar
QuanluZhang committed
423
424
425
426
427
428
    private updateMaxTrialNum(maxTrialNum: number): void {
        this.experimentProfile.params.maxTrialNum = maxTrialNum;

        return;
    }

Deshui Yu's avatar
Deshui Yu committed
429
    private async experimentDoneCleanUp(): Promise<void> {
430
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
431
432
            throw new Error('Error: tuner has not been setup');
        }
433
        this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener);
434
        this.dispatcher.sendCommand(TERMINATE);
Deshui Yu's avatar
Deshui Yu committed
435
436
437
        let tunerAlive: boolean = true;
        // gracefully terminate tuner and assessor here, wait at most 30 seconds.
        for (let i: number = 0; i < 30; i++) {
438
            if (!tunerAlive) { break; }
439
            tunerAlive = await isAlive(this.dispatcherPid);
Deshui Yu's avatar
Deshui Yu committed
440
441
            await delay(1000);
        }
442
        await killPid(this.dispatcherPid);
Deshui Yu's avatar
Deshui Yu committed
443
444
445
446
447
448
        const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();
        // TO DO: to promise all
        for (const trialJob of trialJobList) {
            if (trialJob.status === 'RUNNING' ||
                trialJob.status === 'WAITING') {
                try {
chicm-ms's avatar
chicm-ms committed
449
                    this.log.info(`cancelTrialJob: ${trialJob.id}`);
Deshui Yu's avatar
Deshui Yu committed
450
451
452
453
454
455
456
                    await this.trainingService.cancelTrialJob(trialJob.id);
                } catch (error) {
                    // pid does not exist, do nothing here
                }
            }
        }
        await this.trainingService.cleanUp();
457
        this.experimentProfile.endTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
458
        await this.storeExperimentProfile();
chicm-ms's avatar
chicm-ms committed
459
        this.setStatus('STOPPED');
Deshui Yu's avatar
Deshui Yu committed
460
461
462
    }

    private async periodicallyUpdateExecDuration(): Promise<void> {
463
        let count: number = 1;
464
        while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
465
            await delay(1000 * 1); // 1 seconds
466
            if (this.status.status === 'RUNNING') {
467
468
469
470
471
472
                this.experimentProfile.execDuration += 1;
                if (count % 10 === 0) {
                    await this.storeExperimentProfile();
                }
            }
            count += 1;
Deshui Yu's avatar
Deshui Yu committed
473
474
475
        }
    }

chicm-ms's avatar
chicm-ms committed
476
477
478
479
480
481
    private async pingDispatcher(): Promise<void> {
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
            this.dispatcher.sendCommand(PING);
chicm-ms's avatar
chicm-ms committed
482
            await delay(1000 * 5);
chicm-ms's avatar
chicm-ms committed
483
484
485
        }
    }

QuanluZhang's avatar
QuanluZhang committed
486
487
    private async requestTrialJobsStatus(): Promise<number> {
        let finishedTrialJobNum: number = 0;
QuanluZhang's avatar
QuanluZhang committed
488
489
490
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
QuanluZhang's avatar
QuanluZhang committed
491
492
493
494
        for (const trialJobId of Array.from(this.trialJobs.keys())) {
            const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
            const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
            if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status !== trialJobDetail.status) {
chicm-ms's avatar
chicm-ms committed
495
                this.log.info(`Trial job ${trialJobDetail.id} status changed from ${oldTrialJobDetail.status} to ${trialJobDetail.status}`);
QuanluZhang's avatar
QuanluZhang committed
496
                this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail));
497
                await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail);
QuanluZhang's avatar
QuanluZhang committed
498
            }
QuanluZhang's avatar
QuanluZhang committed
499
            let hyperParams: string | undefined = undefined;
QuanluZhang's avatar
QuanluZhang committed
500
501
502
            switch (trialJobDetail.status) {
                case 'SUCCEEDED':
                case 'USER_CANCELED':
QuanluZhang's avatar
QuanluZhang committed
503
                case 'EARLY_STOPPED':
QuanluZhang's avatar
QuanluZhang committed
504
505
                    this.trialJobs.delete(trialJobId);
                    finishedTrialJobNum++;
506
                    hyperParams = trialJobDetail.form.hyperParameters.value;
QuanluZhang's avatar
QuanluZhang committed
507
508
509
                    this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({
                        trial_job_id: trialJobDetail.id,
                        event: trialJobDetail.status,
goooxu's avatar
goooxu committed
510
511
                        hyper_params: hyperParams
                    }));
QuanluZhang's avatar
QuanluZhang committed
512
513
514
515
516
517
518
                    break;
                case 'FAILED':
                case 'SYS_CANCELED':
                    // In the current version, we do not retry
                    // TO DO: push this job to queue for retry
                    this.trialJobs.delete(trialJobId);
                    finishedTrialJobNum++;
519
                    hyperParams = trialJobDetail.form.hyperParameters.value;
QuanluZhang's avatar
QuanluZhang committed
520
521
522
                    this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({
                        trial_job_id: trialJobDetail.id,
                        event: trialJobDetail.status,
goooxu's avatar
goooxu committed
523
524
                        hyper_params: hyperParams
                    }));
QuanluZhang's avatar
QuanluZhang committed
525
526
527
528
529
530
531
532
533
534
                    break;
                case 'WAITING':
                case 'RUNNING':
                case 'UNKNOWN':
                    // Do nothing
                    break;
                default:
                // TO DO: add warning in log
            }
        }
goooxu's avatar
goooxu committed
535

Gems Guo's avatar
Gems Guo committed
536
        return finishedTrialJobNum;
QuanluZhang's avatar
QuanluZhang committed
537
538
539
540
541
542
    }

    private async manageTrials(): Promise<void> {
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
QuanluZhang's avatar
QuanluZhang committed
543
        let allFinishedTrialJobNum: number = this.currSubmittedTrialNum;
QuanluZhang's avatar
QuanluZhang committed
544
        let waitSubmittedToFinish: number;
545
        while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
QuanluZhang's avatar
QuanluZhang committed
546
547
548
549
550
551
552
            const finishedTrialJobNum: number = await this.requestTrialJobsStatus();
            allFinishedTrialJobNum += finishedTrialJobNum;

            // requestTrialNum is the number of trials that will be requested from tuner.
            // If trialConcurrency does not change, requestTrialNum equals finishedTrialJobNum.
            // If trialConcurrency changes, for example, trialConcurrency increases by 2 (trialConcurrencyChange=2), then
            // requestTrialNum equals 2 + finishedTrialJobNum and trialConcurrencyChange becomes 0.
553
            // If trialConcurrency changes, for example, trialConcurrency decreases by 4 (trialConcurrencyChange=-4) and
QuanluZhang's avatar
QuanluZhang committed
554
555
556
557
558
559
560
561
            // finishedTrialJobNum is 2, then requestTrialNum becomes -2. No trial will be requested from tuner,
            // and trialConcurrencyChange becomes -2.
            const requestTrialNum: number = this.trialConcurrencyChange + finishedTrialJobNum;
            if (requestTrialNum >= 0) {
                this.trialConcurrencyChange = 0;
            } else {
                this.trialConcurrencyChange = requestTrialNum;
            }
chicm-ms's avatar
chicm-ms committed
562
563
564

            const requestCustomTrialNum: number = Math.min(requestTrialNum, this.customizedTrials.length);
            for (let i: number = 0; i < requestCustomTrialNum; i++) {
QuanluZhang's avatar
QuanluZhang committed
565
566
567
568
569
570
571
                // ask tuner for more trials
                if (this.customizedTrials.length > 0) {
                    const hyperParams: string | undefined = this.customizedTrials.shift();
                    this.dispatcher.sendCommand(ADD_CUSTOMIZED_TRIAL_JOB, hyperParams);
                }
            }

chicm-ms's avatar
chicm-ms committed
572
573
574
575
            if (requestTrialNum - requestCustomTrialNum > 0) {
                this.requestTrialJobs(requestTrialNum - requestCustomTrialNum);
            }

QuanluZhang's avatar
QuanluZhang committed
576
            // check maxtrialnum and maxduration here
577
            // NO_MORE_TRIAL is more like a subset of RUNNING, because during RUNNING tuner
578
            // might tell nnimanager that this is no more trials. In NO_MORE_TRIAL state, the experiment is viewed
579
580
            // as still running. DONE could be transfered from RUNNING or NO_MORE_TRIAL.
            assert(this.status.status === 'RUNNING' ||
581
                this.status.status === 'DONE' ||
QuanluZhang's avatar
QuanluZhang committed
582
583
                this.status.status === 'NO_MORE_TRIAL' ||
                this.status.status === 'TUNER_NO_MORE_TRIAL');
584
            if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
QuanluZhang's avatar
QuanluZhang committed
585
                this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
QuanluZhang's avatar
QuanluZhang committed
586
                if (this.status.status !== 'DONE') {
chicm-ms's avatar
chicm-ms committed
587
                    this.setStatus('NO_MORE_TRIAL');
QuanluZhang's avatar
QuanluZhang committed
588
589
590
591
                    waitSubmittedToFinish = this.currSubmittedTrialNum;

                    assert(allFinishedTrialJobNum <= waitSubmittedToFinish);
                    if (allFinishedTrialJobNum >= waitSubmittedToFinish) {
chicm-ms's avatar
chicm-ms committed
592
                        this.setStatus('DONE');
QuanluZhang's avatar
QuanluZhang committed
593
594
595
596
597
                        this.experimentProfile.endTime = Date.now();
                        await this.storeExperimentProfile();
                        // write this log for travis CI
                        this.log.info('Experiment done.');
                    }
QuanluZhang's avatar
QuanluZhang committed
598
599
600
                }
            } else {
                if (this.status.status === 'DONE') {
601
602
                    delete this.experimentProfile.endTime;
                    await this.storeExperimentProfile();
QuanluZhang's avatar
QuanluZhang committed
603
                }
QuanluZhang's avatar
QuanluZhang committed
604
                if (this.status.status !== 'TUNER_NO_MORE_TRIAL') {
chicm-ms's avatar
chicm-ms committed
605
                    this.setStatus('RUNNING');
606
                }
QuanluZhang's avatar
QuanluZhang committed
607
608
609
610
611
612
613
614
615
616
617
                for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
                    if (this.waitingTrials.length === 0 ||
                        this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
                        break;
                    }
                    const hyperParams: string | undefined = this.waitingTrials.shift();
                    if (hyperParams === undefined) {
                        throw new Error(`Error: invalid hyper-parameters for job submission: ${hyperParams}`);
                    }
                    this.currSubmittedTrialNum++;
                    const trialJobAppForm: TrialJobApplicationForm = {
618
                        sequenceId: this.experimentProfile.nextSequenceId++,
QuanluZhang's avatar
QuanluZhang committed
619
620
621
622
623
                        hyperParameters: {
                            value: hyperParams,
                            index: 0
                        }
                    };
chicm-ms's avatar
chicm-ms committed
624
                    this.log.info(`submitTrialJob: form: ${JSON.stringify(trialJobAppForm)}`);
QuanluZhang's avatar
QuanluZhang committed
625
                    const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
626
                    await this.storeExperimentProfile();
QuanluZhang's avatar
QuanluZhang committed
627
628
629
630
                    this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail));
                    const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id);
                    if (trialJobDetailSnapshot != undefined) {
                        await this.dataStore.storeTrialJobEvent(
631
                            trialJobDetailSnapshot.status, trialJobDetailSnapshot.id, hyperParams, trialJobDetailSnapshot);
QuanluZhang's avatar
QuanluZhang committed
632
633
634
635
636
637
638
639
640
                    } else {
                        assert(false, `undefined trialJobDetail in trialJobs: ${trialJobDetail.id}`);
                    }
                }
            }
            await delay(1000 * 5); // 5 seconds
        }
    }

Deshui Yu's avatar
Deshui Yu committed
641
642
643
644
645
646
    private storeExperimentProfile(): Promise<void> {
        this.experimentProfile.revision += 1;

        return this.dataStore.storeExperimentProfile(this.experimentProfile);
    }

647
    private async run(): Promise<void> {
QuanluZhang's avatar
QuanluZhang committed
648
        assert(this.dispatcher !== undefined);
649
650
651
652
653
654
655

        this.addEventListeners();

        this.sendInitTunerCommands();

        await Promise.all([
            this.periodicallyUpdateExecDuration(),
chicm-ms's avatar
chicm-ms committed
656
            this.pingDispatcher().catch((err: Error) => {
chicm-ms's avatar
chicm-ms committed
657
                throw NNIError.FromError(err, 'Dispatcher error: ');
chicm-ms's avatar
chicm-ms committed
658
            }),
659
            this.trainingService.run().catch((err: Error) => {
chicm-ms's avatar
chicm-ms committed
660
                throw NNIError.FromError(err, 'Training service error: ');
661
            }),
QuanluZhang's avatar
QuanluZhang committed
662
            this.manageTrials().catch((err: Error) => {
chicm-ms's avatar
chicm-ms committed
663
                throw NNIError.FromError(err, 'Job management error: ');
664
            })]);
665
666
    }

QuanluZhang's avatar
QuanluZhang committed
667
    private addEventListeners(): void {
chicm-ms's avatar
chicm-ms committed
668
        this.log.info('Add event listeners');
669
        // TO DO: cannot run this method more than once in one NNIManager instance
QuanluZhang's avatar
QuanluZhang committed
670
        if (this.dispatcher === undefined) {
671
672
            throw new Error('Error: tuner or job maintainer have not been setup');
        }
673
        this.trainingService.addTrialJobMetricListener(this.trialJobMetricListener);
674
675
676

        this.dispatcher.onCommand((commandType: string, content: string) => {
            this.onTunerCommand(commandType, content).catch((err: Error) => {
chicm-ms's avatar
chicm-ms committed
677
                this.criticalError(NNIError.FromError(err, 'Tuner command event error: '));
678
679
680
681
682
683
            });
        });
    }

    private sendInitTunerCommands(): void {
        if (this.dispatcher === undefined) {
684
            throw new Error('Dispatcher error: tuner has not been setup');
685
        }
chicm-ms's avatar
chicm-ms committed
686
687
688
        this.log.debug(`Send tuner command: INITIALIZE: ${this.experimentProfile.params.searchSpace}`);
        // Tuner need to be initialized with search space before generating any hyper parameters
        this.dispatcher.sendCommand(INITIALIZE, this.experimentProfile.params.searchSpace);
689
690
691
    }

    private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
chicm-ms's avatar
chicm-ms committed
692
        this.log.debug(`NNIManager received trial job metrics: ${metric}`);
693
694
695
696
697
698
699
        await this.dataStore.storeMetricData(metric.id, metric.data);
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
    }

chicm-ms's avatar
chicm-ms committed
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
    private requestTrialJobs(jobNum: number): void {
        if (jobNum < 1) {
            return;
        }
        if (this.dispatcher === undefined) {
            throw new Error('Dispatcher error: tuner has not been setup');
        }
        if (this.experimentProfile.params.multiThread) {
            // Send multiple requests to ensure multiple hyper parameters are generated in non-blocking way.
            // For a single REQUEST_TRIAL_JOBS request, hyper parameters are generated one by one
            // sequentially.
            for (let i: number = 0; i < jobNum; i++) {
                this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1');
            }
        } else {
            this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(jobNum));
        }
    }

719
    private async onTunerCommand(commandType: string, content: string): Promise<void> {
horizon365's avatar
horizon365 committed
720
        this.log.info(`NNIManager received command from dispatcher: ${commandType}, ${content}`);
721
        switch (commandType) {
chicm-ms's avatar
chicm-ms committed
722
723
            case INITIALIZED:
                // Tuner is intialized, search space is set, request tuner to generate hyper parameters
724
725
726
727
728
729
                if (this.trialDataForTuner.length > 0) {
                    if (this.dispatcher === undefined) {
                        throw new Error('Dispatcher error: tuner has not been setup');
                    }
                    this.dispatcher.sendCommand(IMPORT_DATA, this.trialDataForTuner);
                }
chicm-ms's avatar
chicm-ms committed
730
731
                this.requestTrialJobs(this.experimentProfile.params.trialConcurrency);
                break;
732
            case NEW_TRIAL_JOB:
QuanluZhang's avatar
QuanluZhang committed
733
                if (this.status.status === 'TUNER_NO_MORE_TRIAL') {
734
                    this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set');
chicm-ms's avatar
chicm-ms committed
735
                    this.setStatus('RUNNING');
736
                }
QuanluZhang's avatar
QuanluZhang committed
737
                this.waitingTrials.push(content);
738
                break;
chicm-ms's avatar
chicm-ms committed
739
740
741
742
743
744
            case SEND_TRIAL_JOB_PARAMETER:
                const tunerCommand: any = JSON.parse(content);
                assert(tunerCommand.parameter_index >= 0);
                assert(tunerCommand.trial_job_id !== undefined);

                const trialJobForm: TrialJobApplicationForm = {
745
                    sequenceId: -1,  // FIXME: multi-phase tuner should use sequence ID instead of trial job ID
chicm-ms's avatar
chicm-ms committed
746
747
748
749
750
                    hyperParameters: {
                        value: content,
                        index: tunerCommand.parameter_index
                    }
                };
chicm-ms's avatar
chicm-ms committed
751
                this.log.info(`updateTrialJob: job id: ${tunerCommand.trial_job_id}, form: ${JSON.stringify(trialJobForm)}`);
chicm-ms's avatar
chicm-ms committed
752
                await this.trainingService.updateTrialJob(tunerCommand.trial_job_id, trialJobForm);
753
754
755
756
757
                if (tunerCommand['parameters'] !== null) {
                    // parameters field is set as empty string if no more hyper parameter can be generated by tuner.
                    await this.dataStore.storeTrialJobEvent(
                        'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
                }
chicm-ms's avatar
chicm-ms committed
758
                break;
759
            case NO_MORE_TRIAL_JOBS:
760
761
762
                if (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
                    this.setStatus('TUNER_NO_MORE_TRIAL');
                }
763
764
                break;
            case KILL_TRIAL_JOB:
chicm-ms's avatar
chicm-ms committed
765
                this.log.info(`cancelTrialJob: ${JSON.parse(content)}`);
QuanluZhang's avatar
QuanluZhang committed
766
                await this.trainingService.cancelTrialJob(JSON.parse(content), true);
767
768
769
770
                break;
            default:
                throw new Error('Error: unsupported command type from tuner');
        }
Deshui Yu's avatar
Deshui Yu committed
771
772
    }

773
774
775
776
777
778
779
780
781
782
    private criticalError(err: Error): void {
        this.logError(err);
        console.error(err);
    }

    private logError(err: Error): void {
        if (err.stack !== undefined) {
            this.log.error(err.stack);
        }
        this.status.errors.push(err.message);
chicm-ms's avatar
chicm-ms committed
783
784
785
786
787
788
789
790
        this.setStatus('ERROR');
    }

    private setStatus(status: ExperimentStatus): void {
        if (status !== this.status.status) {
            this.log.info(`Change NNIManager status from: ${this.status.status} to: ${status}`);
            this.status.status = status;
        }
791
792
793
794
795
796
797
    }

    private createEmptyExperimentProfile(): ExperimentProfile {
        return {
            id: getExperimentId(),
            revision: 0,
            execDuration: 0,
798
            logDir: getExperimentRootDir(),
799
            nextSequenceId: 0,
800
801
802
803
804
805
            params: {
                authorName: '',
                experimentName: '',
                trialConcurrency: 0,
                maxExecDuration: 0, // unit: second
                maxTrialNum: 0, // maxTrialNum includes all the submitted trial jobs
806
                trainingServicePlatform: '',
QuanluZhang's avatar
QuanluZhang committed
807
                searchSpace: ''
808
809
            }
        };
Deshui Yu's avatar
Deshui Yu committed
810
    }
811

QuanluZhang's avatar
QuanluZhang committed
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
    private async createCheckpointDir(): Promise<string> {
        // TODO: test
        const chkpDir: string = getCheckpointDir();
        // create checkpoint directory
        await mkDirP(chkpDir);
        // assign this directory to exp profile's checkpointDir
        if (this.experimentProfile.params.advisor) {
            this.experimentProfile.params.advisor.checkpointDir = chkpDir;
        }
        if (this.experimentProfile.params.tuner) {
            this.experimentProfile.params.tuner.checkpointDir = chkpDir;
        }
        if (this.experimentProfile.params.assessor) {
            this.experimentProfile.params.assessor.checkpointDir = chkpDir;
        }

        return Promise.resolve(chkpDir);
    }
Deshui Yu's avatar
Deshui Yu committed
830
831
832
}

export { NNIManager };