nnimanager.ts 22.3 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/**
 * Copyright (c) Microsoft Corporation
 * All rights reserved.
 *
 * MIT License
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

'use strict';

import * as assert from 'assert';
import * as cpp from 'child-process-promise';
import { ChildProcess, spawn } from 'child_process';
import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
import { getExperimentId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import {
    ExperimentParams, ExperimentProfile, Manager,
32
    NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
Deshui Yu's avatar
Deshui Yu committed
33
34
35
36
} from '../common/manager';
import {
    TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService';
37
import { delay , getLogDir, getMsgDispatcherCommand} from '../common/utils';
Deshui Yu's avatar
Deshui Yu committed
38
39
import {
    ADD_CUSTOMIZED_TRIAL_JOB, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, REPORT_METRIC_DATA,
chicm-ms's avatar
chicm-ms committed
40
    REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
Deshui Yu's avatar
Deshui Yu committed
41
} from './commands';
42
import { createDispatcherInterface, IpcInterface } from './ipcInterface';
Deshui Yu's avatar
Deshui Yu committed
43
44
45
46
47
48
49
import { TrialJobMaintainerEvent, TrialJobs } from './trialJobs';

/**
 * NNIManager
 */
class NNIManager implements Manager {
    private trainingService: TrainingService;
50
    private dispatcher: IpcInterface | undefined;
Deshui Yu's avatar
Deshui Yu committed
51
52
53
54
55
56
57
    private trialJobsMaintainer: TrialJobs | undefined;
    private currSubmittedTrialNum: number; // need to be recovered
    private trialConcurrencyReduction: number;
    private customizedTrials: string[]; // need to be recovered
    private log: Logger;
    private dataStore: DataStore;
    private experimentProfile: ExperimentProfile;
58
    private dispatcherPid: number;
59
    private status: NNIManagerStatus;
Deshui Yu's avatar
Deshui Yu committed
60
61
62
63
64
65
66

    constructor() {
        this.currSubmittedTrialNum = 0;
        this.trialConcurrencyReduction = 0;
        this.customizedTrials = [];
        this.trainingService = component.get(TrainingService);
        assert(this.trainingService);
67
        this.dispatcherPid = 0;
Deshui Yu's avatar
Deshui Yu committed
68
69
70

        this.log = getLogger();
        this.dataStore = component.get(DataStore);
71
72
73
74
        this.experimentProfile = this.createEmptyExperimentProfile();
        this.status = {
            status: 'INITIALIZED',
            errors: []
Deshui Yu's avatar
Deshui Yu committed
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
        };
    }

    public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
        switch (updateType) {
            case 'TRIAL_CONCURRENCY':
                this.updateTrialConcurrency(experimentProfile.params.trialConcurrency);
                break;
            case 'MAX_EXEC_DURATION':
                this.updateMaxExecDuration(experimentProfile.params.maxExecDuration);
                break;
            case 'SEARCH_SPACE':
                this.updateSearchSpace(experimentProfile.params.searchSpace);
                break;
            default:
                throw new Error('Error: unrecognized updateType');
        }

        return this.storeExperimentProfile();
    }

    public addCustomizedTrialJob(hyperParams: string): Promise<void> {
        if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
            return Promise.reject(
                new Error('reach maxTrialNum')
            );
        }
        this.customizedTrials.push(hyperParams);

        // trial id has not been generated yet, thus use '' instead
        return this.dataStore.storeTrialJobEvent('ADD_CUSTOMIZED', '', hyperParams);
    }

    public async cancelTrialJobByUser(trialJobId: string): Promise<void> {
        await this.trainingService.cancelTrialJob(trialJobId);
        await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, '');
    }

    public async startExperiment(expParams: ExperimentParams): Promise<string> {
        this.log.debug(`Starting experiment: ${this.experimentProfile.id}`);
        this.experimentProfile.params = expParams;
        await this.storeExperimentProfile();
        this.log.debug('Setup tuner...');
118

119
120
121
122
123
        // Set up multiphase config
        if(expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
            this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
        }

chicm-ms's avatar
chicm-ms committed
124
        const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase);
125
        console.log(`dispatcher command: ${dispatcherCommand}`);
Deshui Yu's avatar
Deshui Yu committed
126
        this.setupTuner(
127
128
129
            //expParams.tuner.tunerCommand,
            dispatcherCommand,
            undefined,
Deshui Yu's avatar
Deshui Yu committed
130
            'start',
131
            expParams.tuner.checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
132

133
134
        this.experimentProfile.startTime = Date.now();
        this.status.status = 'EXPERIMENT_RUNNING';
Deshui Yu's avatar
Deshui Yu committed
135
        await this.storeExperimentProfile();
136
137
        this.run().catch((err: Error) => {
            this.criticalError(err);
Deshui Yu's avatar
Deshui Yu committed
138
139
140
141
142
143
144
145
146
147
        });
        return this.experimentProfile.id;
    }

    public async resumeExperiment(): Promise<void> {
        //Fetch back the experiment profile
        const experimentId: string = getExperimentId();
        this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
        const expParams: ExperimentParams = this.experimentProfile.params;

148
149
150
151
152
        // Set up multiphase config
        if(expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
            this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
        }

chicm-ms's avatar
chicm-ms committed
153
        const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase);
154
        console.log(`dispatcher command: ${dispatcherCommand}`);
Deshui Yu's avatar
Deshui Yu committed
155
        this.setupTuner(
156
157
            dispatcherCommand,
            undefined,
Deshui Yu's avatar
Deshui Yu committed
158
            'resume',
159
            expParams.tuner.checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
160
161
162
163
164
165
166
167
168
169
170

        const allTrialJobs: TrialJobInfo[] = await this.dataStore.listTrialJobs();

        // Resume currSubmittedTrialNum
        this.currSubmittedTrialNum = allTrialJobs.length;

        // Check the final status for WAITING and RUNNING jobs
        await Promise.all(allTrialJobs
            .filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING')
            .map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.id)));

171
172
        this.status.status = 'EXPERIMENT_RUNNING';

Deshui Yu's avatar
Deshui Yu committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
        // TO DO: update database record for resume event
        this.run().catch(console.error);
    }

    public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
        return Promise.resolve(
            this.trainingService.getTrialJob(trialJobId)
        );
    }

    public async setClusterMetadata(key: string, value: string): Promise<void> {
        let timeoutId: NodeJS.Timer;
        // TO DO: move timeout value to constants file
        const delay1: Promise<{}> = new Promise((resolve: Function, reject: Function): void => {
            timeoutId = setTimeout(
188
                () => { reject(new Error('TrainingService setClusterMetadata timeout. Please check your config file.')); },
Deshui Yu's avatar
Deshui Yu committed
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
                10000);
        });
        await Promise.race([delay1, this.trainingService.setClusterMetadata(key, value)]).finally(() => {
            clearTimeout(timeoutId);
        });
    }

    public getClusterMetadata(key: string): Promise<string> {
        return Promise.resolve(
            this.trainingService.getClusterMetadata(key)
        );
    }

    public async getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
        return this.dataStore.getTrialJobStatistics();
    }

    public stopExperiment(): Promise<void> {
207
        this.status.status = 'STOPPING';
Deshui Yu's avatar
Deshui Yu committed
208
209
210
211
212
213
214
215
216
        if (this.trialJobsMaintainer !== undefined) {
            this.trialJobsMaintainer.setStopLoop();

            return Promise.resolve();
        } else {
            return Promise.reject(new Error('Error: undefined trialJobsMaintainer'));
        }
    }

217
    public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
Deshui Yu's avatar
Deshui Yu committed
218
219
220
221
222
223
224
225
226
227
228
        return this.dataStore.getMetricData(trialJobId, metricType);
    }

    public getExperimentProfile(): Promise<ExperimentProfile> {
        // TO DO: using Promise.resolve()
        const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>();
        deferred.resolve(this.experimentProfile);

        return deferred.promise;
    }

229
230
231
232
    public getStatus(): NNIManagerStatus {
        return this.status;
    }

Deshui Yu's avatar
Deshui Yu committed
233
234
235
236
    public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
        return this.dataStore.listTrialJobs(status);
    }

237
238
    private setupTuner(command: string, cwd: string | undefined, mode: 'start' | 'resume', dataDirectory: string): void {
        if (this.dispatcher !== undefined) {
Deshui Yu's avatar
Deshui Yu committed
239
240
241
242
243
244
245
246
247
248
            return;
        }
        const stdio: (string | NodeJS.WriteStream)[] = ['ignore', process.stdout, process.stderr, 'pipe', 'pipe'];
        let newCwd: string;
        if (cwd === undefined || cwd === '') {
            newCwd = getLogDir();
        } else {
            newCwd = cwd;
        }
        // TO DO: add CUDA_VISIBLE_DEVICES
Zejun Lin's avatar
Zejun Lin committed
249
250
251
252
253
254
        let nniEnv = {
            NNI_MODE: mode,
            NNI_CHECKPOINT_DIRECTORY: dataDirectory,
            NNI_LOG_DIRECTORY: getLogDir()
        };
        let newEnv = Object.assign({}, process.env, nniEnv);
Deshui Yu's avatar
Deshui Yu committed
255
256
257
        const tunerProc: ChildProcess = spawn(command, [], {
            stdio,
            cwd: newCwd,
Zejun Lin's avatar
Zejun Lin committed
258
            env: newEnv,
Deshui Yu's avatar
Deshui Yu committed
259
260
            shell: true
        });
261
262
        this.dispatcherPid = tunerProc.pid;
        this.dispatcher = createDispatcherInterface(tunerProc);
Deshui Yu's avatar
Deshui Yu committed
263
264
265
266
267
268
269

        return;
    }

    private updateTrialConcurrency(trialConcurrency: number): void {
        // TO DO: this method can only be called after startExperiment/resumeExperiment
        if (trialConcurrency > this.experimentProfile.params.trialConcurrency) {
270
            if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
271
272
                throw new Error('Error: tuner has to be initialized');
            }
273
            this.dispatcher.sendCommand(
Deshui Yu's avatar
Deshui Yu committed
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
                REQUEST_TRIAL_JOBS,
                String(trialConcurrency - this.experimentProfile.params.trialConcurrency)
            );
        } else {
            // we assume trialConcurrency >= 0, which is checked by restserver
            this.trialConcurrencyReduction += (this.experimentProfile.params.trialConcurrency - trialConcurrency);
        }
        this.experimentProfile.params.trialConcurrency = trialConcurrency;

        return;
    }

    private updateMaxExecDuration(duration: number): void {
        if (this.trialJobsMaintainer !== undefined) {
            this.trialJobsMaintainer.updateMaxExecDuration(duration);
        }
        this.experimentProfile.params.maxExecDuration = duration;

        return;
    }

    private updateSearchSpace(searchSpace: string): void {
296
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
297
298
            throw new Error('Error: tuner has not been setup');
        }
299
        this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, searchSpace);
Deshui Yu's avatar
Deshui Yu committed
300
301
302
303
304
305
        this.experimentProfile.params.searchSpace = searchSpace;

        return;
    }

    private async experimentDoneCleanUp(): Promise<void> {
306
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
307
308
            throw new Error('Error: tuner has not been setup');
        }
309
        this.dispatcher.sendCommand(TERMINATE);
Deshui Yu's avatar
Deshui Yu committed
310
311
312
        let tunerAlive: boolean = true;
        // gracefully terminate tuner and assessor here, wait at most 30 seconds.
        for (let i: number = 0; i < 30; i++) {
313
            if (!tunerAlive) { break; }
Deshui Yu's avatar
Deshui Yu committed
314
            try {
315
                await cpp.exec(`kill -0 ${this.dispatcherPid}`);
Deshui Yu's avatar
Deshui Yu committed
316
317
318
319
            } catch (error) { tunerAlive = false; }
            await delay(1000);
        }
        try {
320
            await cpp.exec(`kill ${this.dispatcherPid}`);
Deshui Yu's avatar
Deshui Yu committed
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
        } catch (error) {
            // this.tunerPid does not exist, do nothing here
        }
        const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();
        // TO DO: to promise all
        for (const trialJob of trialJobList) {
            if (trialJob.status === 'RUNNING' ||
                trialJob.status === 'WAITING') {
                try {
                    await this.trainingService.cancelTrialJob(trialJob.id);
                } catch (error) {
                    // pid does not exist, do nothing here
                }
            }
        }
        await this.trainingService.cleanUp();
337
        this.experimentProfile.endTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
338
        await this.storeExperimentProfile();
339
        this.status.status = 'STOPPED';
Deshui Yu's avatar
Deshui Yu committed
340
341
342
    }

    private async periodicallyUpdateExecDuration(): Promise<void> {
343
        const startTime: number = Date.now();
Deshui Yu's avatar
Deshui Yu committed
344
345
346
        const execDuration: number = this.experimentProfile.execDuration;
        for (; ;) {
            await delay(1000 * 60 * 10); // 10 minutes
347
            this.experimentProfile.execDuration = execDuration + (Date.now() - startTime) / 1000;
Deshui Yu's avatar
Deshui Yu committed
348
349
350
351
352
353
354
355
356
357
            await this.storeExperimentProfile();
        }
    }

    private storeExperimentProfile(): Promise<void> {
        this.experimentProfile.revision += 1;

        return this.dataStore.storeExperimentProfile(this.experimentProfile);
    }

358
    private async run(): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
359
360
361
362
        this.trialJobsMaintainer = new TrialJobs(
            this.trainingService,
            this.experimentProfile.execDuration,
            this.experimentProfile.params.maxExecDuration);
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386

        assert(this.dispatcher !== undefined && this.trialJobsMaintainer !== undefined);

        this.addEventListeners();

        this.sendInitTunerCommands();

        await Promise.all([
            this.periodicallyUpdateExecDuration(),
            this.trainingService.run(),
            this.trialJobsMaintainer.run()]);
    }

     private addEventListeners(): void {
        // TO DO: cannot run this method more than once in one NNIManager instance
        if (this.dispatcher === undefined || this.trialJobsMaintainer === undefined) {
            throw new Error('Error: tuner or job maintainer have not been setup');
        }
        this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => {
            this.onTrialJobMetrics(metric).catch((err: Error) => {
                this.criticalError(err);
            });
        });

Deshui Yu's avatar
Deshui Yu committed
387
        this.trialJobsMaintainer.on(async (event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail) => {
388
389
390
            this.onTrialJobEvent(event, trialJobDetail).catch((err: Error) => {
                this.criticalError(err);
            });
Deshui Yu's avatar
Deshui Yu committed
391
392
        });

393
394
395
396
397
398
399
400
401
402
403
        this.dispatcher.onCommand((commandType: string, content: string) => {
            this.onTunerCommand(commandType, content).catch((err: Error) => {
                this.criticalError(err);
            });
        });
    }

    private sendInitTunerCommands(): void {
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
Deshui Yu's avatar
Deshui Yu committed
404
        // TO DO: we should send INITIALIZE command to tuner if user's tuner needs to run init method in tuner
405
406
        this.log.debug(`Send tuner command: update search space: ${this.experimentProfile.params.searchSpace}`);
        this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, this.experimentProfile.params.searchSpace);
Deshui Yu's avatar
Deshui Yu committed
407
        if (this.trialConcurrencyReduction !== 0) {
408
            throw new Error('Error: cannot modify trialConcurrency before startExperiment');
Deshui Yu's avatar
Deshui Yu committed
409
        }
410
        this.log.debug(`Send tuner command: ${this.experimentProfile.params.trialConcurrency}`);
411
        this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(this.experimentProfile.params.trialConcurrency));
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
    }

    private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
        await this.dataStore.storeMetricData(metric.id, metric.data);
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
    }

    private async onTrialJobEvent(event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail): Promise<void> {
        if (trialJobDetail !== undefined) {
            this.log.debug(`Job event: ${event}, id: ${trialJobDetail.id}`);
        } else {
            this.log.debug(`Job event: ${event}`);
        }
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        switch (event) {
            case 'SUCCEEDED':
            case 'FAILED':
            case 'USER_CANCELED':
            case 'SYS_CANCELED':
                if (this.trialConcurrencyReduction > 0) {
                    this.trialConcurrencyReduction--;
                } else {
Deshui Yu's avatar
Deshui Yu committed
439
                    if (this.currSubmittedTrialNum < this.experimentProfile.params.maxTrialNum) {
440
441
442
443
444
                        if (this.customizedTrials.length > 0) {
                            const hyperParams: string | undefined = this.customizedTrials.shift();
                            this.dispatcher.sendCommand(ADD_CUSTOMIZED_TRIAL_JOB, hyperParams);
                        } else {
                            this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1');
Deshui Yu's avatar
Deshui Yu committed
445
446
                        }
                    }
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
                }
                this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({trial_job_id: trialJobDetail.id, event: event}));
                await this.dataStore.storeTrialJobEvent(event, trialJobDetail.id, undefined, trialJobDetail.url);
                break;
            case 'RUNNING':
                await this.dataStore.storeTrialJobEvent(event, trialJobDetail.id, undefined, trialJobDetail.url);
                break;
            case 'EXPERIMENT_DONE':
                this.log.info('Experiment done, cleaning up...');
                await this.experimentDoneCleanUp();
                this.log.info('Experiment done.');
                break;
            default:
                throw new Error('Error: unrecognized event from trialJobsMaintainer');
        }
    }
Deshui Yu's avatar
Deshui Yu committed
463

464
465
466
467
468
469
470
471
472
473
474
    private async onTunerCommand(commandType: string, content: string): Promise<void> {
        this.log.info(`Command from tuner: ${commandType}, ${content}`);
        if (this.trialJobsMaintainer === undefined) {
            throw new Error('Error: trialJobsMaintainer not initialized');
        }
        switch (commandType) {
            case NEW_TRIAL_JOB:
                if (this.currSubmittedTrialNum < this.experimentProfile.params.maxTrialNum) {
                    this.currSubmittedTrialNum++;
                    const trialJobAppForm: TrialJobApplicationForm = {
                        jobType: 'TRIAL',
chicm-ms's avatar
chicm-ms committed
475
476
477
478
                        hyperParameters: {
                            value: content,
                            index: 0
                        }
479
480
481
482
483
484
485
486
487
488
489
                    };
                    const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
                    this.trialJobsMaintainer.setTrialJob(trialJobDetail.id, Object.assign({}, trialJobDetail));
                    // TO DO: to uncomment
                    assert(trialJobDetail.status === 'WAITING');
                    await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, content, trialJobDetail.url);
                    if (this.currSubmittedTrialNum === this.experimentProfile.params.maxTrialNum) {
                        this.trialJobsMaintainer.setNoMoreTrials();
                    }
                }
                break;
chicm-ms's avatar
chicm-ms committed
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
            case SEND_TRIAL_JOB_PARAMETER:
                const tunerCommand: any = JSON.parse(content);
                assert(tunerCommand.parameter_index >= 0);
                assert(tunerCommand.trial_job_id !== undefined);

                const trialJobForm: TrialJobApplicationForm = {
                    jobType: 'TRIAL',
                    hyperParameters: {
                        value: content,
                        index: tunerCommand.parameter_index
                    }
                };
                await this.trainingService.updateTrialJob(tunerCommand.trial_job_id, trialJobForm);
                await this.dataStore.storeTrialJobEvent(
                        'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
                break;
506
507
508
509
510
511
512
513
514
            case NO_MORE_TRIAL_JOBS:
                this.trialJobsMaintainer.setNoMoreTrials();
                break;
            case KILL_TRIAL_JOB:
                await this.trainingService.cancelTrialJob(JSON.parse(content));
                break;
            default:
                throw new Error('Error: unsupported command type from tuner');
        }
Deshui Yu's avatar
Deshui Yu committed
515
516
    }

517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
    private criticalError(err: Error): void {
        this.logError(err);
        console.error(err);
    }

    private logError(err: Error): void {
        if (err.stack !== undefined) {
            this.log.error(err.stack);
        }
        this.status.errors.push(err.message);
        this.status.status = 'ERROR';
    }

    private createEmptyExperimentProfile(): ExperimentProfile {
        return {
            id: getExperimentId(),
            revision: 0,
            execDuration: 0,
            params: {
                authorName: '',
                experimentName: '',
                trialConcurrency: 0,
                maxExecDuration: 0, // unit: second
                maxTrialNum: 0, // maxTrialNum includes all the submitted trial jobs
541
                trainingServicePlatform: '',
542
543
544
545
546
547
548
549
                searchSpace: '',
                tuner: {
                    className: '',
                    classArgs: {},
                    checkpointDir: ''
                }
            }
        };
Deshui Yu's avatar
Deshui Yu committed
550
551
552
553
    }
}

export { NNIManager };