localTrainingService.ts 22.8 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5
6
7
8
9
10

'use strict';
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
import * as ts from 'tail-stream';
11
import * as tkill from 'tree-kill';
12
import { NNIError, NNIErrorNames } from '../../common/errors';
13
import { getExperimentId } from '../../common/experimentStartupInfo';
14
import { getLogger, Logger } from '../../common/log';
Deshui Yu's avatar
Deshui Yu committed
15
import {
16
    HyperParameters, TrainingService, TrialJobApplicationForm,
Deshui Yu's avatar
Deshui Yu committed
17
18
    TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../../common/trainingService';
19
20
21
import {
    delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, getNewLine, isAlive, uniqueString
} from '../../common/utils';
22
23
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
24
import { execMkdir, execNewFile, getScriptName, runScript, setEnvironmentVariable } from '../common/util';
25
import { GPUScheduler } from './gpuScheduler';
Deshui Yu's avatar
Deshui Yu committed
26
27
28
29
30
31
32
33

/**
 * Decode a command
 * @param Buffer binary incoming data
 * @returns a tuple of (success, commandType, content, remain)
 *          success: true if the buffer contains at least one complete command; otherwise false
 *          remain: remaining data after the first command
 */
34
// tslint:disable:newline-per-chained-call informative-docs
Deshui Yu's avatar
Deshui Yu committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
    if (data.length < 8) {
        return [false, '', '', data];
    }
    const commandType: string = data.slice(0, 2).toString();
    const contentLength: number = parseInt(data.slice(2, 8).toString(), 10);
    if (data.length < contentLength + 8) {
        return [false, '', '', data];
    }
    const content: string = data.slice(8, contentLength + 8).toString();
    const remain: Buffer = data.slice(contentLength + 8);

    return [true, commandType, content, remain];
}
49
// tslint:enable:newline-per-chained-call informative-docs
Deshui Yu's avatar
Deshui Yu committed
50
51
52
53
54
55
56

/**
 * LocalTrialJobDetail
 */
class LocalTrialJobDetail implements TrialJobDetail {
    public id: string;
    public status: TrialJobStatus;
57
58
59
    public submitTime: number;
    public startTime?: number;
    public endTime?: number;
Deshui Yu's avatar
Deshui Yu committed
60
61
62
    public tags?: string[];
    public url?: string;
    public workingDirectory: string;
63
    public form: TrialJobApplicationForm;
Deshui Yu's avatar
Deshui Yu committed
64
    public pid?: number;
65
    public gpuIndices?: number[];
Deshui Yu's avatar
Deshui Yu committed
66

67
68
    constructor(
        id: string, status: TrialJobStatus, submitTime: number,
69
        workingDirectory: string, form: TrialJobApplicationForm) {
Deshui Yu's avatar
Deshui Yu committed
70
71
72
73
74
75
        this.id = id;
        this.status = status;
        this.submitTime = submitTime;
        this.workingDirectory = workingDirectory;
        this.form = form;
        this.url = `file://localhost:${workingDirectory}`;
76
77
78
79
80
81
82
83
        this.gpuIndices = [];
    }
}

/**
 * Local training service config
 */
class LocalConfig {
84
    public maxTrialNumPerGpu?: number;
85
    public gpuIndices?: string;
86
87
    public useActiveGpu?: boolean;
    constructor(gpuIndices?: string, maxTrialNumPerGpu?: number, useActiveGpu?: boolean) {
88
89
90
        if (gpuIndices !== undefined) {
            this.gpuIndices = gpuIndices;
        }
91
92
93
94
95
96
        if (maxTrialNumPerGpu !== undefined) {
            this.maxTrialNumPerGpu = maxTrialNumPerGpu;
        }
        if (useActiveGpu !== undefined) {
            this.useActiveGpu = useActiveGpu;
        }
Deshui Yu's avatar
Deshui Yu committed
97
98
99
100
    }
}

/**
chicm-ms's avatar
chicm-ms committed
101
 * Local machine training service
Deshui Yu's avatar
Deshui Yu committed
102
103
 */
class LocalTrainingService implements TrainingService {
104
105
106
    private readonly eventEmitter: EventEmitter;
    private readonly jobMap: Map<string, LocalTrialJobDetail>;
    private readonly jobQueue: string[];
Deshui Yu's avatar
Deshui Yu committed
107
108
109
    private initialized: boolean;
    private stopping: boolean;
    private rootDir!: string;
suiguoxin's avatar
suiguoxin committed
110
    private readonly experimentId! : string;
111
    private gpuScheduler!: GPUScheduler;
112
    private readonly occupiedGpuIndexNumMap: Map<number, number>;
113
    private designatedGpuIndices!: Set<number>;
114
    private readonly log: Logger;
115
    private localTrialConfig?: TrialConfig;
116
    private localConfig?: LocalConfig;
117
    private isMultiPhase: boolean;
118
    private readonly jobStreamMap: Map<string, ts.Stream>;
119
120
    private maxTrialNumPerGpu: number;
    private useActiveGpu: boolean;
Deshui Yu's avatar
Deshui Yu committed
121
122
123
124
125
126
127
128

    constructor() {
        this.eventEmitter = new EventEmitter();
        this.jobMap = new Map<string, LocalTrialJobDetail>();
        this.jobQueue = [];
        this.initialized = false;
        this.stopping = false;
        this.log = getLogger();
suiguoxin's avatar
suiguoxin committed
129
        this.experimentId = getExperimentId();
130
        this.jobStreamMap = new Map<string, ts.Stream>();
chicm-ms's avatar
chicm-ms committed
131
        this.log.info('Construct local machine training service.');
132
133
134
135
        this.occupiedGpuIndexNumMap = new Map<number, number>();
        this.maxTrialNumPerGpu = 1;
        this.useActiveGpu = false;
        this.isMultiPhase = false;
Deshui Yu's avatar
Deshui Yu committed
136
137
138
    }

    public async run(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
139
        this.log.info('Run local machine training service.');
140
141
142
        const longRunningTasks: Promise<void>[] = [this.runJobLoop()];
        if (this.gpuScheduler !== undefined) {
            longRunningTasks.push(this.gpuScheduler.run());
Deshui Yu's avatar
Deshui Yu committed
143
        }
144
        await Promise.all(longRunningTasks);
chicm-ms's avatar
chicm-ms committed
145
        this.log.info('Local machine training service exit.');
Deshui Yu's avatar
Deshui Yu committed
146
147
148
149
150
151
    }

    public async listTrialJobs(): Promise<TrialJobDetail[]> {
        const jobs: TrialJobDetail[] = [];
        for (const key of this.jobMap.keys()) {
            const trialJob: TrialJobDetail = await this.getTrialJob(key);
152
            jobs.push(trialJob);
Deshui Yu's avatar
Deshui Yu committed
153
154
155
156
157
158
159
160
161
162
163
        }

        return jobs;
    }

    public async getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
        const trialJob: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
        if (trialJob === undefined) {
            throw new NNIError(NNIErrorNames.NOT_FOUND, 'Trial job not found');
        }
        if (trialJob.status === 'RUNNING') {
164
            const alive: boolean = await isAlive(trialJob.pid);
Deshui Yu's avatar
Deshui Yu committed
165
            if (!alive) {
166
                trialJob.endTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
167
168
169
                this.setTrialJobStatus(trialJob, 'FAILED');
                try {
                    const state: string = await fs.promises.readFile(path.join(trialJob.workingDirectory, '.nni', 'state'), 'utf8');
170
171
                    const match: RegExpMatchArray | null = state.trim()
                        .match(/^(\d+)\s+(\d+)/);
Deshui Yu's avatar
Deshui Yu committed
172
173
174
175
176
                    if (match !== null) {
                        const { 1: code, 2: timestamp } = match;
                        if (parseInt(code, 10) === 0) {
                            this.setTrialJobStatus(trialJob, 'SUCCEEDED');
                        }
177
                        trialJob.endTime = parseInt(timestamp, 10);
Deshui Yu's avatar
Deshui Yu committed
178
179
180
181
                    }
                } catch (error) {
                    //ignore
                }
182
                this.log.debug(`trialJob status update: ${trialJobId}, ${trialJob.status}`);
Deshui Yu's avatar
Deshui Yu committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
            }
        }

        return trialJob;
    }

    public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.eventEmitter.on('metric', listener);
    }

    public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.eventEmitter.off('metric', listener);
    }

197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
    public submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
        const trialJobId: string = uniqueString(5);
        const trialJobDetail: LocalTrialJobDetail = new LocalTrialJobDetail(
            trialJobId,
            'WAITING',
            Date.now(),
            path.join(this.rootDir, 'trials', trialJobId),
            form
        );
        this.jobQueue.push(trialJobId);
        this.jobMap.set(trialJobId, trialJobDetail);

        this.log.debug(`submitTrialJob: return: ${JSON.stringify(trialJobDetail)} `);

        return Promise.resolve(trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
212
213
    }

214
215
216
217
218
    /**
     * Update trial job for multi-phase
     * @param trialJobId trial job id
     * @param form job application form
     */
219
    public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
chicm-ms's avatar
chicm-ms committed
220
221
222
223
        const trialJobDetail: undefined | TrialJobDetail = this.jobMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new Error(`updateTrialJob failed: ${trialJobId} not found`);
        }
224
        await this.writeParameterFile(trialJobDetail.workingDirectory, form.hyperParameters);
chicm-ms's avatar
chicm-ms committed
225
226

        return trialJobDetail;
227
228
229
230
231
232
    }

    /**
     * Is multiphase job supported in current training service
     */
    public get isMultiPhaseJobSupported(): boolean {
233
        return true;
234
235
    }

QuanluZhang's avatar
QuanluZhang committed
236
    public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
237
238
239
240
        const trialJob: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
        if (trialJob === undefined) {
            throw new NNIError(NNIErrorNames.NOT_FOUND, 'Trial job not found');
        }
241
        if (trialJob.pid === undefined) {
242
            this.setTrialJobStatus(trialJob, 'USER_CANCELED');
243

SparkSnail's avatar
SparkSnail committed
244
            return Promise.resolve();
245
        }
246
        tkill(trialJob.pid, 'SIGKILL');
QuanluZhang's avatar
QuanluZhang committed
247
        this.setTrialJobStatus(trialJob, getJobCancelStatus(isEarlyStopped));
248

SparkSnail's avatar
SparkSnail committed
249
        return Promise.resolve();
Deshui Yu's avatar
Deshui Yu committed
250
251
252
253
254
    }

    public async setClusterMetadata(key: string, value: string): Promise<void> {
        if (!this.initialized) {
            this.rootDir = getExperimentRootDir();
255
256
            // tslint:disable-next-line:non-literal-fs-path
            if (!fs.existsSync(this.rootDir)) {
257
258
                await cpp.exec(`powershell.exe mkdir ${this.rootDir}`);
            }
Deshui Yu's avatar
Deshui Yu committed
259
260
261
            this.initialized = true;
        }
        switch (key) {
262
            case TrialConfigMetadataKey.TRIAL_CONFIG:
263
                this.localTrialConfig = <TrialConfig>JSON.parse(value);
264
                // Parse trial config failed, throw Error
265
                if (this.localTrialConfig === undefined) {
266
267
                    throw new Error('trial config parsed failed');
                }
268
269
270
                if (this.localTrialConfig.gpuNum !== undefined) {
                    this.log.info(`required GPU number is ${this.localTrialConfig.gpuNum}`);
                    if (this.gpuScheduler === undefined && this.localTrialConfig.gpuNum > 0) {
SparkSnail's avatar
SparkSnail committed
271
272
                        this.gpuScheduler = new GPUScheduler();
                    }
273
274
275
276
277
278
279
280
281
282
283
284
                }
                break;
            case TrialConfigMetadataKey.LOCAL_CONFIG:
                this.localConfig = <LocalConfig>JSON.parse(value);
                this.log.info(`Specified GPU indices: ${this.localConfig.gpuIndices}`);
                if (this.localConfig.gpuIndices !== undefined) {
                    this.designatedGpuIndices = new Set(this.localConfig.gpuIndices.split(',')
                            .map((x: string) => parseInt(x, 10)));
                    if (this.designatedGpuIndices.size === 0) {
                        throw new Error('gpuIndices can not be empty if specified.');
                    }
                }
285
286
287
288
289
290
291
                if (this.localConfig.maxTrialNumPerGpu !== undefined) {
                    this.maxTrialNumPerGpu = this.localConfig.maxTrialNumPerGpu;
                }

                if (this.localConfig.useActiveGpu !== undefined) {
                    this.useActiveGpu = this.localConfig.useActiveGpu;
                }
Deshui Yu's avatar
Deshui Yu committed
292
                break;
293
294
295
            case TrialConfigMetadataKey.MULTI_PHASE:
                this.isMultiPhase = (value === 'true' || value === 'True');
                break;
Deshui Yu's avatar
Deshui Yu committed
296
297
298
299
300
301
            default:
        }
    }

    public getClusterMetadata(key: string): Promise<string> {
        switch (key) {
302
            case TrialConfigMetadataKey.TRIAL_CONFIG:
303
                let getResult: Promise<string>;
304
                if (this.localTrialConfig === undefined) {
305
306
                    getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`));
                } else {
307
                    getResult = Promise.resolve(JSON.stringify(this.localTrialConfig));
308
                }
309

310
                return getResult;
Deshui Yu's avatar
Deshui Yu committed
311
312
313
314
315
            default:
                return Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, 'Key not found'));
        }
    }

316
    public async cleanUp(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
317
        this.log.info('Stopping local machine training service...');
Deshui Yu's avatar
Deshui Yu committed
318
        this.stopping = true;
319
        for (const stream of this.jobStreamMap.values()) {
320
321
            stream.end(0);
            stream.emit('end');
322
        }
323
324
325
326
        if (this.gpuScheduler !== undefined) {
            await this.gpuScheduler.stop();
        }

Deshui Yu's avatar
Deshui Yu committed
327
328
329
        return Promise.resolve();
    }

330
    private onTrialJobStatusChanged(trialJob: LocalTrialJobDetail, oldStatus: TrialJobStatus): void {
331
        //if job is not running, destory job stream
332
333
334
        if (['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED', 'EARLY_STOPPED'].includes(trialJob.status)) {
            if (this.jobStreamMap.has(trialJob.id)) {
                const stream: ts.Stream | undefined = this.jobStreamMap.get(trialJob.id);
335
                if (stream === undefined) {
336
337
                    throw new Error(`Could not find stream in trial ${trialJob.id}`);
                }
338
                //Refer https://github.com/Juul/tail-stream/issues/20
339
340
                stream.end(0);
                stream.emit('end');
341
342
343
                this.jobStreamMap.delete(trialJob.id);
            }
        }
344
345
346
        if (trialJob.gpuIndices !== undefined && trialJob.gpuIndices.length > 0 && this.gpuScheduler !== undefined) {
            if (oldStatus === 'RUNNING' && trialJob.status !== 'RUNNING') {
                for (const index of trialJob.gpuIndices) {
347
348
                    const num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
                    if (num === undefined) {
349
                        throw new Error(`gpu resource schedule error`);
350
                    } else if (num === 1) {
351
352
                        this.occupiedGpuIndexNumMap.delete(index);
                    } else {
353
                        this.occupiedGpuIndexNumMap.set(index, num - 1);
354
                    }
355
356
357
                }
            }
        }
Deshui Yu's avatar
Deshui Yu committed
358
359
    }

360
361
    private getEnvironmentVariables(
        trialJobDetail: TrialJobDetail,
SparkSnail's avatar
SparkSnail committed
362
363
        resource: { gpuIndices: number[] },
        gpuNum: number | undefined): { key: string; value: string }[] {
364
        const envVariables: { key: string; value: string }[] = [
Deshui Yu's avatar
Deshui Yu committed
365
            { key: 'NNI_PLATFORM', value: 'local' },
suiguoxin's avatar
suiguoxin committed
366
            { key: 'NNI_EXP_ID', value: this.experimentId },
Deshui Yu's avatar
Deshui Yu committed
367
368
            { key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
            { key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id },
369
            { key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory },
370
            { key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.form.sequenceId.toString() },
371
            { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
Deshui Yu's avatar
Deshui Yu committed
372
        ];
SparkSnail's avatar
SparkSnail committed
373
374
375
376
377
378
        if (gpuNum !== undefined) {
            envVariables.push({
                key: 'CUDA_VISIBLE_DEVICES',
                value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
            });
        }
379
380
381
382
383
384
385
386
387

        return envVariables;
    }

    private setExtraProperties(trialJobDetail: LocalTrialJobDetail, resource: { gpuIndices: number[] }): void {
        trialJobDetail.gpuIndices = resource.gpuIndices;
    }

    private tryGetAvailableResource(): [boolean, { gpuIndices: number[]}] {
388
389
        if (this.localTrialConfig === undefined) {
            throw new Error('localTrialConfig is not initialized!');
390
391
392
393
394
395
396
        }

        const resource: { gpuIndices: number[] } = { gpuIndices: [] };
        if (this.gpuScheduler === undefined) {
            return [true, resource];
        }

397
        let selectedGPUIndices: number[] = [];
398
399
400
401
        const availableGpuIndices: number[] = this.gpuScheduler.getAvailableGPUIndices(this.useActiveGpu, this.occupiedGpuIndexNumMap);
        for (const index of availableGpuIndices) {
            const num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
            if (num === undefined || num < this.maxTrialNumPerGpu) {
402
403
404
                selectedGPUIndices.push(index);
            }
        }
405
406
407
408
409
410

        if (this.designatedGpuIndices !== undefined) {
            this.checkSpecifiedGpuIndices();
            selectedGPUIndices = selectedGPUIndices.filter((index: number) => this.designatedGpuIndices.has(index));
        }

411
        if (selectedGPUIndices.length < this.localTrialConfig.gpuNum) {
412
413
414
            return [false, resource];
        }

415
        selectedGPUIndices.splice(this.localTrialConfig.gpuNum);
416
417
418
        Object.assign(resource, { gpuIndices: selectedGPUIndices });

        return [true, resource];
Deshui Yu's avatar
Deshui Yu committed
419
420
    }

421
422
423
424
425
426
427
428
429
    private checkSpecifiedGpuIndices(): void {
        const gpuCount: number = this.gpuScheduler.getSystemGpuCount();
        if (this.designatedGpuIndices !== undefined) {
            for (const index of this.designatedGpuIndices) {
                if (index >= gpuCount) {
                    throw new Error(`Specified GPU index not found: ${index}`);
                }
            }
        }
Deshui Yu's avatar
Deshui Yu committed
430
431
    }

432
433
434
    private occupyResource(resource: {gpuIndices: number[]}): void {
        if (this.gpuScheduler !== undefined) {
            for (const index of resource.gpuIndices) {
435
436
437
                const num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
                if (num === undefined) {
                    this.occupiedGpuIndexNumMap.set(index, 1);
438
                } else {
439
                    this.occupiedGpuIndexNumMap.set(index, num + 1);
440
                }
441
442
            }
        }
Deshui Yu's avatar
Deshui Yu committed
443
444
    }

445
446
447
448
449
450
451
452
453
454
    private async runJobLoop(): Promise<void> {
        while (!this.stopping) {
            while (!this.stopping && this.jobQueue.length !== 0) {
                const trialJobId: string = this.jobQueue[0];
                const trialJobDeatil: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
                if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING') {
                    const [success, resource] = this.tryGetAvailableResource();
                    if (!success) {
                        break;
                    }
455

456
457
458
459
460
461
462
                    this.occupyResource(resource);
                    await this.runTrialJob(trialJobId, resource);
                }
                this.jobQueue.shift();
            }
            await delay(5000);
        }
Deshui Yu's avatar
Deshui Yu committed
463
464
465
466
467
468
469
470
471
472
    }

    private setTrialJobStatus(trialJob: LocalTrialJobDetail, newStatus: TrialJobStatus): void {
        if (trialJob.status !== newStatus) {
            const oldStatus: TrialJobStatus = trialJob.status;
            trialJob.status = newStatus;
            this.onTrialJobStatusChanged(trialJob, oldStatus);
        }
    }

473
    private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
474
475
        const script: string[] = [];
        if (process.platform === 'win32') {
476
            script.push(
SparkSnail's avatar
SparkSnail committed
477
                `cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`,
478
                `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
479
                `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
SparkSnail's avatar
SparkSnail committed
480
                `Write $LASTEXITCODE " " $NOW_DATE  | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`);
481
        } else {
SparkSnail's avatar
SparkSnail committed
482
            script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`);
483
484
485
            if (process.platform === 'darwin') {
                // https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
                // Considering the worst case, write 999 to avoid negative duration
SparkSnail's avatar
SparkSnail committed
486
                script.push(`echo $? \`date +%s999\` >'${path.join(workingDirectory, '.nni', 'state')}'`);
487
            } else {
SparkSnail's avatar
SparkSnail committed
488
                script.push(`echo $? \`date +%s%3N\` >'${path.join(workingDirectory, '.nni', 'state')}'`);
489
            }
490
        }
491

492
493
494
        return script;
    }

495
    private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
496
        const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
497
        if (this.localTrialConfig === undefined) {
SparkSnail's avatar
SparkSnail committed
498
499
            throw new Error(`localTrialConfig not initialized!`);
        }
500
        const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrialConfig.gpuNum);
Deshui Yu's avatar
Deshui Yu committed
501

502
        if (this.localTrialConfig === undefined) {
503
504
            throw new Error('trial config is not initialized');
        }
505
506
507
        const runScriptContent: string[] = [];
        if (process.platform !== 'win32') {
            runScriptContent.push('#!/bin/bash');
508
        }
SparkSnail's avatar
SparkSnail committed
509
        runScriptContent.push(`cd '${this.localTrialConfig.codeDir}'`);
Deshui Yu's avatar
Deshui Yu committed
510
        for (const variable of variables) {
511
            runScriptContent.push(setEnvironmentVariable(variable));
Deshui Yu's avatar
Deshui Yu committed
512
        }
513
        const scripts: string[] = this.getScript(this.localTrialConfig, trialJobDetail.workingDirectory);
514
515
        scripts.forEach((script: string) => {
            runScriptContent.push(script);
516
517
518
519
520
        });
        await execMkdir(trialJobDetail.workingDirectory);
        await execMkdir(path.join(trialJobDetail.workingDirectory, '.nni'));
        await execNewFile(path.join(trialJobDetail.workingDirectory, '.nni', 'metrics'));
        const scriptName: string = getScriptName('run');
521
522
        await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, scriptName),
                                    runScriptContent.join(getNewLine()), { encoding: 'utf8', mode: 0o777 });
523
        await this.writeParameterFile(trialJobDetail.workingDirectory, trialJobDetail.form.hyperParameters);
524
        const trialJobProcess: cp.ChildProcess = runScript(path.join(trialJobDetail.workingDirectory, scriptName));
Deshui Yu's avatar
Deshui Yu committed
525
        this.setTrialJobStatus(trialJobDetail, 'RUNNING');
526
        trialJobDetail.startTime = Date.now();
527
        trialJobDetail.pid = trialJobProcess.pid;
Deshui Yu's avatar
Deshui Yu committed
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
        this.setExtraProperties(trialJobDetail, resource);

        let buffer: Buffer = Buffer.alloc(0);
        const stream: ts.Stream = ts.createReadStream(path.join(trialJobDetail.workingDirectory, '.nni', 'metrics'));
        stream.on('data', (data: Buffer) => {
            buffer = Buffer.concat([buffer, data]);
            while (buffer.length > 0) {
                const [success, , content, remain] = decodeCommand(buffer);
                if (!success) {
                    break;
                }
                this.eventEmitter.emit('metric', {
                    id: trialJobDetail.id,
                    data: content
                });
                this.log.debug(`Sending metrics, job id: ${trialJobDetail.id}, metrics: ${content}`);
                buffer = remain;
            }
        });
547
        this.jobStreamMap.set(trialJobDetail.id, stream);
Deshui Yu's avatar
Deshui Yu committed
548
549
    }

chicm-ms's avatar
chicm-ms committed
550
    private async writeParameterFile(directory: string, hyperParameters: HyperParameters): Promise<void> {
551
        const filepath: string = path.join(directory, generateParamFileName(hyperParameters));
chicm-ms's avatar
chicm-ms committed
552
553
        await fs.promises.writeFile(filepath, hyperParameters.value, { encoding: 'utf8' });
    }
Deshui Yu's avatar
Deshui Yu committed
554
555
556
}

export { LocalTrainingService };