localTrainingService.ts 23.9 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/**
 * Copyright (c) Microsoft Corporation
 * All rights reserved.
 *
 * MIT License
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

'use strict';
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
import * as ts from 'tail-stream';
27
import * as tkill from 'tree-kill';
28
import { NNIError, NNIErrorNames } from '../../common/errors';
29
import { getExperimentId } from '../../common/experimentStartupInfo';
30
import { getLogger, Logger } from '../../common/log';
Deshui Yu's avatar
Deshui Yu committed
31
import {
32
    HyperParameters, TrainingService, TrialJobApplicationForm,
Deshui Yu's avatar
Deshui Yu committed
33
34
    TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../../common/trainingService';
35
36
37
import {
    delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, getNewLine, isAlive, uniqueString
} from '../../common/utils';
38
39
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
40
import { execMkdir, execNewFile, getScriptName, runScript, setEnvironmentVariable } from '../common/util';
41
import { GPUScheduler } from './gpuScheduler';
Deshui Yu's avatar
Deshui Yu committed
42
43
44
45
46
47
48
49

/**
 * Decode a command
 * @param Buffer binary incoming data
 * @returns a tuple of (success, commandType, content, remain)
 *          success: true if the buffer contains at least one complete command; otherwise false
 *          remain: remaining data after the first command
 */
50
// tslint:disable:newline-per-chained-call informative-docs
Deshui Yu's avatar
Deshui Yu committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
    if (data.length < 8) {
        return [false, '', '', data];
    }
    const commandType: string = data.slice(0, 2).toString();
    const contentLength: number = parseInt(data.slice(2, 8).toString(), 10);
    if (data.length < contentLength + 8) {
        return [false, '', '', data];
    }
    const content: string = data.slice(8, contentLength + 8).toString();
    const remain: Buffer = data.slice(contentLength + 8);

    return [true, commandType, content, remain];
}
65
// tslint:enable:newline-per-chained-call informative-docs
Deshui Yu's avatar
Deshui Yu committed
66
67
68
69
70
71
72

/**
 * LocalTrialJobDetail
 */
class LocalTrialJobDetail implements TrialJobDetail {
    public id: string;
    public status: TrialJobStatus;
73
74
75
    public submitTime: number;
    public startTime?: number;
    public endTime?: number;
Deshui Yu's avatar
Deshui Yu committed
76
77
78
    public tags?: string[];
    public url?: string;
    public workingDirectory: string;
79
    public form: TrialJobApplicationForm;
Deshui Yu's avatar
Deshui Yu committed
80
    public pid?: number;
81
    public gpuIndices?: number[];
Deshui Yu's avatar
Deshui Yu committed
82

83
84
    constructor(
        id: string, status: TrialJobStatus, submitTime: number,
85
        workingDirectory: string, form: TrialJobApplicationForm) {
Deshui Yu's avatar
Deshui Yu committed
86
87
88
89
90
91
        this.id = id;
        this.status = status;
        this.submitTime = submitTime;
        this.workingDirectory = workingDirectory;
        this.form = form;
        this.url = `file://localhost:${workingDirectory}`;
92
93
94
95
96
97
98
99
        this.gpuIndices = [];
    }
}

/**
 * Local training service config
 */
class LocalConfig {
100
    public maxTrialNumPerGpu?: number;
101
    public gpuIndices?: string;
102
103
    public useActiveGpu?: boolean;
    constructor(gpuIndices?: string, maxTrialNumPerGpu?: number, useActiveGpu?: boolean) {
104
105
106
        if (gpuIndices !== undefined) {
            this.gpuIndices = gpuIndices;
        }
107
108
109
110
111
112
        if (maxTrialNumPerGpu !== undefined) {
            this.maxTrialNumPerGpu = maxTrialNumPerGpu;
        }
        if (useActiveGpu !== undefined) {
            this.useActiveGpu = useActiveGpu;
        }
Deshui Yu's avatar
Deshui Yu committed
113
114
115
116
    }
}

/**
chicm-ms's avatar
chicm-ms committed
117
 * Local machine training service
Deshui Yu's avatar
Deshui Yu committed
118
119
 */
class LocalTrainingService implements TrainingService {
120
121
122
    private readonly eventEmitter: EventEmitter;
    private readonly jobMap: Map<string, LocalTrialJobDetail>;
    private readonly jobQueue: string[];
Deshui Yu's avatar
Deshui Yu committed
123
124
125
    private initialized: boolean;
    private stopping: boolean;
    private rootDir!: string;
suiguoxin's avatar
suiguoxin committed
126
    private readonly experimentId! : string;
127
    private gpuScheduler!: GPUScheduler;
128
    private readonly occupiedGpuIndexNumMap: Map<number, number>;
129
    private designatedGpuIndices!: Set<number>;
130
    private readonly log: Logger;
131
    private localTrialConfig?: TrialConfig;
132
    private localConfig?: LocalConfig;
133
    private isMultiPhase: boolean;
134
    private readonly jobStreamMap: Map<string, ts.Stream>;
135
136
    private maxTrialNumPerGpu: number;
    private useActiveGpu: boolean;
Deshui Yu's avatar
Deshui Yu committed
137
138
139
140
141
142
143
144

    constructor() {
        this.eventEmitter = new EventEmitter();
        this.jobMap = new Map<string, LocalTrialJobDetail>();
        this.jobQueue = [];
        this.initialized = false;
        this.stopping = false;
        this.log = getLogger();
suiguoxin's avatar
suiguoxin committed
145
        this.experimentId = getExperimentId();
146
        this.jobStreamMap = new Map<string, ts.Stream>();
chicm-ms's avatar
chicm-ms committed
147
        this.log.info('Construct local machine training service.');
148
149
150
151
        this.occupiedGpuIndexNumMap = new Map<number, number>();
        this.maxTrialNumPerGpu = 1;
        this.useActiveGpu = false;
        this.isMultiPhase = false;
Deshui Yu's avatar
Deshui Yu committed
152
153
154
    }

    public async run(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
155
        this.log.info('Run local machine training service.');
156
157
158
        const longRunningTasks: Promise<void>[] = [this.runJobLoop()];
        if (this.gpuScheduler !== undefined) {
            longRunningTasks.push(this.gpuScheduler.run());
Deshui Yu's avatar
Deshui Yu committed
159
        }
160
        await Promise.all(longRunningTasks);
chicm-ms's avatar
chicm-ms committed
161
        this.log.info('Local machine training service exit.');
Deshui Yu's avatar
Deshui Yu committed
162
163
164
165
166
167
    }

    public async listTrialJobs(): Promise<TrialJobDetail[]> {
        const jobs: TrialJobDetail[] = [];
        for (const key of this.jobMap.keys()) {
            const trialJob: TrialJobDetail = await this.getTrialJob(key);
168
            jobs.push(trialJob);
Deshui Yu's avatar
Deshui Yu committed
169
170
171
172
173
174
175
176
177
178
179
        }

        return jobs;
    }

    public async getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
        const trialJob: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
        if (trialJob === undefined) {
            throw new NNIError(NNIErrorNames.NOT_FOUND, 'Trial job not found');
        }
        if (trialJob.status === 'RUNNING') {
180
            const alive: boolean = await isAlive(trialJob.pid);
Deshui Yu's avatar
Deshui Yu committed
181
            if (!alive) {
182
                trialJob.endTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
183
184
185
                this.setTrialJobStatus(trialJob, 'FAILED');
                try {
                    const state: string = await fs.promises.readFile(path.join(trialJob.workingDirectory, '.nni', 'state'), 'utf8');
186
187
                    const match: RegExpMatchArray | null = state.trim()
                        .match(/^(\d+)\s+(\d+)/);
Deshui Yu's avatar
Deshui Yu committed
188
189
190
191
192
                    if (match !== null) {
                        const { 1: code, 2: timestamp } = match;
                        if (parseInt(code, 10) === 0) {
                            this.setTrialJobStatus(trialJob, 'SUCCEEDED');
                        }
193
                        trialJob.endTime = parseInt(timestamp, 10);
Deshui Yu's avatar
Deshui Yu committed
194
195
196
197
                    }
                } catch (error) {
                    //ignore
                }
198
                this.log.debug(`trialJob status update: ${trialJobId}, ${trialJob.status}`);
Deshui Yu's avatar
Deshui Yu committed
199
200
201
202
203
204
205
206
207
208
209
210
211
212
            }
        }

        return trialJob;
    }

    public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.eventEmitter.on('metric', listener);
    }

    public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.eventEmitter.off('metric', listener);
    }

213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
    public submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
        const trialJobId: string = uniqueString(5);
        const trialJobDetail: LocalTrialJobDetail = new LocalTrialJobDetail(
            trialJobId,
            'WAITING',
            Date.now(),
            path.join(this.rootDir, 'trials', trialJobId),
            form
        );
        this.jobQueue.push(trialJobId);
        this.jobMap.set(trialJobId, trialJobDetail);

        this.log.debug(`submitTrialJob: return: ${JSON.stringify(trialJobDetail)} `);

        return Promise.resolve(trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
228
229
    }

230
231
232
233
234
    /**
     * Update trial job for multi-phase
     * @param trialJobId trial job id
     * @param form job application form
     */
235
    public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
chicm-ms's avatar
chicm-ms committed
236
237
238
239
        const trialJobDetail: undefined | TrialJobDetail = this.jobMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new Error(`updateTrialJob failed: ${trialJobId} not found`);
        }
240
        await this.writeParameterFile(trialJobDetail.workingDirectory, form.hyperParameters);
chicm-ms's avatar
chicm-ms committed
241
242

        return trialJobDetail;
243
244
245
246
247
248
    }

    /**
     * Is multiphase job supported in current training service
     */
    public get isMultiPhaseJobSupported(): boolean {
249
        return true;
250
251
    }

QuanluZhang's avatar
QuanluZhang committed
252
    public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
253
254
255
256
        const trialJob: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
        if (trialJob === undefined) {
            throw new NNIError(NNIErrorNames.NOT_FOUND, 'Trial job not found');
        }
257
        if (trialJob.pid === undefined) {
258
            this.setTrialJobStatus(trialJob, 'USER_CANCELED');
259

SparkSnail's avatar
SparkSnail committed
260
            return Promise.resolve();
261
        }
262
        tkill(trialJob.pid, 'SIGKILL');
QuanluZhang's avatar
QuanluZhang committed
263
        this.setTrialJobStatus(trialJob, getJobCancelStatus(isEarlyStopped));
264

SparkSnail's avatar
SparkSnail committed
265
        return Promise.resolve();
Deshui Yu's avatar
Deshui Yu committed
266
267
268
269
270
    }

    public async setClusterMetadata(key: string, value: string): Promise<void> {
        if (!this.initialized) {
            this.rootDir = getExperimentRootDir();
271
272
            // tslint:disable-next-line:non-literal-fs-path
            if (!fs.existsSync(this.rootDir)) {
273
274
                await cpp.exec(`powershell.exe mkdir ${this.rootDir}`);
            }
Deshui Yu's avatar
Deshui Yu committed
275
276
277
            this.initialized = true;
        }
        switch (key) {
278
            case TrialConfigMetadataKey.TRIAL_CONFIG:
279
                this.localTrialConfig = <TrialConfig>JSON.parse(value);
280
                // Parse trial config failed, throw Error
281
                if (this.localTrialConfig === undefined) {
282
283
                    throw new Error('trial config parsed failed');
                }
284
285
286
                if (this.localTrialConfig.gpuNum !== undefined) {
                    this.log.info(`required GPU number is ${this.localTrialConfig.gpuNum}`);
                    if (this.gpuScheduler === undefined && this.localTrialConfig.gpuNum > 0) {
SparkSnail's avatar
SparkSnail committed
287
288
                        this.gpuScheduler = new GPUScheduler();
                    }
289
290
291
292
293
294
295
296
297
298
299
300
                }
                break;
            case TrialConfigMetadataKey.LOCAL_CONFIG:
                this.localConfig = <LocalConfig>JSON.parse(value);
                this.log.info(`Specified GPU indices: ${this.localConfig.gpuIndices}`);
                if (this.localConfig.gpuIndices !== undefined) {
                    this.designatedGpuIndices = new Set(this.localConfig.gpuIndices.split(',')
                            .map((x: string) => parseInt(x, 10)));
                    if (this.designatedGpuIndices.size === 0) {
                        throw new Error('gpuIndices can not be empty if specified.');
                    }
                }
301
302
303
304
305
306
307
                if (this.localConfig.maxTrialNumPerGpu !== undefined) {
                    this.maxTrialNumPerGpu = this.localConfig.maxTrialNumPerGpu;
                }

                if (this.localConfig.useActiveGpu !== undefined) {
                    this.useActiveGpu = this.localConfig.useActiveGpu;
                }
Deshui Yu's avatar
Deshui Yu committed
308
                break;
309
310
311
            case TrialConfigMetadataKey.MULTI_PHASE:
                this.isMultiPhase = (value === 'true' || value === 'True');
                break;
Deshui Yu's avatar
Deshui Yu committed
312
313
314
315
316
317
            default:
        }
    }

    public getClusterMetadata(key: string): Promise<string> {
        switch (key) {
318
            case TrialConfigMetadataKey.TRIAL_CONFIG:
319
                let getResult: Promise<string>;
320
                if (this.localTrialConfig === undefined) {
321
322
                    getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`));
                } else {
323
                    getResult = Promise.resolve(JSON.stringify(this.localTrialConfig));
324
                }
325

326
                return getResult;
Deshui Yu's avatar
Deshui Yu committed
327
328
329
330
331
            default:
                return Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, 'Key not found'));
        }
    }

332
    public async cleanUp(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
333
        this.log.info('Stopping local machine training service...');
Deshui Yu's avatar
Deshui Yu committed
334
        this.stopping = true;
335
        for (const stream of this.jobStreamMap.values()) {
336
337
            stream.end(0);
            stream.emit('end');
338
        }
339
340
341
342
        if (this.gpuScheduler !== undefined) {
            await this.gpuScheduler.stop();
        }

Deshui Yu's avatar
Deshui Yu committed
343
344
345
        return Promise.resolve();
    }

346
    private onTrialJobStatusChanged(trialJob: LocalTrialJobDetail, oldStatus: TrialJobStatus): void {
347
        //if job is not running, destory job stream
348
349
350
        if (['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED', 'EARLY_STOPPED'].includes(trialJob.status)) {
            if (this.jobStreamMap.has(trialJob.id)) {
                const stream: ts.Stream | undefined = this.jobStreamMap.get(trialJob.id);
351
                if (stream === undefined) {
352
353
                    throw new Error(`Could not find stream in trial ${trialJob.id}`);
                }
354
                //Refer https://github.com/Juul/tail-stream/issues/20
355
356
                stream.end(0);
                stream.emit('end');
357
358
359
                this.jobStreamMap.delete(trialJob.id);
            }
        }
360
361
362
        if (trialJob.gpuIndices !== undefined && trialJob.gpuIndices.length > 0 && this.gpuScheduler !== undefined) {
            if (oldStatus === 'RUNNING' && trialJob.status !== 'RUNNING') {
                for (const index of trialJob.gpuIndices) {
363
364
                    const num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
                    if (num === undefined) {
365
                        throw new Error(`gpu resource schedule error`);
366
                    } else if (num === 1) {
367
368
                        this.occupiedGpuIndexNumMap.delete(index);
                    } else {
369
                        this.occupiedGpuIndexNumMap.set(index, num - 1);
370
                    }
371
372
373
                }
            }
        }
Deshui Yu's avatar
Deshui Yu committed
374
375
    }

376
377
    private getEnvironmentVariables(
        trialJobDetail: TrialJobDetail,
SparkSnail's avatar
SparkSnail committed
378
379
        resource: { gpuIndices: number[] },
        gpuNum: number | undefined): { key: string; value: string }[] {
380
        const envVariables: { key: string; value: string }[] = [
Deshui Yu's avatar
Deshui Yu committed
381
            { key: 'NNI_PLATFORM', value: 'local' },
suiguoxin's avatar
suiguoxin committed
382
            { key: 'NNI_EXP_ID', value: this.experimentId },
Deshui Yu's avatar
Deshui Yu committed
383
384
            { key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
            { key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id },
385
            { key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory },
386
            { key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.form.sequenceId.toString() },
387
            { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
Deshui Yu's avatar
Deshui Yu committed
388
        ];
SparkSnail's avatar
SparkSnail committed
389
390
391
392
393
394
        if (gpuNum !== undefined) {
            envVariables.push({
                key: 'CUDA_VISIBLE_DEVICES',
                value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
            });
        }
395
396
397
398
399
400
401
402
403

        return envVariables;
    }

    private setExtraProperties(trialJobDetail: LocalTrialJobDetail, resource: { gpuIndices: number[] }): void {
        trialJobDetail.gpuIndices = resource.gpuIndices;
    }

    private tryGetAvailableResource(): [boolean, { gpuIndices: number[]}] {
404
405
        if (this.localTrialConfig === undefined) {
            throw new Error('localTrialConfig is not initialized!');
406
407
408
409
410
411
412
        }

        const resource: { gpuIndices: number[] } = { gpuIndices: [] };
        if (this.gpuScheduler === undefined) {
            return [true, resource];
        }

413
        let selectedGPUIndices: number[] = [];
414
415
416
417
        const availableGpuIndices: number[] = this.gpuScheduler.getAvailableGPUIndices(this.useActiveGpu, this.occupiedGpuIndexNumMap);
        for (const index of availableGpuIndices) {
            const num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
            if (num === undefined || num < this.maxTrialNumPerGpu) {
418
419
420
                selectedGPUIndices.push(index);
            }
        }
421
422
423
424
425
426

        if (this.designatedGpuIndices !== undefined) {
            this.checkSpecifiedGpuIndices();
            selectedGPUIndices = selectedGPUIndices.filter((index: number) => this.designatedGpuIndices.has(index));
        }

427
        if (selectedGPUIndices.length < this.localTrialConfig.gpuNum) {
428
429
430
            return [false, resource];
        }

431
        selectedGPUIndices.splice(this.localTrialConfig.gpuNum);
432
433
434
        Object.assign(resource, { gpuIndices: selectedGPUIndices });

        return [true, resource];
Deshui Yu's avatar
Deshui Yu committed
435
436
    }

437
438
439
440
441
442
443
444
445
    private checkSpecifiedGpuIndices(): void {
        const gpuCount: number = this.gpuScheduler.getSystemGpuCount();
        if (this.designatedGpuIndices !== undefined) {
            for (const index of this.designatedGpuIndices) {
                if (index >= gpuCount) {
                    throw new Error(`Specified GPU index not found: ${index}`);
                }
            }
        }
Deshui Yu's avatar
Deshui Yu committed
446
447
    }

448
449
450
    private occupyResource(resource: {gpuIndices: number[]}): void {
        if (this.gpuScheduler !== undefined) {
            for (const index of resource.gpuIndices) {
451
452
453
                const num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
                if (num === undefined) {
                    this.occupiedGpuIndexNumMap.set(index, 1);
454
                } else {
455
                    this.occupiedGpuIndexNumMap.set(index, num + 1);
456
                }
457
458
            }
        }
Deshui Yu's avatar
Deshui Yu committed
459
460
    }

461
462
463
464
465
466
467
468
469
470
    private async runJobLoop(): Promise<void> {
        while (!this.stopping) {
            while (!this.stopping && this.jobQueue.length !== 0) {
                const trialJobId: string = this.jobQueue[0];
                const trialJobDeatil: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
                if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING') {
                    const [success, resource] = this.tryGetAvailableResource();
                    if (!success) {
                        break;
                    }
471

472
473
474
475
476
477
478
                    this.occupyResource(resource);
                    await this.runTrialJob(trialJobId, resource);
                }
                this.jobQueue.shift();
            }
            await delay(5000);
        }
Deshui Yu's avatar
Deshui Yu committed
479
480
481
482
483
484
485
486
487
488
    }

    private setTrialJobStatus(trialJob: LocalTrialJobDetail, newStatus: TrialJobStatus): void {
        if (trialJob.status !== newStatus) {
            const oldStatus: TrialJobStatus = trialJob.status;
            trialJob.status = newStatus;
            this.onTrialJobStatusChanged(trialJob, oldStatus);
        }
    }

489
    private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
490
491
        const script: string[] = [];
        if (process.platform === 'win32') {
492
            script.push(
SparkSnail's avatar
SparkSnail committed
493
                `cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`,
494
                `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
495
                `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
SparkSnail's avatar
SparkSnail committed
496
                `Write $LASTEXITCODE " " $NOW_DATE  | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`);
497
        } else {
SparkSnail's avatar
SparkSnail committed
498
            script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`);
499
500
501
            if (process.platform === 'darwin') {
                // https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
                // Considering the worst case, write 999 to avoid negative duration
SparkSnail's avatar
SparkSnail committed
502
                script.push(`echo $? \`date +%s999\` >'${path.join(workingDirectory, '.nni', 'state')}'`);
503
            } else {
SparkSnail's avatar
SparkSnail committed
504
                script.push(`echo $? \`date +%s%3N\` >'${path.join(workingDirectory, '.nni', 'state')}'`);
505
            }
506
        }
507

508
509
510
        return script;
    }

511
    private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
512
        const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
513
        if (this.localTrialConfig === undefined) {
SparkSnail's avatar
SparkSnail committed
514
515
            throw new Error(`localTrialConfig not initialized!`);
        }
516
        const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrialConfig.gpuNum);
Deshui Yu's avatar
Deshui Yu committed
517

518
        if (this.localTrialConfig === undefined) {
519
520
            throw new Error('trial config is not initialized');
        }
521
522
523
        const runScriptContent: string[] = [];
        if (process.platform !== 'win32') {
            runScriptContent.push('#!/bin/bash');
524
        }
SparkSnail's avatar
SparkSnail committed
525
        runScriptContent.push(`cd '${this.localTrialConfig.codeDir}'`);
Deshui Yu's avatar
Deshui Yu committed
526
        for (const variable of variables) {
527
            runScriptContent.push(setEnvironmentVariable(variable));
Deshui Yu's avatar
Deshui Yu committed
528
        }
529
        const scripts: string[] = this.getScript(this.localTrialConfig, trialJobDetail.workingDirectory);
530
531
        scripts.forEach((script: string) => {
            runScriptContent.push(script);
532
533
534
535
536
        });
        await execMkdir(trialJobDetail.workingDirectory);
        await execMkdir(path.join(trialJobDetail.workingDirectory, '.nni'));
        await execNewFile(path.join(trialJobDetail.workingDirectory, '.nni', 'metrics'));
        const scriptName: string = getScriptName('run');
537
538
        await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, scriptName),
                                    runScriptContent.join(getNewLine()), { encoding: 'utf8', mode: 0o777 });
539
        await this.writeParameterFile(trialJobDetail.workingDirectory, trialJobDetail.form.hyperParameters);
540
        const trialJobProcess: cp.ChildProcess = runScript(path.join(trialJobDetail.workingDirectory, scriptName));
Deshui Yu's avatar
Deshui Yu committed
541
        this.setTrialJobStatus(trialJobDetail, 'RUNNING');
542
        trialJobDetail.startTime = Date.now();
543
        trialJobDetail.pid = trialJobProcess.pid;
Deshui Yu's avatar
Deshui Yu committed
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
        this.setExtraProperties(trialJobDetail, resource);

        let buffer: Buffer = Buffer.alloc(0);
        const stream: ts.Stream = ts.createReadStream(path.join(trialJobDetail.workingDirectory, '.nni', 'metrics'));
        stream.on('data', (data: Buffer) => {
            buffer = Buffer.concat([buffer, data]);
            while (buffer.length > 0) {
                const [success, , content, remain] = decodeCommand(buffer);
                if (!success) {
                    break;
                }
                this.eventEmitter.emit('metric', {
                    id: trialJobDetail.id,
                    data: content
                });
                this.log.debug(`Sending metrics, job id: ${trialJobDetail.id}, metrics: ${content}`);
                buffer = remain;
            }
        });
563
        this.jobStreamMap.set(trialJobDetail.id, stream);
Deshui Yu's avatar
Deshui Yu committed
564
565
    }

chicm-ms's avatar
chicm-ms committed
566
    private async writeParameterFile(directory: string, hyperParameters: HyperParameters): Promise<void> {
567
        const filepath: string = path.join(directory, generateParamFileName(hyperParameters));
chicm-ms's avatar
chicm-ms committed
568
569
        await fs.promises.writeFile(filepath, hyperParameters.value, { encoding: 'utf8' });
    }
Deshui Yu's avatar
Deshui Yu committed
570
571
572
}

export { LocalTrainingService };