remoteMachineTrainingService.ts 30.7 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5

'use strict';

6
import * as assert from 'assert';
Deshui Yu's avatar
Deshui Yu committed
7
8
9
10
11
12
import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import * as component from '../../common/component';
SparkSnail's avatar
SparkSnail committed
13
import { NNIError, NNIErrorNames } from '../../common/errors';
14
import { getExperimentId } from '../../common/experimentStartupInfo';
Deshui Yu's avatar
Deshui Yu committed
15
16
17
import { getLogger, Logger } from '../../common/log';
import { ObservableTimer } from '../../common/observableTimer';
import {
18
    HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm,
19
    TrialJobDetail, TrialJobMetric
Deshui Yu's avatar
Deshui Yu committed
20
} from '../../common/trainingService';
21
22
23
24
25
import {
    delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getRemoteTmpDir,
    getVersion, uniqueString, unixPathJoin
} from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
26
import { GPUSummary } from '../common/gpuData';
27
28
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
29
import { execMkdir, validateCodeDir, getGpuMetricsCollectorBashScriptContent } from '../common/util';
Deshui Yu's avatar
Deshui Yu committed
30
31
import { GPUScheduler } from './gpuScheduler';
import {
32
    REMOTEMACHINE_TRIAL_COMMAND_FORMAT, RemoteMachineMeta,
33
    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail,
34
    ScheduleResultType, ExecutorManager
Deshui Yu's avatar
Deshui Yu committed
35
} from './remoteMachineData';
SparkSnail's avatar
SparkSnail committed
36
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
37
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
Deshui Yu's avatar
Deshui Yu committed
38
39
40
41

/**
 * Training Service implementation for Remote Machine (Linux)
 */
SparkSnail's avatar
SparkSnail committed
42
@component.Singleton
Deshui Yu's avatar
Deshui Yu committed
43
class RemoteMachineTrainingService implements TrainingService {
44
    private readonly machineExecutorManagerMap: Map<RemoteMachineMeta, ExecutorManager>; //machine excutor map
45
    private readonly machineCopyExpCodeDirPromiseMap: Map<RemoteMachineMeta, Promise<void>>;
46
    private readonly trialExecutorMap: Map<string, ShellExecutor>; //trial excutor map
47
    private readonly trialJobsMap: Map<string, RemoteMachineTrialJobDetail>;
48
    private readonly MAX_TRIAL_NUMBER_PER_EXECUTOR: number = 5; // every excutor has a max trial concurrency number
49
50
    private readonly expRootDir: string;
    private readonly remoteExpRootDir: string;
51
    private readonly remoteExpCodeDir: string;
52
    private trialConfig: TrialConfig | undefined;
53
    private gpuScheduler?: GPUScheduler;
54
55
    private readonly jobQueue: string[];
    private readonly timer: ObservableTimer;
Deshui Yu's avatar
Deshui Yu committed
56
    private stopping: boolean = false;
57
58
    private readonly metricsEmitter: EventEmitter;
    private readonly log: Logger;
59
    private isMultiPhase: boolean = false;
60
    private trialSequenceId: number;
SparkSnail's avatar
SparkSnail committed
61
    private remoteRestServerPort?: number;
62
    private readonly remoteOS: string;
SparkSnail's avatar
SparkSnail committed
63
    private nniManagerIpConfig?: NNIManagerIpConfig;
64
    private versionCheck: boolean = true;
SparkSnail's avatar
SparkSnail committed
65
    private logCollection: string;
Deshui Yu's avatar
Deshui Yu committed
66
67

    constructor(@component.Inject timer: ObservableTimer) {
68
        this.remoteOS = 'linux';
Deshui Yu's avatar
Deshui Yu committed
69
70
        this.metricsEmitter = new EventEmitter();
        this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>();
71
72
        this.trialExecutorMap = new Map<string, ShellExecutor>();
        this.machineExecutorManagerMap = new Map<RemoteMachineMeta, ExecutorManager>();
73
        this.machineCopyExpCodeDirPromiseMap = new Map<RemoteMachineMeta, Promise<void>>();
Deshui Yu's avatar
Deshui Yu committed
74
75
        this.jobQueue = [];
        this.expRootDir = getExperimentRootDir();
76
        this.remoteExpRootDir = this.getRemoteExperimentRootDir();
77
        this.remoteExpCodeDir = unixPathJoin(this.remoteExpRootDir, 'nni-code');
Deshui Yu's avatar
Deshui Yu committed
78
79
        this.timer = timer;
        this.log = getLogger();
80
        this.trialSequenceId = -1;
SparkSnail's avatar
SparkSnail committed
81
        this.logCollection = 'none';
chicm-ms's avatar
chicm-ms committed
82
        this.log.info('Construct remote machine training service.');
Deshui Yu's avatar
Deshui Yu committed
83
84
85
86
87
88
    }

    /**
     * Loop to launch trial jobs and collect trial metrics
     */
    public async run(): Promise<void> {
SparkSnail's avatar
SparkSnail committed
89
90
        const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
        await restServer.start();
91
        restServer.setEnableVersionCheck = this.versionCheck;
chicm-ms's avatar
chicm-ms committed
92
        this.log.info('Run remote machine training service.');
Deshui Yu's avatar
Deshui Yu committed
93
94
        while (!this.stopping) {
            while (this.jobQueue.length > 0) {
SparkSnail's avatar
SparkSnail committed
95
                this.updateGpuReservation();
Deshui Yu's avatar
Deshui Yu committed
96
                const trialJobId: string = this.jobQueue[0];
chicm-ms's avatar
chicm-ms committed
97
                const prepareResult: boolean = await this.prepareTrialJob(trialJobId);
Deshui Yu's avatar
Deshui Yu committed
98
99
100
101
                if (prepareResult) {
                    // Remove trial job with trialJobId from job queue
                    this.jobQueue.shift();
                } else {
102
                    // Break the while loop since no GPU resource is available right now,
Deshui Yu's avatar
Deshui Yu committed
103
104
105
                    // Wait to schedule job in next time iteration
                    break;
                }
106
            }
107
            if (restServer.getErrorMessage !== undefined) {
108
109
110
                throw new Error(restServer.getErrorMessage);
                this.stopping = true;
            }
Deshui Yu's avatar
Deshui Yu committed
111
112
            await delay(3000);
        }
chicm-ms's avatar
chicm-ms committed
113
        this.log.info('Remote machine training service exit.');
Deshui Yu's avatar
Deshui Yu committed
114
    }
115

SparkSnail's avatar
SparkSnail committed
116
    /**
117
     * give trial an executor
118
     * @param trial remote machine trial job detail
SparkSnail's avatar
SparkSnail committed
119
     */
120
    public async allocateExecutorForTrial(trial: RemoteMachineTrialJobDetail): Promise<void> {
121
        if (trial.rmMeta === undefined) {
SparkSnail's avatar
SparkSnail committed
122
123
            throw new Error(`rmMeta not set in trial ${trial.id}`);
        }
124
125
126
        const executorManager: ExecutorManager | undefined = this.machineExecutorManagerMap.get(trial.rmMeta);
        if (executorManager === undefined) {
            throw new Error(`executorManager not initialized`);
SparkSnail's avatar
SparkSnail committed
127
        }
128
129
        const shellExecutor: ShellExecutor = await executorManager.getAvailableExecutor();
        this.trialExecutorMap.set(trial.id, shellExecutor);
SparkSnail's avatar
SparkSnail committed
130
    }
131

SparkSnail's avatar
SparkSnail committed
132
133
    /**
     * If a trial is finished, release the connection resource
134
     * @param trial remote machine trial job detail
SparkSnail's avatar
SparkSnail committed
135
     */
136
    public releaseTrialExecutor(trial: RemoteMachineTrialJobDetail): void {
137
        if (trial.rmMeta === undefined) {
SparkSnail's avatar
SparkSnail committed
138
139
            throw new Error(`rmMeta not set in trial ${trial.id}`);
        }
140
141
142
        const executorManager: ExecutorManager | undefined = this.machineExecutorManagerMap.get(trial.rmMeta);
        if (executorManager === undefined) {
            throw new Error(`executorManager not initialized`);
SparkSnail's avatar
SparkSnail committed
143
        }
144
        executorManager.releaseConnection(this.trialExecutorMap.get(trial.id));
SparkSnail's avatar
SparkSnail committed
145
    }
Deshui Yu's avatar
Deshui Yu committed
146
147
148
149

    /**
     * List submitted trial jobs
     */
150
    public async listTrialJobs(): Promise<TrialJobDetail[]> {
Deshui Yu's avatar
Deshui Yu committed
151
152
153
        const jobs: TrialJobDetail[] = [];
        const deferred: Deferred<TrialJobDetail[]> = new Deferred<TrialJobDetail[]>();

154
        for (const [key,] of this.trialJobsMap) {
155
            jobs.push(await this.getTrialJob(key));
156
        }
Deshui Yu's avatar
Deshui Yu committed
157
158
159
160
161
162
163
164
165
166
167
        deferred.resolve(jobs);

        return deferred.promise;
    }

    /**
     * Get trial job detail information
     * @param trialJobId ID of trial job
     */
    public async getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
        const trialJob: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
168
        if (trialJob === undefined) {
Deshui Yu's avatar
Deshui Yu committed
169
170
171
172
            throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
        }
        //TO DO: add another job status, and design new job status change logic
        if (trialJob.status === 'RUNNING' || trialJob.status === 'UNKNOWN') {
173
            // Get executor where the job is running
Deshui Yu's avatar
Deshui Yu committed
174
175
176
            if (trialJob.rmMeta === undefined) {
                throw new Error(`rmMeta not set for submitted job ${trialJobId}`);
            }
177
178
179
            const executor: ShellExecutor | undefined = this.trialExecutorMap.get(trialJob.id);
            if (executor === undefined) {
                throw new Error(`Invalid job id: ${trialJobId}, cannot find executor`);
Deshui Yu's avatar
Deshui Yu committed
180
181
            }

182
            return this.updateTrialJobStatus(trialJob, executor);
Deshui Yu's avatar
Deshui Yu committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
        } else {
            return trialJob;
        }
    }

    /**
     * Add job metrics listener
     * @param listener callback listener
     */
    public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.metricsEmitter.on('metric', listener);
    }

    /**
     * Remove job metrics listener
     * @param listener callback listener
     */
    public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.metricsEmitter.off('metric', listener);
    }

    /**
     * Submit trial job
     * @param form trial job description form
     */
208
    public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
209
        if (this.trialConfig === undefined) {
Deshui Yu's avatar
Deshui Yu committed
210
211
212
            throw new Error('trial config is not initialized');
        }

213
214
215
        // Generate trial job id(random)
        const trialJobId: string = uniqueString(5);
        const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
Deshui Yu's avatar
Deshui Yu committed
216

217
218
219
220
221
222
223
224
225
        const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(
            trialJobId,
            'WAITING',
            Date.now(),
            trialWorkingFolder,
            form
        );
        this.jobQueue.push(trialJobId);
        this.trialJobsMap.set(trialJobId, trialJobDetail);
226

227
        return Promise.resolve(trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
228
229
    }

230
231
232
233
234
    /**
     * Update trial job for multi-phase
     * @param trialJobId trial job id
     * @param form job application form
     */
235
    public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
chicm-ms's avatar
chicm-ms committed
236
237
238
239
        const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new Error(`updateTrialJob failed: ${trialJobId} not found`);
        }
chicm-ms's avatar
chicm-ms committed
240
        await this.writeParameterFile(trialJobId, form.hyperParameters);
chicm-ms's avatar
chicm-ms committed
241
242

        return trialJobDetail;
243
    }
244

245
246
247
248
    /**
     * Is multiphase job supported in current training service
     */
    public get isMultiPhaseJobSupported(): boolean {
249
        return true;
250
251
    }

Deshui Yu's avatar
Deshui Yu committed
252
253
254
255
    /**
     * Cancel trial job
     * @param trialJobId ID of trial job
     */
QuanluZhang's avatar
QuanluZhang committed
256
    public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
257
        const trialJob: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
258
        if (trialJob === undefined) {
Deshui Yu's avatar
Deshui Yu committed
259
260
261
262
            throw new Error(`trial job id ${trialJobId} not found`);
        }

        // Remove the job with trialJobId from job queue
chicm-ms's avatar
chicm-ms committed
263
        const index: number = this.jobQueue.indexOf(trialJobId);
264
        if (index >= 0) {
Deshui Yu's avatar
Deshui Yu committed
265
266
267
            this.jobQueue.splice(index, 1);
        }

268
        // Get executor where the job is running
Deshui Yu's avatar
Deshui Yu committed
269
270
        if (trialJob.rmMeta !== undefined) {
            // If the trial job is already scheduled, check its status and kill the trial process in remote machine
271
272
273
            const executor: ShellExecutor | undefined = this.trialExecutorMap.get(trialJob.id);
            if (executor === undefined) {
                throw new Error(`Invalid job id ${trialJobId}, cannot find executor`);
Deshui Yu's avatar
Deshui Yu committed
274
275
            }

chicm-ms's avatar
chicm-ms committed
276
            if (trialJob.status === 'UNKNOWN') {
277
                this.releaseTrialExecutor(trialJob);
chicm-ms's avatar
chicm-ms committed
278
279
280
281
                trialJob.status = 'USER_CANCELED';
                return
            }

Deshui Yu's avatar
Deshui Yu committed
282
283
            const jobpidPath: string = this.getJobPidPath(trialJob.id);
            try {
284
285
                // Mark the toEarlyStop tag here
                trialJob.isEarlyStopped = isEarlyStopped;
286
287
                await executor.killChildProcesses(jobpidPath);
                this.releaseTrialExecutor(trialJob);
Deshui Yu's avatar
Deshui Yu committed
288
289
290
291
292
293
            } catch (error) {
                // Not handle the error since pkill failed will not impact trial job's current status
                this.log.error(`remoteTrainingService.cancelTrialJob: ${error.message}`);
            }
        } else {
            // Job is not scheduled yet, set status to 'USER_CANCELLED' directly
QuanluZhang's avatar
QuanluZhang committed
294
295
            assert(isEarlyStopped === false, 'isEarlyStopped is not supposed to be true here.');
            trialJob.status = getJobCancelStatus(isEarlyStopped);
Deshui Yu's avatar
Deshui Yu committed
296
297
298
299
300
301
        }
    }

    /**
     * Set culster metadata
     * @param key metadata key
302
     * //1. MACHINE_LIST -- create executor of machine list
Deshui Yu's avatar
Deshui Yu committed
303
304
305
306
307
     * //2. TRIAL_CONFIG -- trial configuration
     * @param value metadata value
     */
    public async setClusterMetadata(key: string, value: string): Promise<void> {
        switch (key) {
SparkSnail's avatar
SparkSnail committed
308
309
310
            case TrialConfigMetadataKey.NNI_MANAGER_IP:
                this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
                break;
311
            case TrialConfigMetadataKey.MACHINE_LIST:
Deshui Yu's avatar
Deshui Yu committed
312
                await this.setupConnections(value);
313
                this.gpuScheduler = new GPUScheduler(this.machineExecutorManagerMap);
Deshui Yu's avatar
Deshui Yu committed
314
                break;
chicm-ms's avatar
chicm-ms committed
315
            case TrialConfigMetadataKey.TRIAL_CONFIG: {
316
                const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
Deshui Yu's avatar
Deshui Yu committed
317
                // Parse trial config failed, throw Error
318
                if (remoteMachineTrailConfig === undefined) {
Deshui Yu's avatar
Deshui Yu committed
319
320
321
                    throw new Error('trial config parsed failed');
                }
                // codeDir is not a valid directory, throw Error
322
                if (!fs.lstatSync(remoteMachineTrailConfig.codeDir)
323
                    .isDirectory()) {
Deshui Yu's avatar
Deshui Yu committed
324
325
                    throw new Error(`codeDir ${remoteMachineTrailConfig.codeDir} is not a directory`);
                }
326
327

                try {
328
                    // Validate to make sure codeDir doesn't have too many files
329
                    await validateCodeDir(remoteMachineTrailConfig.codeDir);
330
331
332
333
334
335
336
337
338
339
340
                    // Copy codeDir to remote machine
                    for (const [rmMeta, executorManager] of this.machineExecutorManagerMap.entries()) {
                        const executor: ShellExecutor = await executorManager.getAvailableExecutor();
                        if (executor !== undefined) {
                            this.machineCopyExpCodeDirPromiseMap.set(
                                rmMeta,
                                executor.copyDirectoryToRemote(remoteMachineTrailConfig.codeDir, this.remoteExpCodeDir, this.remoteOS)
                            ); 
                        }
                    }
                    
341
                } catch (error) {
342
                    this.log.error(error);
343

344
                    return Promise.reject(new Error(error));
345
346
                }

Deshui Yu's avatar
Deshui Yu committed
347
348
                this.trialConfig = remoteMachineTrailConfig;
                break;
chicm-ms's avatar
chicm-ms committed
349
            }
350
351
352
            case TrialConfigMetadataKey.MULTI_PHASE:
                this.isMultiPhase = (value === 'true' || value === 'True');
                break;
353
354
355
            case TrialConfigMetadataKey.VERSION_CHECK:
                this.versionCheck = (value === 'true' || value === 'True');
                break;
SparkSnail's avatar
SparkSnail committed
356
357
358
            case TrialConfigMetadataKey.LOG_COLLECTION:
                this.logCollection = value;
                break;
Deshui Yu's avatar
Deshui Yu committed
359
360
361
362
363
364
365
366
367
368
            default:
                //Reject for unknown keys
                throw new Error(`Uknown key: ${key}`);
        }
    }

    /**
     * Get culster metadata
     * @param key metadata key
     */
369
    public async getClusterMetadata(_key: string): Promise<string> {
370
        return "";
Deshui Yu's avatar
Deshui Yu committed
371
    }
372

SparkSnail's avatar
SparkSnail committed
373
    /**
374
     * cleanup() has a time out of 10s to clean remote connections
SparkSnail's avatar
SparkSnail committed
375
376
     */
    public async cleanUp(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
377
        this.log.info('Stopping remote machine training service...');
Deshui Yu's avatar
Deshui Yu committed
378
        this.stopping = true;
SparkSnail's avatar
SparkSnail committed
379
380
        await Promise.race([delay(10000), this.cleanupConnections()]);
    }
381

382
383
384
385
    /**
     * remove gpu reversion when job is not running
     */
    private updateGpuReservation(): void {
386
387
388
389
390
        if (this.gpuScheduler) {
            for (const [key, value] of this.trialJobsMap) {
                if (!['WAITING', 'RUNNING'].includes(value.status)) {
                    this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap);
                }
391
392
393
394
            }
        }
    }

SparkSnail's avatar
SparkSnail committed
395
396
397
398
    /**
     * stop gpu_metric_collector process in remote machine and remove unused scripts
     */
    private async cleanupConnections(): Promise<void> {
399
        try {
400
            for (const [rmMeta, executorManager] of this.machineExecutorManagerMap.entries()) {
401
                const jobpidPath: string = unixPathJoin(this.getRemoteScriptsPath(rmMeta.username), 'pid');
402
403
404
405
                const executor: ShellExecutor | undefined = executorManager.getFirstExecutor();
                if (executor !== undefined) {
                    await executor.killChildProcesses(jobpidPath);
                    await executor.removeFolder(this.getRemoteScriptsPath(rmMeta.username));
SparkSnail's avatar
SparkSnail committed
406
                }
407
                executorManager.closeAllExecutor();
SparkSnail's avatar
SparkSnail committed
408
            }
409
        } catch (error) {
SparkSnail's avatar
SparkSnail committed
410
411
412
            //ignore error, this function is called to cleanup remote connections when experiment is stopping
            this.log.error(`Cleanup connection exception, error is ${error.message}`);
        }
Deshui Yu's avatar
Deshui Yu committed
413
414

        return Promise.resolve();
415
416
    }

Deshui Yu's avatar
Deshui Yu committed
417
    private async setupConnections(machineList: string): Promise<void> {
chicm-ms's avatar
chicm-ms committed
418
        this.log.debug(`Connecting to remote machines: ${machineList}`);
Deshui Yu's avatar
Deshui Yu committed
419
420
421
422
        const deferred: Deferred<void> = new Deferred<void>();
        //TO DO: verify if value's format is wrong, and json parse failed, how to handle error
        const rmMetaList: RemoteMachineMeta[] = <RemoteMachineMeta[]>JSON.parse(machineList);
        let connectedRMNum: number = 0;
SparkSnail's avatar
SparkSnail committed
423

SparkSnail's avatar
SparkSnail committed
424
        rmMetaList.forEach(async (rmMeta: RemoteMachineMeta) => {
425
            rmMeta.occupiedGpuIndexMap = new Map<number, number>();
426
427
428
429
            const executorManager: ExecutorManager = new ExecutorManager([], this.MAX_TRIAL_NUMBER_PER_EXECUTOR, rmMeta);
            const executor: ShellExecutor = await executorManager.getAvailableExecutor();
            this.machineExecutorManagerMap.set(rmMeta, executorManager);
            await this.initRemoteMachineOnConnected(rmMeta, executor);
SparkSnail's avatar
SparkSnail committed
430
431
            if (++connectedRMNum === rmMetaList.length) {
                deferred.resolve();
432
            }
Deshui Yu's avatar
Deshui Yu committed
433
        });
434

Deshui Yu's avatar
Deshui Yu committed
435
436
437
        return deferred.promise;
    }

438
439
    private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, executor: ShellExecutor): Promise<void> {
        // Create root working directory after executor is ready
440
        const nniRootDir: string = unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni');
441
        await executor.createFolder(this.remoteExpRootDir);
442

443
444
        // the directory to store temp scripts in remote machine
        const remoteGpuScriptCollectorDir: string = this.getRemoteScriptsPath(rmMeta.username);
445
446
        await executor.createFolder(remoteGpuScriptCollectorDir, true);
        await executor.allowPermission(false, nniRootDir, `${nniRootDir}/*`, `${nniRootDir}/scripts/*`);
447

Deshui Yu's avatar
Deshui Yu committed
448
        //Begin to execute gpu_metrics_collection scripts
449
        const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir);
450
        executor.executeScript(script, false, true);
451

452
        const disposable: Rx.IDisposable = this.timer.subscribe(
453
454
455
456
            async () => {
                const cmdresult = await executor.readLastLines(unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics'));
                if (cmdresult !== "") {
                    rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult);
457
458
459
460
                    if (rmMeta.gpuSummary.gpuCount === 0) {
                        this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`);
                        this.timer.unsubscribe(disposable);
                    }
Deshui Yu's avatar
Deshui Yu committed
461
462
463
464
465
466
                }
            }
        );
    }

    private async prepareTrialJob(trialJobId: string): Promise<boolean> {
chicm-ms's avatar
chicm-ms committed
467
        const deferred: Deferred<boolean> = new Deferred<boolean>();
Deshui Yu's avatar
Deshui Yu committed
468

469
        if (this.trialConfig === undefined) {
Deshui Yu's avatar
Deshui Yu committed
470
471
            throw new Error('trial config is not initialized');
        }
472
473
474
        if (this.gpuScheduler === undefined) {
            throw new Error('gpuScheduler is not initialized');
        }
Deshui Yu's avatar
Deshui Yu committed
475
476
477
478
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${trialJobId}`);
        }
479
480
481
        // If job is not WATIING, Don't prepare and resolve true immediately
        if (trialJobDetail.status !== 'WAITING') {
            deferred.resolve(true);
482

483
484
            return deferred.promise;
        }
485
        // get an executor from scheduler
486
        const rmScheduleResult: RemoteMachineScheduleResult = this.gpuScheduler.scheduleMachine(this.trialConfig.gpuNum, trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
487
        if (rmScheduleResult.resultType === ScheduleResultType.REQUIRE_EXCEED_TOTAL) {
chicm-ms's avatar
chicm-ms committed
488
            const errorMessage: string = `Required GPU number ${this.trialConfig.gpuNum} is too large, no machine can meet`;
Deshui Yu's avatar
Deshui Yu committed
489
490
491
            this.log.error(errorMessage);
            deferred.reject();
            throw new NNIError(NNIErrorNames.RESOURCE_NOT_AVAILABLE, errorMessage);
492
        } else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED
Deshui Yu's avatar
Deshui Yu committed
493
            && rmScheduleResult.scheduleInfo !== undefined) {
chicm-ms's avatar
chicm-ms committed
494
            const rmScheduleInfo: RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo;
495
            const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
SparkSnail's avatar
SparkSnail committed
496
497

            trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
498
499
500
501
            const copyExpCodeDirPromise = this.machineCopyExpCodeDirPromiseMap.get(trialJobDetail.rmMeta);
            if (copyExpCodeDirPromise !== undefined) {
                await copyExpCodeDirPromise;
            }
SparkSnail's avatar
SparkSnail committed
502

503
            await this.allocateExecutorForTrial(trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
504
            await this.launchTrialOnScheduledMachine(
505
                trialJobId, trialWorkingFolder, trialJobDetail.form, rmScheduleInfo);
Deshui Yu's avatar
Deshui Yu committed
506
507
508

            trialJobDetail.status = 'RUNNING';
            trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialWorkingFolder}`;
509
            trialJobDetail.startTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
510

511
            this.trialJobsMap.set(trialJobId, trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
512
            deferred.resolve(true);
513
        } else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
Deshui Yu's avatar
Deshui Yu committed
514
515
516
            this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`);
            deferred.resolve(false);
        } else {
517
            deferred.reject(`Invalid schedule resutl type: ${rmScheduleResult.resultType}`);
Deshui Yu's avatar
Deshui Yu committed
518
519
520
521
522
523
        }

        return deferred.promise;
    }

    private async launchTrialOnScheduledMachine(trialJobId: string, trialWorkingFolder: string, form: TrialJobApplicationForm,
524
        rmScheduleInfo: RemoteMachineScheduleInfo): Promise<void> {
525
        if (this.trialConfig === undefined) {
Deshui Yu's avatar
Deshui Yu committed
526
527
            throw new Error('trial config is not initialized');
        }
chicm-ms's avatar
chicm-ms committed
528
        const cudaVisibleDevice: string = rmScheduleInfo.cudaVisibleDevice;
529
530
531
        const executor: ShellExecutor | undefined = this.trialExecutorMap.get(trialJobId);
        if (executor === undefined) {
            assert(false, 'ShellExecutor is undefined.');
532
533
534
535

            // for lint
            return;
        }
536
537
538
539
540
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new Error(`Can not get trial job detail for job: ${trialJobId}`);
        }

Deshui Yu's avatar
Deshui Yu committed
541
542
        const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);

543
544
        await executor.createFolder(trialWorkingFolder);
        await executor.createFolder(unixPathJoin(trialWorkingFolder, '.nni'));
Deshui Yu's avatar
Deshui Yu committed
545
546
547

        // RemoteMachineRunShellFormat is the run shell format string,
        // See definition in remoteMachineData.ts
SparkSnail's avatar
SparkSnail committed
548
549

        let command: string;
chicm-ms's avatar
chicm-ms committed
550
551
        // Set CUDA_VISIBLE_DEVICES environment variable based on cudaVisibleDevice
        // If no valid cudaVisibleDevice is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
SparkSnail's avatar
SparkSnail committed
552
553
554
        // If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script
        if (this.trialConfig.gpuNum === undefined) {
            command = this.trialConfig.command;
SparkSnail's avatar
SparkSnail committed
555
        } else {
chicm-ms's avatar
chicm-ms committed
556
557
            if (typeof cudaVisibleDevice === 'string' && cudaVisibleDevice.length > 0) {
                command = `CUDA_VISIBLE_DEVICES=${cudaVisibleDevice} ${this.trialConfig.command}`;
SparkSnail's avatar
SparkSnail committed
558
559
560
            } else {
                command = `CUDA_VISIBLE_DEVICES=" " ${this.trialConfig.command}`;
            }
SparkSnail's avatar
SparkSnail committed
561
        }
562
563
        const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
        if (this.remoteRestServerPort === undefined) {
SparkSnail's avatar
SparkSnail committed
564
565
566
            const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
            this.remoteRestServerPort = restServer.clusterRestServerPort;
        }
567
        const version: string = this.versionCheck ? await getVersion() : '';
SparkSnail's avatar
SparkSnail committed
568
569
570
        const runScriptTrialContent: string = String.Format(
            REMOTEMACHINE_TRIAL_COMMAND_FORMAT,
            trialWorkingFolder,
Deshui Yu's avatar
Deshui Yu committed
571
572
            trialWorkingFolder,
            trialJobId,
SparkSnail's avatar
SparkSnail committed
573
            getExperimentId(),
574
            trialJobDetail.form.sequenceId.toString(),
575
            this.isMultiPhase,
576
            this.remoteExpCodeDir,
577
            unixPathJoin(trialWorkingFolder, '.nni', 'jobpid'),
SparkSnail's avatar
SparkSnail committed
578
579
580
            command,
            nniManagerIp,
            this.remoteRestServerPort,
581
            version,
SparkSnail's avatar
SparkSnail committed
582
            this.logCollection,
583
            unixPathJoin(trialWorkingFolder, '.nni', 'code')
584
        );
Deshui Yu's avatar
Deshui Yu committed
585
586

        //create tmp trial working folder locally.
587
        await execMkdir(path.join(trialLocalTempFolder, '.nni'));
588
589
        // Write install_nni.sh
        await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' });
590
        // Write file content ( run.sh and parameter.cfg ) to local tmp files
SparkSnail's avatar
SparkSnail committed
591
        await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run.sh'), runScriptTrialContent, { encoding: 'utf8' });
chicm-ms's avatar
chicm-ms committed
592
        await this.writeParameterFile(trialJobId, form.hyperParameters);
Deshui Yu's avatar
Deshui Yu committed
593
        // Copy files in codeDir to remote working directory
594
        await executor.copyDirectoryToRemote(trialLocalTempFolder, trialWorkingFolder, this.remoteOS);
Deshui Yu's avatar
Deshui Yu committed
595
        // Execute command in remote machine
596
        executor.executeScript(unixPathJoin(trialWorkingFolder, 'run.sh'), true, true);
Deshui Yu's avatar
Deshui Yu committed
597
598
599
    }

    private getRmMetaByHost(host: string): RemoteMachineMeta {
600
        for (const rmMeta of this.machineExecutorManagerMap.keys()) {
Deshui Yu's avatar
Deshui Yu committed
601
602
603
604
605
606
607
            if (rmMeta.ip === host) {
                return rmMeta;
            }
        }
        throw new Error(`Host not found: ${host}`);
    }

608
    private async updateTrialJobStatus(trialJob: RemoteMachineTrialJobDetail, executor: ShellExecutor): Promise<TrialJobDetail> {
Deshui Yu's avatar
Deshui Yu committed
609
610
        const deferred: Deferred<TrialJobDetail> = new Deferred<TrialJobDetail>();
        const jobpidPath: string = this.getJobPidPath(trialJob.id);
611
        const trialReturnCodeFilePath: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJob.id, '.nni', 'code');
612
        /* eslint-disable require-atomic-updates */
Deshui Yu's avatar
Deshui Yu committed
613
        try {
614
            const isAlive = await executor.isProcessAlive(jobpidPath);
Deshui Yu's avatar
Deshui Yu committed
615
            // if the process of jobpid is not alive any more
616
617
618
619
            if (!isAlive) {
                const trialReturnCode: string = await executor.getRemoteFileContent(trialReturnCodeFilePath);
                this.log.debug(`trailjob ${trialJob.id} return code: ${trialReturnCode}`);
                const match: RegExpMatchArray | null = trialReturnCode.trim()
620
                    .match(/^(\d+)\s+(\d+)$/);
621
                if (match !== null) {
Deshui Yu's avatar
Deshui Yu committed
622
623
624
625
626
                    const { 1: code, 2: timestamp } = match;
                    // Update trial job's status based on result code
                    if (parseInt(code, 10) === 0) {
                        trialJob.status = 'SUCCEEDED';
                    } else {
627
628
629
630
631
632
                        // isEarlyStopped is never set, mean it's not cancelled by NNI, so if the process's exit code >0, mark it as FAILED
                        if (trialJob.isEarlyStopped === undefined) {
                            trialJob.status = 'FAILED';
                        } else {
                            trialJob.status = getJobCancelStatus(trialJob.isEarlyStopped);
                        }
Deshui Yu's avatar
Deshui Yu committed
633
                    }
634
                    trialJob.endTime = parseInt(timestamp, 10);
635
                    this.releaseTrialExecutor(trialJob);
Deshui Yu's avatar
Deshui Yu committed
636
                }
chicm-ms's avatar
chicm-ms committed
637
                this.log.debug(`trailJob status update: ${trialJob.id}, ${trialJob.status}`);
Deshui Yu's avatar
Deshui Yu committed
638
639
640
641
642
643
644
645
646
647
648
            }
            deferred.resolve(trialJob);
        } catch (error) {
            this.log.error(`Update job status exception, error is ${error.message}`);
            if (error instanceof NNIError && error.name === NNIErrorNames.NOT_FOUND) {
                deferred.resolve(trialJob);
            } else {
                trialJob.status = 'UNKNOWN';
                deferred.resolve(trialJob);
            }
        }
649
        /* eslint-enable require-atomic-updates */
Deshui Yu's avatar
Deshui Yu committed
650
651
652
        return deferred.promise;
    }

SparkSnail's avatar
SparkSnail committed
653
    private getRemoteScriptsPath(userName: string): string {
654
        return unixPathJoin(getRemoteTmpDir(this.remoteOS), userName, 'nni', 'scripts');
Deshui Yu's avatar
Deshui Yu committed
655
656
657
    }

    private getHostJobRemoteDir(jobId: string): string {
658
        return unixPathJoin(this.remoteExpRootDir, 'hostjobs', jobId);
Deshui Yu's avatar
Deshui Yu committed
659
660
    }

661
    private getRemoteExperimentRootDir(): string {
662
        return unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni', 'experiments', getExperimentId());
Deshui Yu's avatar
Deshui Yu committed
663
664
    }

chicm-ms's avatar
chicm-ms committed
665
    public get MetricsEmitter(): EventEmitter {
SparkSnail's avatar
SparkSnail committed
666
667
668
        return this.metricsEmitter;
    }

Deshui Yu's avatar
Deshui Yu committed
669
670
671
672
673
674
    private getJobPidPath(jobId: string): string {
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(jobId);
        if (trialJobDetail === undefined) {
            throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${jobId}`);
        }

chicm-ms's avatar
chicm-ms committed
675
        return unixPathJoin(trialJobDetail.workingDirectory, '.nni', 'jobpid');
Deshui Yu's avatar
Deshui Yu committed
676
    }
chicm-ms's avatar
chicm-ms committed
677

chicm-ms's avatar
chicm-ms committed
678
    private async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters): Promise<void> {
679
680
681
        const executor: ShellExecutor | undefined = this.trialExecutorMap.get(trialJobId);
        if (executor === undefined) {
            throw new Error('ShellExecutor is undefined.');
chicm-ms's avatar
chicm-ms committed
682
683
        }

684
        const trialWorkingFolder: string = unixPathJoin(this.remoteExpRootDir, 'trials', trialJobId);
chicm-ms's avatar
chicm-ms committed
685
686
        const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);

687
        const fileName: string = generateParamFileName(hyperParameters);
chicm-ms's avatar
chicm-ms committed
688
689
690
        const localFilepath: string = path.join(trialLocalTempFolder, fileName);
        await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' });

691
        await executor.copyFileToRemote(localFilepath, unixPathJoin(trialWorkingFolder, fileName));
chicm-ms's avatar
chicm-ms committed
692
    }
Deshui Yu's avatar
Deshui Yu committed
693
694
695
}

export { RemoteMachineTrainingService };