"src/targets/gpu/device/unary_not.cpp" did not exist on "b74d9628bc9be1c2dfe60317e1e03c2d7de04caa"
remoteMachineTrainingService.ts 30 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5

'use strict';

6
import * as assert from 'assert';
Deshui Yu's avatar
Deshui Yu committed
7
8
9
import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
10
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
Deshui Yu's avatar
Deshui Yu committed
11
12
import { Deferred } from 'ts-deferred';
import * as component from '../../common/component';
SparkSnail's avatar
SparkSnail committed
13
import { NNIError, NNIErrorNames } from '../../common/errors';
14
import { getExperimentId } from '../../common/experimentStartupInfo';
Deshui Yu's avatar
Deshui Yu committed
15
16
17
import { getLogger, Logger } from '../../common/log';
import { ObservableTimer } from '../../common/observableTimer';
import {
18
    HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm,
19
    TrialJobDetail, TrialJobMetric
Deshui Yu's avatar
Deshui Yu committed
20
} from '../../common/trainingService';
21
import {
22
23
    delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus,
    getVersion, uniqueString
24
25
} from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
26
import { GPUSummary, ScheduleResultType } from '../common/gpuData';
27
28
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
29
import { execMkdir, validateCodeDir } from '../common/util';
Deshui Yu's avatar
Deshui Yu committed
30
31
import { GPUScheduler } from './gpuScheduler';
import {
32
33
    ExecutorManager, RemoteMachineMeta,
    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
Deshui Yu's avatar
Deshui Yu committed
34
} from './remoteMachineData';
SparkSnail's avatar
SparkSnail committed
35
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
Deshui Yu's avatar
Deshui Yu committed
36
37
38
39

/**
 * Training Service implementation for Remote Machine (Linux)
 */
SparkSnail's avatar
SparkSnail committed
40
@component.Singleton
Deshui Yu's avatar
Deshui Yu committed
41
class RemoteMachineTrainingService implements TrainingService {
42
    private readonly initExecutorId = "initConnection";
43
    private readonly machineExecutorManagerMap: Map<RemoteMachineMeta, ExecutorManager>; //machine excutor map
44
    private readonly machineCopyExpCodeDirPromiseMap: Map<RemoteMachineMeta, Promise<void>>;
45
    private readonly trialExecutorManagerMap: Map<string, ExecutorManager>; //trial excutor map
46
47
    private readonly trialJobsMap: Map<string, RemoteMachineTrialJobDetail>;
    private readonly expRootDir: string;
48
    private trialConfig: TrialConfig | undefined;
49
    private gpuScheduler?: GPUScheduler;
50
51
    private readonly jobQueue: string[];
    private readonly timer: ObservableTimer;
Deshui Yu's avatar
Deshui Yu committed
52
    private stopping: boolean = false;
53
54
    private readonly metricsEmitter: EventEmitter;
    private readonly log: Logger;
55
    private isMultiPhase: boolean = false;
SparkSnail's avatar
SparkSnail committed
56
57
    private remoteRestServerPort?: number;
    private nniManagerIpConfig?: NNIManagerIpConfig;
58
    private versionCheck: boolean = true;
SparkSnail's avatar
SparkSnail committed
59
    private logCollection: string;
60
    private sshConnectionPromises: any[];
Deshui Yu's avatar
Deshui Yu committed
61
62
63
64

    constructor(@component.Inject timer: ObservableTimer) {
        this.metricsEmitter = new EventEmitter();
        this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>();
65
        this.trialExecutorManagerMap = new Map<string, ExecutorManager>();
66
        this.machineCopyExpCodeDirPromiseMap = new Map<RemoteMachineMeta, Promise<void>>();
67
        this.machineExecutorManagerMap = new Map<RemoteMachineMeta, ExecutorManager>();
Deshui Yu's avatar
Deshui Yu committed
68
        this.jobQueue = [];
69
        this.sshConnectionPromises = [];
Deshui Yu's avatar
Deshui Yu committed
70
71
72
        this.expRootDir = getExperimentRootDir();
        this.timer = timer;
        this.log = getLogger();
SparkSnail's avatar
SparkSnail committed
73
        this.logCollection = 'none';
chicm-ms's avatar
chicm-ms committed
74
        this.log.info('Construct remote machine training service.');
Deshui Yu's avatar
Deshui Yu committed
75
76
77
78
79
80
    }

    /**
     * Loop to launch trial jobs and collect trial metrics
     */
    public async run(): Promise<void> {
SparkSnail's avatar
SparkSnail committed
81
82
        const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
        await restServer.start();
83
        restServer.setEnableVersionCheck = this.versionCheck;
chicm-ms's avatar
chicm-ms committed
84
        this.log.info('Run remote machine training service.');
85
86
87
88
89
90
        if (this.sshConnectionPromises.length > 0) {
            await Promise.all(this.sshConnectionPromises);
            this.log.info('ssh connection initialized!');
            // set sshConnectionPromises to [] to avoid log information duplicated
            this.sshConnectionPromises = [];
        }
Deshui Yu's avatar
Deshui Yu committed
91
92
        while (!this.stopping) {
            while (this.jobQueue.length > 0) {
SparkSnail's avatar
SparkSnail committed
93
                this.updateGpuReservation();
Deshui Yu's avatar
Deshui Yu committed
94
                const trialJobId: string = this.jobQueue[0];
chicm-ms's avatar
chicm-ms committed
95
                const prepareResult: boolean = await this.prepareTrialJob(trialJobId);
Deshui Yu's avatar
Deshui Yu committed
96
97
98
99
                if (prepareResult) {
                    // Remove trial job with trialJobId from job queue
                    this.jobQueue.shift();
                } else {
100
                    // Break the while loop since no GPU resource is available right now,
Deshui Yu's avatar
Deshui Yu committed
101
102
103
                    // Wait to schedule job in next time iteration
                    break;
                }
104
            }
105
            if (restServer.getErrorMessage !== undefined) {
106
                this.stopping = true;
107
                throw new Error(restServer.getErrorMessage);
108
            }
Deshui Yu's avatar
Deshui Yu committed
109
110
            await delay(3000);
        }
111
        this.log.info('RemoteMachineTrainingService run loop exited.');
Deshui Yu's avatar
Deshui Yu committed
112
    }
113

SparkSnail's avatar
SparkSnail committed
114
    /**
115
     * give trial an executor
116
     * @param trial remote machine trial job detail
SparkSnail's avatar
SparkSnail committed
117
     */
118
    public allocateExecutorManagerForTrial(trial: RemoteMachineTrialJobDetail): void {
119
        if (trial.rmMeta === undefined) {
SparkSnail's avatar
SparkSnail committed
120
121
            throw new Error(`rmMeta not set in trial ${trial.id}`);
        }
122
123
124
        const executorManager: ExecutorManager | undefined = this.machineExecutorManagerMap.get(trial.rmMeta);
        if (executorManager === undefined) {
            throw new Error(`executorManager not initialized`);
SparkSnail's avatar
SparkSnail committed
125
        }
126
        this.trialExecutorManagerMap.set(trial.id, executorManager);
SparkSnail's avatar
SparkSnail committed
127
    }
128

SparkSnail's avatar
SparkSnail committed
129
130
    /**
     * If a trial is finished, release the connection resource
131
     * @param trial remote machine trial job detail
SparkSnail's avatar
SparkSnail committed
132
     */
133
    public releaseTrialResource(trial: RemoteMachineTrialJobDetail): void {
134
        if (trial.rmMeta === undefined) {
SparkSnail's avatar
SparkSnail committed
135
136
            throw new Error(`rmMeta not set in trial ${trial.id}`);
        }
137
        const executorManager = this.trialExecutorManagerMap.get(trial.id);
138
        if (executorManager === undefined) {
139
            throw new Error(`ExecutorManager is not assigned for trial ${trial.id}`);
SparkSnail's avatar
SparkSnail committed
140
        }
141
142
        // Note, it still keep reference in trialExecutorManagerMap, as there may be following requests from nni manager.
        executorManager.releaseExecutor(trial.id);
SparkSnail's avatar
SparkSnail committed
143
    }
Deshui Yu's avatar
Deshui Yu committed
144
145
146
147

    /**
     * List submitted trial jobs
     */
148
    public async listTrialJobs(): Promise<TrialJobDetail[]> {
Deshui Yu's avatar
Deshui Yu committed
149
150
151
        const jobs: TrialJobDetail[] = [];
        const deferred: Deferred<TrialJobDetail[]> = new Deferred<TrialJobDetail[]>();

152
        for (const [key,] of this.trialJobsMap) {
153
            jobs.push(await this.getTrialJob(key));
154
        }
Deshui Yu's avatar
Deshui Yu committed
155
156
157
158
159
160
161
162
163
164
165
        deferred.resolve(jobs);

        return deferred.promise;
    }

    /**
     * Get trial job detail information
     * @param trialJobId ID of trial job
     */
    public async getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
        const trialJob: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
166
        if (trialJob === undefined) {
Deshui Yu's avatar
Deshui Yu committed
167
168
169
170
            throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
        }
        //TO DO: add another job status, and design new job status change logic
        if (trialJob.status === 'RUNNING' || trialJob.status === 'UNKNOWN') {
171
            // Get executor where the job is running
Deshui Yu's avatar
Deshui Yu committed
172
173
174
            if (trialJob.rmMeta === undefined) {
                throw new Error(`rmMeta not set for submitted job ${trialJobId}`);
            }
175
            const executor = await this.getExecutor(trialJob.id);
Deshui Yu's avatar
Deshui Yu committed
176

177
            return this.updateTrialJobStatus(trialJob, executor);
Deshui Yu's avatar
Deshui Yu committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
        } else {
            return trialJob;
        }
    }

    /**
     * Add job metrics listener
     * @param listener callback listener
     */
    public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.metricsEmitter.on('metric', listener);
    }

    /**
     * Remove job metrics listener
     * @param listener callback listener
     */
    public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.metricsEmitter.off('metric', listener);
    }

    /**
     * Submit trial job
     * @param form trial job description form
     */
203
    public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
204
        if (this.trialConfig === undefined) {
Deshui Yu's avatar
Deshui Yu committed
205
206
207
            throw new Error('trial config is not initialized');
        }

208
209
        // Generate trial job id(random)
        const trialJobId: string = uniqueString(5);
Deshui Yu's avatar
Deshui Yu committed
210

211
212
213
214
        const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(
            trialJobId,
            'WAITING',
            Date.now(),
215
            "unset",
216
217
218
219
            form
        );
        this.jobQueue.push(trialJobId);
        this.trialJobsMap.set(trialJobId, trialJobDetail);
220

221
        return Promise.resolve(trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
222
223
    }

224
225
226
227
228
    /**
     * Update trial job for multi-phase
     * @param trialJobId trial job id
     * @param form job application form
     */
229
    public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
chicm-ms's avatar
chicm-ms committed
230
231
232
233
        const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new Error(`updateTrialJob failed: ${trialJobId} not found`);
        }
chicm-ms's avatar
chicm-ms committed
234
        await this.writeParameterFile(trialJobId, form.hyperParameters);
chicm-ms's avatar
chicm-ms committed
235
236

        return trialJobDetail;
237
    }
238

239
240
241
242
    /**
     * Is multiphase job supported in current training service
     */
    public get isMultiPhaseJobSupported(): boolean {
243
        return true;
244
245
    }

Deshui Yu's avatar
Deshui Yu committed
246
247
248
249
    /**
     * Cancel trial job
     * @param trialJobId ID of trial job
     */
QuanluZhang's avatar
QuanluZhang committed
250
    public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
251
        const trialJob: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
252
        if (trialJob === undefined) {
Deshui Yu's avatar
Deshui Yu committed
253
254
255
256
            throw new Error(`trial job id ${trialJobId} not found`);
        }

        // Remove the job with trialJobId from job queue
chicm-ms's avatar
chicm-ms committed
257
        const index: number = this.jobQueue.indexOf(trialJobId);
258
        if (index >= 0) {
Deshui Yu's avatar
Deshui Yu committed
259
260
261
            this.jobQueue.splice(index, 1);
        }

262
        // Get executor where the job is running
Deshui Yu's avatar
Deshui Yu committed
263
264
        if (trialJob.rmMeta !== undefined) {
            // If the trial job is already scheduled, check its status and kill the trial process in remote machine
265
            const executor = await this.getExecutor(trialJob.id);
Deshui Yu's avatar
Deshui Yu committed
266

chicm-ms's avatar
chicm-ms committed
267
268
            if (trialJob.status === 'UNKNOWN') {
                trialJob.status = 'USER_CANCELED';
269
                this.releaseTrialResource(trialJob);
chicm-ms's avatar
chicm-ms committed
270
271
272
                return
            }

273
            const jobpidPath: string = this.getJobPidPath(executor, trialJob.id);
Deshui Yu's avatar
Deshui Yu committed
274
            try {
275
276
                // Mark the toEarlyStop tag here
                trialJob.isEarlyStopped = isEarlyStopped;
277
                await executor.killChildProcesses(jobpidPath);
278
                this.releaseTrialResource(trialJob);
Deshui Yu's avatar
Deshui Yu committed
279
280
            } catch (error) {
                // Not handle the error since pkill failed will not impact trial job's current status
281
                this.log.error(`remoteTrainingService.cancelTrialJob: ${error}`);
Deshui Yu's avatar
Deshui Yu committed
282
283
284
            }
        } else {
            // Job is not scheduled yet, set status to 'USER_CANCELLED' directly
QuanluZhang's avatar
QuanluZhang committed
285
286
            assert(isEarlyStopped === false, 'isEarlyStopped is not supposed to be true here.');
            trialJob.status = getJobCancelStatus(isEarlyStopped);
Deshui Yu's avatar
Deshui Yu committed
287
288
289
290
291
292
        }
    }

    /**
     * Set culster metadata
     * @param key metadata key
293
     * //1. MACHINE_LIST -- create executor of machine list
Deshui Yu's avatar
Deshui Yu committed
294
295
296
297
298
     * //2. TRIAL_CONFIG -- trial configuration
     * @param value metadata value
     */
    public async setClusterMetadata(key: string, value: string): Promise<void> {
        switch (key) {
SparkSnail's avatar
SparkSnail committed
299
300
301
            case TrialConfigMetadataKey.NNI_MANAGER_IP:
                this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
                break;
302
            case TrialConfigMetadataKey.MACHINE_LIST:
Deshui Yu's avatar
Deshui Yu committed
303
                await this.setupConnections(value);
304
                this.gpuScheduler = new GPUScheduler(this.machineExecutorManagerMap);
Deshui Yu's avatar
Deshui Yu committed
305
                break;
chicm-ms's avatar
chicm-ms committed
306
            case TrialConfigMetadataKey.TRIAL_CONFIG: {
307
                const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
Deshui Yu's avatar
Deshui Yu committed
308
                // Parse trial config failed, throw Error
309
                if (remoteMachineTrailConfig === undefined) {
Deshui Yu's avatar
Deshui Yu committed
310
311
312
                    throw new Error('trial config parsed failed');
                }
                // codeDir is not a valid directory, throw Error
313
                if (!fs.lstatSync(remoteMachineTrailConfig.codeDir)
314
                    .isDirectory()) {
Deshui Yu's avatar
Deshui Yu committed
315
316
                    throw new Error(`codeDir ${remoteMachineTrailConfig.codeDir} is not a directory`);
                }
317
318

                try {
319
                    // Validate to make sure codeDir doesn't have too many files
320
                    await validateCodeDir(remoteMachineTrailConfig.codeDir);
321
322
                    // Copy codeDir to remote machine
                    for (const [rmMeta, executorManager] of this.machineExecutorManagerMap.entries()) {
323
                        const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId);
324
325
326
                        if (executor !== undefined) {
                            this.machineCopyExpCodeDirPromiseMap.set(
                                rmMeta,
327
328
                                executor.copyDirectoryToRemote(remoteMachineTrailConfig.codeDir, executor.getRemoteCodePath(getExperimentId()))
                            );
329
330
                        }
                    }
331

332
                } catch (error) {
333
                    this.log.error(error);
334

335
                    return Promise.reject(new Error(error));
336
337
                }

Deshui Yu's avatar
Deshui Yu committed
338
339
                this.trialConfig = remoteMachineTrailConfig;
                break;
chicm-ms's avatar
chicm-ms committed
340
            }
341
342
343
            case TrialConfigMetadataKey.MULTI_PHASE:
                this.isMultiPhase = (value === 'true' || value === 'True');
                break;
344
345
346
            case TrialConfigMetadataKey.VERSION_CHECK:
                this.versionCheck = (value === 'true' || value === 'True');
                break;
SparkSnail's avatar
SparkSnail committed
347
348
349
            case TrialConfigMetadataKey.LOG_COLLECTION:
                this.logCollection = value;
                break;
Deshui Yu's avatar
Deshui Yu committed
350
351
352
353
354
355
356
357
358
359
            default:
                //Reject for unknown keys
                throw new Error(`Uknown key: ${key}`);
        }
    }

    /**
     * Get culster metadata
     * @param key metadata key
     */
360
    public async getClusterMetadata(_key: string): Promise<string> {
361
        return "";
Deshui Yu's avatar
Deshui Yu committed
362
    }
363

SparkSnail's avatar
SparkSnail committed
364
    /**
365
     * cleanup() has a time out of 10s to clean remote connections
SparkSnail's avatar
SparkSnail committed
366
367
     */
    public async cleanUp(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
368
        this.log.info('Stopping remote machine training service...');
Deshui Yu's avatar
Deshui Yu committed
369
        this.stopping = true;
370
371
372
373
374
375
376
377
378
        await this.cleanupConnections();
    }

    private async getExecutor(trialId: string): Promise<ShellExecutor> {
        const executorManager = this.trialExecutorManagerMap.get(trialId);
        if (executorManager === undefined) {
            throw new Error(`ExecutorManager is not assigned for trial ${trialId}`);
        }
        return await executorManager.getExecutor(trialId);
SparkSnail's avatar
SparkSnail committed
379
    }
380

381
382
383
384
    /**
     * remove gpu reversion when job is not running
     */
    private updateGpuReservation(): void {
385
386
387
388
389
        if (this.gpuScheduler) {
            for (const [key, value] of this.trialJobsMap) {
                if (!['WAITING', 'RUNNING'].includes(value.status)) {
                    this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap);
                }
390
391
392
393
            }
        }
    }

SparkSnail's avatar
SparkSnail committed
394
395
396
397
    /**
     * stop gpu_metric_collector process in remote machine and remove unused scripts
     */
    private async cleanupConnections(): Promise<void> {
398
        try {
399
400
            for (const executorManager of this.machineExecutorManagerMap.values()) {
                const executor = await executorManager.getExecutor(this.initExecutorId);
401
                if (executor !== undefined) {
402
403
                    this.log.info(`killing gpu metric collector on ${executor.name}`);
                    const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
404
                    await executor.killChildProcesses(gpuJobPidPath, true);
SparkSnail's avatar
SparkSnail committed
405
                }
406
                executorManager.releaseAllExecutor();
SparkSnail's avatar
SparkSnail committed
407
            }
408
        } catch (error) {
SparkSnail's avatar
SparkSnail committed
409
            //ignore error, this function is called to cleanup remote connections when experiment is stopping
410
            this.log.error(`Cleanup connection exception, error is ${error}`);
SparkSnail's avatar
SparkSnail committed
411
        }
412
413
    }

Deshui Yu's avatar
Deshui Yu committed
414
    private async setupConnections(machineList: string): Promise<void> {
chicm-ms's avatar
chicm-ms committed
415
        this.log.debug(`Connecting to remote machines: ${machineList}`);
Deshui Yu's avatar
Deshui Yu committed
416
417
        //TO DO: verify if value's format is wrong, and json parse failed, how to handle error
        const rmMetaList: RemoteMachineMeta[] = <RemoteMachineMeta[]>JSON.parse(machineList);
SparkSnail's avatar
SparkSnail committed
418

419
        for (const rmMeta of rmMetaList) {
420
            rmMeta.occupiedGpuIndexMap = new Map<number, number>();
421
422
423
424
            const executorManager: ExecutorManager = new ExecutorManager(rmMeta);
            this.log.info(`connecting to ${rmMeta.username}@${rmMeta.ip}:${rmMeta.port}`);
            const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId);
            this.log.debug(`reached ${executor.name}`);
425
            this.machineExecutorManagerMap.set(rmMeta, executorManager);
426
            this.log.debug(`initializing ${executor.name}`);
427
428
            this.sshConnectionPromises.push(this.initRemoteMachineOnConnected(rmMeta, executor));
            this.log.info(`connecting to ${executor.name}`);
429
        }
Deshui Yu's avatar
Deshui Yu committed
430
431
    }

432
433
    private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, executor: ShellExecutor): Promise<void> {
        // Create root working directory after executor is ready
434
435
        const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni');
        await executor.createFolder(executor.getRemoteExperimentRootDir(getExperimentId()));
436

437
        // the directory to store temp scripts in remote machine
438
439
440
        const remoteGpuScriptCollectorDir: string = executor.getRemoteScriptsPath(getExperimentId());

        // clean up previous result.
441
        await executor.createFolder(remoteGpuScriptCollectorDir, true);
Junwei Sun's avatar
Junwei Sun committed
442
        await executor.allowPermission(true, nniRootDir);
443

Deshui Yu's avatar
Deshui Yu committed
444
        //Begin to execute gpu_metrics_collection scripts
445
        const script = executor.generateGpuStatsScript(getExperimentId());
446
        executor.executeScript(script, false, true);
447
448
449
        // the timer is trigger in 1 second, it causes multiple runs on server.
        // So reduce it's freqeunce, only allow one of it run.
        const collectingCount: boolean[] = [];
450

451
        const disposable: Rx.IDisposable = this.timer.subscribe(
452
            async () => {
453
454
455
456
457
458
459
460
461
                if (collectingCount.length == 0) {
                    collectingCount.push(true);
                    const cmdresult = await executor.readLastLines(executor.joinPath(remoteGpuScriptCollectorDir, 'gpu_metrics'));
                    if (cmdresult !== "") {
                        rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult);
                        if (rmMeta.gpuSummary.gpuCount === 0) {
                            this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`);
                            this.timer.unsubscribe(disposable);
                        }
462
                    }
463
                    if (this.stopping) {
464
465
466
                        this.timer.unsubscribe(disposable);
                        this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`);
                    }
467
                    collectingCount.pop();
Deshui Yu's avatar
Deshui Yu committed
468
469
470
471
472
473
                }
            }
        );
    }

    private async prepareTrialJob(trialJobId: string): Promise<boolean> {
chicm-ms's avatar
chicm-ms committed
474
        const deferred: Deferred<boolean> = new Deferred<boolean>();
Deshui Yu's avatar
Deshui Yu committed
475

476
        if (this.trialConfig === undefined) {
Deshui Yu's avatar
Deshui Yu committed
477
478
            throw new Error('trial config is not initialized');
        }
479
480
481
        if (this.gpuScheduler === undefined) {
            throw new Error('gpuScheduler is not initialized');
        }
Deshui Yu's avatar
Deshui Yu committed
482
483
484
485
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${trialJobId}`);
        }
486
487
488
        // If job is not WATIING, Don't prepare and resolve true immediately
        if (trialJobDetail.status !== 'WAITING') {
            deferred.resolve(true);
489

490
491
            return deferred.promise;
        }
492
        // get an executor from scheduler
493
        const rmScheduleResult: RemoteMachineScheduleResult = this.gpuScheduler.scheduleMachine(this.trialConfig.gpuNum, trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
494
        if (rmScheduleResult.resultType === ScheduleResultType.REQUIRE_EXCEED_TOTAL) {
chicm-ms's avatar
chicm-ms committed
495
            const errorMessage: string = `Required GPU number ${this.trialConfig.gpuNum} is too large, no machine can meet`;
Deshui Yu's avatar
Deshui Yu committed
496
497
498
            this.log.error(errorMessage);
            deferred.reject();
            throw new NNIError(NNIErrorNames.RESOURCE_NOT_AVAILABLE, errorMessage);
499
        } else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED
Deshui Yu's avatar
Deshui Yu committed
500
            && rmScheduleResult.scheduleInfo !== undefined) {
chicm-ms's avatar
chicm-ms committed
501
            const rmScheduleInfo: RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo;
SparkSnail's avatar
SparkSnail committed
502
503

            trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
504
505
506
507
            const copyExpCodeDirPromise = this.machineCopyExpCodeDirPromiseMap.get(trialJobDetail.rmMeta);
            if (copyExpCodeDirPromise !== undefined) {
                await copyExpCodeDirPromise;
            }
SparkSnail's avatar
SparkSnail committed
508

509
510
511
512
513
            this.allocateExecutorManagerForTrial(trialJobDetail);
            const executor = await this.getExecutor(trialJobDetail.id);

            trialJobDetail.workingDirectory = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJobDetail.id);

Deshui Yu's avatar
Deshui Yu committed
514
            await this.launchTrialOnScheduledMachine(
515
                trialJobId, trialJobDetail.form, rmScheduleInfo);
Deshui Yu's avatar
Deshui Yu committed
516
517

            trialJobDetail.status = 'RUNNING';
518
            trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialJobDetail.workingDirectory}`;
519
            trialJobDetail.startTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
520

521
            this.trialJobsMap.set(trialJobId, trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
522
            deferred.resolve(true);
523
        } else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
Deshui Yu's avatar
Deshui Yu committed
524
525
526
            this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`);
            deferred.resolve(false);
        } else {
527
            deferred.reject(`Invalid schedule resutl type: ${rmScheduleResult.resultType}`);
Deshui Yu's avatar
Deshui Yu committed
528
529
530
531
532
        }

        return deferred.promise;
    }

533
    private async launchTrialOnScheduledMachine(trialJobId: string, form: TrialJobApplicationForm,
534
        rmScheduleInfo: RemoteMachineScheduleInfo): Promise<void> {
535
        if (this.trialConfig === undefined) {
Deshui Yu's avatar
Deshui Yu committed
536
537
            throw new Error('trial config is not initialized');
        }
chicm-ms's avatar
chicm-ms committed
538
        const cudaVisibleDevice: string = rmScheduleInfo.cudaVisibleDevice;
539
        const executor = await this.getExecutor(trialJobId);
540
541
542
543
544
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new Error(`Can not get trial job detail for job: ${trialJobId}`);
        }

Deshui Yu's avatar
Deshui Yu committed
545
546
        const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);

547
        await executor.createFolder(executor.joinPath(trialJobDetail.workingDirectory, '.nni'));
Deshui Yu's avatar
Deshui Yu committed
548
549
550

        // RemoteMachineRunShellFormat is the run shell format string,
        // See definition in remoteMachineData.ts
SparkSnail's avatar
SparkSnail committed
551

552
        let cudaVisible: string;
chicm-ms's avatar
chicm-ms committed
553
554
        // Set CUDA_VISIBLE_DEVICES environment variable based on cudaVisibleDevice
        // If no valid cudaVisibleDevice is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
SparkSnail's avatar
SparkSnail committed
555
556
        // If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script
        if (this.trialConfig.gpuNum === undefined) {
557
            cudaVisible = ""
SparkSnail's avatar
SparkSnail committed
558
        } else {
chicm-ms's avatar
chicm-ms committed
559
            if (typeof cudaVisibleDevice === 'string' && cudaVisibleDevice.length > 0) {
560
                cudaVisible = `CUDA_VISIBLE_DEVICES=${cudaVisibleDevice}`;
SparkSnail's avatar
SparkSnail committed
561
            } else {
562
                cudaVisible = `CUDA_VISIBLE_DEVICES=" "`;
SparkSnail's avatar
SparkSnail committed
563
            }
SparkSnail's avatar
SparkSnail committed
564
        }
565
566
        const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
        if (this.remoteRestServerPort === undefined) {
SparkSnail's avatar
SparkSnail committed
567
568
569
            const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
            this.remoteRestServerPort = restServer.clusterRestServerPort;
        }
570
        const version: string = this.versionCheck ? await getVersion() : '';
571
572
        const runScriptTrialContent: string = executor.generateStartScript(
            trialJobDetail.workingDirectory,
Deshui Yu's avatar
Deshui Yu committed
573
            trialJobId,
SparkSnail's avatar
SparkSnail committed
574
            getExperimentId(),
575
            trialJobDetail.form.sequenceId.toString(),
576
            this.isMultiPhase,
577
            this.trialConfig.command,
SparkSnail's avatar
SparkSnail committed
578
579
            nniManagerIp,
            this.remoteRestServerPort,
580
            version,
581
            this.logCollection, cudaVisible);
Deshui Yu's avatar
Deshui Yu committed
582
583

        //create tmp trial working folder locally.
584
        await execMkdir(path.join(trialLocalTempFolder, '.nni'));
585
586
587

        // Write install_nni.sh, it's not used in Windows platform.
        await fs.promises.writeFile(path.join(trialLocalTempFolder, executor.getScriptName("install_nni")), CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' });
588
        // Write file content ( run.sh and parameter.cfg ) to local tmp files
589
        await fs.promises.writeFile(path.join(trialLocalTempFolder, executor.getScriptName("run")), runScriptTrialContent, { encoding: 'utf8' });
chicm-ms's avatar
chicm-ms committed
590
        await this.writeParameterFile(trialJobId, form.hyperParameters);
Deshui Yu's avatar
Deshui Yu committed
591
        // Copy files in codeDir to remote working directory
592
        await executor.copyDirectoryToRemote(trialLocalTempFolder, trialJobDetail.workingDirectory);
Deshui Yu's avatar
Deshui Yu committed
593
        // Execute command in remote machine
594
        executor.executeScript(executor.joinPath(trialJobDetail.workingDirectory, executor.getScriptName("run")), true, true);
Deshui Yu's avatar
Deshui Yu committed
595
596
    }

597
    private async updateTrialJobStatus(trialJob: RemoteMachineTrialJobDetail, executor: ShellExecutor): Promise<TrialJobDetail> {
Deshui Yu's avatar
Deshui Yu committed
598
        const deferred: Deferred<TrialJobDetail> = new Deferred<TrialJobDetail>();
599
600
        const jobpidPath: string = this.getJobPidPath(executor, trialJob.id);
        const trialReturnCodeFilePath: string = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJob.id, '.nni', 'code');
601
        /* eslint-disable require-atomic-updates */
Deshui Yu's avatar
Deshui Yu committed
602
        try {
603
            const isAlive = await executor.isProcessAlive(jobpidPath);
Deshui Yu's avatar
Deshui Yu committed
604
            // if the process of jobpid is not alive any more
605
606
607
608
            if (!isAlive) {
                const trialReturnCode: string = await executor.getRemoteFileContent(trialReturnCodeFilePath);
                this.log.debug(`trailjob ${trialJob.id} return code: ${trialReturnCode}`);
                const match: RegExpMatchArray | null = trialReturnCode.trim()
609
                    .match(/^-?(\d+)\s+(\d+)$/);
610
                if (match !== null) {
Deshui Yu's avatar
Deshui Yu committed
611
612
613
614
615
                    const { 1: code, 2: timestamp } = match;
                    // Update trial job's status based on result code
                    if (parseInt(code, 10) === 0) {
                        trialJob.status = 'SUCCEEDED';
                    } else {
616
617
618
619
620
621
                        // isEarlyStopped is never set, mean it's not cancelled by NNI, so if the process's exit code >0, mark it as FAILED
                        if (trialJob.isEarlyStopped === undefined) {
                            trialJob.status = 'FAILED';
                        } else {
                            trialJob.status = getJobCancelStatus(trialJob.isEarlyStopped);
                        }
Deshui Yu's avatar
Deshui Yu committed
622
                    }
623
                    trialJob.endTime = parseInt(timestamp, 10);
624
                    this.releaseTrialResource(trialJob);
Deshui Yu's avatar
Deshui Yu committed
625
                }
chicm-ms's avatar
chicm-ms committed
626
                this.log.debug(`trailJob status update: ${trialJob.id}, ${trialJob.status}`);
Deshui Yu's avatar
Deshui Yu committed
627
628
629
            }
            deferred.resolve(trialJob);
        } catch (error) {
630
            this.log.debug(`(Ignorable mostly)Update job status exception, error is ${error.message}`);
Deshui Yu's avatar
Deshui Yu committed
631
632
633
634
635
636
637
            if (error instanceof NNIError && error.name === NNIErrorNames.NOT_FOUND) {
                deferred.resolve(trialJob);
            } else {
                trialJob.status = 'UNKNOWN';
                deferred.resolve(trialJob);
            }
        }
638
        /* eslint-enable require-atomic-updates */
Deshui Yu's avatar
Deshui Yu committed
639
640
641
        return deferred.promise;
    }

chicm-ms's avatar
chicm-ms committed
642
    public get MetricsEmitter(): EventEmitter {
SparkSnail's avatar
SparkSnail committed
643
644
645
        return this.metricsEmitter;
    }

646
    private getJobPidPath(executor: ShellExecutor, jobId: string): string {
Deshui Yu's avatar
Deshui Yu committed
647
648
649
650
651
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(jobId);
        if (trialJobDetail === undefined) {
            throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${jobId}`);
        }

652
        return executor.joinPath(trialJobDetail.workingDirectory, '.nni', 'jobpid');
Deshui Yu's avatar
Deshui Yu committed
653
    }
chicm-ms's avatar
chicm-ms committed
654

chicm-ms's avatar
chicm-ms committed
655
    private async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters): Promise<void> {
656
        const executor = await this.getExecutor(trialJobId);
chicm-ms's avatar
chicm-ms committed
657

658
        const trialWorkingFolder: string = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJobId);
chicm-ms's avatar
chicm-ms committed
659
660
        const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);

661
        const fileName: string = generateParamFileName(hyperParameters);
chicm-ms's avatar
chicm-ms committed
662
663
664
        const localFilepath: string = path.join(trialLocalTempFolder, fileName);
        await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' });

665
        await executor.copyFileToRemote(localFilepath, executor.joinPath(trialWorkingFolder, fileName));
chicm-ms's avatar
chicm-ms committed
666
    }
Deshui Yu's avatar
Deshui Yu committed
667
668
669
}

export { RemoteMachineTrainingService };