"src/vscode:/vscode.git/clone" did not exist on "eb094e57f305f55e2e15a64a233522cf315c10ce"
remoteMachineTrainingService.ts 27.6 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5

'use strict';

6
import * as assert from 'assert';
Deshui Yu's avatar
Deshui Yu committed
7
8
9
import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
10
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
Deshui Yu's avatar
Deshui Yu committed
11
12
import { Deferred } from 'ts-deferred';
import * as component from '../../common/component';
13
import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors';
14
import { getExperimentId } from '../../common/experimentStartupInfo';
Deshui Yu's avatar
Deshui Yu committed
15
16
17
import { getLogger, Logger } from '../../common/log';
import { ObservableTimer } from '../../common/observableTimer';
import {
18
    HyperParameters, TrainingService, TrialJobApplicationForm,
Yuge Zhang's avatar
Yuge Zhang committed
19
    TrialJobDetail, TrialJobMetric
Deshui Yu's avatar
Deshui Yu committed
20
} from '../../common/trainingService';
21
import {
22
23
    delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus,
    getVersion, uniqueString
24
} from '../../common/utils';
25
import { ExperimentConfig, RemoteConfig, RemoteMachineConfig, flattenConfig } from '../../common/experimentConfig';
26
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
27
import { GPUSummary, ScheduleResultType } from '../common/gpuData';
28
import { execMkdir, validateCodeDir } from '../common/util';
Deshui Yu's avatar
Deshui Yu committed
29
30
import { GPUScheduler } from './gpuScheduler';
import {
31
    ExecutorManager, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
Deshui Yu's avatar
Deshui Yu committed
32
} from './remoteMachineData';
SparkSnail's avatar
SparkSnail committed
33
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
Deshui Yu's avatar
Deshui Yu committed
34

35
36
interface FlattenRemoteConfig extends ExperimentConfig, RemoteConfig { }

Deshui Yu's avatar
Deshui Yu committed
37
38
39
/**
 * Training Service implementation for Remote Machine (Linux)
 */
SparkSnail's avatar
SparkSnail committed
40
@component.Singleton
Deshui Yu's avatar
Deshui Yu committed
41
class RemoteMachineTrainingService implements TrainingService {
42
    private readonly initExecutorId = "initConnection";
43
44
    private readonly machineExecutorManagerMap: Map<RemoteMachineConfig, ExecutorManager>; //machine excutor map
    private readonly machineCopyExpCodeDirPromiseMap: Map<RemoteMachineConfig, Promise<void>>;
45
    private readonly trialExecutorManagerMap: Map<string, ExecutorManager>; //trial excutor map
46
47
    private readonly trialJobsMap: Map<string, RemoteMachineTrialJobDetail>;
    private readonly expRootDir: string;
48
    private gpuScheduler?: GPUScheduler;
49
50
    private readonly jobQueue: string[];
    private readonly timer: ObservableTimer;
Deshui Yu's avatar
Deshui Yu committed
51
    private stopping: boolean = false;
52
53
    private readonly metricsEmitter: EventEmitter;
    private readonly log: Logger;
SparkSnail's avatar
SparkSnail committed
54
    private remoteRestServerPort?: number;
55
    private versionCheck: boolean = true;
56
    private logCollection: string = 'none';
57
    private sshConnectionPromises: any[];
58
    private config: FlattenRemoteConfig;
Deshui Yu's avatar
Deshui Yu committed
59

60
    constructor(config: ExperimentConfig) {
Deshui Yu's avatar
Deshui Yu committed
61
62
        this.metricsEmitter = new EventEmitter();
        this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>();
63
        this.trialExecutorManagerMap = new Map<string, ExecutorManager>();
64
65
        this.machineCopyExpCodeDirPromiseMap = new Map<RemoteMachineConfig, Promise<void>>();
        this.machineExecutorManagerMap = new Map<RemoteMachineConfig, ExecutorManager>();
Deshui Yu's avatar
Deshui Yu committed
66
        this.jobQueue = [];
67
        this.sshConnectionPromises = [];
Deshui Yu's avatar
Deshui Yu committed
68
        this.expRootDir = getExperimentRootDir();
69
        this.timer = component.get(ObservableTimer);
liuzhe-lz's avatar
liuzhe-lz committed
70
        this.log = getLogger('RemoteMachineTrainingService');
chicm-ms's avatar
chicm-ms committed
71
        this.log.info('Construct remote machine training service.');
72
73
74
75
76
77
78
79
80
81
        this.config = flattenConfig(config, 'remote');

        if (!fs.lstatSync(this.config.trialCodeDirectory).isDirectory()) {
            throw new Error(`codeDir ${this.config.trialCodeDirectory} is not a directory`);
        }
        validateCodeDir(this.config.trialCodeDirectory);

        this.sshConnectionPromises = this.config.machineList.map(
            machine => this.initRemoteMachineOnConnected(machine)
        );
Deshui Yu's avatar
Deshui Yu committed
82
83
84
85
86
87
    }

    /**
     * Loop to launch trial jobs and collect trial metrics
     */
    public async run(): Promise<void> {
88
        const restServer = new RemoteMachineJobRestServer(this);
SparkSnail's avatar
SparkSnail committed
89
        await restServer.start();
90
        restServer.setEnableVersionCheck = this.versionCheck;
chicm-ms's avatar
chicm-ms committed
91
        this.log.info('Run remote machine training service.');
92
93
94
95
96
        if (this.sshConnectionPromises.length > 0) {
            await Promise.all(this.sshConnectionPromises);
            this.log.info('ssh connection initialized!');
            // set sshConnectionPromises to [] to avoid log information duplicated
            this.sshConnectionPromises = [];
97
98
            // initialize gpuScheduler
            this.gpuScheduler = new GPUScheduler(this.machineExecutorManagerMap);
SparkSnail's avatar
SparkSnail committed
99
            // Copy codeDir to remote machine
100
            for (const [machineConfig, executorManager] of this.machineExecutorManagerMap.entries()) {
SparkSnail's avatar
SparkSnail committed
101
102
103
                const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId);
                if (executor !== undefined) {
                    this.machineCopyExpCodeDirPromiseMap.set(
104
105
                        machineConfig,
                        executor.copyDirectoryToRemote(this.config.trialCodeDirectory, executor.getRemoteCodePath(getExperimentId()))
SparkSnail's avatar
SparkSnail committed
106
107
108
                    );
                }
            }
109
        }
Deshui Yu's avatar
Deshui Yu committed
110
111
        while (!this.stopping) {
            while (this.jobQueue.length > 0) {
SparkSnail's avatar
SparkSnail committed
112
                this.updateGpuReservation();
Deshui Yu's avatar
Deshui Yu committed
113
                const trialJobId: string = this.jobQueue[0];
chicm-ms's avatar
chicm-ms committed
114
                const prepareResult: boolean = await this.prepareTrialJob(trialJobId);
Deshui Yu's avatar
Deshui Yu committed
115
116
117
118
                if (prepareResult) {
                    // Remove trial job with trialJobId from job queue
                    this.jobQueue.shift();
                } else {
119
                    // Break the while loop since no GPU resource is available right now,
Deshui Yu's avatar
Deshui Yu committed
120
121
122
                    // Wait to schedule job in next time iteration
                    break;
                }
123
            }
124
            if (restServer.getErrorMessage !== undefined) {
125
                this.stopping = true;
126
                throw new Error(restServer.getErrorMessage);
127
            }
Deshui Yu's avatar
Deshui Yu committed
128
129
            await delay(3000);
        }
130
        this.log.info('RemoteMachineTrainingService run loop exited.');
Deshui Yu's avatar
Deshui Yu committed
131
    }
132

SparkSnail's avatar
SparkSnail committed
133
    /**
134
     * give trial an executor
135
     * @param trial remote machine trial job detail
SparkSnail's avatar
SparkSnail committed
136
     */
137
    public allocateExecutorManagerForTrial(trial: RemoteMachineTrialJobDetail): void {
138
        if (trial.rmMeta === undefined) {
SparkSnail's avatar
SparkSnail committed
139
140
            throw new Error(`rmMeta not set in trial ${trial.id}`);
        }
141
        const executorManager: ExecutorManager | undefined = this.machineExecutorManagerMap.get(trial.rmMeta.config);
142
143
        if (executorManager === undefined) {
            throw new Error(`executorManager not initialized`);
SparkSnail's avatar
SparkSnail committed
144
        }
145
        this.trialExecutorManagerMap.set(trial.id, executorManager);
SparkSnail's avatar
SparkSnail committed
146
    }
147

SparkSnail's avatar
SparkSnail committed
148
149
    /**
     * If a trial is finished, release the connection resource
150
     * @param trial remote machine trial job detail
SparkSnail's avatar
SparkSnail committed
151
     */
152
    public releaseTrialResource(trial: RemoteMachineTrialJobDetail): void {
153
        if (trial.rmMeta === undefined) {
SparkSnail's avatar
SparkSnail committed
154
155
            throw new Error(`rmMeta not set in trial ${trial.id}`);
        }
156
        const executorManager = this.trialExecutorManagerMap.get(trial.id);
157
        if (executorManager === undefined) {
158
            throw new Error(`ExecutorManager is not assigned for trial ${trial.id}`);
SparkSnail's avatar
SparkSnail committed
159
        }
160
161
        // Note, it still keep reference in trialExecutorManagerMap, as there may be following requests from nni manager.
        executorManager.releaseExecutor(trial.id);
SparkSnail's avatar
SparkSnail committed
162
    }
Deshui Yu's avatar
Deshui Yu committed
163
164
165
166

    /**
     * List submitted trial jobs
     */
167
    public async listTrialJobs(): Promise<TrialJobDetail[]> {
Deshui Yu's avatar
Deshui Yu committed
168
169
170
        const jobs: TrialJobDetail[] = [];
        const deferred: Deferred<TrialJobDetail[]> = new Deferred<TrialJobDetail[]>();

171
        for (const [key,] of this.trialJobsMap) {
172
            jobs.push(await this.getTrialJob(key));
173
        }
Deshui Yu's avatar
Deshui Yu committed
174
175
176
177
178
179
180
181
182
183
184
        deferred.resolve(jobs);

        return deferred.promise;
    }

    /**
     * Get trial job detail information
     * @param trialJobId ID of trial job
     */
    public async getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
        const trialJob: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
185
        if (trialJob === undefined) {
Deshui Yu's avatar
Deshui Yu committed
186
187
188
189
            throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
        }
        //TO DO: add another job status, and design new job status change logic
        if (trialJob.status === 'RUNNING' || trialJob.status === 'UNKNOWN') {
190
            // Get executor where the job is running
Deshui Yu's avatar
Deshui Yu committed
191
192
193
            if (trialJob.rmMeta === undefined) {
                throw new Error(`rmMeta not set for submitted job ${trialJobId}`);
            }
194
            const executor = await this.getExecutor(trialJob.id);
Deshui Yu's avatar
Deshui Yu committed
195

196
            return this.updateTrialJobStatus(trialJob, executor);
Deshui Yu's avatar
Deshui Yu committed
197
198
199
200
201
        } else {
            return trialJob;
        }
    }

202
203
204
205
206
    /**
     * Get trial job log
     * @param _trialJobId ID of trial job
     * @param _logType 'TRIAL_LOG' | 'TRIAL_STDERR'
     */
Yuge Zhang's avatar
Yuge Zhang committed
207
    public async getTrialFile(_trialJobId: string, _fileName: string): Promise<string | Buffer> {
208
209
210
        throw new MethodNotImplementedError();
    }

Deshui Yu's avatar
Deshui Yu committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
    /**
     * Add job metrics listener
     * @param listener callback listener
     */
    public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.metricsEmitter.on('metric', listener);
    }

    /**
     * Remove job metrics listener
     * @param listener callback listener
     */
    public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
        this.metricsEmitter.off('metric', listener);
    }

    /**
     * Submit trial job
     * @param form trial job description form
     */
231
232
233
    public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
        // Generate trial job id(random)
        const trialJobId: string = uniqueString(5);
Deshui Yu's avatar
Deshui Yu committed
234

235
236
237
238
        const trialJobDetail: RemoteMachineTrialJobDetail = new RemoteMachineTrialJobDetail(
            trialJobId,
            'WAITING',
            Date.now(),
239
            "unset",
240
241
242
243
            form
        );
        this.jobQueue.push(trialJobId);
        this.trialJobsMap.set(trialJobId, trialJobDetail);
244

245
        return Promise.resolve(trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
246
247
    }

248
249
250
251
252
    /**
     * Update trial job for multi-phase
     * @param trialJobId trial job id
     * @param form job application form
     */
253
    public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
chicm-ms's avatar
chicm-ms committed
254
255
256
257
        const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new Error(`updateTrialJob failed: ${trialJobId} not found`);
        }
chicm-ms's avatar
chicm-ms committed
258
        await this.writeParameterFile(trialJobId, form.hyperParameters);
chicm-ms's avatar
chicm-ms committed
259
260

        return trialJobDetail;
261
    }
262

Deshui Yu's avatar
Deshui Yu committed
263
264
265
266
    /**
     * Cancel trial job
     * @param trialJobId ID of trial job
     */
QuanluZhang's avatar
QuanluZhang committed
267
    public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
268
        const trialJob: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
269
        if (trialJob === undefined) {
Deshui Yu's avatar
Deshui Yu committed
270
271
272
273
            throw new Error(`trial job id ${trialJobId} not found`);
        }

        // Remove the job with trialJobId from job queue
chicm-ms's avatar
chicm-ms committed
274
        const index: number = this.jobQueue.indexOf(trialJobId);
275
        if (index >= 0) {
Deshui Yu's avatar
Deshui Yu committed
276
277
278
            this.jobQueue.splice(index, 1);
        }

279
        // Get executor where the job is running
Deshui Yu's avatar
Deshui Yu committed
280
281
        if (trialJob.rmMeta !== undefined) {
            // If the trial job is already scheduled, check its status and kill the trial process in remote machine
282
            const executor = await this.getExecutor(trialJob.id);
Deshui Yu's avatar
Deshui Yu committed
283

chicm-ms's avatar
chicm-ms committed
284
285
            if (trialJob.status === 'UNKNOWN') {
                trialJob.status = 'USER_CANCELED';
286
                this.releaseTrialResource(trialJob);
chicm-ms's avatar
chicm-ms committed
287
288
289
                return
            }

290
            const jobpidPath: string = this.getJobPidPath(executor, trialJob.id);
Deshui Yu's avatar
Deshui Yu committed
291
            try {
292
293
                // Mark the toEarlyStop tag here
                trialJob.isEarlyStopped = isEarlyStopped;
294
                await executor.killChildProcesses(jobpidPath);
295
                this.releaseTrialResource(trialJob);
Deshui Yu's avatar
Deshui Yu committed
296
297
            } catch (error) {
                // Not handle the error since pkill failed will not impact trial job's current status
298
                this.log.error(`remoteTrainingService.cancelTrialJob: ${error}`);
Deshui Yu's avatar
Deshui Yu committed
299
300
301
            }
        } else {
            // Job is not scheduled yet, set status to 'USER_CANCELLED' directly
QuanluZhang's avatar
QuanluZhang committed
302
303
            assert(isEarlyStopped === false, 'isEarlyStopped is not supposed to be true here.');
            trialJob.status = getJobCancelStatus(isEarlyStopped);
Deshui Yu's avatar
Deshui Yu committed
304
305
306
        }
    }

307
308
    public async setClusterMetadata(_key: string, _value: string): Promise<void> { return; }
    public async getClusterMetadata(_key: string): Promise<string> { return ''; }
309

SparkSnail's avatar
SparkSnail committed
310
    /**
311
     * cleanup() has a time out of 10s to clean remote connections
SparkSnail's avatar
SparkSnail committed
312
313
     */
    public async cleanUp(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
314
        this.log.info('Stopping remote machine training service...');
Deshui Yu's avatar
Deshui Yu committed
315
        this.stopping = true;
316
317
318
319
320
321
322
323
324
        await this.cleanupConnections();
    }

    private async getExecutor(trialId: string): Promise<ShellExecutor> {
        const executorManager = this.trialExecutorManagerMap.get(trialId);
        if (executorManager === undefined) {
            throw new Error(`ExecutorManager is not assigned for trial ${trialId}`);
        }
        return await executorManager.getExecutor(trialId);
SparkSnail's avatar
SparkSnail committed
325
    }
326

327
328
329
330
    /**
     * remove gpu reversion when job is not running
     */
    private updateGpuReservation(): void {
331
332
333
334
335
        if (this.gpuScheduler) {
            for (const [key, value] of this.trialJobsMap) {
                if (!['WAITING', 'RUNNING'].includes(value.status)) {
                    this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap);
                }
336
337
338
339
            }
        }
    }

SparkSnail's avatar
SparkSnail committed
340
341
342
343
    /**
     * stop gpu_metric_collector process in remote machine and remove unused scripts
     */
    private async cleanupConnections(): Promise<void> {
344
        try {
345
346
            for (const executorManager of this.machineExecutorManagerMap.values()) {
                const executor = await executorManager.getExecutor(this.initExecutorId);
347
                if (executor !== undefined) {
348
349
                    this.log.info(`killing gpu metric collector on ${executor.name}`);
                    const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
350
                    await executor.killChildProcesses(gpuJobPidPath, true);
SparkSnail's avatar
SparkSnail committed
351
                }
352
                executorManager.releaseAllExecutor();
SparkSnail's avatar
SparkSnail committed
353
            }
354
        } catch (error) {
SparkSnail's avatar
SparkSnail committed
355
            //ignore error, this function is called to cleanup remote connections when experiment is stopping
356
            this.log.error(`Cleanup connection exception, error is ${error}`);
SparkSnail's avatar
SparkSnail committed
357
        }
358
359
    }

360
361
362
    private async initRemoteMachineOnConnected(machineConfig: RemoteMachineConfig): Promise<void> {
        const executorManager: ExecutorManager = new ExecutorManager(machineConfig);
        this.log.info(`connecting to ${machineConfig.user}@${machineConfig.host}:${machineConfig.port}`);
363
364
        const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId);
        this.log.debug(`reached ${executor.name}`);
365
        this.machineExecutorManagerMap.set(machineConfig, executorManager);
366
367
        this.log.debug(`initializing ${executor.name}`);

368
        // Create root working directory after executor is ready
369
370
        const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni');
        await executor.createFolder(executor.getRemoteExperimentRootDir(getExperimentId()));
371

372
        // the directory to store temp scripts in remote machine
373
374
375
        const remoteGpuScriptCollectorDir: string = executor.getRemoteScriptsPath(getExperimentId());

        // clean up previous result.
376
        await executor.createFolder(remoteGpuScriptCollectorDir, true);
Junwei Sun's avatar
Junwei Sun committed
377
        await executor.allowPermission(true, nniRootDir);
378

Deshui Yu's avatar
Deshui Yu committed
379
        //Begin to execute gpu_metrics_collection scripts
380
        const script = executor.generateGpuStatsScript(getExperimentId());
381
        executor.executeScript(script, false, true);
382
383
384
        // the timer is trigger in 1 second, it causes multiple runs on server.
        // So reduce it's freqeunce, only allow one of it run.
        const collectingCount: boolean[] = [];
385

386
        const disposable: Rx.IDisposable = this.timer.subscribe(
387
            async () => {
388
389
390
391
                if (collectingCount.length == 0) {
                    collectingCount.push(true);
                    const cmdresult = await executor.readLastLines(executor.joinPath(remoteGpuScriptCollectorDir, 'gpu_metrics'));
                    if (cmdresult !== "") {
392
393
394
                        executorManager.rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult);
                        if (executorManager.rmMeta.gpuSummary.gpuCount === 0) {
                            this.log.warning(`No GPU found on remote machine ${machineConfig.host}`);
395
396
                            this.timer.unsubscribe(disposable);
                        }
397
                    }
398
                    if (this.stopping) {
399
                        this.timer.unsubscribe(disposable);
400
                        this.log.debug(`Stopped GPU collector on ${machineConfig.host}, since experiment is exiting.`);
401
                    }
402
                    collectingCount.pop();
Deshui Yu's avatar
Deshui Yu committed
403
404
405
406
407
408
                }
            }
        );
    }

    private async prepareTrialJob(trialJobId: string): Promise<boolean> {
chicm-ms's avatar
chicm-ms committed
409
        const deferred: Deferred<boolean> = new Deferred<boolean>();
Deshui Yu's avatar
Deshui Yu committed
410

411
412
413
        if (this.gpuScheduler === undefined) {
            throw new Error('gpuScheduler is not initialized');
        }
Deshui Yu's avatar
Deshui Yu committed
414
415
416
417
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${trialJobId}`);
        }
418
419
420
        // If job is not WATIING, Don't prepare and resolve true immediately
        if (trialJobDetail.status !== 'WAITING') {
            deferred.resolve(true);
421

422
423
            return deferred.promise;
        }
424
        // get an executor from scheduler
425
        const rmScheduleResult: RemoteMachineScheduleResult = this.gpuScheduler.scheduleMachine(this.config.trialGpuNumber, trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
426
        if (rmScheduleResult.resultType === ScheduleResultType.REQUIRE_EXCEED_TOTAL) {
427
            const errorMessage: string = `Required GPU number ${this.config.trialGpuNumber} is too large, no machine can meet`;
Deshui Yu's avatar
Deshui Yu committed
428
429
430
            this.log.error(errorMessage);
            deferred.reject();
            throw new NNIError(NNIErrorNames.RESOURCE_NOT_AVAILABLE, errorMessage);
431
        } else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED
Deshui Yu's avatar
Deshui Yu committed
432
            && rmScheduleResult.scheduleInfo !== undefined) {
chicm-ms's avatar
chicm-ms committed
433
            const rmScheduleInfo: RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo;
SparkSnail's avatar
SparkSnail committed
434
435

            trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
436
            const copyExpCodeDirPromise = this.machineCopyExpCodeDirPromiseMap.get(rmScheduleInfo.rmMeta.config);
437
438
439
            if (copyExpCodeDirPromise !== undefined) {
                await copyExpCodeDirPromise;
            }
SparkSnail's avatar
SparkSnail committed
440

441
442
443
444
445
            this.allocateExecutorManagerForTrial(trialJobDetail);
            const executor = await this.getExecutor(trialJobDetail.id);

            trialJobDetail.workingDirectory = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJobDetail.id);

Deshui Yu's avatar
Deshui Yu committed
446
            await this.launchTrialOnScheduledMachine(
447
                trialJobId, trialJobDetail.form, rmScheduleInfo);
Deshui Yu's avatar
Deshui Yu committed
448
449

            trialJobDetail.status = 'RUNNING';
450
            trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.config.host}:${trialJobDetail.workingDirectory}`;
451
            trialJobDetail.startTime = Date.now();
Deshui Yu's avatar
Deshui Yu committed
452

453
            this.trialJobsMap.set(trialJobId, trialJobDetail);
Deshui Yu's avatar
Deshui Yu committed
454
            deferred.resolve(true);
455
        } else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
Deshui Yu's avatar
Deshui Yu committed
456
457
458
            this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`);
            deferred.resolve(false);
        } else {
459
            deferred.reject(`Invalid schedule resutl type: ${rmScheduleResult.resultType}`);
Deshui Yu's avatar
Deshui Yu committed
460
461
462
463
464
        }

        return deferred.promise;
    }

465
    private async launchTrialOnScheduledMachine(trialJobId: string, form: TrialJobApplicationForm,
466
        rmScheduleInfo: RemoteMachineScheduleInfo): Promise<void> {
chicm-ms's avatar
chicm-ms committed
467
        const cudaVisibleDevice: string = rmScheduleInfo.cudaVisibleDevice;
468
        const executor = await this.getExecutor(trialJobId);
469
470
471
472
473
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
        if (trialJobDetail === undefined) {
            throw new Error(`Can not get trial job detail for job: ${trialJobId}`);
        }

Deshui Yu's avatar
Deshui Yu committed
474
475
        const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);

476
        await executor.createFolder(executor.joinPath(trialJobDetail.workingDirectory, '.nni'));
Deshui Yu's avatar
Deshui Yu committed
477
478
479

        // RemoteMachineRunShellFormat is the run shell format string,
        // See definition in remoteMachineData.ts
SparkSnail's avatar
SparkSnail committed
480

481
        let cudaVisible: string;
chicm-ms's avatar
chicm-ms committed
482
483
        // Set CUDA_VISIBLE_DEVICES environment variable based on cudaVisibleDevice
        // If no valid cudaVisibleDevice is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
SparkSnail's avatar
SparkSnail committed
484
        // If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script
485
        if (this.config.trialGpuNumber === undefined) {
486
            cudaVisible = ""
SparkSnail's avatar
SparkSnail committed
487
        } else {
chicm-ms's avatar
chicm-ms committed
488
            if (typeof cudaVisibleDevice === 'string' && cudaVisibleDevice.length > 0) {
489
                cudaVisible = `CUDA_VISIBLE_DEVICES=${cudaVisibleDevice}`;
SparkSnail's avatar
SparkSnail committed
490
            } else {
491
                cudaVisible = `CUDA_VISIBLE_DEVICES=" "`;
SparkSnail's avatar
SparkSnail committed
492
            }
SparkSnail's avatar
SparkSnail committed
493
        }
494
        const nniManagerIp: string = this.config.nniManagerIp ? this.config.nniManagerIp : getIPV4Address();
495
        if (this.remoteRestServerPort === undefined) {
SparkSnail's avatar
SparkSnail committed
496
497
498
            const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
            this.remoteRestServerPort = restServer.clusterRestServerPort;
        }
499
        const version: string = this.versionCheck ? await getVersion() : '';
500
501
        const runScriptTrialContent: string = executor.generateStartScript(
            trialJobDetail.workingDirectory,
Deshui Yu's avatar
Deshui Yu committed
502
            trialJobId,
SparkSnail's avatar
SparkSnail committed
503
            getExperimentId(),
504
            trialJobDetail.form.sequenceId.toString(),
505
506
            false,  // multi-phase
            this.config.trialCommand,
SparkSnail's avatar
SparkSnail committed
507
508
            nniManagerIp,
            this.remoteRestServerPort,
509
            version,
510
511
            this.logCollection,
            cudaVisible);
Deshui Yu's avatar
Deshui Yu committed
512
513

        //create tmp trial working folder locally.
514
        await execMkdir(path.join(trialLocalTempFolder, '.nni'));
515
516
517

        // Write install_nni.sh, it's not used in Windows platform.
        await fs.promises.writeFile(path.join(trialLocalTempFolder, executor.getScriptName("install_nni")), CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' });
518
        // Write file content ( run.sh and parameter.cfg ) to local tmp files
519
        await fs.promises.writeFile(path.join(trialLocalTempFolder, executor.getScriptName("run")), runScriptTrialContent, { encoding: 'utf8' });
chicm-ms's avatar
chicm-ms committed
520
        await this.writeParameterFile(trialJobId, form.hyperParameters);
Deshui Yu's avatar
Deshui Yu committed
521
        // Copy files in codeDir to remote working directory
522
        await executor.copyDirectoryToRemote(trialLocalTempFolder, trialJobDetail.workingDirectory);
Deshui Yu's avatar
Deshui Yu committed
523
        // Execute command in remote machine
524
        executor.executeScript(executor.joinPath(trialJobDetail.workingDirectory, executor.getScriptName("run")), true, true);
Deshui Yu's avatar
Deshui Yu committed
525
526
    }

527
    private async updateTrialJobStatus(trialJob: RemoteMachineTrialJobDetail, executor: ShellExecutor): Promise<TrialJobDetail> {
Deshui Yu's avatar
Deshui Yu committed
528
        const deferred: Deferred<TrialJobDetail> = new Deferred<TrialJobDetail>();
529
530
        const jobpidPath: string = this.getJobPidPath(executor, trialJob.id);
        const trialReturnCodeFilePath: string = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJob.id, '.nni', 'code');
531
        /* eslint-disable require-atomic-updates */
Deshui Yu's avatar
Deshui Yu committed
532
        try {
533
            const isAlive = await executor.isProcessAlive(jobpidPath);
Deshui Yu's avatar
Deshui Yu committed
534
            // if the process of jobpid is not alive any more
535
536
537
538
            if (!isAlive) {
                const trialReturnCode: string = await executor.getRemoteFileContent(trialReturnCodeFilePath);
                this.log.debug(`trailjob ${trialJob.id} return code: ${trialReturnCode}`);
                const match: RegExpMatchArray | null = trialReturnCode.trim()
539
                    .match(/^-?(\d+)\s+(\d+)$/);
540
                if (match !== null) {
Deshui Yu's avatar
Deshui Yu committed
541
542
543
544
545
                    const { 1: code, 2: timestamp } = match;
                    // Update trial job's status based on result code
                    if (parseInt(code, 10) === 0) {
                        trialJob.status = 'SUCCEEDED';
                    } else {
546
547
548
549
550
551
                        // isEarlyStopped is never set, mean it's not cancelled by NNI, so if the process's exit code >0, mark it as FAILED
                        if (trialJob.isEarlyStopped === undefined) {
                            trialJob.status = 'FAILED';
                        } else {
                            trialJob.status = getJobCancelStatus(trialJob.isEarlyStopped);
                        }
Deshui Yu's avatar
Deshui Yu committed
552
                    }
553
                    trialJob.endTime = parseInt(timestamp, 10);
554
                    this.releaseTrialResource(trialJob);
Deshui Yu's avatar
Deshui Yu committed
555
                }
chicm-ms's avatar
chicm-ms committed
556
                this.log.debug(`trailJob status update: ${trialJob.id}, ${trialJob.status}`);
Deshui Yu's avatar
Deshui Yu committed
557
558
559
            }
            deferred.resolve(trialJob);
        } catch (error) {
560
            this.log.debug(`(Ignorable mostly)Update job status exception, error is ${error.message}`);
Deshui Yu's avatar
Deshui Yu committed
561
562
563
564
565
566
567
            if (error instanceof NNIError && error.name === NNIErrorNames.NOT_FOUND) {
                deferred.resolve(trialJob);
            } else {
                trialJob.status = 'UNKNOWN';
                deferred.resolve(trialJob);
            }
        }
568
        /* eslint-enable require-atomic-updates */
Deshui Yu's avatar
Deshui Yu committed
569
570
571
        return deferred.promise;
    }

chicm-ms's avatar
chicm-ms committed
572
    public get MetricsEmitter(): EventEmitter {
SparkSnail's avatar
SparkSnail committed
573
574
575
        return this.metricsEmitter;
    }

576
    private getJobPidPath(executor: ShellExecutor, jobId: string): string {
Deshui Yu's avatar
Deshui Yu committed
577
578
579
580
581
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(jobId);
        if (trialJobDetail === undefined) {
            throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${jobId}`);
        }

582
        return executor.joinPath(trialJobDetail.workingDirectory, '.nni', 'jobpid');
Deshui Yu's avatar
Deshui Yu committed
583
    }
chicm-ms's avatar
chicm-ms committed
584

chicm-ms's avatar
chicm-ms committed
585
    private async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters): Promise<void> {
586
        const executor = await this.getExecutor(trialJobId);
chicm-ms's avatar
chicm-ms committed
587

588
        const trialWorkingFolder: string = executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), 'trials', trialJobId);
chicm-ms's avatar
chicm-ms committed
589
590
        const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);

591
        const fileName: string = generateParamFileName(hyperParameters);
chicm-ms's avatar
chicm-ms committed
592
593
594
        const localFilepath: string = path.join(trialLocalTempFolder, fileName);
        await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' });

595
        await executor.copyFileToRemote(localFilepath, executor.joinPath(trialWorkingFolder, fileName));
chicm-ms's avatar
chicm-ms committed
596
    }
J-shang's avatar
J-shang committed
597
598
599
600
601
602
603
604

    public getTrialOutputLocalPath(_trialJobId: string): Promise<string> {
        throw new MethodNotImplementedError();
    }

    public fetchTrialOutput(_trialJobId: string, _subpath: string): Promise<void> {
        throw new MethodNotImplementedError();
    }
Deshui Yu's avatar
Deshui Yu committed
605
606
607
}

export { RemoteMachineTrainingService };