dlcEnvironmentService.ts 6.2 KB
Newer Older
1
2
3
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

4
5
6
import fs from 'fs';
import path from 'path';
import * as component from 'common/component';
7
import { Deferred } from 'ts-deferred';
8
import { getLogger, Logger } from 'common/log';
liuzhe-lz's avatar
liuzhe-lz committed
9
import { DlcConfig } from 'common/experimentConfig';
10
import { ExperimentStartupInfo } from 'common/experimentStartupInfo';
11
12
13
14
15
16
17
18
import { DlcClient } from '../dlc/dlcClient';
import { DlcEnvironmentInformation } from '../dlc/dlcConfig';
import { EnvironmentInformation, EnvironmentService } from '../environment';
import { EventEmitter } from "events";
import { FileCommandChannel } from '../channels/fileCommandChannel';
import { MountedStorageService } from '../storages/mountedStorageService';
import { Scope } from 'typescript-ioc';
import { StorageService } from '../storageService';
19
import { getLogDir } from 'common/utils';
20
import { setTimeout } from 'timers/promises';
21
22
23
24
25
26
27
28
29

/**
 * Collector DLC jobs info from DLC cluster, and update dlc job status locally
 */
@component.Singleton
export class DlcEnvironmentService extends EnvironmentService {

    private readonly log: Logger = getLogger('dlcEnvironmentService');
    private experimentId: string;
liuzhe-lz's avatar
liuzhe-lz committed
30
    private config: DlcConfig;
31

liuzhe-lz's avatar
liuzhe-lz committed
32
    constructor(config: DlcConfig, info: ExperimentStartupInfo) {
33
34
        super();
        this.experimentId = info.experimentId;
liuzhe-lz's avatar
liuzhe-lz committed
35
        this.config = config;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
        component.Container.bind(StorageService).to(MountedStorageService).scope(Scope.Singleton);
        const storageService = component.get<StorageService>(StorageService)
        const remoteRoot = storageService.joinPath(this.config.localStorageMountPoint, 'nni-experiments', this.experimentId);
        const localRoot = storageService.joinPath(this.config.localStorageMountPoint, 'nni-experiments');
        storageService.initialize(localRoot, remoteRoot);
    }

    public get hasStorageService(): boolean {
        return true;
    }

    public initCommandChannel(eventEmitter: EventEmitter): void {
        this.commandChannel = new FileCommandChannel(eventEmitter);
    }

    public createEnvironmentInformation(envId: string, envName: string): EnvironmentInformation {
        return new DlcEnvironmentInformation(envId, envName);
    }

    public get getName(): string {
        return 'dlc';
    }
58
    
59
    public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
60
        const deferred: Deferred<void> = new Deferred<void>();
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
        environments.forEach(async (environment) => {
            const dlcClient = (environment as DlcEnvironmentInformation).dlcClient;
            if (!dlcClient) {
                return Promise.reject('DLC client not initialized!');
            }
            const newStatus = await dlcClient.updateStatus(environment.status);
            switch (newStatus.toUpperCase()) {
                case 'CREATING':
                case 'CREATED':
                case 'WAITING':
                case 'QUEUED':
                    environment.setStatus('WAITING');
                    break;
                case 'RUNNING':
                    environment.setStatus('RUNNING');
                    break;
                case 'COMPLETED':
                case 'SUCCEEDED':
                    environment.setStatus('SUCCEEDED');
                    break;
                case 'FAILED':
82
83
84
                    // the job create failed,we will sleep(60) to create new job
                    await setTimeout(60000);
                    this.log.debug(`await 60s to create new job,DLC: job ${environment.id} is failed!`);
85
                    environment.setStatus('FAILED');
86
                    break;
87
88
89
90
91
92
93
94
                case 'STOPPED':
                case 'STOPPING':
                    environment.setStatus('USER_CANCELED');
                    break;
                default:
                    environment.setStatus('UNKNOWN');
            }
        });
95
96
        deferred.resolve();
        return deferred.promise;
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    }

    public async startEnvironment(environment: EnvironmentInformation): Promise<void> {
        const dlcEnvironment: DlcEnvironmentInformation = environment as DlcEnvironmentInformation;

        const environmentRoot = path.join(this.config.containerStorageMountPoint, `/nni-experiments/${this.experimentId}`);
        const localRoot = path.join(this.config.localStorageMountPoint, `/nni-experiments/${this.experimentId}`);

        dlcEnvironment.workingFolder = `${localRoot}/envs/${environment.id}`;
        dlcEnvironment.runnerWorkingFolder = `${environmentRoot}/envs/${environment.id}`;

        // environment id dir and command dir, folder created on DLC side can't be accessed on DSW.
        if (!fs.existsSync(`${dlcEnvironment.workingFolder}/commands`)) {
            await fs.promises.mkdir(`${dlcEnvironment.workingFolder}/commands`, {recursive: true});
        }

        environment.command = `cd ${environmentRoot} && ${environment.command} 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr`;

        const dlcClient = new DlcClient(
            this.config.type,
            this.config.image,
            this.config.jobType,
            this.config.podCount,
            this.experimentId,
            environment.id,
            this.config.ecsSpec,
            this.config.region,
124
            this.config.workspaceId,
125
126
127
128
            this.config.nasDataSourceId,
            this.config.accessKeyId,
            this.config.accessKeySecret,
            environment.command,
129
            path.join(getLogDir(), `envs/${environment.id}`),
130
            this.config.ossDataSourceId,
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
        );

        dlcEnvironment.id = await dlcClient.submit();
        this.log.debug('dlc: before getTrackingUrl');
        dlcEnvironment.trackingUrl = await dlcClient.getTrackingUrl();
        this.log.debug(`dlc trackingUrl: ${dlcEnvironment.trackingUrl}`);
        dlcEnvironment.dlcClient = dlcClient;
    }

    public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
        const dlcEnvironment: DlcEnvironmentInformation = environment as DlcEnvironmentInformation;
        const dlcClient = dlcEnvironment.dlcClient;
        if (!dlcClient) {
            throw new Error('DLC client not initialized!');
        }
        dlcClient.stop();
    }
}