amlEnvironmentService.ts 5.5 KB
Newer Older
SparkSnail's avatar
SparkSnail committed
1
2
3
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

4
5
6
7
import fs from 'fs';
import path from 'path';
import * as component from 'common/component';
import { getLogger, Logger } from 'common/log';
liuzhe-lz's avatar
liuzhe-lz committed
8
import { AmlConfig } from 'common/experimentConfig';
9
10
import { ExperimentStartupInfo } from 'common/experimentStartupInfo';
import { validateCodeDir } from 'training_service/common/util';
11
import { AMLClient } from '../aml/amlClient';
liuzhe-lz's avatar
liuzhe-lz committed
12
import { AMLEnvironmentInformation } from '../aml/amlConfig';
13
import { EnvironmentInformation, EnvironmentService } from '../environment';
14
15
import { EventEmitter } from "events";
import { AMLCommandChannel } from '../channels/amlCommandChannel';
16
import { SharedStorageService } from '../sharedStorage'
SparkSnail's avatar
SparkSnail committed
17
18

/**
19
 * Collector AML jobs info from AML cluster, and update aml job status locally
SparkSnail's avatar
SparkSnail committed
20
21
22
 */
@component.Singleton
export class AMLEnvironmentService extends EnvironmentService {
23

liuzhe-lz's avatar
liuzhe-lz committed
24
    private readonly log: Logger = getLogger('AMLEnvironmentService');
SparkSnail's avatar
SparkSnail committed
25
26
    private experimentId: string;
    private experimentRootDir: string;
liuzhe-lz's avatar
liuzhe-lz committed
27
    private config: AmlConfig;
SparkSnail's avatar
SparkSnail committed
28

liuzhe-lz's avatar
liuzhe-lz committed
29
    constructor(config: AmlConfig, info: ExperimentStartupInfo) {
SparkSnail's avatar
SparkSnail committed
30
        super();
31
32
        this.experimentId = info.experimentId;
        this.experimentRootDir = info.logDir;
liuzhe-lz's avatar
liuzhe-lz committed
33
        this.config = config;
liuzhe-lz's avatar
liuzhe-lz committed
34
        validateCodeDir(this.config.trialCodeDirectory);
SparkSnail's avatar
SparkSnail committed
35
36
37
38
39
40
    }

    public get hasStorageService(): boolean {
        return false;
    }

41
42
    public initCommandChannel(eventEmitter: EventEmitter): void {
        this.commandChannel = new AMLCommandChannel(eventEmitter);
SparkSnail's avatar
SparkSnail committed
43
44
    }

J-shang's avatar
J-shang committed
45
    public createEnvironmentInformation(envId: string, envName: string): EnvironmentInformation {
SparkSnail's avatar
SparkSnail committed
46
47
48
        return new AMLEnvironmentInformation(envId, envName);
    }

49
50
51
52
    public get getName(): string {
        return 'aml';
    }

SparkSnail's avatar
SparkSnail committed
53
54
55
    public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
        environments.forEach(async (environment) => {
            const amlClient = (environment as AMLEnvironmentInformation).amlClient;
56
            if (!amlClient) {
57
                return Promise.reject('AML client not initialized!');
SparkSnail's avatar
SparkSnail committed
58
            }
59
60
            const newStatus = await amlClient.updateStatus(environment.status);
            switch (newStatus.toUpperCase()) {
SparkSnail's avatar
SparkSnail committed
61
62
                case 'WAITING':
                case 'QUEUED':
63
64
65
66
                    environment.setStatus('WAITING');
                    break;
                case 'RUNNING':
                    environment.setStatus('RUNNING');
SparkSnail's avatar
SparkSnail committed
67
68
69
                    break;
                case 'COMPLETED':
                case 'SUCCEEDED':
70
                    environment.setStatus('SUCCEEDED');
SparkSnail's avatar
SparkSnail committed
71
72
                    break;
                case 'FAILED':
73
74
                    environment.setStatus('FAILED');
                    return Promise.reject(`AML: job ${environment.envId} is failed!`);
SparkSnail's avatar
SparkSnail committed
75
76
                case 'STOPPED':
                case 'STOPPING':
77
                    environment.setStatus('USER_CANCELED');
SparkSnail's avatar
SparkSnail committed
78
79
                    break;
                default:
80
                    environment.setStatus('UNKNOWN');
SparkSnail's avatar
SparkSnail committed
81
82
83
84
85
86
            }
        });
    }

    public async startEnvironment(environment: EnvironmentInformation): Promise<void> {
        const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation;
SparkSnail's avatar
SparkSnail committed
87
        const environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp");
88
89
90
91
92
93
94
95
96
97
98
        if (!fs.existsSync(environmentLocalTempFolder)) {
            await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true});
        }
        if (amlEnvironment.useSharedStorage) {
            const environmentRoot = component.get<SharedStorageService>(SharedStorageService).remoteWorkingRoot;
            const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand;
            amlEnvironment.command = `${remoteMountCommand} && cd ${environmentRoot} && ${amlEnvironment.command}`.replace(/"/g, `\\"`);
        } else {
            amlEnvironment.command = `mv envs outputs/envs && cd outputs && ${amlEnvironment.command}`;
        }
        amlEnvironment.command = `import os\nos.system('${amlEnvironment.command}')`;
liuzhe-lz's avatar
liuzhe-lz committed
99
        amlEnvironment.maxTrialNumberPerGpu = this.config.maxTrialNumberPerGpu;
100

101
        await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' });
SparkSnail's avatar
SparkSnail committed
102
        const amlClient = new AMLClient(
liuzhe-lz's avatar
liuzhe-lz committed
103
104
105
            this.config.subscriptionId,
            this.config.resourceGroup,
            this.config.workspaceName,
SparkSnail's avatar
SparkSnail committed
106
            this.experimentId,
liuzhe-lz's avatar
liuzhe-lz committed
107
108
            this.config.computeTarget,
            this.config.dockerImage,
SparkSnail's avatar
SparkSnail committed
109
110
111
112
            'nni_script.py',
            environmentLocalTempFolder
        );
        amlEnvironment.id = await amlClient.submit();
liuzhe-lz's avatar
liuzhe-lz committed
113
        this.log.debug('aml: before getTrackingUrl');
SparkSnail's avatar
SparkSnail committed
114
        amlEnvironment.trackingUrl = await amlClient.getTrackingUrl();
liuzhe-lz's avatar
liuzhe-lz committed
115
        this.log.debug('aml: after getTrackingUrl');
SparkSnail's avatar
SparkSnail committed
116
117
118
119
120
121
122
123
124
        amlEnvironment.amlClient = amlClient;
    }

    public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
        const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation;
        const amlClient = amlEnvironment.amlClient;
        if (!amlClient) {
            throw new Error('AML client not initialized!');
        }
125
126
127
128
129
130
        const result = await amlClient.stop();
        if (result) {
            this.log.info(`Stop aml run ${environment.id} success!`);
        } else {
            this.log.info(`Stop aml run ${environment.id} failed!`);
        }
SparkSnail's avatar
SparkSnail committed
131
132
    }
}