amlEnvironmentService.ts 5.61 KB
Newer Older
SparkSnail's avatar
SparkSnail committed
1
2
3
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

4
5
6
7
8
9
10
import fs from 'fs';
import path from 'path';
import * as component from 'common/component';
import { getLogger, Logger } from 'common/log';
import { ExperimentConfig, AmlConfig, flattenConfig } from 'common/experimentConfig';
import { ExperimentStartupInfo } from 'common/experimentStartupInfo';
import { validateCodeDir } from 'training_service/common/util';
11
import { AMLClient } from '../aml/amlClient';
liuzhe-lz's avatar
liuzhe-lz committed
12
import { AMLEnvironmentInformation } from '../aml/amlConfig';
13
import { EnvironmentInformation, EnvironmentService } from '../environment';
14
15
import { EventEmitter } from "events";
import { AMLCommandChannel } from '../channels/amlCommandChannel';
16
import { SharedStorageService } from '../sharedStorage'
SparkSnail's avatar
SparkSnail committed
17

liuzhe-lz's avatar
liuzhe-lz committed
18
interface FlattenAmlConfig extends ExperimentConfig, AmlConfig { }
SparkSnail's avatar
SparkSnail committed
19
20

/**
21
 * Collector AML jobs info from AML cluster, and update aml job status locally
SparkSnail's avatar
SparkSnail committed
22
23
24
 */
@component.Singleton
export class AMLEnvironmentService extends EnvironmentService {
25

liuzhe-lz's avatar
liuzhe-lz committed
26
    private readonly log: Logger = getLogger('AMLEnvironmentService');
SparkSnail's avatar
SparkSnail committed
27
28
    private experimentId: string;
    private experimentRootDir: string;
liuzhe-lz's avatar
liuzhe-lz committed
29
    private config: FlattenAmlConfig;
SparkSnail's avatar
SparkSnail committed
30

31
    constructor(config: ExperimentConfig, info: ExperimentStartupInfo) {
SparkSnail's avatar
SparkSnail committed
32
        super();
33
34
        this.experimentId = info.experimentId;
        this.experimentRootDir = info.logDir;
liuzhe-lz's avatar
liuzhe-lz committed
35
36
        this.config = flattenConfig(config, 'aml');
        validateCodeDir(this.config.trialCodeDirectory);
SparkSnail's avatar
SparkSnail committed
37
38
39
40
41
42
    }

    public get hasStorageService(): boolean {
        return false;
    }

43
44
    public initCommandChannel(eventEmitter: EventEmitter): void {
        this.commandChannel = new AMLCommandChannel(eventEmitter);
SparkSnail's avatar
SparkSnail committed
45
46
    }

J-shang's avatar
J-shang committed
47
    public createEnvironmentInformation(envId: string, envName: string): EnvironmentInformation {
SparkSnail's avatar
SparkSnail committed
48
49
50
        return new AMLEnvironmentInformation(envId, envName);
    }

51
52
53
54
    public get getName(): string {
        return 'aml';
    }

SparkSnail's avatar
SparkSnail committed
55
56
57
    public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
        environments.forEach(async (environment) => {
            const amlClient = (environment as AMLEnvironmentInformation).amlClient;
58
            if (!amlClient) {
59
                return Promise.reject('AML client not initialized!');
SparkSnail's avatar
SparkSnail committed
60
            }
61
62
            const newStatus = await amlClient.updateStatus(environment.status);
            switch (newStatus.toUpperCase()) {
SparkSnail's avatar
SparkSnail committed
63
64
                case 'WAITING':
                case 'QUEUED':
65
66
67
68
                    environment.setStatus('WAITING');
                    break;
                case 'RUNNING':
                    environment.setStatus('RUNNING');
SparkSnail's avatar
SparkSnail committed
69
70
71
                    break;
                case 'COMPLETED':
                case 'SUCCEEDED':
72
                    environment.setStatus('SUCCEEDED');
SparkSnail's avatar
SparkSnail committed
73
74
                    break;
                case 'FAILED':
75
76
                    environment.setStatus('FAILED');
                    return Promise.reject(`AML: job ${environment.envId} is failed!`);
SparkSnail's avatar
SparkSnail committed
77
78
                case 'STOPPED':
                case 'STOPPING':
79
                    environment.setStatus('USER_CANCELED');
SparkSnail's avatar
SparkSnail committed
80
81
                    break;
                default:
82
                    environment.setStatus('UNKNOWN');
SparkSnail's avatar
SparkSnail committed
83
84
85
86
87
88
            }
        });
    }

    public async startEnvironment(environment: EnvironmentInformation): Promise<void> {
        const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation;
SparkSnail's avatar
SparkSnail committed
89
        const environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp");
90
91
92
93
94
95
96
97
98
99
100
        if (!fs.existsSync(environmentLocalTempFolder)) {
            await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true});
        }
        if (amlEnvironment.useSharedStorage) {
            const environmentRoot = component.get<SharedStorageService>(SharedStorageService).remoteWorkingRoot;
            const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand;
            amlEnvironment.command = `${remoteMountCommand} && cd ${environmentRoot} && ${amlEnvironment.command}`.replace(/"/g, `\\"`);
        } else {
            amlEnvironment.command = `mv envs outputs/envs && cd outputs && ${amlEnvironment.command}`;
        }
        amlEnvironment.command = `import os\nos.system('${amlEnvironment.command}')`;
SparkSnail's avatar
SparkSnail committed
101
102
103
        if (this.config.deprecated && this.config.deprecated.useActiveGpu !== undefined) {
            amlEnvironment.useActiveGpu = this.config.deprecated.useActiveGpu;
        }
liuzhe-lz's avatar
liuzhe-lz committed
104
        amlEnvironment.maxTrialNumberPerGpu = this.config.maxTrialNumberPerGpu;
105

106
        await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' });
SparkSnail's avatar
SparkSnail committed
107
        const amlClient = new AMLClient(
liuzhe-lz's avatar
liuzhe-lz committed
108
109
110
            this.config.subscriptionId,
            this.config.resourceGroup,
            this.config.workspaceName,
SparkSnail's avatar
SparkSnail committed
111
            this.experimentId,
liuzhe-lz's avatar
liuzhe-lz committed
112
113
            this.config.computeTarget,
            this.config.dockerImage,
SparkSnail's avatar
SparkSnail committed
114
115
116
117
            'nni_script.py',
            environmentLocalTempFolder
        );
        amlEnvironment.id = await amlClient.submit();
liuzhe-lz's avatar
liuzhe-lz committed
118
        this.log.debug('aml: before getTrackingUrl');
SparkSnail's avatar
SparkSnail committed
119
        amlEnvironment.trackingUrl = await amlClient.getTrackingUrl();
liuzhe-lz's avatar
liuzhe-lz committed
120
        this.log.debug('aml: after getTrackingUrl');
SparkSnail's avatar
SparkSnail committed
121
122
123
124
125
126
127
128
129
130
131
132
        amlEnvironment.amlClient = amlClient;
    }

    public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
        const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation;
        const amlClient = amlEnvironment.amlClient;
        if (!amlClient) {
            throw new Error('AML client not initialized!');
        }
        amlClient.stop();
    }
}