environment.ts 7.38 KB
Newer Older
1
2
3
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

4
5
6
import { getLogger, Logger } from "common/log";
import { TrialJobStatus } from "common/trainingService";
import { GPUInfo } from "training_service/common/gpuData";
7
import { CommandChannel } from "./commandChannel";
8
9
import { WebCommandChannel } from './channels/webCommandChannel';
import { EventEmitter } from "events";
10
11
12
13
14


export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED';
export type Channel = "web" | "file" | "aml" | "ut";

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

export class TrialGpuSummary {
    // GPU count on the machine
    public gpuCount: number;
    // The timestamp when GPU summary data queried
    public timestamp: string;
    // The array of GPU information for each GPU card
    public gpuInfos: GPUInfo[];
    // GPU assigned status
    public assignedGpuIndexMap: Map<number, number> = new Map<number, number>();

    constructor(gpuCount: number, timestamp: string, gpuInfos: GPUInfo[]) {
        this.gpuCount = gpuCount;
        this.timestamp = timestamp;
        this.gpuInfos = gpuInfos;
    }
}

33
export class EnvironmentInformation {
34
35
    // node id is 5 chars, so won't conflict.
    private readonly defaultNodeId = "default";
36
    private log: Logger;
37
    private isNoGpuWarned: boolean = false;
38
39
40
41

    // key states
    // true: environment is running, waiting, or unknown.
    public isAlive: boolean = true;
42
43
    // true: Runner is initialized, and can receive trials.
    public isRunnerReady: boolean = false;
44
45
46
    // don't set status in environment directly, use setFinalState function to set a final state.
    public status: EnvironmentStatus = "UNKNOWN";

47
48
49
50
51
    // true: environment is ready to run trial.
    public runningTrialCount: number = 0;
    // uses to count how many trial runs on this environment.
    // it can be used in many scenarios, but for now, it uses for reusable.
    public assignedTrialCount: number = 0;
52
53
    // it is used to get environment idle time interval
    public latestTrialReleasedTime: number = -1;
54
55
56
57
58
59
60

    // NNI environment ID
    public id: string;
    // training platform unique job ID.
    public envId: string;
    // training platform job friendly name, in case it's different with job ID.
    public name: string;
61
62
63
64
65
66
67
    public trackingUrl: string = "";
    public workingFolder: string = "";
    public runnerWorkingFolder: string = "";
    public command: string = "";
    public nodeCount: number = 1;

    // it's used to aggregate node status for multiple node trial
J-shang's avatar
J-shang committed
68
    public nodes: Map<string, NodeInformation>;
69
    public gpuSummaries: Map<string, TrialGpuSummary> = new Map<string, TrialGpuSummary>();
70

71
72
73
74
75
76
77
    // use can specify which gpus can be used by NNI.
    // it's usable for sharable environment like remote machine.
    public usableGpus?: number[];
    // user can specify how to use GPU resource for an environment, like local and remote.
    public maxTrialNumberPerGpu?: number;
    public useActiveGpu?: boolean;

78
79
    public environmentService?: EnvironmentService;

80
81
    public useSharedStorage?: boolean;

82
    constructor(id: string, name: string, envId?: string) {
liuzhe-lz's avatar
liuzhe-lz committed
83
        this.log = getLogger('EnvironmentInformation');
84
        this.id = id;
85
86
        this.name = name;
        this.envId = envId ? envId : name;
J-shang's avatar
J-shang committed
87
        this.nodes = new Map<string, NodeInformation>();
88
89
    }

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    public setStatus(status: EnvironmentStatus): void {
        if (this.status !== status) {
            this.log.info(`EnvironmentInformation: ${this.envId} change status from ${this.status} to ${status}.`)
            this.status = status;
        }
    }

    public setGpuSummary(nodeId: string, newGpuSummary: TrialGpuSummary): void {
        if (nodeId === null || nodeId === undefined) {
            nodeId = this.defaultNodeId;
        }

        const originalGpuSummary = this.gpuSummaries.get(nodeId);
        if (undefined === originalGpuSummary) {
            newGpuSummary.assignedGpuIndexMap = new Map<number, number>();
            this.gpuSummaries.set(nodeId, newGpuSummary);
        } else {
            originalGpuSummary.gpuCount = newGpuSummary.gpuCount;
            originalGpuSummary.timestamp = newGpuSummary.timestamp;
            originalGpuSummary.gpuInfos = newGpuSummary.gpuInfos;
        }
    }

    public get defaultGpuSummary(): TrialGpuSummary | undefined {
        const gpuSummary = this.gpuSummaries.get(this.defaultNodeId);
        if (gpuSummary === undefined) {
            if (false === this.isNoGpuWarned) {
liuzhe-lz's avatar
liuzhe-lz committed
117
                this.log.warning(`EnvironmentInformation: ${this.envId} no default gpu found. current gpu info`,  this.gpuSummaries);
118
119
120
121
                this.isNoGpuWarned = true;
            }
        } else {
            this.isNoGpuWarned = false;
122
        }
123
        return gpuSummary;
124
125
    }
}
SparkSnail's avatar
SparkSnail committed
126

127
128
export abstract class EnvironmentService {

liuzhe-lz's avatar
liuzhe-lz committed
129
130
131
132
    public async init(): Promise<void> {
        return;
    }

133
134
135
    public abstract get hasStorageService(): boolean;
    public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void>;
    public abstract stopEnvironment(environment: EnvironmentInformation): Promise<void>;
136
    public abstract startEnvironment(environment: EnvironmentInformation): Promise<void>;
137
138
    // Make public for ut
    protected commandChannel: CommandChannel | undefined;
139
140
141
142
143
144
    
    // It is used to set prefetched environment count, default value is 0 for OpenPAI and AML mode,
    // in remote mode, this value is set to the length of machine list.
    public get prefetchedEnvironmentCount(): number {
        return 0;
    }
145

146
147
148
149
150
151
152
153
154
155
156
157
158
159
    public abstract get getName(): string;
    
    // Initialize command channel, use WebCommandChannel as default command channel
    public initCommandChannel(eventEmitter: EventEmitter): void {
        this.commandChannel = WebCommandChannel.getInstance(eventEmitter);
    }

    public get getCommandChannel(): CommandChannel {
        if (this.commandChannel === undefined) {
            throw new Error("Command channel not initialized!");
        }
        return this.commandChannel;
    }

160
161
162
163
164
165
166
167
168
169
170
171
172
    // It depends on environment pressure and settings
    // for example, OpenPAI relies on API calls, and there is an limitation for frequence, so it need to be bigger.
    public get environmentMaintenceLoopInterval(): number {
        return 5000;
    }

    // it's needed in two scenario
    // 1. remote machine has fixed number, so it can return false, when all environment are assigned.
    // 2. If there are consistent error on requested environments, for example, authentication failure on platform.
    public get hasMoreEnvironments(): boolean {
        return true;
    }

J-shang's avatar
J-shang committed
173
    public createEnvironmentInformation(envId: string, envName: string): EnvironmentInformation {
174
175
176
177
        return new EnvironmentInformation(envId, envName);
    }
}

J-shang's avatar
J-shang committed
178
export class NodeInformation {
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    public id: string;
    public status: TrialJobStatus = "UNKNOWN";
    public endTime?: number;

    constructor(id: string) {
        this.id = id;
    }
}

export class RunnerSettings {
    public experimentId: string = "";
    public platform: string = "";
    public nniManagerIP: string = "";
    public nniManagerPort: number = 8081;
    public nniManagerVersion: string = "";
    public logCollection: string = "none";
    public command: string = "";
196
    public enableGpuCollector: boolean = true;
197
198
199
200
201

    // specify which communication channel is used by runner.
    // supported channel includes: rest, storage, aml
    public commandChannel: Channel = "file";
}