environment.ts 6.78 KB
Newer Older
1
2
3
4
5
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

'use strict';

6
import { EventEmitter } from "events";
7
8
import { getLogger, Logger } from "../../common/log";
import { TrialJobStatus } from "../../common/trainingService";
9
import { GPUInfo } from "../../training_service/common/gpuData";
10
11
12
13
14
15
16
import { WebCommandChannel } from "./channels/webCommandChannel";
import { CommandChannel } from "./commandChannel";


export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED';
export type Channel = "web" | "file" | "aml" | "ut";

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

export class TrialGpuSummary {
    // GPU count on the machine
    public gpuCount: number;
    // The timestamp when GPU summary data queried
    public timestamp: string;
    // The array of GPU information for each GPU card
    public gpuInfos: GPUInfo[];
    // GPU assigned status
    public assignedGpuIndexMap: Map<number, number> = new Map<number, number>();

    constructor(gpuCount: number, timestamp: string, gpuInfos: GPUInfo[]) {
        this.gpuCount = gpuCount;
        this.timestamp = timestamp;
        this.gpuInfos = gpuInfos;
    }
}

35
export class EnvironmentInformation {
36
37
    // node id is 5 chars, so won't conflict.
    private readonly defaultNodeId = "default";
38
    private log: Logger;
39
    private isNoGpuWarned: boolean = false;
40
41
42
43

    // key states
    // true: environment is running, waiting, or unknown.
    public isAlive: boolean = true;
44
45
    // true: Runner is initialized, and can receive trials.
    public isRunnerReady: boolean = false;
46
47
48
    // don't set status in environment directly, use setFinalState function to set a final state.
    public status: EnvironmentStatus = "UNKNOWN";

49
50
51
52
53
54
55
56
57
58
59
60
    // true: environment is ready to run trial.
    public runningTrialCount: number = 0;
    // uses to count how many trial runs on this environment.
    // it can be used in many scenarios, but for now, it uses for reusable.
    public assignedTrialCount: number = 0;

    // NNI environment ID
    public id: string;
    // training platform unique job ID.
    public envId: string;
    // training platform job friendly name, in case it's different with job ID.
    public name: string;
61
62
63
64
65
66
67
    public trackingUrl: string = "";
    public workingFolder: string = "";
    public runnerWorkingFolder: string = "";
    public command: string = "";
    public nodeCount: number = 1;

    // it's used to aggregate node status for multiple node trial
J-shang's avatar
J-shang committed
68
    public nodes: Map<string, NodeInformation>;
69
    public gpuSummaries: Map<string, TrialGpuSummary> = new Map<string, TrialGpuSummary>();
70

71
72
73
74
75
76
77
78
    // use can specify which gpus can be used by NNI.
    // it's usable for sharable environment like remote machine.
    public usableGpus?: number[];
    // user can specify how to use GPU resource for an environment, like local and remote.
    public maxTrialNumberPerGpu?: number;
    public useActiveGpu?: boolean;

    constructor(id: string, name: string, envId?: string) {
79
80
        this.log = getLogger();
        this.id = id;
81
82
        this.name = name;
        this.envId = envId ? envId : name;
J-shang's avatar
J-shang committed
83
        this.nodes = new Map<string, NodeInformation>();
84
85
    }

86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
    public setStatus(status: EnvironmentStatus): void {
        if (this.status !== status) {
            this.log.info(`EnvironmentInformation: ${this.envId} change status from ${this.status} to ${status}.`)
            this.status = status;
        }
    }

    public setGpuSummary(nodeId: string, newGpuSummary: TrialGpuSummary): void {
        if (nodeId === null || nodeId === undefined) {
            nodeId = this.defaultNodeId;
        }

        const originalGpuSummary = this.gpuSummaries.get(nodeId);
        if (undefined === originalGpuSummary) {
            newGpuSummary.assignedGpuIndexMap = new Map<number, number>();
            this.gpuSummaries.set(nodeId, newGpuSummary);
        } else {
            originalGpuSummary.gpuCount = newGpuSummary.gpuCount;
            originalGpuSummary.timestamp = newGpuSummary.timestamp;
            originalGpuSummary.gpuInfos = newGpuSummary.gpuInfos;
        }
    }

    public get defaultGpuSummary(): TrialGpuSummary | undefined {
        const gpuSummary = this.gpuSummaries.get(this.defaultNodeId);
        if (gpuSummary === undefined) {
            if (false === this.isNoGpuWarned) {
                this.log.warning(`EnvironmentInformation: ${this.envId} no default gpu found. current gpu info ${JSON.stringify(this.gpuSummaries)}`);
                this.isNoGpuWarned = true;
            }
        } else {
            this.isNoGpuWarned = false;
118
        }
119
        return gpuSummary;
120
121
    }
}
SparkSnail's avatar
SparkSnail committed
122

123
124
125
126
127
128
export abstract class EnvironmentService {

    public abstract get hasStorageService(): boolean;
    public abstract config(key: string, value: string): Promise<void>;
    public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void>;
    public abstract stopEnvironment(environment: EnvironmentInformation): Promise<void>;
129
130
131
132
133
134
135
    public abstract startEnvironment(environment: EnvironmentInformation): Promise<void>;
    
    // It is used to set prefetched environment count, default value is 0 for OpenPAI and AML mode,
    // in remote mode, this value is set to the length of machine list.
    public get prefetchedEnvironmentCount(): number {
        return 0;
    }
136

137
138
139
140
141
142
143
144
145
146
147
148
149
150
    // It depends on environment pressure and settings
    // for example, OpenPAI relies on API calls, and there is an limitation for frequence, so it need to be bigger.
    public get environmentMaintenceLoopInterval(): number {
        return 5000;
    }

    // it's needed in two scenario
    // 1. remote machine has fixed number, so it can return false, when all environment are assigned.
    // 2. If there are consistent error on requested environments, for example, authentication failure on platform.
    public get hasMoreEnvironments(): boolean {
        return true;
    }

    public createCommandChannel(commandEmitter: EventEmitter): CommandChannel {
151
152
153
        return new WebCommandChannel(commandEmitter);
    }

J-shang's avatar
J-shang committed
154
    public createEnvironmentInformation(envId: string, envName: string): EnvironmentInformation {
155
156
157
158
        return new EnvironmentInformation(envId, envName);
    }
}

J-shang's avatar
J-shang committed
159
export class NodeInformation {
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
    public id: string;
    public status: TrialJobStatus = "UNKNOWN";
    public endTime?: number;

    constructor(id: string) {
        this.id = id;
    }
}

export class RunnerSettings {
    public experimentId: string = "";
    public platform: string = "";
    public nniManagerIP: string = "";
    public nniManagerPort: number = 8081;
    public nniManagerVersion: string = "";
    public logCollection: string = "none";
    public command: string = "";
177
    public enableGpuCollector: boolean = true;
178
179
180
181
182

    // specify which communication channel is used by runner.
    // supported channel includes: rest, storage, aml
    public commandChannel: Channel = "file";
}