"torchvision/tv_tensors/_bounding_box.py" did not exist on "be798eff3f1255f647d56b7c245bdd8f4652e0a7"
gpuScheduler.ts 10.5 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5

'use strict';

6
import * as assert from 'assert';
Deshui Yu's avatar
Deshui Yu committed
7
import { getLogger, Logger } from '../../common/log';
8
import { randomSelect } from '../../common/utils';
9
10
import { RemoteMachineConfig } from '../../common/experimentConfig';
import { GPUInfo, ScheduleResultType } from '../common/gpuData';
11
import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';
Deshui Yu's avatar
Deshui Yu committed
12

chicm-ms's avatar
chicm-ms committed
13
14
type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';

Deshui Yu's avatar
Deshui Yu committed
15
16
17
18
19
/**
 * A simple GPU scheduler implementation
 */
export class GPUScheduler {

20
    private readonly machineExecutorMap: Map<RemoteMachineConfig, ExecutorManager>;
liuzhe-lz's avatar
liuzhe-lz committed
21
    private readonly log: Logger = getLogger('GPUScheduler');
chicm-ms's avatar
chicm-ms committed
22
23
24
    private readonly policyName: SCHEDULE_POLICY_NAME = 'round-robin';
    private roundRobinIndex: number = 0;
    private configuredRMs: RemoteMachineMeta[] = [];
Deshui Yu's avatar
Deshui Yu committed
25
26
27

    /**
     * Constructor
28
     * @param machineExecutorMap map from remote machine to executor
Deshui Yu's avatar
Deshui Yu committed
29
     */
30
    constructor(machineExecutorMap: Map<RemoteMachineConfig, ExecutorManager>) {
31
32
        assert(machineExecutorMap.size > 0);
        this.machineExecutorMap = machineExecutorMap;
33
        this.configuredRMs = Array.from(machineExecutorMap.values(), manager => manager.rmMeta);
Deshui Yu's avatar
Deshui Yu committed
34
35
36
37
38
39
    }

    /**
     * Schedule a machine according to the constraints (requiredGPUNum)
     * @param requiredGPUNum required GPU number
     */
chicm-ms's avatar
chicm-ms committed
40
    public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
41
        if (requiredGPUNum === undefined) {
SparkSnail's avatar
SparkSnail committed
42
43
            requiredGPUNum = 0;
        }
44
        assert(requiredGPUNum >= 0);
45
        const allRMs: RemoteMachineMeta[] = Array.from(this.machineExecutorMap.values(), manager => manager.rmMeta);
46
47
48
        assert(allRMs.length > 0);

        // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
chicm-ms's avatar
chicm-ms committed
49
        const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
50
            rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
51
        if (eligibleRM.length === 0) {
Deshui Yu's avatar
Deshui Yu committed
52
53
54
            // If the required gpu number exceeds the upper limit of all machine's GPU number
            // Return REQUIRE_EXCEED_TOTAL directly
            return ({
55
56
                resultType: ScheduleResultType.REQUIRE_EXCEED_TOTAL,
                scheduleInfo: undefined
Deshui Yu's avatar
Deshui Yu committed
57
58
59
            });
        }

60
61
62
63
        // Step 2: Allocate Host/GPU for specified trial job
        // Currenty the requireGPUNum parameter for all trial jobs are identical.
        if (requiredGPUNum > 0) {
            // Trial job requires GPU
64
            const result: RemoteMachineScheduleResult | undefined = this.scheduleGPUHost(requiredGPUNum, trialJobDetail);
65
66
            if (result !== undefined) {
                return result;
Deshui Yu's avatar
Deshui Yu committed
67
            }
68
69
70
71
        } else {
            // Trail job does not need GPU
            const allocatedRm: RemoteMachineMeta = this.selectMachine(allRMs);

72
            return this.allocateHost(requiredGPUNum, allocatedRm, [], trialJobDetail);
73
        }
74
        this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);
75

Deshui Yu's avatar
Deshui Yu committed
76
        return {
77
78
            resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
            scheduleInfo: undefined
Deshui Yu's avatar
Deshui Yu committed
79
80
81
        };
    }

82
83
84
    /**
     * remove the job's gpu reversion
     */
85
    public removeGpuReservation(trialJobId: string, trialJobMap: Map<string, RemoteMachineTrialJobDetail>): void {
86
87
        const trialJobDetail: RemoteMachineTrialJobDetail | undefined = trialJobMap.get(trialJobId);
        if (trialJobDetail === undefined) {
88
            throw new Error(`could not get trialJobDetail by id ${trialJobId}`);
89
90
91
92
        }
        if (trialJobDetail.rmMeta !== undefined &&
            trialJobDetail.rmMeta.occupiedGpuIndexMap !== undefined &&
            trialJobDetail.gpuIndices !== undefined &&
93
94
            trialJobDetail.gpuIndices.length > 0) {
            for (const gpuInfo of trialJobDetail.gpuIndices) {
95
96
97
                const num: number | undefined = trialJobDetail.rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
                if (num !== undefined) {
                    if (num === 1) {
98
99
                        trialJobDetail.rmMeta.occupiedGpuIndexMap.delete(gpuInfo.index);
                    } else {
100
                        trialJobDetail.rmMeta.occupiedGpuIndexMap.set(gpuInfo.index, num - 1);
101
                    }
102
                }
103
            }
104
        }
105
106
        trialJobDetail.gpuIndices = [];
        trialJobMap.set(trialJobId, trialJobDetail);
107
108
    }

109
    private scheduleGPUHost(requiredGPUNum: number, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult | undefined {
110
111
112
113
114
115
116
117
118
119
120
        const totalResourceMap: Map<RemoteMachineMeta, GPUInfo[]> = this.gpuResourceDetection();
        const qualifiedRMs: RemoteMachineMeta[] = [];
        totalResourceMap.forEach((gpuInfos: GPUInfo[], rmMeta: RemoteMachineMeta) => {
            if (gpuInfos !== undefined && gpuInfos.length >= requiredGPUNum) {
                qualifiedRMs.push(rmMeta);
            }
        });
        if (qualifiedRMs.length > 0) {
            const allocatedRm: RemoteMachineMeta = this.selectMachine(qualifiedRMs);
            const gpuInfos: GPUInfo[] | undefined = totalResourceMap.get(allocatedRm);
            if (gpuInfos !== undefined) { // should always true
121
                return this.allocateHost(requiredGPUNum, allocatedRm, gpuInfos, trialJobDetail);
122
123
124
125
126
127
            } else {
                assert(false, 'gpuInfos is undefined');
            }
        }
    }

Deshui Yu's avatar
Deshui Yu committed
128
129
130
131
132
133
134
    /**
     * Detect available GPU resource for a remote machine
     * @param rmMeta Remote machine metadata
     * @param requiredGPUNum required GPU number by application
     * @param availableGPUMap available GPU resource filled by this detection
     * @returns Available GPU number on this remote machine
     */
chicm-ms's avatar
chicm-ms committed
135
136
    private gpuResourceDetection(): Map<RemoteMachineMeta, GPUInfo[]> {
        const totalResourceMap: Map<RemoteMachineMeta, GPUInfo[]> = new Map<RemoteMachineMeta, GPUInfo[]>();
137
138
        this.machineExecutorMap.forEach((executorManager: ExecutorManager, machineConfig: RemoteMachineConfig) => {
            const rmMeta = executorManager.rmMeta;
Deshui Yu's avatar
Deshui Yu committed
139
            // Assgin totoal GPU count as init available GPU number
140
141
            if (rmMeta.gpuSummary !== undefined) {
                const availableGPUs: GPUInfo[] = [];
142
                const designatedGpuIndices: number[] | undefined = machineConfig.gpuIndices;
143
144
145
146
147
148
149
                if (designatedGpuIndices !== undefined) {
                    for (const gpuIndex of designatedGpuIndices) {
                        if (gpuIndex >= rmMeta.gpuSummary.gpuCount) {
                            throw new Error(`Specified GPU index not found: ${gpuIndex}`);
                        }
                    }
                }
150
                this.log.debug(`designated gpu indices: ${designatedGpuIndices}`);
Deshui Yu's avatar
Deshui Yu committed
151
                rmMeta.gpuSummary.gpuInfos.forEach((gpuInfo: GPUInfo) => {
152
                    // if the GPU has active process, OR be reserved by a job,
153
                    // or index not in gpuIndices configuration in machineList,
154
                    // or trial number on a GPU reach max number,
Deshui Yu's avatar
Deshui Yu committed
155
                    // We should NOT allocate this GPU
156
                    // if users set useActiveGpu, use the gpu whether there is another activeProcess
157
                    if (designatedGpuIndices === undefined || designatedGpuIndices.includes(gpuInfo.index)) {
158
159
                        if (rmMeta.occupiedGpuIndexMap !== undefined) {
                            const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
160
161
                            if ((num === undefined && (!machineConfig.useActiveGpu && gpuInfo.activeProcessNum === 0 || machineConfig.useActiveGpu)) ||
                                (num !== undefined && num < machineConfig.maxTrialNumberPerGpu)) {
162
163
164
165
166
                                availableGPUs.push(gpuInfo);
                            }
                        } else {
                            throw new Error(`occupiedGpuIndexMap initialize error!`);
                        }
Deshui Yu's avatar
Deshui Yu committed
167
168
169
170
171
172
173
174
                    }
                });
                totalResourceMap.set(rmMeta, availableGPUs);
            }
        });

        return totalResourceMap;
    }
175
176
177
178

    private selectMachine(rmMetas: RemoteMachineMeta[]): RemoteMachineMeta {
        assert(rmMetas !== undefined && rmMetas.length > 0);

chicm-ms's avatar
chicm-ms committed
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
        if (this.policyName === 'random') {
            return randomSelect(rmMetas);
        } else if (this.policyName === 'round-robin') {
            return this.roundRobinSelect(rmMetas);
        } else {
            throw new Error(`Unsupported schedule policy: ${this.policyName}`);
        }
    }

    private roundRobinSelect(rmMetas: RemoteMachineMeta[]): RemoteMachineMeta {
        while (!rmMetas.includes(this.configuredRMs[this.roundRobinIndex % this.configuredRMs.length])) {
            this.roundRobinIndex++;
        }

        return this.configuredRMs[this.roundRobinIndex++ % this.configuredRMs.length];
194
195
196
197
198
199
200
201
    }

    private selectGPUsForTrial(gpuInfos: GPUInfo[], requiredGPUNum: number): GPUInfo[] {
        // Sequentially allocate GPUs
        return gpuInfos.slice(0, requiredGPUNum);
    }

    private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
202
        gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
203
204
205
        assert(gpuInfos.length >= requiredGPUNum);
        const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
        allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
206
207
208
            if (rmMeta.occupiedGpuIndexMap !== undefined) {
                let num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
                if (num === undefined) {
209
210
211
                    num = 0;
                }
                rmMeta.occupiedGpuIndexMap.set(gpuInfo.index, num + 1);
212
            } else {
213
                throw new Error(`Machine ${rmMeta.config.host} occupiedGpuIndexMap initialize error!`);
214
            }
215
        });
216
217
        trialJobDetail.gpuIndices = allocatedGPUs;
        trialJobDetail.rmMeta = rmMeta;
218

219
220
221
222
        return {
            resultType: ScheduleResultType.SUCCEED,
            scheduleInfo: {
                rmMeta: rmMeta,
chicm-ms's avatar
chicm-ms committed
223
                cudaVisibleDevice: allocatedGPUs
224
225
226
227
                    .map((gpuInfo: GPUInfo) => {
                        return gpuInfo.index;
                    })
                    .join(',')
228
229
230
            }
        };
    }
Deshui Yu's avatar
Deshui Yu committed
231
}