Unverified Commit cd05da66 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Add recently-idle environment scheduler in reuse mode (#3375)

parent aea82d71
......@@ -51,6 +51,8 @@ export class EnvironmentInformation {
// uses to count how many trial runs on this environment.
// it can be used in many scenarios, but for now, it uses for reusable.
public assignedTrialCount: number = 0;
// it is used to get environment idle time interval
public latestTrialReleasedTime: number = -1;
// NNI environment ID
public id: string;
......
......@@ -10,7 +10,7 @@ import { GPUInfo, ScheduleResultType } from '../common/gpuData';
import { EnvironmentInformation } from './environment';
import { TrialDetail } from './trial';
type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
type SCHEDULE_POLICY_NAME = 'random' | 'round-robin' | 'recently-idle';
export class GpuSchedulerSetting {
public useActiveGpu: boolean = false;
......@@ -30,7 +30,7 @@ export class GpuScheduler {
// private readonly machineExecutorMap: Set<TrialDetail>;
private readonly log: Logger = getLogger();
private readonly policyName: SCHEDULE_POLICY_NAME = 'round-robin';
private readonly policyName: SCHEDULE_POLICY_NAME = 'recently-idle';
private defaultSetting: GpuSchedulerSetting;
private roundRobinIndex: number = 0;
......@@ -101,6 +101,7 @@ export class GpuScheduler {
trial.environment.defaultGpuSummary !== undefined &&
trial.assignedGpus !== undefined &&
trial.assignedGpus.length > 0) {
for (const gpuInfo of trial.assignedGpus) {
const defaultGpuSummary = trial.environment.defaultGpuSummary;
const num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index);
......@@ -190,10 +191,30 @@ export class GpuScheduler {
return randomSelect(qualifiedEnvironments);
} else if (this.policyName === 'round-robin') {
return this.roundRobinSelect(qualifiedEnvironments, allEnvironments);
} else if (this.policyName === 'recently-idle') {
return this.recentlyIdleSelect(qualifiedEnvironments, allEnvironments);
} else {
throw new Error(`Unsupported schedule policy: ${this.policyName}`);
}
}
// Select the environment which is idle most recently. If all environments are not idle, use round robin to select an environment.
private recentlyIdleSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation {
const now = Date.now();
let selectedEnvironment: EnvironmentInformation | undefined = undefined;
let minTimeInterval = Number.MAX_SAFE_INTEGER;
for (const environment of qualifiedEnvironments) {
if (environment.latestTrialReleasedTime > 0 && (now - environment.latestTrialReleasedTime) < minTimeInterval) {
selectedEnvironment = environment;
minTimeInterval = now - environment.latestTrialReleasedTime;
}
}
if (selectedEnvironment === undefined) {
return this.roundRobinSelect(qualifiedEnvironments, allEnvironments);
}
selectedEnvironment.latestTrialReleasedTime = -1;
return selectedEnvironment;
}
private roundRobinSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation {
while (!qualifiedEnvironments.includes(allEnvironments[this.roundRobinIndex % allEnvironments.length])) {
......
......@@ -732,6 +732,7 @@ class TrialDispatcher implements TrainingService {
throw new Error(`TrialDispatcher: environment ${trial.environment.id} has no counted running trial!`);
}
trial.environment.runningTrialCount--;
trial.environment.latestTrialReleasedTime = Date.now();
trial.environment = undefined;
}
if (true === this.enableGpuScheduler) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment