Unverified Commit cd05da66 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Add recently-idle environment scheduler in reuse mode (#3375)

parent aea82d71
...@@ -51,6 +51,8 @@ export class EnvironmentInformation { ...@@ -51,6 +51,8 @@ export class EnvironmentInformation {
// uses to count how many trial runs on this environment. // uses to count how many trial runs on this environment.
// it can be used in many scenarios, but for now, it uses for reusable. // it can be used in many scenarios, but for now, it uses for reusable.
public assignedTrialCount: number = 0; public assignedTrialCount: number = 0;
// it is used to get environment idle time interval
public latestTrialReleasedTime: number = -1;
// NNI environment ID // NNI environment ID
public id: string; public id: string;
......
...@@ -10,7 +10,7 @@ import { GPUInfo, ScheduleResultType } from '../common/gpuData'; ...@@ -10,7 +10,7 @@ import { GPUInfo, ScheduleResultType } from '../common/gpuData';
import { EnvironmentInformation } from './environment'; import { EnvironmentInformation } from './environment';
import { TrialDetail } from './trial'; import { TrialDetail } from './trial';
type SCHEDULE_POLICY_NAME = 'random' | 'round-robin'; type SCHEDULE_POLICY_NAME = 'random' | 'round-robin' | 'recently-idle';
export class GpuSchedulerSetting { export class GpuSchedulerSetting {
public useActiveGpu: boolean = false; public useActiveGpu: boolean = false;
...@@ -30,7 +30,7 @@ export class GpuScheduler { ...@@ -30,7 +30,7 @@ export class GpuScheduler {
// private readonly machineExecutorMap: Set<TrialDetail>; // private readonly machineExecutorMap: Set<TrialDetail>;
private readonly log: Logger = getLogger(); private readonly log: Logger = getLogger();
private readonly policyName: SCHEDULE_POLICY_NAME = 'round-robin'; private readonly policyName: SCHEDULE_POLICY_NAME = 'recently-idle';
private defaultSetting: GpuSchedulerSetting; private defaultSetting: GpuSchedulerSetting;
private roundRobinIndex: number = 0; private roundRobinIndex: number = 0;
...@@ -101,6 +101,7 @@ export class GpuScheduler { ...@@ -101,6 +101,7 @@ export class GpuScheduler {
trial.environment.defaultGpuSummary !== undefined && trial.environment.defaultGpuSummary !== undefined &&
trial.assignedGpus !== undefined && trial.assignedGpus !== undefined &&
trial.assignedGpus.length > 0) { trial.assignedGpus.length > 0) {
for (const gpuInfo of trial.assignedGpus) { for (const gpuInfo of trial.assignedGpus) {
const defaultGpuSummary = trial.environment.defaultGpuSummary; const defaultGpuSummary = trial.environment.defaultGpuSummary;
const num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index); const num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index);
...@@ -190,11 +191,31 @@ export class GpuScheduler { ...@@ -190,11 +191,31 @@ export class GpuScheduler {
return randomSelect(qualifiedEnvironments); return randomSelect(qualifiedEnvironments);
} else if (this.policyName === 'round-robin') { } else if (this.policyName === 'round-robin') {
return this.roundRobinSelect(qualifiedEnvironments, allEnvironments); return this.roundRobinSelect(qualifiedEnvironments, allEnvironments);
} else if (this.policyName === 'recently-idle') {
return this.recentlyIdleSelect(qualifiedEnvironments, allEnvironments);
} else { } else {
throw new Error(`Unsupported schedule policy: ${this.policyName}`); throw new Error(`Unsupported schedule policy: ${this.policyName}`);
} }
} }
// Select the environment which is idle most recently. If all environments are not idle, use round robin to select an environment.
private recentlyIdleSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation {
const now = Date.now();
let selectedEnvironment: EnvironmentInformation | undefined = undefined;
let minTimeInterval = Number.MAX_SAFE_INTEGER;
for (const environment of qualifiedEnvironments) {
if (environment.latestTrialReleasedTime > 0 && (now - environment.latestTrialReleasedTime) < minTimeInterval) {
selectedEnvironment = environment;
minTimeInterval = now - environment.latestTrialReleasedTime;
}
}
if (selectedEnvironment === undefined) {
return this.roundRobinSelect(qualifiedEnvironments, allEnvironments);
}
selectedEnvironment.latestTrialReleasedTime = -1;
return selectedEnvironment;
}
private roundRobinSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation { private roundRobinSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation {
while (!qualifiedEnvironments.includes(allEnvironments[this.roundRobinIndex % allEnvironments.length])) { while (!qualifiedEnvironments.includes(allEnvironments[this.roundRobinIndex % allEnvironments.length])) {
this.roundRobinIndex++; this.roundRobinIndex++;
......
...@@ -732,6 +732,7 @@ class TrialDispatcher implements TrainingService { ...@@ -732,6 +732,7 @@ class TrialDispatcher implements TrainingService {
throw new Error(`TrialDispatcher: environment ${trial.environment.id} has no counted running trial!`); throw new Error(`TrialDispatcher: environment ${trial.environment.id} has no counted running trial!`);
} }
trial.environment.runningTrialCount--; trial.environment.runningTrialCount--;
trial.environment.latestTrialReleasedTime = Date.now();
trial.environment = undefined; trial.environment = undefined;
} }
if (true === this.enableGpuScheduler) { if (true === this.enableGpuScheduler) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment