Unverified Commit 4b1961e2 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Fix remoteTrainingService gpuScheduler (#1749)

parent 5845ca04
...@@ -67,7 +67,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -67,7 +67,7 @@ class RemoteMachineTrainingService implements TrainingService {
private readonly expRootDir: string; private readonly expRootDir: string;
private readonly remoteExpRootDir: string; private readonly remoteExpRootDir: string;
private trialConfig: TrialConfig | undefined; private trialConfig: TrialConfig | undefined;
private readonly gpuScheduler: GPUScheduler; private gpuScheduler?: GPUScheduler;
private readonly jobQueue: string[]; private readonly jobQueue: string[];
private readonly timer: ObservableTimer; private readonly timer: ObservableTimer;
private stopping: boolean = false; private stopping: boolean = false;
...@@ -87,7 +87,6 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -87,7 +87,6 @@ class RemoteMachineTrainingService implements TrainingService {
this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>(); this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>();
this.trialSSHClientMap = new Map<string, Client>(); this.trialSSHClientMap = new Map<string, Client>();
this.machineSSHClientMap = new Map<RemoteMachineMeta, SSHClientManager>(); this.machineSSHClientMap = new Map<RemoteMachineMeta, SSHClientManager>();
this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap);
this.jobQueue = []; this.jobQueue = [];
this.expRootDir = getExperimentRootDir(); this.expRootDir = getExperimentRootDir();
this.remoteExpRootDir = this.getRemoteExperimentRootDir(); this.remoteExpRootDir = this.getRemoteExperimentRootDir();
...@@ -334,6 +333,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -334,6 +333,7 @@ class RemoteMachineTrainingService implements TrainingService {
break; break;
case TrialConfigMetadataKey.MACHINE_LIST: case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value); await this.setupConnections(value);
this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap);
break; break;
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG:
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value); const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
...@@ -397,9 +397,11 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -397,9 +397,11 @@ class RemoteMachineTrainingService implements TrainingService {
* remove gpu reversion when job is not running * remove gpu reversion when job is not running
*/ */
private updateGpuReservation(): void { private updateGpuReservation(): void {
for (const [key, value] of this.trialJobsMap) { if (this.gpuScheduler) {
if (!['WAITING', 'RUNNING'].includes(value.status)) { for (const [key, value] of this.trialJobsMap) {
this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap); if (!['WAITING', 'RUNNING'].includes(value.status)) {
this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap);
}
} }
} }
} }
...@@ -483,6 +485,9 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -483,6 +485,9 @@ class RemoteMachineTrainingService implements TrainingService {
if (this.trialConfig === undefined) { if (this.trialConfig === undefined) {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
} }
if (this.gpuScheduler === undefined) {
throw new Error('gpuScheduler is not initialized');
}
const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) { if (trialJobDetail === undefined) {
throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${trialJobId}`); throw new NNIError(NNIErrorNames.INVALID_JOB_DETAIL, `Invalid job detail information for trial job ${trialJobId}`);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment