Unverified Commit 013adb1f authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

fix remote connection logic (#2812)

* add trial job detail link

* refactor remote connection

* fix logic for gpuscheduler
parent e408e146
......@@ -87,6 +87,8 @@ class RemoteMachineTrainingService implements TrainingService {
this.log.info('ssh connection initialized!');
// set sshConnectionPromises to [] to avoid log information duplicated
this.sshConnectionPromises = [];
// initialize gpuScheduler
this.gpuScheduler = new GPUScheduler(this.machineExecutorManagerMap);
}
while (!this.stopping) {
while (this.jobQueue.length > 0) {
......@@ -310,7 +312,6 @@ class RemoteMachineTrainingService implements TrainingService {
break;
case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value);
this.gpuScheduler = new GPUScheduler(this.machineExecutorManagerMap);
break;
case TrialConfigMetadataKey.TRIAL_CONFIG: {
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
......@@ -426,19 +427,19 @@ class RemoteMachineTrainingService implements TrainingService {
const rmMetaList: RemoteMachineMeta[] = <RemoteMachineMeta[]>JSON.parse(machineList);
for (const rmMeta of rmMetaList) {
rmMeta.occupiedGpuIndexMap = new Map<number, number>();
const executorManager: ExecutorManager = new ExecutorManager(rmMeta);
this.log.info(`connecting to ${rmMeta.username}@${rmMeta.ip}:${rmMeta.port}`);
const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId);
this.log.debug(`reached ${executor.name}`);
this.machineExecutorManagerMap.set(rmMeta, executorManager);
this.log.debug(`initializing ${executor.name}`);
this.sshConnectionPromises.push(this.initRemoteMachineOnConnected(rmMeta, executor));
this.log.info(`connecting to ${executor.name}`);
this.sshConnectionPromises.push(this.initRemoteMachineOnConnected(rmMeta));
}
}
private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, executor: ShellExecutor): Promise<void> {
private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta): Promise<void> {
rmMeta.occupiedGpuIndexMap = new Map<number, number>();
const executorManager: ExecutorManager = new ExecutorManager(rmMeta);
this.log.info(`connecting to ${rmMeta.username}@${rmMeta.ip}:${rmMeta.port}`);
const executor: ShellExecutor = await executorManager.getExecutor(this.initExecutorId);
this.log.debug(`reached ${executor.name}`);
this.machineExecutorManagerMap.set(rmMeta, executorManager);
this.log.debug(`initializing ${executor.name}`);
// Create root working directory after executor is ready
const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni');
await executor.createFolder(executor.getRemoteExperimentRootDir(getExperimentId()));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment