"...composable_kernel.git" did not exist on "309b1c64618c714b7f47ceb038bba68b61fa4e4e"
Unverified Commit de9e2842 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Fix ssh connection error (#829)

SSH client has a max number of open channels for a connection, if we set the number of trialCurrency too big, our ssh client will exec command using ssh frequently, then we will meet the error of Error: (SSH) Channel open failure: open failed.
Refactor the code, set one connection has a max trial concurrency, when the number of trial reach the ssh connection restriction, will create a new ssh connection to exec trial commands.
parent 7d91796c
...@@ -24,21 +24,21 @@ import { Client } from 'ssh2'; ...@@ -24,21 +24,21 @@ import { Client } from 'ssh2';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { randomSelect } from '../../common/utils'; import { randomSelect } from '../../common/utils';
import { GPUInfo } from '../common/gpuData'; import { GPUInfo } from '../common/gpuData';
import { RemoteMachineMeta, RemoteMachineScheduleResult, ScheduleResultType } from './remoteMachineData'; import { RemoteMachineMeta, RemoteMachineScheduleResult, ScheduleResultType, SSHClientManager } from './remoteMachineData';
/** /**
* A simple GPU scheduler implementation * A simple GPU scheduler implementation
*/ */
export class GPUScheduler { export class GPUScheduler {
private readonly machineSSHClientMap : Map<RemoteMachineMeta, Client>; private readonly machineSSHClientMap : Map<RemoteMachineMeta, SSHClientManager>;
private log: Logger = getLogger(); private log: Logger = getLogger();
/** /**
* Constructor * Constructor
* @param machineSSHClientMap map from remote machine to sshClient * @param machineSSHClientMap map from remote machine to sshClient
*/ */
constructor(machineSSHClientMap : Map<RemoteMachineMeta, Client>) { constructor(machineSSHClientMap : Map<RemoteMachineMeta, SSHClientManager>) {
this.machineSSHClientMap = machineSSHClientMap; this.machineSSHClientMap = machineSSHClientMap;
} }
...@@ -113,7 +113,7 @@ export class GPUScheduler { ...@@ -113,7 +113,7 @@ export class GPUScheduler {
*/ */
private gpuResourceDetection() : Map<RemoteMachineMeta, GPUInfo[]> { private gpuResourceDetection() : Map<RemoteMachineMeta, GPUInfo[]> {
const totalResourceMap : Map<RemoteMachineMeta, GPUInfo[]> = new Map<RemoteMachineMeta, GPUInfo[]>(); const totalResourceMap : Map<RemoteMachineMeta, GPUInfo[]> = new Map<RemoteMachineMeta, GPUInfo[]>();
this.machineSSHClientMap.forEach((client: Client, rmMeta: RemoteMachineMeta) => { this.machineSSHClientMap.forEach((sshClientManager: SSHClientManager, rmMeta: RemoteMachineMeta) => {
// Assgin totoal GPU count as init available GPU number // Assgin totoal GPU count as init available GPU number
if (rmMeta.gpuSummary !== undefined) { if (rmMeta.gpuSummary !== undefined) {
const availableGPUs: GPUInfo[] = []; const availableGPUs: GPUInfo[] = [];
......
...@@ -21,6 +21,9 @@ ...@@ -21,6 +21,9 @@
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUSummary } from '../common/gpuData'; import { GPUSummary } from '../common/gpuData';
import { Client, ConnectConfig } from 'ssh2';
import { Deferred } from 'ts-deferred';
import * as fs from 'fs';
/** /**
...@@ -94,6 +97,138 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail { ...@@ -94,6 +97,138 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
} }
} }
/**
* The remote machine ssh client used for trial and gpu detector
*/
export class SSHClient {
private readonly sshClient: Client;
private usedConnectionNumber: number; //count the connection number of every client
constructor(sshClient: Client, usedConnectionNumber: number) {
this.sshClient = sshClient;
this.usedConnectionNumber = usedConnectionNumber;
}
public get getSSHClientInstance(): Client {
return this.sshClient;
}
public get getUsedConnectionNumber(): number {
return this.usedConnectionNumber;
}
public addUsedConnectionNumber() {
this.usedConnectionNumber += 1;
}
public minusUsedConnectionNumber() {
this.usedConnectionNumber -= 1;
}
}
export class SSHClientManager {
private sshClientArray: SSHClient[];
private readonly maxTrialNumberPerConnection: number;
private readonly rmMeta: RemoteMachineMeta;
constructor(sshClientArray: SSHClient[], maxTrialNumberPerConnection: number, rmMeta: RemoteMachineMeta) {
this.rmMeta = rmMeta;
this.sshClientArray = sshClientArray;
this.maxTrialNumberPerConnection = maxTrialNumberPerConnection;
}
/**
* Create a new ssh connection client and initialize it
*/
private initNewSSHClient(): Promise<Client> {
const deferred: Deferred<Client> = new Deferred<Client>();
const conn: Client = new Client();
let connectConfig: ConnectConfig = {
host: this.rmMeta.ip,
port: this.rmMeta.port,
username: this.rmMeta.username };
if (this.rmMeta.passwd) {
connectConfig.password = this.rmMeta.passwd;
} else if(this.rmMeta.sshKeyPath) {
if(!fs.existsSync(this.rmMeta.sshKeyPath)) {
//SSh key path is not a valid file, reject
deferred.reject(new Error(`${this.rmMeta.sshKeyPath} does not exist.`));
}
const privateKey: string = fs.readFileSync(this.rmMeta.sshKeyPath, 'utf8');
connectConfig.privateKey = privateKey;
connectConfig.passphrase = this.rmMeta.passphrase;
} else {
deferred.reject(new Error(`No valid passwd or sshKeyPath is configed.`));
}
conn.on('ready', () => {
this.addNewSSHClient(conn);
deferred.resolve(conn);
}).on('error', (err: Error) => {
// SSH connection error, reject with error message
deferred.reject(new Error(err.message));
}).connect(connectConfig);
return deferred.promise;
}
/**
* find a available ssh client in ssh array, if no ssh client available, return undefined
*/
public async getAvailableSSHClient(): Promise<Client> {
const deferred: Deferred<Client> = new Deferred<Client>();
for (const index in this.sshClientArray) {
let connectionNumber: number = this.sshClientArray[index].getUsedConnectionNumber;
if(connectionNumber < this.maxTrialNumberPerConnection) {
this.sshClientArray[index].addUsedConnectionNumber();
deferred.resolve(this.sshClientArray[index].getSSHClientInstance);
return deferred.promise;
}
};
//init a new ssh client if could not get an available one
return await this.initNewSSHClient();
}
/**
* add a new ssh client to sshClientArray
* @param sshClient
*/
public addNewSSHClient(client: Client) {
this.sshClientArray.push(new SSHClient(client, 1));
}
/**
* first ssh clilent instance is used for gpu collector and host job
*/
public getFirstSSHClient() {
return this.sshClientArray[0].getSSHClientInstance;
}
/**
* close all of ssh client
*/
public closeAllSSHClient() {
for (let sshClient of this.sshClientArray) {
sshClient.getSSHClientInstance.end();
}
}
/**
* retrieve resource, minus a number for given ssh client
* @param client
*/
public releaseConnection(client: Client | undefined) {
if(!client) {
throw new Error(`could not release a undefined ssh client`);
}
for(let index in this.sshClientArray) {
if(this.sshClientArray[index].getSSHClientInstance === client) {
this.sshClientArray[index].minusUsedConnectionNumber();
break;
}
}
}
}
export type RemoteMachineScheduleResult = { scheduleInfo : RemoteMachineScheduleInfo | undefined; resultType : ScheduleResultType}; export type RemoteMachineScheduleResult = { scheduleInfo : RemoteMachineScheduleInfo | undefined; resultType : ScheduleResultType};
export type RemoteMachineScheduleInfo = { rmMeta : RemoteMachineMeta; cuda_visible_device : string}; export type RemoteMachineScheduleInfo = { rmMeta : RemoteMachineMeta; cuda_visible_device : string};
......
...@@ -43,7 +43,7 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; ...@@ -43,7 +43,7 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { GPUScheduler } from './gpuScheduler'; import { GPUScheduler } from './gpuScheduler';
import { import {
HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta, HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT, RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT,
GPU_COLLECTOR_FORMAT GPU_COLLECTOR_FORMAT
} from './remoteMachineData'; } from './remoteMachineData';
...@@ -58,8 +58,10 @@ import { mkDirP } from '../../common/utils'; ...@@ -58,8 +58,10 @@ import { mkDirP } from '../../common/utils';
*/ */
@component.Singleton @component.Singleton
class RemoteMachineTrainingService implements TrainingService { class RemoteMachineTrainingService implements TrainingService {
private machineSSHClientMap: Map<RemoteMachineMeta, Client>; private machineSSHClientMap: Map<RemoteMachineMeta, SSHClientManager>; //machine ssh client map
private trialSSHClientMap: Map<string, Client>; //trial ssh client map
private trialJobsMap: Map<string, RemoteMachineTrialJobDetail>; private trialJobsMap: Map<string, RemoteMachineTrialJobDetail>;
private readonly MAX_TRIAL_NUMBER_PER_SSHCONNECTION: number = 5 // every ssh client has a max trial concurrency number
private expRootDir: string; private expRootDir: string;
private remoteExpRootDir: string; private remoteExpRootDir: string;
private trialConfig: TrialConfig | undefined; private trialConfig: TrialConfig | undefined;
...@@ -79,7 +81,8 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -79,7 +81,8 @@ class RemoteMachineTrainingService implements TrainingService {
this.remoteOS = 'linux'; this.remoteOS = 'linux';
this.metricsEmitter = new EventEmitter(); this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>(); this.trialJobsMap = new Map<string, RemoteMachineTrialJobDetail>();
this.machineSSHClientMap = new Map<RemoteMachineMeta, Client>(); this.trialSSHClientMap = new Map<string, Client>();
this.machineSSHClientMap = new Map<RemoteMachineMeta, SSHClientManager>();
this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap); this.gpuScheduler = new GPUScheduler(this.machineSSHClientMap);
this.jobQueue = []; this.jobQueue = [];
this.expRootDir = getExperimentRootDir(); this.expRootDir = getExperimentRootDir();
...@@ -115,6 +118,40 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -115,6 +118,40 @@ class RemoteMachineTrainingService implements TrainingService {
} }
this.log.info('Remote machine training service exit.'); this.log.info('Remote machine training service exit.');
} }
/**
* give trial a ssh connection
* @param trial
*/
public async allocateSSHClientForTrial(trial: RemoteMachineTrialJobDetail): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
if(!trial.rmMeta) {
throw new Error(`rmMeta not set in trial ${trial.id}`);
}
let sshClientManager: SSHClientManager | undefined = this.machineSSHClientMap.get(trial.rmMeta);
if(!sshClientManager) {
throw new Error(`remoteSSHClient not initialized`);
}
let sshClient: Client = await sshClientManager.getAvailableSSHClient();
this.trialSSHClientMap.set(trial.id, sshClient);
deferred.resolve();
return deferred.promise;
}
/**
* If a trial is finished, release the connection resource
* @param trial
*/
public releaseTrialSSHClient(trial: RemoteMachineTrialJobDetail): void {
if(!trial.rmMeta) {
throw new Error(`rmMeta not set in trial ${trial.id}`);
}
let sshClientManager: SSHClientManager | undefined = this.machineSSHClientMap.get(trial.rmMeta);
if(!sshClientManager) {
throw new Error(`sshClientManager not initialized`);
}
sshClientManager.releaseConnection(this.trialSSHClientMap.get(trial.id));
}
/** /**
* List submitted trial jobs * List submitted trial jobs
...@@ -148,7 +185,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -148,7 +185,7 @@ class RemoteMachineTrainingService implements TrainingService {
if (trialJob.rmMeta === undefined) { if (trialJob.rmMeta === undefined) {
throw new Error(`rmMeta not set for submitted job ${trialJobId}`); throw new Error(`rmMeta not set for submitted job ${trialJobId}`);
} }
const sshClient: Client | undefined = this.machineSSHClientMap.get(trialJob.rmMeta); const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJob.id);
if (!sshClient) { if (!sshClient) {
throw new Error(`Invalid job id: ${trialJobId}, cannot find ssh client`); throw new Error(`Invalid job id: ${trialJobId}, cannot find ssh client`);
} }
...@@ -179,7 +216,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -179,7 +216,7 @@ class RemoteMachineTrainingService implements TrainingService {
* Submit trial job * Submit trial job
* @param form trial job description form * @param form trial job description form
*/ */
public submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
if (!this.trialConfig) { if (!this.trialConfig) {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
} }
...@@ -271,7 +308,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -271,7 +308,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Get ssh client where the job is running // Get ssh client where the job is running
if (trialJob.rmMeta !== undefined) { if (trialJob.rmMeta !== undefined) {
// If the trial job is already scheduled, check its status and kill the trial process in remote machine // If the trial job is already scheduled, check its status and kill the trial process in remote machine
const sshClient: Client | undefined = this.machineSSHClientMap.get(trialJob.rmMeta); const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJob.id);
if (!sshClient) { if (!sshClient) {
deferred.reject(); deferred.reject();
throw new Error(`Invalid job id ${trialJobId}, cannot find ssh client`); throw new Error(`Invalid job id ${trialJobId}, cannot find ssh client`);
...@@ -282,6 +319,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -282,6 +319,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Mark the toEarlyStop tag here // Mark the toEarlyStop tag here
trialJob.isEarlyStopped = isEarlyStopped; trialJob.isEarlyStopped = isEarlyStopped;
await SSHClientUtility.remoteExeCommand(`pkill -P \`cat ${jobpidPath}\``, sshClient); await SSHClientUtility.remoteExeCommand(`pkill -P \`cat ${jobpidPath}\``, sshClient);
this.releaseTrialSSHClient(trialJob);
} catch (error) { } catch (error) {
// Not handle the error since pkill failed will not impact trial job's current status // Not handle the error since pkill failed will not impact trial job's current status
this.log.error(`remoteTrainingService.cancelTrialJob: ${error.message}`); this.log.error(`remoteTrainingService.cancelTrialJob: ${error.message}`);
...@@ -364,10 +402,14 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -364,10 +402,14 @@ class RemoteMachineTrainingService implements TrainingService {
*/ */
private async cleanupConnections(): Promise<void> { private async cleanupConnections(): Promise<void> {
try{ try{
for (const [rmMeta, client] of this.machineSSHClientMap.entries()) { for (const [rmMeta, sshClientManager] of this.machineSSHClientMap.entries()) {
let jobpidPath: string = path.join(this.getRemoteScriptsPath(rmMeta.username), 'pid'); let jobpidPath: string = path.join(this.getRemoteScriptsPath(rmMeta.username), 'pid');
await SSHClientUtility.remoteExeCommand(`pkill -P \`cat ${jobpidPath}\``, client); let client: Client | undefined = sshClientManager.getFirstSSHClient();
await SSHClientUtility.remoteExeCommand(`rm -rf ${this.getRemoteScriptsPath(rmMeta.username)}`, client); if(client) {
await SSHClientUtility.remoteExeCommand(`pkill -P \`cat ${jobpidPath}\``, client);
await SSHClientUtility.remoteExeCommand(`rm -rf ${this.getRemoteScriptsPath(rmMeta.username)}`, client);
}
sshClientManager.closeAllSSHClient();
} }
}catch (error) { }catch (error) {
//ignore error, this function is called to cleanup remote connections when experiment is stopping //ignore error, this function is called to cleanup remote connections when experiment is stopping
...@@ -410,37 +452,14 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -410,37 +452,14 @@ class RemoteMachineTrainingService implements TrainingService {
const rmMetaList: RemoteMachineMeta[] = <RemoteMachineMeta[]>JSON.parse(machineList); const rmMetaList: RemoteMachineMeta[] = <RemoteMachineMeta[]>JSON.parse(machineList);
let connectedRMNum: number = 0; let connectedRMNum: number = 0;
rmMetaList.forEach((rmMeta: RemoteMachineMeta) => { rmMetaList.forEach(async (rmMeta: RemoteMachineMeta) => {
const conn: Client = new Client(); let sshClientManager: SSHClientManager = new SSHClientManager([], this.MAX_TRIAL_NUMBER_PER_SSHCONNECTION, rmMeta);
let connectConfig: ConnectConfig = { let sshClient: Client = await sshClientManager.getAvailableSSHClient();
host: rmMeta.ip, this.machineSSHClientMap.set(rmMeta, sshClientManager);
port: rmMeta.port, await this.initRemoteMachineOnConnected(rmMeta, sshClient);
username: rmMeta.username }; if (++connectedRMNum === rmMetaList.length) {
if (rmMeta.passwd) { deferred.resolve();
connectConfig.password = rmMeta.passwd;
} else if(rmMeta.sshKeyPath) {
if(!fs.existsSync(rmMeta.sshKeyPath)) {
//SSh key path is not a valid file, reject
deferred.reject(new Error(`${rmMeta.sshKeyPath} does not exist.`));
}
const privateKey: string = fs.readFileSync(rmMeta.sshKeyPath, 'utf8');
connectConfig.privateKey = privateKey;
connectConfig.passphrase = rmMeta.passphrase;
} else {
deferred.reject(new Error(`No valid passwd or sshKeyPath is configed.`));
} }
this.machineSSHClientMap.set(rmMeta, conn);
conn.on('ready', async () => {
this.machineSSHClientMap.set(rmMeta, conn);
await this.initRemoteMachineOnConnected(rmMeta, conn);
if (++connectedRMNum === rmMetaList.length) {
deferred.resolve();
}
}).on('error', (err: Error) => {
// SSH connection error, reject with error message
deferred.reject(new Error(err.message));
}).connect(connectConfig);
}); });
return deferred.promise; return deferred.promise;
} }
...@@ -499,13 +518,16 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -499,13 +518,16 @@ class RemoteMachineTrainingService implements TrainingService {
&& rmScheduleResult.scheduleInfo !== undefined) { && rmScheduleResult.scheduleInfo !== undefined) {
const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo; const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo;
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId);
trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
await this.allocateSSHClientForTrial(trialJobDetail);
await this.launchTrialOnScheduledMachine( await this.launchTrialOnScheduledMachine(
trialJobId, trialWorkingFolder, <TrialJobApplicationForm>trialJobDetail.form, rmScheduleInfo); trialJobId, trialWorkingFolder, <TrialJobApplicationForm>trialJobDetail.form, rmScheduleInfo);
trialJobDetail.status = 'RUNNING'; trialJobDetail.status = 'RUNNING';
trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialWorkingFolder}`; trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialWorkingFolder}`;
trialJobDetail.startTime = Date.now(); trialJobDetail.startTime = Date.now();
trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
deferred.resolve(true); deferred.resolve(true);
} else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) { } else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
...@@ -524,7 +546,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -524,7 +546,7 @@ class RemoteMachineTrainingService implements TrainingService {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
} }
const cuda_visible_device: string = rmScheduleInfo.cuda_visible_device; const cuda_visible_device: string = rmScheduleInfo.cuda_visible_device;
const sshClient: Client | undefined = this.machineSSHClientMap.get(rmScheduleInfo.rmMeta); const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJobId);
if (sshClient === undefined) { if (sshClient === undefined) {
assert(false, 'sshClient is undefined.'); assert(false, 'sshClient is undefined.');
...@@ -592,10 +614,11 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -592,10 +614,11 @@ class RemoteMachineTrainingService implements TrainingService {
private async runHostJob(form: HostJobApplicationForm): Promise<TrialJobDetail> { private async runHostJob(form: HostJobApplicationForm): Promise<TrialJobDetail> {
const rmMeta: RemoteMachineMeta = this.getRmMetaByHost(form.host); const rmMeta: RemoteMachineMeta = this.getRmMetaByHost(form.host);
const sshClient: Client | undefined = this.machineSSHClientMap.get(rmMeta); const sshClientManager: SSHClientManager | undefined = this.machineSSHClientMap.get(rmMeta);
if (sshClient === undefined) { if (sshClientManager === undefined) {
throw new Error('sshClient not found.'); throw new Error('sshClient not found.');
} }
let sshClient: Client = sshClientManager.getFirstSSHClient();
const jobId: string = uniqueString(5); const jobId: string = uniqueString(5);
const localDir: string = path.join(this.expRootDir, 'hostjobs-local', jobId); const localDir: string = path.join(this.expRootDir, 'hostjobs-local', jobId);
const remoteDir: string = this.getHostJobRemoteDir(jobId); const remoteDir: string = this.getHostJobRemoteDir(jobId);
...@@ -654,6 +677,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -654,6 +677,7 @@ class RemoteMachineTrainingService implements TrainingService {
} }
} }
trialJob.endTime = parseInt(timestamp, 10); trialJob.endTime = parseInt(timestamp, 10);
this.releaseTrialSSHClient(trialJob);
} }
this.log.debug(`trailJob status update: ${trialJob.id}, ${trialJob.status}`); this.log.debug(`trailJob status update: ${trialJob.id}, ${trialJob.status}`);
} }
...@@ -705,7 +729,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -705,7 +729,7 @@ class RemoteMachineTrainingService implements TrainingService {
} }
private async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters, rmMeta: RemoteMachineMeta): Promise<void> { private async writeParameterFile(trialJobId: string, hyperParameters: HyperParameters, rmMeta: RemoteMachineMeta): Promise<void> {
const sshClient: Client | undefined = this.machineSSHClientMap.get(rmMeta); const sshClient: Client | undefined = this.trialSSHClientMap.get(trialJobId);
if (sshClient === undefined) { if (sshClient === undefined) {
throw new Error('sshClient is undefined.'); throw new Error('sshClient is undefined.');
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment