Unverified Commit 0c3827b3 authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

fix ip detection (#3934)


Co-authored-by: default avatarliuzhe <zhe.liu@microsoft.com>
parent 994a2d23
...@@ -223,7 +223,7 @@ let cachedIpv4Address: string | null = null; ...@@ -223,7 +223,7 @@ let cachedIpv4Address: string | null = null;
/** /**
* Get IPv4 address of current machine. * Get IPv4 address of current machine.
*/ */
function getIPV4Address(): string { async function getIPV4Address(): Promise<string> {
if (cachedIpv4Address !== null) { if (cachedIpv4Address !== null) {
return cachedIpv4Address; return cachedIpv4Address;
} }
...@@ -232,12 +232,20 @@ function getIPV4Address(): string { ...@@ -232,12 +232,20 @@ function getIPV4Address(): string {
// since udp is connectionless, this does not send actual packets. // since udp is connectionless, this does not send actual packets.
const socket = dgram.createSocket('udp4'); const socket = dgram.createSocket('udp4');
socket.connect(1, '192.0.2.0'); socket.connect(1, '192.0.2.0');
cachedIpv4Address = socket.address().address; for (let i = 0; i < 10; i++) { // wait the system to initialize "connection"
await yield_();
try { cachedIpv4Address = socket.address().address; } catch (error) { /* retry */ }
}
cachedIpv4Address = socket.address().address; // if it still fails, throw the error
socket.close(); socket.close();
return cachedIpv4Address; return cachedIpv4Address;
} }
async function yield_(): Promise<void> {
/* trigger the scheduler, do nothing */
}
/** /**
* Get the status of canceled jobs according to the hint isEarlyStopped * Get the status of canceled jobs according to the hint isEarlyStopped
*/ */
......
...@@ -277,7 +277,7 @@ abstract class KubernetesTrainingService { ...@@ -277,7 +277,7 @@ abstract class KubernetesTrainingService {
if (gpuNum === 0) { if (gpuNum === 0) {
nvidiaScript = 'export CUDA_VISIBLE_DEVICES='; nvidiaScript = 'export CUDA_VISIBLE_DEVICES=';
} }
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : await getIPV4Address();
const version: string = this.versionCheck ? await getVersion() : ''; const version: string = this.versionCheck ? await getVersion() : '';
const runScript: string = String.Format( const runScript: string = String.Format(
kubernetesScriptFormat, kubernetesScriptFormat,
......
...@@ -23,10 +23,7 @@ import { PAIJobInfoCollector } from './paiJobInfoCollector'; ...@@ -23,10 +23,7 @@ import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { PAIJobRestServer } from './paiJobRestServer'; import { PAIJobRestServer } from './paiJobRestServer';
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiConfig'; import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiConfig';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { import { generateParamFileName, getIPV4Address, uniqueString } from '../../common/utils';
generateParamFileName,
getIPV4Address, uniqueString
} from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { execMkdir, validateCodeDir, execCopydir } from '../common/util'; import { execMkdir, validateCodeDir, execCopydir } from '../common/util';
...@@ -332,7 +329,7 @@ class PAITrainingService implements TrainingService { ...@@ -332,7 +329,7 @@ class PAITrainingService implements TrainingService {
return trialJobDetail; return trialJobDetail;
} }
private generateNNITrialCommand(trialJobDetail: PAITrialJobDetail, command: string): string { private async generateNNITrialCommand(trialJobDetail: PAITrialJobDetail, command: string): Promise<string> {
const containerNFSExpCodeDir = `${this.config.containerStorageMountPoint}/${this.experimentId}/nni-code`; const containerNFSExpCodeDir = `${this.config.containerStorageMountPoint}/${this.experimentId}/nni-code`;
const containerWorkingDir: string = `${this.config.containerStorageMountPoint}/${this.experimentId}/${trialJobDetail.id}`; const containerWorkingDir: string = `${this.config.containerStorageMountPoint}/${this.experimentId}/${trialJobDetail.id}`;
const nniPaiTrialCommand: string = String.Format( const nniPaiTrialCommand: string = String.Format(
...@@ -345,7 +342,7 @@ class PAITrainingService implements TrainingService { ...@@ -345,7 +342,7 @@ class PAITrainingService implements TrainingService {
false, // multi-phase false, // multi-phase
containerNFSExpCodeDir, containerNFSExpCodeDir,
command, command,
this.config.nniManagerIp || getIPV4Address(), this.config.nniManagerIp || await getIPV4Address(),
this.paiRestServerPort, this.paiRestServerPort,
this.nniVersion, this.nniVersion,
this.logCollection this.logCollection
...@@ -356,7 +353,7 @@ class PAITrainingService implements TrainingService { ...@@ -356,7 +353,7 @@ class PAITrainingService implements TrainingService {
} }
private generateJobConfigInYamlFormat(trialJobDetail: PAITrialJobDetail): any { private async generateJobConfigInYamlFormat(trialJobDetail: PAITrialJobDetail): Promise<any> {
const jobName = `nni_exp_${this.experimentId}_trial_${trialJobDetail.id}` const jobName = `nni_exp_${this.experimentId}_trial_${trialJobDetail.id}`
let nniJobConfig: any = undefined; let nniJobConfig: any = undefined;
...@@ -367,7 +364,7 @@ class PAITrainingService implements TrainingService { ...@@ -367,7 +364,7 @@ class PAITrainingService implements TrainingService {
// Each command will be formatted to NNI style // Each command will be formatted to NNI style
for (const taskRoleIndex in nniJobConfig.taskRoles) { for (const taskRoleIndex in nniJobConfig.taskRoles) {
const commands = nniJobConfig.taskRoles[taskRoleIndex].commands const commands = nniJobConfig.taskRoles[taskRoleIndex].commands
const nniTrialCommand = this.generateNNITrialCommand(trialJobDetail, commands.join(" && ").replace(/(["'$`\\])/g, '\\$1')); const nniTrialCommand = await this.generateNNITrialCommand(trialJobDetail, commands.join(" && ").replace(/(["'$`\\])/g, '\\$1'));
nniJobConfig.taskRoles[taskRoleIndex].commands = [nniTrialCommand] nniJobConfig.taskRoles[taskRoleIndex].commands = [nniTrialCommand]
} }
...@@ -399,7 +396,7 @@ class PAITrainingService implements TrainingService { ...@@ -399,7 +396,7 @@ class PAITrainingService implements TrainingService {
memoryMB: toMegaBytes(this.config.trialMemorySize) memoryMB: toMegaBytes(this.config.trialMemorySize)
}, },
commands: [ commands: [
this.generateNNITrialCommand(trialJobDetail, this.config.trialCommand) await this.generateNNITrialCommand(trialJobDetail, this.config.trialCommand)
] ]
} }
}, },
...@@ -456,7 +453,7 @@ class PAITrainingService implements TrainingService { ...@@ -456,7 +453,7 @@ class PAITrainingService implements TrainingService {
} }
//Generate Job Configuration in yaml format //Generate Job Configuration in yaml format
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail); const paiJobConfig = await this.generateJobConfigInYamlFormat(trialJobDetail);
this.log.debug(paiJobConfig); this.log.debug(paiJobConfig);
// Step 2. Submit PAI job via Rest call // Step 2. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
......
...@@ -491,7 +491,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -491,7 +491,7 @@ class RemoteMachineTrainingService implements TrainingService {
cudaVisible = `CUDA_VISIBLE_DEVICES=" "`; cudaVisible = `CUDA_VISIBLE_DEVICES=" "`;
} }
} }
const nniManagerIp: string = this.config.nniManagerIp ? this.config.nniManagerIp : getIPV4Address(); const nniManagerIp: string = this.config.nniManagerIp ? this.config.nniManagerIp : await getIPV4Address();
if (this.remoteRestServerPort === undefined) { if (this.remoteRestServerPort === undefined) {
const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer); const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
this.remoteRestServerPort = restServer.clusterRestServerPort; this.remoteRestServerPort = restServer.clusterRestServerPort;
......
...@@ -216,7 +216,7 @@ class TrialDispatcher implements TrainingService { ...@@ -216,7 +216,7 @@ class TrialDispatcher implements TrainingService {
for(const environmentService of this.environmentServiceList) { for(const environmentService of this.environmentServiceList) {
const runnerSettings: RunnerSettings = new RunnerSettings(); const runnerSettings: RunnerSettings = new RunnerSettings();
runnerSettings.nniManagerIP = this.config.nniManagerIp === undefined? getIPV4Address() : this.config.nniManagerIp; runnerSettings.nniManagerIP = this.config.nniManagerIp === undefined? await getIPV4Address() : this.config.nniManagerIp;
runnerSettings.nniManagerPort = getBasePort() + 1; runnerSettings.nniManagerPort = getBasePort() + 1;
runnerSettings.commandChannel = environmentService.getCommandChannel.channelName; runnerSettings.commandChannel = environmentService.getCommandChannel.channelName;
runnerSettings.enableGpuCollector = this.enableGpuScheduler; runnerSettings.enableGpuCollector = this.enableGpuScheduler;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment