Unverified Commit 0c3827b3 authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

fix ip detection (#3934)


Co-authored-by: default avatarliuzhe <zhe.liu@microsoft.com>
parent 994a2d23
......@@ -223,7 +223,7 @@ let cachedIpv4Address: string | null = null;
/**
* Get IPv4 address of current machine.
*/
function getIPV4Address(): string {
async function getIPV4Address(): Promise<string> {
if (cachedIpv4Address !== null) {
return cachedIpv4Address;
}
......@@ -232,12 +232,20 @@ function getIPV4Address(): string {
// since udp is connectionless, this does not send actual packets.
const socket = dgram.createSocket('udp4');
socket.connect(1, '192.0.2.0');
cachedIpv4Address = socket.address().address;
for (let i = 0; i < 10; i++) { // wait the system to initialize "connection"
await yield_();
try { cachedIpv4Address = socket.address().address; } catch (error) { /* retry */ }
}
cachedIpv4Address = socket.address().address; // if it still fails, throw the error
socket.close();
return cachedIpv4Address;
}
async function yield_(): Promise<void> {
/* trigger the scheduler, do nothing */
}
/**
* Get the status of canceled jobs according to the hint isEarlyStopped
*/
......
......@@ -277,7 +277,7 @@ abstract class KubernetesTrainingService {
if (gpuNum === 0) {
nvidiaScript = 'export CUDA_VISIBLE_DEVICES=';
}
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : await getIPV4Address();
const version: string = this.versionCheck ? await getVersion() : '';
const runScript: string = String.Format(
kubernetesScriptFormat,
......
......@@ -23,10 +23,7 @@ import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { PAIJobRestServer } from './paiJobRestServer';
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiConfig';
import { String } from 'typescript-string-operations';
import {
generateParamFileName,
getIPV4Address, uniqueString
} from '../../common/utils';
import { generateParamFileName, getIPV4Address, uniqueString } from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { execMkdir, validateCodeDir, execCopydir } from '../common/util';
......@@ -332,7 +329,7 @@ class PAITrainingService implements TrainingService {
return trialJobDetail;
}
private generateNNITrialCommand(trialJobDetail: PAITrialJobDetail, command: string): string {
private async generateNNITrialCommand(trialJobDetail: PAITrialJobDetail, command: string): Promise<string> {
const containerNFSExpCodeDir = `${this.config.containerStorageMountPoint}/${this.experimentId}/nni-code`;
const containerWorkingDir: string = `${this.config.containerStorageMountPoint}/${this.experimentId}/${trialJobDetail.id}`;
const nniPaiTrialCommand: string = String.Format(
......@@ -345,7 +342,7 @@ class PAITrainingService implements TrainingService {
false, // multi-phase
containerNFSExpCodeDir,
command,
this.config.nniManagerIp || getIPV4Address(),
this.config.nniManagerIp || await getIPV4Address(),
this.paiRestServerPort,
this.nniVersion,
this.logCollection
......@@ -356,7 +353,7 @@ class PAITrainingService implements TrainingService {
}
private generateJobConfigInYamlFormat(trialJobDetail: PAITrialJobDetail): any {
private async generateJobConfigInYamlFormat(trialJobDetail: PAITrialJobDetail): Promise<any> {
const jobName = `nni_exp_${this.experimentId}_trial_${trialJobDetail.id}`
let nniJobConfig: any = undefined;
......@@ -367,7 +364,7 @@ class PAITrainingService implements TrainingService {
// Each command will be formatted to NNI style
for (const taskRoleIndex in nniJobConfig.taskRoles) {
const commands = nniJobConfig.taskRoles[taskRoleIndex].commands
const nniTrialCommand = this.generateNNITrialCommand(trialJobDetail, commands.join(" && ").replace(/(["'$`\\])/g, '\\$1'));
const nniTrialCommand = await this.generateNNITrialCommand(trialJobDetail, commands.join(" && ").replace(/(["'$`\\])/g, '\\$1'));
nniJobConfig.taskRoles[taskRoleIndex].commands = [nniTrialCommand]
}
......@@ -399,7 +396,7 @@ class PAITrainingService implements TrainingService {
memoryMB: toMegaBytes(this.config.trialMemorySize)
},
commands: [
this.generateNNITrialCommand(trialJobDetail, this.config.trialCommand)
await this.generateNNITrialCommand(trialJobDetail, this.config.trialCommand)
]
}
},
......@@ -456,7 +453,7 @@ class PAITrainingService implements TrainingService {
}
//Generate Job Configuration in yaml format
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail);
const paiJobConfig = await this.generateJobConfigInYamlFormat(trialJobDetail);
this.log.debug(paiJobConfig);
// Step 2. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
......
......@@ -491,7 +491,7 @@ class RemoteMachineTrainingService implements TrainingService {
cudaVisible = `CUDA_VISIBLE_DEVICES=" "`;
}
}
const nniManagerIp: string = this.config.nniManagerIp ? this.config.nniManagerIp : getIPV4Address();
const nniManagerIp: string = this.config.nniManagerIp ? this.config.nniManagerIp : await getIPV4Address();
if (this.remoteRestServerPort === undefined) {
const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
this.remoteRestServerPort = restServer.clusterRestServerPort;
......
......@@ -216,7 +216,7 @@ class TrialDispatcher implements TrainingService {
for(const environmentService of this.environmentServiceList) {
const runnerSettings: RunnerSettings = new RunnerSettings();
runnerSettings.nniManagerIP = this.config.nniManagerIp === undefined? getIPV4Address() : this.config.nniManagerIp;
runnerSettings.nniManagerIP = this.config.nniManagerIp === undefined? await getIPV4Address() : this.config.nniManagerIp;
runnerSettings.nniManagerPort = getBasePort() + 1;
runnerSettings.commandChannel = environmentService.getCommandChannel.channelName;
runnerSettings.enableGpuCollector = this.enableGpuScheduler;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment