Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
0c3827b3
Unverified
Commit
0c3827b3
authored
Jul 14, 2021
by
liuzhe-lz
Committed by
GitHub
Jul 14, 2021
Browse files
fix ip detection (#3934)
Co-authored-by:
liuzhe
<
zhe.liu@microsoft.com
>
parent
994a2d23
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
20 additions
and
15 deletions
+20
-15
ts/nni_manager/common/utils.ts
ts/nni_manager/common/utils.ts
+10
-2
ts/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+1
-1
ts/nni_manager/training_service/pai/paiTrainingService.ts
ts/nni_manager/training_service/pai/paiTrainingService.ts
+7
-10
ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+1
-1
ts/nni_manager/training_service/reusable/trialDispatcher.ts
ts/nni_manager/training_service/reusable/trialDispatcher.ts
+1
-1
No files found.
ts/nni_manager/common/utils.ts
View file @
0c3827b3
...
...
@@ -223,7 +223,7 @@ let cachedIpv4Address: string | null = null;
/**
* Get IPv4 address of current machine.
*/
function
getIPV4Address
():
string
{
async
function
getIPV4Address
():
Promise
<
string
>
{
if
(
cachedIpv4Address
!==
null
)
{
return
cachedIpv4Address
;
}
...
...
@@ -232,12 +232,20 @@ function getIPV4Address(): string {
// since udp is connectionless, this does not send actual packets.
const
socket
=
dgram
.
createSocket
(
'
udp4
'
);
socket
.
connect
(
1
,
'
192.0.2.0
'
);
cachedIpv4Address
=
socket
.
address
().
address
;
for
(
let
i
=
0
;
i
<
10
;
i
++
)
{
// wait the system to initialize "connection"
await
yield_
();
try
{
cachedIpv4Address
=
socket
.
address
().
address
;
}
catch
(
error
)
{
/* retry */
}
}
cachedIpv4Address
=
socket
.
address
().
address
;
// if it still fails, throw the error
socket
.
close
();
return
cachedIpv4Address
;
}
async
function
yield_
():
Promise
<
void
>
{
/* trigger the scheduler, do nothing */
}
/**
* Get the status of canceled jobs according to the hint isEarlyStopped
*/
...
...
ts/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
0c3827b3
...
...
@@ -277,7 +277,7 @@ abstract class KubernetesTrainingService {
if
(
gpuNum
===
0
)
{
nvidiaScript
=
'
export CUDA_VISIBLE_DEVICES=
'
;
}
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
getIPV4Address
();
const
nniManagerIp
:
string
=
this
.
nniManagerIpConfig
?
this
.
nniManagerIpConfig
.
nniManagerIp
:
await
getIPV4Address
();
const
version
:
string
=
this
.
versionCheck
?
await
getVersion
()
:
''
;
const
runScript
:
string
=
String
.
Format
(
kubernetesScriptFormat
,
...
...
ts/nni_manager/training_service/pai/paiTrainingService.ts
View file @
0c3827b3
...
...
@@ -23,10 +23,7 @@ import { PAIJobInfoCollector } from './paiJobInfoCollector';
import
{
PAIJobRestServer
}
from
'
./paiJobRestServer
'
;
import
{
PAITrialJobDetail
,
PAI_TRIAL_COMMAND_FORMAT
}
from
'
./paiConfig
'
;
import
{
String
}
from
'
typescript-string-operations
'
;
import
{
generateParamFileName
,
getIPV4Address
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
generateParamFileName
,
getIPV4Address
,
uniqueString
}
from
'
../../common/utils
'
;
import
{
CONTAINER_INSTALL_NNI_SHELL_FORMAT
}
from
'
../common/containerJobData
'
;
import
{
execMkdir
,
validateCodeDir
,
execCopydir
}
from
'
../common/util
'
;
...
...
@@ -332,7 +329,7 @@ class PAITrainingService implements TrainingService {
return
trialJobDetail
;
}
private
generateNNITrialCommand
(
trialJobDetail
:
PAITrialJobDetail
,
command
:
string
):
string
{
private
async
generateNNITrialCommand
(
trialJobDetail
:
PAITrialJobDetail
,
command
:
string
):
Promise
<
string
>
{
const
containerNFSExpCodeDir
=
`
${
this
.
config
.
containerStorageMountPoint
}
/
${
this
.
experimentId
}
/nni-code`
;
const
containerWorkingDir
:
string
=
`
${
this
.
config
.
containerStorageMountPoint
}
/
${
this
.
experimentId
}
/
${
trialJobDetail
.
id
}
`
;
const
nniPaiTrialCommand
:
string
=
String
.
Format
(
...
...
@@ -345,7 +342,7 @@ class PAITrainingService implements TrainingService {
false
,
// multi-phase
containerNFSExpCodeDir
,
command
,
this
.
config
.
nniManagerIp
||
getIPV4Address
(),
this
.
config
.
nniManagerIp
||
await
getIPV4Address
(),
this
.
paiRestServerPort
,
this
.
nniVersion
,
this
.
logCollection
...
...
@@ -356,7 +353,7 @@ class PAITrainingService implements TrainingService {
}
private
generateJobConfigInYamlFormat
(
trialJobDetail
:
PAITrialJobDetail
):
any
{
private
async
generateJobConfigInYamlFormat
(
trialJobDetail
:
PAITrialJobDetail
):
Promise
<
any
>
{
const
jobName
=
`nni_exp_
${
this
.
experimentId
}
_trial_
${
trialJobDetail
.
id
}
`
let
nniJobConfig
:
any
=
undefined
;
...
...
@@ -367,7 +364,7 @@ class PAITrainingService implements TrainingService {
// Each command will be formatted to NNI style
for
(
const
taskRoleIndex
in
nniJobConfig
.
taskRoles
)
{
const
commands
=
nniJobConfig
.
taskRoles
[
taskRoleIndex
].
commands
const
nniTrialCommand
=
this
.
generateNNITrialCommand
(
trialJobDetail
,
commands
.
join
(
"
&&
"
).
replace
(
/
([
"'$`
\\])
/g
,
'
\\
$1
'
));
const
nniTrialCommand
=
await
this
.
generateNNITrialCommand
(
trialJobDetail
,
commands
.
join
(
"
&&
"
).
replace
(
/
([
"'$`
\\])
/g
,
'
\\
$1
'
));
nniJobConfig
.
taskRoles
[
taskRoleIndex
].
commands
=
[
nniTrialCommand
]
}
...
...
@@ -399,7 +396,7 @@ class PAITrainingService implements TrainingService {
memoryMB
:
toMegaBytes
(
this
.
config
.
trialMemorySize
)
},
commands
:
[
this
.
generateNNITrialCommand
(
trialJobDetail
,
this
.
config
.
trialCommand
)
await
this
.
generateNNITrialCommand
(
trialJobDetail
,
this
.
config
.
trialCommand
)
]
}
},
...
...
@@ -456,7 +453,7 @@ class PAITrainingService implements TrainingService {
}
//Generate Job Configuration in yaml format
const
paiJobConfig
=
this
.
generateJobConfigInYamlFormat
(
trialJobDetail
);
const
paiJobConfig
=
await
this
.
generateJobConfigInYamlFormat
(
trialJobDetail
);
this
.
log
.
debug
(
paiJobConfig
);
// Step 2. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
...
...
ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
0c3827b3
...
...
@@ -491,7 +491,7 @@ class RemoteMachineTrainingService implements TrainingService {
cudaVisible
=
`CUDA_VISIBLE_DEVICES=" "`
;
}
}
const
nniManagerIp
:
string
=
this
.
config
.
nniManagerIp
?
this
.
config
.
nniManagerIp
:
getIPV4Address
();
const
nniManagerIp
:
string
=
this
.
config
.
nniManagerIp
?
this
.
config
.
nniManagerIp
:
await
getIPV4Address
();
if
(
this
.
remoteRestServerPort
===
undefined
)
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
this
.
remoteRestServerPort
=
restServer
.
clusterRestServerPort
;
...
...
ts/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
0c3827b3
...
...
@@ -216,7 +216,7 @@ class TrialDispatcher implements TrainingService {
for
(
const
environmentService
of
this
.
environmentServiceList
)
{
const
runnerSettings
:
RunnerSettings
=
new
RunnerSettings
();
runnerSettings
.
nniManagerIP
=
this
.
config
.
nniManagerIp
===
undefined
?
getIPV4Address
()
:
this
.
config
.
nniManagerIp
;
runnerSettings
.
nniManagerIP
=
this
.
config
.
nniManagerIp
===
undefined
?
await
getIPV4Address
()
:
this
.
config
.
nniManagerIp
;
runnerSettings
.
nniManagerPort
=
getBasePort
()
+
1
;
runnerSettings
.
commandChannel
=
environmentService
.
getCommandChannel
.
channelName
;
runnerSettings
.
enableGpuCollector
=
this
.
enableGpuScheduler
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment