Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
109d9a32
Unverified
Commit
109d9a32
authored
Aug 07, 2020
by
SparkSnail
Committed by
GitHub
Aug 07, 2020
Browse files
Fix remote machine connection logic (#2725)
parent
2488aa65
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
5 deletions
+10
-5
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+10
-5
No files found.
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
109d9a32
...
@@ -57,6 +57,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -57,6 +57,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
versionCheck
:
boolean
=
true
;
private
versionCheck
:
boolean
=
true
;
private
logCollection
:
string
;
private
logCollection
:
string
;
private
sshConnectionPromises
:
any
[];
constructor
(@
component
.
Inject
timer
:
ObservableTimer
)
{
constructor
(@
component
.
Inject
timer
:
ObservableTimer
)
{
this
.
metricsEmitter
=
new
EventEmitter
();
this
.
metricsEmitter
=
new
EventEmitter
();
...
@@ -65,6 +66,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -65,6 +66,7 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
machineCopyExpCodeDirPromiseMap
=
new
Map
<
RemoteMachineMeta
,
Promise
<
void
>>
();
this
.
machineCopyExpCodeDirPromiseMap
=
new
Map
<
RemoteMachineMeta
,
Promise
<
void
>>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachineMeta
,
ExecutorManager
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachineMeta
,
ExecutorManager
>
();
this
.
jobQueue
=
[];
this
.
jobQueue
=
[];
this
.
sshConnectionPromises
=
[];
this
.
expRootDir
=
getExperimentRootDir
();
this
.
expRootDir
=
getExperimentRootDir
();
this
.
timer
=
timer
;
this
.
timer
=
timer
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
...
@@ -80,6 +82,12 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -80,6 +82,12 @@ class RemoteMachineTrainingService implements TrainingService {
await
restServer
.
start
();
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
'
Run remote machine training service.
'
);
this
.
log
.
info
(
'
Run remote machine training service.
'
);
if
(
this
.
sshConnectionPromises
.
length
>
0
)
{
await
Promise
.
all
(
this
.
sshConnectionPromises
);
this
.
log
.
info
(
'
ssh connection initialized!
'
);
// set sshConnectionPromises to [] to avoid log information duplicated
this
.
sshConnectionPromises
=
[];
}
while
(
!
this
.
stopping
)
{
while
(
!
this
.
stopping
)
{
while
(
this
.
jobQueue
.
length
>
0
)
{
while
(
this
.
jobQueue
.
length
>
0
)
{
this
.
updateGpuReservation
();
this
.
updateGpuReservation
();
...
@@ -408,7 +416,6 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -408,7 +416,6 @@ class RemoteMachineTrainingService implements TrainingService {
//TO DO: verify if value's format is wrong, and json parse failed, how to handle error
//TO DO: verify if value's format is wrong, and json parse failed, how to handle error
const
rmMetaList
:
RemoteMachineMeta
[]
=
<
RemoteMachineMeta
[]
>
JSON
.
parse
(
machineList
);
const
rmMetaList
:
RemoteMachineMeta
[]
=
<
RemoteMachineMeta
[]
>
JSON
.
parse
(
machineList
);
const
connectionPromises
=
[];
for
(
const
rmMeta
of
rmMetaList
)
{
for
(
const
rmMeta
of
rmMetaList
)
{
rmMeta
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
();
rmMeta
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
();
const
executorManager
:
ExecutorManager
=
new
ExecutorManager
(
rmMeta
);
const
executorManager
:
ExecutorManager
=
new
ExecutorManager
(
rmMeta
);
...
@@ -417,11 +424,9 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -417,11 +424,9 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
log
.
debug
(
`reached
${
executor
.
name
}
`
);
this
.
log
.
debug
(
`reached
${
executor
.
name
}
`
);
this
.
machineExecutorManagerMap
.
set
(
rmMeta
,
executorManager
);
this
.
machineExecutorManagerMap
.
set
(
rmMeta
,
executorManager
);
this
.
log
.
debug
(
`initializing
${
executor
.
name
}
`
);
this
.
log
.
debug
(
`initializing
${
executor
.
name
}
`
);
c
onnectionPromises
.
push
(
this
.
initRemoteMachineOnConnected
(
rmMeta
,
executor
));
this
.
sshC
onnectionPromises
.
push
(
this
.
initRemoteMachineOnConnected
(
rmMeta
,
executor
));
this
.
log
.
info
(
`connect
ed
to
${
executor
.
name
}
`
);
this
.
log
.
info
(
`connect
ing
to
${
executor
.
name
}
`
);
}
}
await
Promise
.
all
(
connectionPromises
);
}
}
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
,
executor
:
ShellExecutor
):
Promise
<
void
>
{
private
async
initRemoteMachineOnConnected
(
rmMeta
:
RemoteMachineMeta
,
executor
:
ShellExecutor
):
Promise
<
void
>
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment