Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
4b1961e2
Unverified
Commit
4b1961e2
authored
Nov 22, 2019
by
SparkSnail
Committed by
GitHub
Nov 22, 2019
Browse files
Fix remoteTrainingService gpuScheduler (#1749)
parent
5845ca04
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
5 deletions
+10
-5
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+10
-5
No files found.
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
4b1961e2
...
...
@@ -67,7 +67,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
readonly
expRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
readonly
gpuScheduler
:
GPUScheduler
;
private
gpuScheduler
?
:
GPUScheduler
;
private
readonly
jobQueue
:
string
[];
private
readonly
timer
:
ObservableTimer
;
private
stopping
:
boolean
=
false
;
...
...
@@ -87,7 +87,6 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
trialJobsMap
=
new
Map
<
string
,
RemoteMachineTrialJobDetail
>
();
this
.
trialSSHClientMap
=
new
Map
<
string
,
Client
>
();
this
.
machineSSHClientMap
=
new
Map
<
RemoteMachineMeta
,
SSHClientManager
>
();
this
.
gpuScheduler
=
new
GPUScheduler
(
this
.
machineSSHClientMap
);
this
.
jobQueue
=
[];
this
.
expRootDir
=
getExperimentRootDir
();
this
.
remoteExpRootDir
=
this
.
getRemoteExperimentRootDir
();
...
...
@@ -334,6 +333,7 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
case
TrialConfigMetadataKey
.
MACHINE_LIST
:
await
this
.
setupConnections
(
value
);
this
.
gpuScheduler
=
new
GPUScheduler
(
this
.
machineSSHClientMap
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
...
...
@@ -397,9 +397,11 @@ class RemoteMachineTrainingService implements TrainingService {
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
():
void
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
if
(
this
.
gpuScheduler
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
}
}
}
...
...
@@ -483,6 +485,9 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
this
.
gpuScheduler
===
undefined
)
{
throw
new
Error
(
'
gpuScheduler is not initialized
'
);
}
const
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
INVALID_JOB_DETAIL
,
`Invalid job detail information for trial job
${
trialJobId
}
`
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment