Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
4b1961e2
Unverified
Commit
4b1961e2
authored
Nov 22, 2019
by
SparkSnail
Committed by
GitHub
Nov 22, 2019
Browse files
Fix remoteTrainingService gpuScheduler (#1749)
parent
5845ca04
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
5 deletions
+10
-5
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+10
-5
No files found.
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
4b1961e2
...
...
@@ -67,7 +67,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
readonly
expRootDir
:
string
;
private
readonly
remoteExpRootDir
:
string
;
private
trialConfig
:
TrialConfig
|
undefined
;
private
readonly
gpuScheduler
:
GPUScheduler
;
private
gpuScheduler
?
:
GPUScheduler
;
private
readonly
jobQueue
:
string
[];
private
readonly
timer
:
ObservableTimer
;
private
stopping
:
boolean
=
false
;
...
...
@@ -87,7 +87,6 @@ class RemoteMachineTrainingService implements TrainingService {
this
.
trialJobsMap
=
new
Map
<
string
,
RemoteMachineTrialJobDetail
>
();
this
.
trialSSHClientMap
=
new
Map
<
string
,
Client
>
();
this
.
machineSSHClientMap
=
new
Map
<
RemoteMachineMeta
,
SSHClientManager
>
();
this
.
gpuScheduler
=
new
GPUScheduler
(
this
.
machineSSHClientMap
);
this
.
jobQueue
=
[];
this
.
expRootDir
=
getExperimentRootDir
();
this
.
remoteExpRootDir
=
this
.
getRemoteExperimentRootDir
();
...
...
@@ -334,6 +333,7 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
case
TrialConfigMetadataKey
.
MACHINE_LIST
:
await
this
.
setupConnections
(
value
);
this
.
gpuScheduler
=
new
GPUScheduler
(
this
.
machineSSHClientMap
);
break
;
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
const
remoteMachineTrailConfig
:
TrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
...
...
@@ -397,12 +397,14 @@ class RemoteMachineTrainingService implements TrainingService {
* remove gpu reversion when job is not running
*/
private
updateGpuReservation
():
void
{
if
(
this
.
gpuScheduler
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
}
}
}
/**
* stop gpu_metric_collector process in remote machine and remove unused scripts
...
...
@@ -483,6 +485,9 @@ class RemoteMachineTrainingService implements TrainingService {
if
(
this
.
trialConfig
===
undefined
)
{
throw
new
Error
(
'
trial config is not initialized
'
);
}
if
(
this
.
gpuScheduler
===
undefined
)
{
throw
new
Error
(
'
gpuScheduler is not initialized
'
);
}
const
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
this
.
trialJobsMap
.
get
(
trialJobId
);
if
(
trialJobDetail
===
undefined
)
{
throw
new
NNIError
(
NNIErrorNames
.
INVALID_JOB_DETAIL
,
`Invalid job detail information for trial job
${
trialJobId
}
`
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment