Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
761638d8
Commit
761638d8
authored
Nov 01, 2018
by
Gems Guo
Committed by
goooxu
Nov 05, 2018
Browse files
Refactor close experiment implementation
parent
5a2721be
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
36 additions
and
21 deletions
+36
-21
src/nni_manager/common/log.ts
src/nni_manager/common/log.ts
+8
-2
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+8
-12
src/nni_manager/main.ts
src/nni_manager/main.ts
+10
-1
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+0
-2
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+7
-2
src/nni_manager/training_service/pai/hdfsClientUtility.ts
src/nni_manager/training_service/pai/hdfsClientUtility.ts
+1
-1
src/nni_manager/types/tail-stream/index.d.ts
src/nni_manager/types/tail-stream/index.d.ts
+1
-0
tools/nnicmd/nnictl_utils.py
tools/nnicmd/nnictl_utils.py
+1
-1
No files found.
src/nni_manager/common/log.ts
View file @
761638d8
...
...
@@ -70,17 +70,23 @@ class Logger {
private
DEFAULT_LOGFILE
:
string
=
path
.
join
(
getLogDir
(),
'
nnimanager.log
'
);
private
level
:
number
=
DEBUG
;
private
bufferSerialEmitter
:
BufferSerialEmitter
;
private
writble
:
Writable
;
constructor
(
fileName
?:
string
)
{
let
logFile
:
string
|
undefined
=
fileName
;
if
(
logFile
===
undefined
)
{
logFile
=
this
.
DEFAULT_LOGFILE
;
}
this
.
bufferSerialEmitter
=
new
BufferSerialEmitter
(
fs
.
createWriteStream
(
logFile
,
{
this
.
writble
=
fs
.
createWriteStream
(
logFile
,
{
flags
:
'
a+
'
,
encoding
:
'
utf8
'
,
autoClose
:
true
}));
});
this
.
bufferSerialEmitter
=
new
BufferSerialEmitter
(
this
.
writble
);
}
public
close
()
{
this
.
writble
.
destroy
();
}
public
debug
(...
param
:
any
[]):
void
{
...
...
src/nni_manager/core/nnimanager.ts
View file @
761638d8
...
...
@@ -35,7 +35,7 @@ import {
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../common/trainingService
'
;
import
{
delay
,
getLogDir
,
getMsgDispatcherCommand
}
from
'
../common/utils
'
;
import
{
delay
,
getLogDir
,
getMsgDispatcherCommand
}
from
'
../common/utils
'
;
import
{
ADD_CUSTOMIZED_TRIAL_JOB
,
KILL_TRIAL_JOB
,
NEW_TRIAL_JOB
,
NO_MORE_TRIAL_JOBS
,
REPORT_METRIC_DATA
,
REQUEST_TRIAL_JOBS
,
SEND_TRIAL_JOB_PARAMETER
,
TERMINATE
,
TRIAL_END
,
UPDATE_SEARCH_SPACE
...
...
@@ -123,7 +123,7 @@ class NNIManager implements Manager {
this
.
log
.
debug
(
'
Setup tuner...
'
);
// Set up multiphase config
if
(
expParams
.
multiPhase
&&
this
.
trainingService
.
isMultiPhaseJobSupported
)
{
if
(
expParams
.
multiPhase
&&
this
.
trainingService
.
isMultiPhaseJobSupported
)
{
this
.
trainingService
.
setClusterMetadata
(
'
multiPhase
'
,
expParams
.
multiPhase
.
toString
());
}
...
...
@@ -217,10 +217,9 @@ class NNIManager implements Manager {
return
this
.
dataStore
.
getTrialJobStatistics
();
}
public
stopExperiment
():
Promise
<
void
>
{
public
async
stopExperiment
():
Promise
<
void
>
{
this
.
status
.
status
=
'
STOPPING
'
;
return
Promise
.
resolve
();
await
this
.
experimentDoneCleanUp
();
}
public
async
getMetricData
(
trialJobId
?:
string
,
metricType
?:
MetricType
):
Promise
<
MetricDataRecord
[]
>
{
...
...
@@ -342,7 +341,7 @@ class NNIManager implements Manager {
private
async
periodicallyUpdateExecDuration
():
Promise
<
void
>
{
let
count
:
number
=
1
;
for
(;
;
)
{
while
(
this
.
status
.
status
!==
'
STOPPING
'
)
{
await
delay
(
1000
*
1
);
// 1 seconds
if
(
this
.
status
.
status
===
'
EXPERIMENT_RUNNING
'
)
{
this
.
experimentProfile
.
execDuration
+=
1
;
...
...
@@ -396,10 +395,7 @@ class NNIManager implements Manager {
throw
new
Error
(
'
Error: tuner has not been setup
'
);
}
let
allFinishedTrialJobNum
:
number
=
0
;
for
(;
;)
{
if
(
this
.
status
.
status
===
'
STOPPING
'
)
{
break
;
}
while
(
this
.
status
.
status
!==
'
STOPPING
'
)
{
const
finishedTrialJobNum
:
number
=
await
this
.
requestTrialJobsStatus
();
allFinishedTrialJobNum
+=
finishedTrialJobNum
;
...
...
@@ -477,7 +473,7 @@ class NNIManager implements Manager {
}
await
delay
(
1000
*
5
);
// 5 seconds
}
this
.
log
.
info
(
'
Experiment done, cleaning up...
'
);
await
this
.
experimentDoneCleanUp
();
this
.
log
.
info
(
'
Experiment done.
'
);
...
...
@@ -563,7 +559,7 @@ class NNIManager implements Manager {
};
await
this
.
trainingService
.
updateTrialJob
(
tunerCommand
.
trial_job_id
,
trialJobForm
);
await
this
.
dataStore
.
storeTrialJobEvent
(
'
ADD_HYPERPARAMETER
'
,
tunerCommand
.
trial_job_id
,
content
,
undefined
);
'
ADD_HYPERPARAMETER
'
,
tunerCommand
.
trial_job_id
,
content
,
undefined
);
break
;
case
NO_MORE_TRIAL_JOBS
:
//this.trialJobsMaintainer.setNoMoreTrials();
...
...
src/nni_manager/main.ts
View file @
761638d8
...
...
@@ -50,7 +50,7 @@ async function initContainer(platformMode: string): Promise<void> {
Container
.
bind
(
TrainingService
).
to
(
LocalTrainingServiceForGPU
).
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
remote
'
)
{
Container
.
bind
(
TrainingService
).
to
(
RemoteMachineTrainingService
).
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
pai
'
){
}
else
if
(
platformMode
===
'
pai
'
)
{
Container
.
bind
(
TrainingService
).
to
(
PAITrainingService
).
scope
(
Scope
.
Singleton
);
}
else
{
throw
new
Error
(
`Error: unsupported mode:
${
mode
}
`
);
...
...
@@ -108,3 +108,12 @@ mkDirP(getLogDir()).then(async () => {
}).
catch
((
err
:
Error
)
=>
{
console
.
error
(
`Failed to create log dir:
${
err
.
stack
}
`
);
});
process
.
on
(
'
SIGTERM
'
,
async
()
=>
{
const
ds
:
DataStore
=
component
.
get
(
DataStore
);
await
ds
.
close
();
const
restServer
:
NNIRestServer
=
component
.
get
(
NNIRestServer
);
await
restServer
.
stop
();
const
log
:
Logger
=
getLogger
();
log
.
close
();
})
\ No newline at end of file
src/nni_manager/rest_server/restHandler.ts
View file @
761638d8
...
...
@@ -164,8 +164,6 @@ class NNIRestHandler {
await
this
.
tb
.
cleanUp
();
await
this
.
nniManager
.
stopExperiment
();
res
.
send
();
this
.
log
.
debug
(
'
Stopping rest server
'
);
await
this
.
restServer
.
stop
();
}
catch
(
err
)
{
this
.
handle_error
(
err
,
res
);
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
761638d8
...
...
@@ -26,7 +26,7 @@ import { EventEmitter } from 'events';
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
ts
from
'
tail-stream
'
;
import
{
MethodNotImplementedError
,
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
...
...
@@ -103,6 +103,7 @@ class LocalTrainingService implements TrainingService {
protected
log
:
Logger
;
protected
localTrailConfig
?:
TrialConfig
;
private
isMultiPhase
:
boolean
=
false
;
private
streams
:
Array
<
ts
.
Stream
>
;
constructor
()
{
this
.
eventEmitter
=
new
EventEmitter
();
...
...
@@ -112,6 +113,7 @@ class LocalTrainingService implements TrainingService {
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
trialSequenceId
=
-
1
;
this
.
streams
=
new
Array
<
ts
.
Stream
>
();
}
public
async
run
():
Promise
<
void
>
{
...
...
@@ -295,7 +297,9 @@ class LocalTrainingService implements TrainingService {
public
cleanUp
():
Promise
<
void
>
{
this
.
stopping
=
true
;
for
(
const
stream
of
this
.
streams
)
{
stream
.
destroy
();
}
return
Promise
.
resolve
();
}
...
...
@@ -382,6 +386,7 @@ class LocalTrainingService implements TrainingService {
buffer
=
remain
;
}
});
this
.
streams
.
push
(
stream
);
}
private
async
runHostJob
(
form
:
HostJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
...
...
src/nni_manager/training_service/pai/hdfsClientUtility.ts
View file @
761638d8
...
...
@@ -136,7 +136,7 @@ export namespace HDFSClientUtility {
let
timeoutId
:
NodeJS
.
Timer
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
setTimeout
(()
=>
deferred
.
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
),
5000
);
timeoutId
=
setTimeout
(()
=>
deferred
.
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
),
5000
);
});
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
]).
finally
(()
=>
clearTimeout
(
timeoutId
));
...
...
src/nni_manager/types/tail-stream/index.d.ts
View file @
761638d8
declare
module
'
tail-stream
'
{
export
interface
Stream
{
on
(
type
:
'
data
'
,
callback
:
(
data
:
Buffer
)
=>
void
):
void
;
destroy
():
void
;
}
export
function
createReadStream
(
path
:
string
):
Stream
;
}
\ No newline at end of file
tools/nnicmd/nnictl_utils.py
View file @
761638d8
...
...
@@ -190,7 +190,7 @@ def stop_experiment(args):
time
.
sleep
(
3
)
rest_pid
=
nni_config
.
get_config
(
'restServerPid'
)
if
rest_pid
:
stop_rest_cmds
=
[
'
p
kill'
,
'-P'
,
str
(
rest_pid
)]
stop_rest_cmds
=
[
'kill'
,
str
(
rest_pid
)]
call
(
stop_rest_cmds
)
tensorboard_pid_list
=
nni_config
.
get_config
(
'tensorboardPidList'
)
if
tensorboard_pid_list
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment