Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
761638d8
"git@developer.sourcefind.cn:OpenDAS/nni.git" did not exist on "54a89dd0e6292083d147cf65b8264453029cd43f"
Commit
761638d8
authored
Nov 01, 2018
by
Gems Guo
Committed by
goooxu
Nov 05, 2018
Browse files
Refactor close experiment implementation
parent
5a2721be
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
36 additions
and
21 deletions
+36
-21
src/nni_manager/common/log.ts
src/nni_manager/common/log.ts
+8
-2
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+8
-12
src/nni_manager/main.ts
src/nni_manager/main.ts
+10
-1
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+0
-2
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+7
-2
src/nni_manager/training_service/pai/hdfsClientUtility.ts
src/nni_manager/training_service/pai/hdfsClientUtility.ts
+1
-1
src/nni_manager/types/tail-stream/index.d.ts
src/nni_manager/types/tail-stream/index.d.ts
+1
-0
tools/nnicmd/nnictl_utils.py
tools/nnicmd/nnictl_utils.py
+1
-1
No files found.
src/nni_manager/common/log.ts
View file @
761638d8
...
@@ -70,17 +70,23 @@ class Logger {
...
@@ -70,17 +70,23 @@ class Logger {
private
DEFAULT_LOGFILE
:
string
=
path
.
join
(
getLogDir
(),
'
nnimanager.log
'
);
private
DEFAULT_LOGFILE
:
string
=
path
.
join
(
getLogDir
(),
'
nnimanager.log
'
);
private
level
:
number
=
DEBUG
;
private
level
:
number
=
DEBUG
;
private
bufferSerialEmitter
:
BufferSerialEmitter
;
private
bufferSerialEmitter
:
BufferSerialEmitter
;
private
writble
:
Writable
;
constructor
(
fileName
?:
string
)
{
constructor
(
fileName
?:
string
)
{
let
logFile
:
string
|
undefined
=
fileName
;
let
logFile
:
string
|
undefined
=
fileName
;
if
(
logFile
===
undefined
)
{
if
(
logFile
===
undefined
)
{
logFile
=
this
.
DEFAULT_LOGFILE
;
logFile
=
this
.
DEFAULT_LOGFILE
;
}
}
this
.
bufferSerialEmitter
=
new
BufferSerialEmitter
(
fs
.
createWriteStream
(
logFile
,
{
this
.
writble
=
fs
.
createWriteStream
(
logFile
,
{
flags
:
'
a+
'
,
flags
:
'
a+
'
,
encoding
:
'
utf8
'
,
encoding
:
'
utf8
'
,
autoClose
:
true
autoClose
:
true
}));
});
this
.
bufferSerialEmitter
=
new
BufferSerialEmitter
(
this
.
writble
);
}
public
close
()
{
this
.
writble
.
destroy
();
}
}
public
debug
(...
param
:
any
[]):
void
{
public
debug
(...
param
:
any
[]):
void
{
...
...
src/nni_manager/core/nnimanager.ts
View file @
761638d8
...
@@ -35,7 +35,7 @@ import {
...
@@ -35,7 +35,7 @@ import {
import
{
import
{
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
TrainingService
,
TrialJobApplicationForm
,
TrialJobDetail
,
TrialJobMetric
,
TrialJobStatus
}
from
'
../common/trainingService
'
;
}
from
'
../common/trainingService
'
;
import
{
delay
,
getLogDir
,
getMsgDispatcherCommand
}
from
'
../common/utils
'
;
import
{
delay
,
getLogDir
,
getMsgDispatcherCommand
}
from
'
../common/utils
'
;
import
{
import
{
ADD_CUSTOMIZED_TRIAL_JOB
,
KILL_TRIAL_JOB
,
NEW_TRIAL_JOB
,
NO_MORE_TRIAL_JOBS
,
REPORT_METRIC_DATA
,
ADD_CUSTOMIZED_TRIAL_JOB
,
KILL_TRIAL_JOB
,
NEW_TRIAL_JOB
,
NO_MORE_TRIAL_JOBS
,
REPORT_METRIC_DATA
,
REQUEST_TRIAL_JOBS
,
SEND_TRIAL_JOB_PARAMETER
,
TERMINATE
,
TRIAL_END
,
UPDATE_SEARCH_SPACE
REQUEST_TRIAL_JOBS
,
SEND_TRIAL_JOB_PARAMETER
,
TERMINATE
,
TRIAL_END
,
UPDATE_SEARCH_SPACE
...
@@ -123,7 +123,7 @@ class NNIManager implements Manager {
...
@@ -123,7 +123,7 @@ class NNIManager implements Manager {
this
.
log
.
debug
(
'
Setup tuner...
'
);
this
.
log
.
debug
(
'
Setup tuner...
'
);
// Set up multiphase config
// Set up multiphase config
if
(
expParams
.
multiPhase
&&
this
.
trainingService
.
isMultiPhaseJobSupported
)
{
if
(
expParams
.
multiPhase
&&
this
.
trainingService
.
isMultiPhaseJobSupported
)
{
this
.
trainingService
.
setClusterMetadata
(
'
multiPhase
'
,
expParams
.
multiPhase
.
toString
());
this
.
trainingService
.
setClusterMetadata
(
'
multiPhase
'
,
expParams
.
multiPhase
.
toString
());
}
}
...
@@ -217,10 +217,9 @@ class NNIManager implements Manager {
...
@@ -217,10 +217,9 @@ class NNIManager implements Manager {
return
this
.
dataStore
.
getTrialJobStatistics
();
return
this
.
dataStore
.
getTrialJobStatistics
();
}
}
public
stopExperiment
():
Promise
<
void
>
{
public
async
stopExperiment
():
Promise
<
void
>
{
this
.
status
.
status
=
'
STOPPING
'
;
this
.
status
.
status
=
'
STOPPING
'
;
await
this
.
experimentDoneCleanUp
();
return
Promise
.
resolve
();
}
}
public
async
getMetricData
(
trialJobId
?:
string
,
metricType
?:
MetricType
):
Promise
<
MetricDataRecord
[]
>
{
public
async
getMetricData
(
trialJobId
?:
string
,
metricType
?:
MetricType
):
Promise
<
MetricDataRecord
[]
>
{
...
@@ -342,7 +341,7 @@ class NNIManager implements Manager {
...
@@ -342,7 +341,7 @@ class NNIManager implements Manager {
private
async
periodicallyUpdateExecDuration
():
Promise
<
void
>
{
private
async
periodicallyUpdateExecDuration
():
Promise
<
void
>
{
let
count
:
number
=
1
;
let
count
:
number
=
1
;
for
(;
;
)
{
while
(
this
.
status
.
status
!==
'
STOPPING
'
)
{
await
delay
(
1000
*
1
);
// 1 seconds
await
delay
(
1000
*
1
);
// 1 seconds
if
(
this
.
status
.
status
===
'
EXPERIMENT_RUNNING
'
)
{
if
(
this
.
status
.
status
===
'
EXPERIMENT_RUNNING
'
)
{
this
.
experimentProfile
.
execDuration
+=
1
;
this
.
experimentProfile
.
execDuration
+=
1
;
...
@@ -396,10 +395,7 @@ class NNIManager implements Manager {
...
@@ -396,10 +395,7 @@ class NNIManager implements Manager {
throw
new
Error
(
'
Error: tuner has not been setup
'
);
throw
new
Error
(
'
Error: tuner has not been setup
'
);
}
}
let
allFinishedTrialJobNum
:
number
=
0
;
let
allFinishedTrialJobNum
:
number
=
0
;
for
(;
;)
{
while
(
this
.
status
.
status
!==
'
STOPPING
'
)
{
if
(
this
.
status
.
status
===
'
STOPPING
'
)
{
break
;
}
const
finishedTrialJobNum
:
number
=
await
this
.
requestTrialJobsStatus
();
const
finishedTrialJobNum
:
number
=
await
this
.
requestTrialJobsStatus
();
allFinishedTrialJobNum
+=
finishedTrialJobNum
;
allFinishedTrialJobNum
+=
finishedTrialJobNum
;
...
@@ -477,7 +473,7 @@ class NNIManager implements Manager {
...
@@ -477,7 +473,7 @@ class NNIManager implements Manager {
}
}
await
delay
(
1000
*
5
);
// 5 seconds
await
delay
(
1000
*
5
);
// 5 seconds
}
}
this
.
log
.
info
(
'
Experiment done, cleaning up...
'
);
this
.
log
.
info
(
'
Experiment done, cleaning up...
'
);
await
this
.
experimentDoneCleanUp
();
await
this
.
experimentDoneCleanUp
();
this
.
log
.
info
(
'
Experiment done.
'
);
this
.
log
.
info
(
'
Experiment done.
'
);
...
@@ -563,7 +559,7 @@ class NNIManager implements Manager {
...
@@ -563,7 +559,7 @@ class NNIManager implements Manager {
};
};
await
this
.
trainingService
.
updateTrialJob
(
tunerCommand
.
trial_job_id
,
trialJobForm
);
await
this
.
trainingService
.
updateTrialJob
(
tunerCommand
.
trial_job_id
,
trialJobForm
);
await
this
.
dataStore
.
storeTrialJobEvent
(
await
this
.
dataStore
.
storeTrialJobEvent
(
'
ADD_HYPERPARAMETER
'
,
tunerCommand
.
trial_job_id
,
content
,
undefined
);
'
ADD_HYPERPARAMETER
'
,
tunerCommand
.
trial_job_id
,
content
,
undefined
);
break
;
break
;
case
NO_MORE_TRIAL_JOBS
:
case
NO_MORE_TRIAL_JOBS
:
//this.trialJobsMaintainer.setNoMoreTrials();
//this.trialJobsMaintainer.setNoMoreTrials();
...
...
src/nni_manager/main.ts
View file @
761638d8
...
@@ -50,7 +50,7 @@ async function initContainer(platformMode: string): Promise<void> {
...
@@ -50,7 +50,7 @@ async function initContainer(platformMode: string): Promise<void> {
Container
.
bind
(
TrainingService
).
to
(
LocalTrainingServiceForGPU
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
TrainingService
).
to
(
LocalTrainingServiceForGPU
).
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
remote
'
)
{
}
else
if
(
platformMode
===
'
remote
'
)
{
Container
.
bind
(
TrainingService
).
to
(
RemoteMachineTrainingService
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
TrainingService
).
to
(
RemoteMachineTrainingService
).
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
pai
'
){
}
else
if
(
platformMode
===
'
pai
'
)
{
Container
.
bind
(
TrainingService
).
to
(
PAITrainingService
).
scope
(
Scope
.
Singleton
);
Container
.
bind
(
TrainingService
).
to
(
PAITrainingService
).
scope
(
Scope
.
Singleton
);
}
else
{
}
else
{
throw
new
Error
(
`Error: unsupported mode:
${
mode
}
`
);
throw
new
Error
(
`Error: unsupported mode:
${
mode
}
`
);
...
@@ -108,3 +108,12 @@ mkDirP(getLogDir()).then(async () => {
...
@@ -108,3 +108,12 @@ mkDirP(getLogDir()).then(async () => {
}).
catch
((
err
:
Error
)
=>
{
}).
catch
((
err
:
Error
)
=>
{
console
.
error
(
`Failed to create log dir:
${
err
.
stack
}
`
);
console
.
error
(
`Failed to create log dir:
${
err
.
stack
}
`
);
});
});
process
.
on
(
'
SIGTERM
'
,
async
()
=>
{
const
ds
:
DataStore
=
component
.
get
(
DataStore
);
await
ds
.
close
();
const
restServer
:
NNIRestServer
=
component
.
get
(
NNIRestServer
);
await
restServer
.
stop
();
const
log
:
Logger
=
getLogger
();
log
.
close
();
})
\ No newline at end of file
src/nni_manager/rest_server/restHandler.ts
View file @
761638d8
...
@@ -164,8 +164,6 @@ class NNIRestHandler {
...
@@ -164,8 +164,6 @@ class NNIRestHandler {
await
this
.
tb
.
cleanUp
();
await
this
.
tb
.
cleanUp
();
await
this
.
nniManager
.
stopExperiment
();
await
this
.
nniManager
.
stopExperiment
();
res
.
send
();
res
.
send
();
this
.
log
.
debug
(
'
Stopping rest server
'
);
await
this
.
restServer
.
stop
();
}
catch
(
err
)
{
}
catch
(
err
)
{
this
.
handle_error
(
err
,
res
);
this
.
handle_error
(
err
,
res
);
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
761638d8
...
@@ -26,7 +26,7 @@ import { EventEmitter } from 'events';
...
@@ -26,7 +26,7 @@ import { EventEmitter } from 'events';
import
*
as
fs
from
'
fs
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
path
from
'
path
'
;
import
*
as
ts
from
'
tail-stream
'
;
import
*
as
ts
from
'
tail-stream
'
;
import
{
MethodNotImplementedError
,
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../../common/errors
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfig
}
from
'
../common/trialConfig
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
import
{
TrialConfigMetadataKey
}
from
'
../common/trialConfigMetadataKey
'
;
...
@@ -103,6 +103,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -103,6 +103,7 @@ class LocalTrainingService implements TrainingService {
protected
log
:
Logger
;
protected
log
:
Logger
;
protected
localTrailConfig
?:
TrialConfig
;
protected
localTrailConfig
?:
TrialConfig
;
private
isMultiPhase
:
boolean
=
false
;
private
isMultiPhase
:
boolean
=
false
;
private
streams
:
Array
<
ts
.
Stream
>
;
constructor
()
{
constructor
()
{
this
.
eventEmitter
=
new
EventEmitter
();
this
.
eventEmitter
=
new
EventEmitter
();
...
@@ -112,6 +113,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -112,6 +113,7 @@ class LocalTrainingService implements TrainingService {
this
.
stopping
=
false
;
this
.
stopping
=
false
;
this
.
log
=
getLogger
();
this
.
log
=
getLogger
();
this
.
trialSequenceId
=
-
1
;
this
.
trialSequenceId
=
-
1
;
this
.
streams
=
new
Array
<
ts
.
Stream
>
();
}
}
public
async
run
():
Promise
<
void
>
{
public
async
run
():
Promise
<
void
>
{
...
@@ -295,7 +297,9 @@ class LocalTrainingService implements TrainingService {
...
@@ -295,7 +297,9 @@ class LocalTrainingService implements TrainingService {
public
cleanUp
():
Promise
<
void
>
{
public
cleanUp
():
Promise
<
void
>
{
this
.
stopping
=
true
;
this
.
stopping
=
true
;
for
(
const
stream
of
this
.
streams
)
{
stream
.
destroy
();
}
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
...
@@ -382,6 +386,7 @@ class LocalTrainingService implements TrainingService {
...
@@ -382,6 +386,7 @@ class LocalTrainingService implements TrainingService {
buffer
=
remain
;
buffer
=
remain
;
}
}
});
});
this
.
streams
.
push
(
stream
);
}
}
private
async
runHostJob
(
form
:
HostJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
private
async
runHostJob
(
form
:
HostJobApplicationForm
):
Promise
<
TrialJobDetail
>
{
...
...
src/nni_manager/training_service/pai/hdfsClientUtility.ts
View file @
761638d8
...
@@ -136,7 +136,7 @@ export namespace HDFSClientUtility {
...
@@ -136,7 +136,7 @@ export namespace HDFSClientUtility {
let
timeoutId
:
NodeJS
.
Timer
let
timeoutId
:
NodeJS
.
Timer
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
const
delayTimeout
:
Promise
<
boolean
>
=
new
Promise
<
boolean
>
((
resolve
:
Function
,
reject
:
Function
)
:
void
=>
{
// Set timeout and reject the promise once reach timeout (5 seconds)
// Set timeout and reject the promise once reach timeout (5 seconds)
setTimeout
(()
=>
deferred
.
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
),
5000
);
timeoutId
=
setTimeout
(()
=>
deferred
.
reject
(
`Check HDFS path
${
hdfsPath
}
exists timeout`
),
5000
);
});
});
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
]).
finally
(()
=>
clearTimeout
(
timeoutId
));
return
Promise
.
race
([
deferred
.
promise
,
delayTimeout
]).
finally
(()
=>
clearTimeout
(
timeoutId
));
...
...
src/nni_manager/types/tail-stream/index.d.ts
View file @
761638d8
declare
module
'
tail-stream
'
{
declare
module
'
tail-stream
'
{
export
interface
Stream
{
export
interface
Stream
{
on
(
type
:
'
data
'
,
callback
:
(
data
:
Buffer
)
=>
void
):
void
;
on
(
type
:
'
data
'
,
callback
:
(
data
:
Buffer
)
=>
void
):
void
;
destroy
():
void
;
}
}
export
function
createReadStream
(
path
:
string
):
Stream
;
export
function
createReadStream
(
path
:
string
):
Stream
;
}
}
\ No newline at end of file
tools/nnicmd/nnictl_utils.py
View file @
761638d8
...
@@ -190,7 +190,7 @@ def stop_experiment(args):
...
@@ -190,7 +190,7 @@ def stop_experiment(args):
time
.
sleep
(
3
)
time
.
sleep
(
3
)
rest_pid
=
nni_config
.
get_config
(
'restServerPid'
)
rest_pid
=
nni_config
.
get_config
(
'restServerPid'
)
if
rest_pid
:
if
rest_pid
:
stop_rest_cmds
=
[
'
p
kill'
,
'-P'
,
str
(
rest_pid
)]
stop_rest_cmds
=
[
'kill'
,
str
(
rest_pid
)]
call
(
stop_rest_cmds
)
call
(
stop_rest_cmds
)
tensorboard_pid_list
=
nni_config
.
get_config
(
'tensorboardPidList'
)
tensorboard_pid_list
=
nni_config
.
get_config
(
'tensorboardPidList'
)
if
tensorboard_pid_list
:
if
tensorboard_pid_list
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment