Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
99f7d79c
Unverified
Commit
99f7d79c
authored
Sep 26, 2019
by
SparkSnail
Committed by
GitHub
Sep 26, 2019
Browse files
Support experiment view (#1524)
parent
0b7d6260
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
208 additions
and
145 deletions
+208
-145
docs/en_US/Tutorial/Nnictl.md
docs/en_US/Tutorial/Nnictl.md
+30
-0
src/nni_manager/common/experimentStartupInfo.ts
src/nni_manager/common/experimentStartupInfo.ts
+19
-5
src/nni_manager/common/log.ts
src/nni_manager/common/log.ts
+18
-8
src/nni_manager/common/manager.ts
src/nni_manager/common/manager.ts
+6
-2
src/nni_manager/core/nnimanager.ts
src/nni_manager/core/nnimanager.ts
+23
-3
src/nni_manager/main.ts
src/nni_manager/main.ts
+16
-8
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+17
-17
tools/nni_cmd/launcher.py
tools/nni_cmd/launcher.py
+72
-101
tools/nni_cmd/nnictl.py
tools/nni_cmd/nnictl.py
+7
-1
No files found.
docs/en_US/Tutorial/Nnictl.md
View file @
99f7d79c
...
...
@@ -10,6 +10,7 @@ nnictl support commands:
*
[
nnictl create
](
#create
)
*
[
nnictl resume
](
#resume
)
*
[
nnictl view
](
#view
)
*
[
nnictl stop
](
#stop
)
*
[
nnictl update
](
#update
)
*
[
nnictl trial
](
#trial
)
...
...
@@ -104,6 +105,35 @@ Debug mode will disable version check function in Trialkeeper.
nnictl resume
[
experiment_id]
--port
8088
```
<a
name=
"view"
></a>

`nnictl view`
*
Description
You can use this command to view a stopped experiment.
*
Usage
```
bash
nnictl view
[
OPTIONS]
```
*
Options
|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|id| True| |The id of the experiment you want to view|
|--port, -p| False| |Rest port of the experiment you want to view|
*
Example
> view an experiment with specified port 8088
```
bash
nnictl view
[
experiment_id]
--port
8088
```
<a
name=
"stop"
></a>

`nnictl stop`
...
...
src/nni_manager/common/experimentStartupInfo.ts
View file @
99f7d79c
...
...
@@ -33,11 +33,11 @@ class ExperimentStartupInfo {
private
initTrialSequenceID
:
number
=
0
;
private
logDir
:
string
=
''
;
private
logLevel
:
string
=
''
;
private
readonly
:
boolean
=
false
;
public
setStartupInfo
(
newExperiment
:
boolean
,
experimentId
:
string
,
basePort
:
number
,
logDir
?:
string
,
logLevel
?:
string
):
void
{
public
setStartupInfo
(
newExperiment
:
boolean
,
experimentId
:
string
,
basePort
:
number
,
logDir
?:
string
,
logLevel
?:
string
,
readonly
?:
boolean
):
void
{
assert
(
!
this
.
initialized
);
assert
(
experimentId
.
trim
().
length
>
0
);
this
.
newExperiment
=
newExperiment
;
this
.
experimentId
=
experimentId
;
this
.
basePort
=
basePort
;
...
...
@@ -52,6 +52,10 @@ class ExperimentStartupInfo {
if
(
logLevel
!==
undefined
&&
logLevel
.
length
>
1
)
{
this
.
logLevel
=
logLevel
;
}
if
(
readonly
!==
undefined
)
{
this
.
readonly
=
readonly
;
}
}
public
getExperimentId
():
string
{
...
...
@@ -84,6 +88,12 @@ class ExperimentStartupInfo {
return
this
.
logLevel
;
}
public
isReadonly
():
boolean
{
assert
(
this
.
initialized
);
return
this
.
readonly
;
}
public
setInitTrialSequenceId
(
initSequenceId
:
number
):
void
{
assert
(
this
.
initialized
);
this
.
initTrialSequenceID
=
initSequenceId
;
...
...
@@ -121,10 +131,14 @@ function getExperimentStartupInfo(): ExperimentStartupInfo {
}
function
setExperimentStartupInfo
(
newExperiment
:
boolean
,
experimentId
:
string
,
basePort
:
number
,
logDir
?:
string
,
logLevel
?:
string
):
void
{
newExperiment
:
boolean
,
experimentId
:
string
,
basePort
:
number
,
logDir
?:
string
,
logLevel
?:
string
,
readonly
?:
boolean
):
void
{
component
.
get
<
ExperimentStartupInfo
>
(
ExperimentStartupInfo
)
.
setStartupInfo
(
newExperiment
,
experimentId
,
basePort
,
logDir
,
logLevel
);
.
setStartupInfo
(
newExperiment
,
experimentId
,
basePort
,
logDir
,
logLevel
,
readonly
);
}
function
isReadonly
():
boolean
{
return
component
.
get
<
ExperimentStartupInfo
>
(
ExperimentStartupInfo
).
isReadonly
();
}
export
{
ExperimentStartupInfo
,
getBasePort
,
getExperimentId
,
isNewExperiment
,
getExperimentStartupInfo
,
setExperimentStartupInfo
,
setInitTrialSequenceId
,
getInitTrialSequenceId
};
setExperimentStartupInfo
,
setInitTrialSequenceId
,
getInitTrialSequenceId
,
isReadonly
};
src/nni_manager/common/log.ts
View file @
99f7d79c
...
...
@@ -26,7 +26,7 @@ import { Writable } from 'stream';
import
{
WritableStreamBuffer
}
from
'
stream-buffers
'
;
import
{
format
}
from
'
util
'
;
import
*
as
component
from
'
../common/component
'
;
import
{
getExperimentStartupInfo
}
from
'
./experimentStartupInfo
'
;
import
{
getExperimentStartupInfo
,
isReadonly
}
from
'
./experimentStartupInfo
'
;
import
{
getLogDir
}
from
'
./utils
'
;
const
FATAL
:
number
=
1
;
...
...
@@ -76,6 +76,7 @@ class Logger {
private
level
:
number
=
INFO
;
private
bufferSerialEmitter
:
BufferSerialEmitter
;
private
writable
:
Writable
;
private
readonly
:
boolean
=
false
;
constructor
(
fileName
?:
string
)
{
let
logFile
:
string
|
undefined
=
fileName
;
...
...
@@ -95,6 +96,8 @@ class Logger {
if
(
logLevel
!==
undefined
)
{
this
.
level
=
logLevel
;
}
this
.
readonly
=
isReadonly
();
}
public
close
()
{
...
...
@@ -134,14 +137,21 @@ class Logger {
public
fatal
(...
param
:
any
[]):
void
{
this
.
log
(
'
FATAL
'
,
param
);
}
/**
* if the experiment is not in readonly mode, write log content to stream
* @param level log level
* @param param the params to be written
*/
private
log
(
level
:
string
,
param
:
any
[]):
void
{
const
buffer
:
WritableStreamBuffer
=
new
WritableStreamBuffer
();
buffer
.
write
(
`[
${(
new
Date
()).
toLocaleString
()}
]
${
level
}
`
);
buffer
.
write
(
format
(
param
));
buffer
.
write
(
'
\n
'
);
buffer
.
end
();
this
.
bufferSerialEmitter
.
feed
(
buffer
.
getContents
());
if
(
!
this
.
readonly
)
{
const
buffer
:
WritableStreamBuffer
=
new
WritableStreamBuffer
();
buffer
.
write
(
`[
${(
new
Date
()).
toLocaleString
()}
]
${
level
}
`
);
buffer
.
write
(
format
(
param
));
buffer
.
write
(
'
\n
'
);
buffer
.
end
();
this
.
bufferSerialEmitter
.
feed
(
buffer
.
getContents
());
}
}
}
...
...
src/nni_manager/common/manager.ts
View file @
99f7d79c
...
...
@@ -24,6 +24,10 @@ import { TrialJobStatus } from './trainingService';
type
ProfileUpdateType
=
'
TRIAL_CONCURRENCY
'
|
'
MAX_EXEC_DURATION
'
|
'
SEARCH_SPACE
'
|
'
MAX_TRIAL_NUM
'
;
type
ExperimentStatus
=
'
INITIALIZED
'
|
'
RUNNING
'
|
'
ERROR
'
|
'
STOPPING
'
|
'
STOPPED
'
|
'
DONE
'
|
'
NO_MORE_TRIAL
'
|
'
TUNER_NO_MORE_TRIAL
'
;
namespace
ExperimentStartUpMode
{
export
const
NEW
=
'
new
'
;
export
const
RESUME
=
'
resume
'
;
}
interface
ExperimentParams
{
authorName
:
string
;
...
...
@@ -95,7 +99,7 @@ interface NNIManagerStatus {
abstract
class
Manager
{
public
abstract
startExperiment
(
experimentParams
:
ExperimentParams
):
Promise
<
string
>
;
public
abstract
resumeExperiment
():
Promise
<
void
>
;
public
abstract
resumeExperiment
(
readonly
:
boolean
):
Promise
<
void
>
;
public
abstract
stopExperiment
():
Promise
<
void
>
;
public
abstract
getExperimentProfile
():
Promise
<
ExperimentProfile
>
;
public
abstract
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
;
...
...
@@ -115,4 +119,4 @@ abstract class Manager {
public
abstract
getStatus
():
NNIManagerStatus
;
}
export
{
Manager
,
ExperimentParams
,
ExperimentProfile
,
TrialJobStatistics
,
ProfileUpdateType
,
NNIManagerStatus
,
ExperimentStatus
};
export
{
Manager
,
ExperimentParams
,
ExperimentProfile
,
TrialJobStatistics
,
ProfileUpdateType
,
NNIManagerStatus
,
ExperimentStatus
,
ExperimentStartUpMode
};
src/nni_manager/core/nnimanager.ts
View file @
99f7d79c
...
...
@@ -59,6 +59,7 @@ class NNIManager implements Manager {
private
waitingTrials
:
string
[];
private
trialJobs
:
Map
<
string
,
TrialJobDetail
>
;
private
trialDataForTuner
:
string
;
private
readonly
:
boolean
;
private
trialJobMetricListener
:
(
metric
:
TrialJobMetric
)
=>
void
;
...
...
@@ -72,6 +73,7 @@ class NNIManager implements Manager {
this
.
waitingTrials
=
[];
this
.
trialJobs
=
new
Map
<
string
,
TrialJobDetail
>
();
this
.
trialDataForTuner
=
''
;
this
.
readonly
=
false
;
this
.
log
=
getLogger
();
this
.
dataStore
=
component
.
get
(
DataStore
);
...
...
@@ -88,6 +90,9 @@ class NNIManager implements Manager {
}
public
updateExperimentProfile
(
experimentProfile
:
ExperimentProfile
,
updateType
:
ProfileUpdateType
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not update experiment profile in readonly mode!
'
));
}
switch
(
updateType
)
{
case
'
TRIAL_CONCURRENCY
'
:
this
.
updateTrialConcurrency
(
experimentProfile
.
params
.
trialConcurrency
);
...
...
@@ -109,6 +114,9 @@ class NNIManager implements Manager {
}
public
importData
(
data
:
string
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not import data in readonly mode!
'
));
}
if
(
this
.
dispatcher
===
undefined
)
{
return
Promise
.
reject
(
new
Error
(
'
tuner has not been setup
'
)
...
...
@@ -124,6 +132,9 @@ class NNIManager implements Manager {
}
public
addCustomizedTrialJob
(
hyperParams
:
string
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not add customized trial job in readonly mode!
'
));
}
if
(
this
.
currSubmittedTrialNum
>=
this
.
experimentProfile
.
params
.
maxTrialNum
)
{
return
Promise
.
reject
(
new
Error
(
'
reach maxTrialNum
'
)
...
...
@@ -136,6 +147,9 @@ class NNIManager implements Manager {
}
public
async
cancelTrialJobByUser
(
trialJobId
:
string
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not cancel trial job in readonly mode!
'
));
}
this
.
log
.
info
(
`User cancelTrialJob:
${
trialJobId
}
`
);
await
this
.
trainingService
.
cancelTrialJob
(
trialJobId
);
await
this
.
dataStore
.
storeTrialJobEvent
(
'
USER_TO_CANCEL
'
,
trialJobId
,
''
);
...
...
@@ -180,13 +194,16 @@ class NNIManager implements Manager {
return
this
.
experimentProfile
.
id
;
}
public
async
resumeExperiment
():
Promise
<
void
>
{
public
async
resumeExperiment
(
readonly
:
boolean
):
Promise
<
void
>
{
this
.
log
.
info
(
`Resuming experiment:
${
this
.
experimentProfile
.
id
}
`
);
//Fetch back the experiment profile
const
experimentId
:
string
=
getExperimentId
();
this
.
experimentProfile
=
await
this
.
dataStore
.
getExperimentProfile
(
experimentId
);
this
.
readonly
=
readonly
;
if
(
readonly
)
{
return
Promise
.
resolve
();
}
const
expParams
:
ExperimentParams
=
this
.
experimentProfile
.
params
;
setInitTrialSequenceId
(
this
.
experimentProfile
.
maxSequenceId
+
1
);
// Set up multiphase config
...
...
@@ -196,7 +213,7 @@ class NNIManager implements Manager {
// Set up versionCheck config
if
(
expParams
.
versionCheck
!==
undefined
)
{
this
.
trainingService
.
setClusterMetadata
(
'
version
C
heck
'
,
expParams
.
versionCheck
.
toString
());
this
.
trainingService
.
setClusterMetadata
(
'
version
_c
heck
'
,
expParams
.
versionCheck
.
toString
());
}
const
dispatcherCommand
:
string
=
getMsgDispatcherCommand
(
expParams
.
tuner
,
expParams
.
assessor
,
expParams
.
advisor
,
...
...
@@ -247,6 +264,9 @@ class NNIManager implements Manager {
}
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
if
(
this
.
readonly
)
{
return
Promise
.
reject
(
new
Error
(
'
Error: can not set cluster metadata in readonly mode!
'
));
}
this
.
log
.
info
(
`NNIManager setClusterMetadata, key:
${
key
}
, value:
${
value
}
`
);
let
timeoutId
:
NodeJS
.
Timer
;
// TO DO: move timeout value to constants file
...
...
src/nni_manager/main.ts
View file @
99f7d79c
...
...
@@ -26,7 +26,7 @@ import * as component from './common/component';
import
{
Database
,
DataStore
}
from
'
./common/datastore
'
;
import
{
setExperimentStartupInfo
}
from
'
./common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
,
logLevelNameMap
}
from
'
./common/log
'
;
import
{
Manager
}
from
'
./common/manager
'
;
import
{
Manager
,
ExperimentStartUpMode
}
from
'
./common/manager
'
;
import
{
TrainingService
}
from
'
./common/trainingService
'
;
import
{
getLogDir
,
mkDirP
,
parseArg
,
uniqueString
}
from
'
./common/utils
'
;
import
{
NNIDataStore
}
from
'
./core/nniDataStore
'
;
...
...
@@ -43,10 +43,10 @@ import {
function
initStartupInfo
(
startExpMode
:
string
,
resumeExperimentId
:
string
,
basePort
:
number
,
logDirectory
:
string
,
experimentLogLevel
:
string
):
void
{
const
createNew
:
boolean
=
(
startExpMode
===
'
new
'
);
logDirectory
:
string
,
experimentLogLevel
:
string
,
readonly
:
boolean
):
void
{
const
createNew
:
boolean
=
(
startExpMode
===
ExperimentStartUpMode
.
NEW
);
const
expId
:
string
=
createNew
?
uniqueString
(
8
)
:
resumeExperimentId
;
setExperimentStartupInfo
(
createNew
,
expId
,
basePort
,
logDirectory
,
experimentLogLevel
);
setExperimentStartupInfo
(
createNew
,
expId
,
basePort
,
logDirectory
,
experimentLogLevel
,
readonly
);
}
async
function
initContainer
(
platformMode
:
string
):
Promise
<
void
>
{
...
...
@@ -108,15 +108,15 @@ if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode
}
const
startMode
:
string
=
parseArg
([
'
--start_mode
'
,
'
-s
'
]);
if
(
!
[
'
new
'
,
'
resume
'
].
includes
(
startMode
))
{
if
(
!
[
ExperimentStartUpMode
.
NEW
,
ExperimentStartUpMode
.
RESUME
].
includes
(
startMode
))
{
console
.
log
(
`FATAL: unknown start_mode:
${
startMode
}
`
);
usage
();
process
.
exit
(
1
);
}
const
experimentId
:
string
=
parseArg
([
'
--experiment_id
'
,
'
-id
'
]);
if
(
startMode
===
'
resume
'
&&
experimentId
.
trim
().
length
<
1
)
{
console
.
log
(
`FATAL: cannot resume experiment, invalid experiment_id:
${
experimentId
}
`
);
if
(
(
startMode
===
ExperimentStartUpMode
.
RESUME
)
&&
experimentId
.
trim
().
length
<
1
)
{
console
.
log
(
`FATAL: cannot resume
the
experiment, invalid experiment_id:
${
experimentId
}
`
);
usage
();
process
.
exit
(
1
);
}
...
...
@@ -133,7 +133,15 @@ if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
console
.
log
(
`FATAL: invalid log_level:
${
logLevel
}
`
);
}
initStartupInfo
(
startMode
,
experimentId
,
port
,
logDir
,
logLevel
);
const
readonlyArg
:
string
=
parseArg
([
'
--readonly
'
,
'
-r
'
]);
if
(
!
(
'
true
'
||
'
false
'
).
includes
(
readonlyArg
.
toLowerCase
()))
{
console
.
log
(
`FATAL: readonly property should only be true or false`
);
usage
();
process
.
exit
(
1
);
}
const
readonly
=
readonlyArg
.
toLowerCase
()
==
'
true
'
?
true
:
false
;
initStartupInfo
(
startMode
,
experimentId
,
port
,
logDir
,
logLevel
,
readonly
);
mkDirP
(
getLogDir
())
.
then
(
async
()
=>
{
...
...
src/nni_manager/rest_server/restHandler.ts
View file @
99f7d79c
...
...
@@ -25,9 +25,9 @@ import * as path from 'path';
import
*
as
component
from
'
../common/component
'
;
import
{
DataStore
,
MetricDataRecord
,
TrialJobInfo
}
from
'
../common/datastore
'
;
import
{
NNIError
,
NNIErrorNames
}
from
'
../common/errors
'
;
import
{
isNewExperiment
}
from
'
../common/experimentStartupInfo
'
;
import
{
isNewExperiment
,
isReadonly
}
from
'
../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../common/log
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
}
from
'
../common/manager
'
;
import
{
ExperimentProfile
,
Manager
,
TrialJobStatistics
,
ExperimentStartUpMode
}
from
'
../common/manager
'
;
import
{
ValidationSchemas
}
from
'
./restValidationSchemas
'
;
import
{
NNIRestServer
}
from
'
./nniRestServer
'
;
import
{
getVersion
}
from
'
../common/utils
'
;
...
...
@@ -86,11 +86,11 @@ class NNIRestHandler {
return
router
;
}
private
handle_error
(
err
:
Error
,
res
:
Response
,
isFatal
:
boolean
=
false
):
void
{
private
handle_error
(
err
:
Error
,
res
:
Response
,
isFatal
:
boolean
=
false
,
errorCode
:
number
=
500
):
void
{
if
(
err
instanceof
NNIError
&&
err
.
name
===
NNIErrorNames
.
NOT_FOUND
)
{
res
.
status
(
404
);
}
else
{
res
.
status
(
500
);
res
.
status
(
errorCode
);
}
res
.
send
({
error
:
err
.
message
...
...
@@ -169,13 +169,13 @@ class NNIRestHandler {
this
.
handle_error
(
err
,
res
);
});
}
else
{
this
.
nniManager
.
resumeExperiment
().
then
(()
=>
{
this
.
nniManager
.
resumeExperiment
(
isReadonly
()
).
then
(()
=>
{
res
.
send
();
}).
catch
((
err
:
Error
)
=>
{
// Resume experiment is a step of initialization, so any exception thrown is a fatal
this
.
handle_error
(
err
,
res
);
});
}
}
});
}
...
...
@@ -193,18 +193,18 @@ class NNIRestHandler {
router
.
put
(
'
/experiment/cluster-metadata
'
,
expressJoi
(
ValidationSchemas
.
SETCLUSTERMETADATA
),
async
(
req
:
Request
,
res
:
Response
)
=>
{
// tslint:disable-next-line:no-any
const
metadata
:
any
=
req
.
body
;
const
keys
:
string
[]
=
Object
.
keys
(
metadata
);
try
{
for
(
const
key
of
keys
)
{
await
this
.
nniManager
.
setClusterMetadata
(
key
,
JSON
.
stringify
(
metadata
[
key
]));
// tslint:disable-next-line:no-any
const
metadata
:
any
=
req
.
body
;
const
keys
:
string
[]
=
Object
.
keys
(
metadata
);
try
{
for
(
const
key
of
keys
)
{
await
this
.
nniManager
.
setClusterMetadata
(
key
,
JSON
.
stringify
(
metadata
[
key
]));
}
res
.
send
();
}
catch
(
err
)
{
// setClusterMetata is a step of initialization, so any exception thrown is a fatal
this
.
handle_error
(
NNIError
.
FromError
(
err
),
res
,
true
);
}
res
.
send
();
}
catch
(
err
)
{
// setClusterMetata is a step of initialization, so any exception thrown is a fatal
this
.
handle_error
(
NNIError
.
FromError
(
err
),
res
,
true
);
}
});
}
...
...
tools/nni_cmd/launcher.py
View file @
99f7d79c
...
...
@@ -118,12 +118,17 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
node_command
=
'node'
if
sys
.
platform
==
'win32'
:
node_command
=
os
.
path
.
join
(
entry_dir
[:
-
3
],
'Scripts'
,
'node.exe'
)
cmds
=
[
node_command
,
entry_file
,
'--port'
,
str
(
port
),
'--mode'
,
platform
,
'--start_mode'
,
mode
]
cmds
=
[
node_command
,
entry_file
,
'--port'
,
str
(
port
),
'--mode'
,
platform
]
if
mode
==
'view'
:
cmds
+=
[
'--start_mode'
,
'resume'
]
cmds
+=
[
'--readonly'
,
'true'
]
else
:
cmds
+=
[
'--start_mode'
,
mode
]
if
log_dir
is
not
None
:
cmds
+=
[
'--log_dir'
,
log_dir
]
if
log_level
is
not
None
:
cmds
+=
[
'--log_level'
,
log_level
]
if
mode
==
'resume'
:
if
mode
in
[
'resume'
,
'view'
]
:
cmds
+=
[
'--experiment_id'
,
experiment_id
]
stdout_full_path
,
stderr_full_path
=
get_log_path
(
config_file_name
)
with
open
(
stdout_full_path
,
'a+'
)
as
stdout_file
,
open
(
stderr_full_path
,
'a+'
)
as
stderr_file
:
...
...
@@ -156,7 +161,6 @@ def set_trial_config(experiment_config, port, config_file_name):
def
set_local_config
(
experiment_config
,
port
,
config_file_name
):
'''set local configuration'''
#set machine_list
request_data
=
dict
()
if
experiment_config
.
get
(
'localConfig'
):
request_data
[
'local_config'
]
=
experiment_config
[
'localConfig'
]
...
...
@@ -177,7 +181,7 @@ def set_local_config(experiment_config, port, config_file_name):
fout
.
write
(
json
.
dumps
(
json
.
loads
(
err_message
),
indent
=
4
,
sort_keys
=
True
,
separators
=
(
','
,
':'
)))
return
False
,
err_message
return
set_trial_config
(
experiment_config
,
port
,
config_file_name
)
return
set_trial_config
(
experiment_config
,
port
,
config_file_name
)
,
None
def
set_remote_config
(
experiment_config
,
port
,
config_file_name
):
'''Call setClusterMetadata to pass trial'''
...
...
@@ -345,7 +349,6 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{
'key'
:
'frameworkcontroller_config'
,
'value'
:
experiment_config
[
'frameworkcontrollerConfig'
]})
request_data
[
'clusterMetaData'
].
append
(
{
'key'
:
'trial_config'
,
'value'
:
experiment_config
[
'trial'
]})
response
=
rest_post
(
experiment_url
(
port
),
json
.
dumps
(
request_data
),
REST_TIME_OUT
,
show_error
=
True
)
if
check_response
(
response
):
return
response
...
...
@@ -357,6 +360,33 @@ def set_experiment(experiment_config, mode, port, config_file_name):
print_error
(
'Setting experiment error, error message is {}'
.
format
(
response
.
text
))
return
None
def
set_platform_config
(
platform
,
experiment_config
,
port
,
config_file_name
,
rest_process
):
'''call set_cluster_metadata for specific platform'''
print_normal
(
'Setting {0} config...'
.
format
(
platform
))
config_result
,
err_msg
=
None
,
None
if
platform
==
'local'
:
config_result
,
err_msg
=
set_local_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'remote'
:
config_result
,
err_msg
=
set_remote_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'pai'
:
config_result
,
err_msg
=
set_pai_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'kubeflow'
:
config_result
,
err_msg
=
set_kubeflow_config
(
experiment_config
,
port
,
config_file_name
)
elif
platform
==
'frameworkcontroller'
:
config_result
,
err_msg
=
set_frameworkcontroller_config
(
experiment_config
,
port
,
config_file_name
)
else
:
raise
Exception
(
ERROR_INFO
%
'Unsupported platform!'
)
exit
(
1
)
if
config_result
:
print_normal
(
'Successfully set {0} config!'
.
format
(
platform
))
else
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
def
launch_experiment
(
args
,
experiment_config
,
mode
,
config_file_name
,
experiment_id
=
None
):
'''follow steps to start rest server and start experiment'''
nni_config
=
Config
(
config_file_name
)
...
...
@@ -381,8 +411,10 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
exit
(
1
)
log_dir
=
experiment_config
[
'logDir'
]
if
experiment_config
.
get
(
'logDir'
)
else
None
log_level
=
experiment_config
[
'logLevel'
]
if
experiment_config
.
get
(
'logLevel'
)
else
None
if
log_level
not
in
[
'trace'
,
'debug'
]
and
(
args
.
debug
or
experiment_config
.
get
(
'debug'
)
is
True
):
log_level
=
'debug'
#view experiment mode do not need debug function, when view an experiment, there will be no new logs created
if
mode
!=
'view'
:
if
log_level
not
in
[
'trace'
,
'debug'
]
and
(
args
.
debug
or
experiment_config
.
get
(
'debug'
)
is
True
):
log_level
=
'debug'
# start rest server
rest_process
,
start_time
=
start_rest_server
(
args
.
port
,
experiment_config
[
'trainingServicePlatform'
],
mode
,
config_file_name
,
experiment_id
,
log_dir
,
log_level
)
nni_config
.
set_config
(
'restServerPid'
,
rest_process
.
pid
)
...
...
@@ -416,83 +448,14 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
# set remote config
if
experiment_config
[
'trainingServicePlatform'
]
==
'remote'
:
print_normal
(
'Setting remote config...'
)
config_result
,
err_msg
=
set_remote_config
(
experiment_config
,
args
.
port
,
config_file_name
)
if
config_result
:
print_normal
(
'Successfully set remote config!'
)
else
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
# set local config
if
experiment_config
[
'trainingServicePlatform'
]
==
'local'
:
print_normal
(
'Setting local config...'
)
if
set_local_config
(
experiment_config
,
args
.
port
,
config_file_name
):
print_normal
(
'Successfully set local config!'
)
else
:
print_error
(
'Set local config failed!'
)
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Rest server stopped!'
)
exit
(
1
)
#set pai config
if
experiment_config
[
'trainingServicePlatform'
]
==
'pai'
:
print_normal
(
'Setting pai config...'
)
config_result
,
err_msg
=
set_pai_config
(
experiment_config
,
args
.
port
,
config_file_name
)
if
config_result
:
print_normal
(
'Successfully set pai config!'
)
else
:
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
#set kubeflow config
if
experiment_config
[
'trainingServicePlatform'
]
==
'kubeflow'
:
print_normal
(
'Setting kubeflow config...'
)
config_result
,
err_msg
=
set_kubeflow_config
(
experiment_config
,
args
.
port
,
config_file_name
)
if
config_result
:
print_normal
(
'Successfully set kubeflow config!'
)
else
:
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
#set frameworkcontroller config
if
experiment_config
[
'trainingServicePlatform'
]
==
'frameworkcontroller'
:
print_normal
(
'Setting frameworkcontroller config...'
)
config_result
,
err_msg
=
set_frameworkcontroller_config
(
experiment_config
,
args
.
port
,
config_file_name
)
if
config_result
:
print_normal
(
'Successfully set frameworkcontroller config!'
)
else
:
if
err_msg
:
print_error
(
'Failed! Error is: {}'
.
format
(
err_msg
))
try
:
kill_command
(
rest_process
.
pid
)
except
Exception
:
raise
Exception
(
ERROR_INFO
%
'Restful server stopped!'
)
exit
(
1
)
if
mode
!=
'view'
:
# set platform configuration
set_platform_config
(
experiment_config
[
'trainingServicePlatform'
],
experiment_config
,
args
.
port
,
config_file_name
,
rest_process
)
# start a new experiment
print_normal
(
'Starting experiment...'
)
# set debug configuration
if
experiment_config
.
get
(
'debug'
)
is
None
:
if
mode
!=
'view'
and
experiment_config
.
get
(
'debug'
)
is
None
:
experiment_config
[
'debug'
]
=
args
.
debug
response
=
set_experiment
(
experiment_config
,
mode
,
args
.
port
,
config_file_name
)
if
response
:
...
...
@@ -519,8 +482,23 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
print_normal
(
EXPERIMENT_SUCCESS_INFO
%
(
experiment_id
,
' '
.
join
(
web_ui_url_list
)))
def
resume_experiment
(
args
):
'''resume an experiment'''
def
create_experiment
(
args
):
'''start a new experiment'''
config_file_name
=
''
.
join
(
random
.
sample
(
string
.
ascii_letters
+
string
.
digits
,
8
))
nni_config
=
Config
(
config_file_name
)
config_path
=
os
.
path
.
abspath
(
args
.
config
)
if
not
os
.
path
.
exists
(
config_path
):
print_error
(
'Please set correct config path!'
)
exit
(
1
)
experiment_config
=
get_yml_content
(
config_path
)
validate_all_content
(
experiment_config
,
config_path
)
nni_config
.
set_config
(
'experimentConfig'
,
experiment_config
)
launch_experiment
(
args
,
experiment_config
,
'new'
,
config_file_name
)
nni_config
.
set_config
(
'restServerPort'
,
args
.
port
)
def
manage_stopped_experiment
(
args
,
mode
):
'''view a stopped experiment'''
update_experiment
()
experiment_config
=
Experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
...
...
@@ -528,38 +506,31 @@ def resume_experiment(args):
experiment_endTime
=
None
#find the latest stopped experiment
if
not
args
.
id
:
print_error
(
'Please set experiment id!
\n
You could use
\'
nnictl
resume
{id}
\'
to
resume
a stopped experiment!
\n
'
\
'You could use
\'
nnictl experiment list --all
\'
to show all experiments!'
)
print_error
(
'Please set experiment id!
\n
You could use
\'
nnictl
{0}
{id}
\'
to
{0}
a stopped experiment!
\n
'
\
'You could use
\'
nnictl experiment list --all
\'
to show all experiments!'
.
format
(
mode
)
)
exit
(
1
)
else
:
if
experiment_dict
.
get
(
args
.
id
)
is
None
:
print_error
(
'Id %s not exist!'
%
args
.
id
)
exit
(
1
)
if
experiment_dict
[
args
.
id
][
'status'
]
!=
'STOPPED'
:
print_error
(
'Only stopped experiments can be
resumed!'
)
print_error
(
'Only stopped experiments can be
{0}ed!'
.
format
(
mode
)
)
exit
(
1
)
experiment_id
=
args
.
id
print_normal
(
'
Resuming
experiment
%s
...'
%
experiment_id
)
print_normal
(
'
{0}
experiment
{1}
...'
.
format
(
mode
,
experiment_id
)
)
nni_config
=
Config
(
experiment_dict
[
experiment_id
][
'fileName'
])
experiment_config
=
nni_config
.
get_config
(
'experimentConfig'
)
experiment_id
=
nni_config
.
get_config
(
'experimentId'
)
new_config_file_name
=
''
.
join
(
random
.
sample
(
string
.
ascii_letters
+
string
.
digits
,
8
))
new_nni_config
=
Config
(
new_config_file_name
)
new_nni_config
.
set_config
(
'experimentConfig'
,
experiment_config
)
launch_experiment
(
args
,
experiment_config
,
'resume'
,
new_config_file_name
,
experiment_id
)
launch_experiment
(
args
,
experiment_config
,
mode
,
new_config_file_name
,
experiment_id
)
new_nni_config
.
set_config
(
'restServerPort'
,
args
.
port
)
def
create_experiment
(
args
):
'''start a new experiment'''
config_file_name
=
''
.
join
(
random
.
sample
(
string
.
ascii_letters
+
string
.
digits
,
8
))
nni_config
=
Config
(
config_file_name
)
config_path
=
os
.
path
.
abspath
(
args
.
config
)
if
not
os
.
path
.
exists
(
config_path
):
print_error
(
'Please set correct config path!'
)
exit
(
1
)
experiment_config
=
get_yml_content
(
config_path
)
validate_all_content
(
experiment_config
,
config_path
)
def
view_experiment
(
args
):
'''view a stopped experiment'''
manage_stopped_experiment
(
args
,
'view'
)
nni_config
.
set_config
(
'experimentConfig'
,
experiment
_config
)
launch_experiment
(
args
,
experiment_config
,
'new'
,
config_file_name
)
nni_config
.
set_config
(
'restServerPort'
,
args
.
port
)
def
resume_
experiment
(
args
):
'''resume an experiment'''
manage_stopped_experiment
(
args
,
'resume'
)
\ No newline at end of file
tools/nni_cmd/nnictl.py
View file @
99f7d79c
...
...
@@ -21,7 +21,7 @@
import
argparse
import
pkg_resources
from
.launcher
import
create_experiment
,
resume_experiment
from
.launcher
import
create_experiment
,
resume_experiment
,
view_experiment
from
.updater
import
update_searchspace
,
update_concurrency
,
update_duration
,
update_trialnum
,
import_data
from
.nnictl_utils
import
*
from
.package_management
import
*
...
...
@@ -66,6 +66,12 @@ def parse_args():
parser_resume
.
add_argument
(
'--debug'
,
'-d'
,
action
=
'store_true'
,
help
=
' set debug mode'
)
parser_resume
.
set_defaults
(
func
=
resume_experiment
)
# parse view command
parser_resume
=
subparsers
.
add_parser
(
'view'
,
help
=
'view a stopped experiment'
)
parser_resume
.
add_argument
(
'id'
,
nargs
=
'?'
,
help
=
'The id of the experiment you want to view'
)
parser_resume
.
add_argument
(
'--port'
,
'-p'
,
default
=
DEFAULT_REST_PORT
,
dest
=
'port'
,
help
=
'the port of restful server'
)
parser_resume
.
set_defaults
(
func
=
view_experiment
)
# parse update command
parser_updater
=
subparsers
.
add_parser
(
'update'
,
help
=
'update the experiment'
)
#add subparsers for parser_updater
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment